diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23639 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 16856, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0023730422401518746, + "grad_norm": 3.3012346274534408, + "learning_rate": 1.4827995255041518e-07, + "loss": 0.9532, + "step": 5 + }, + { + "epoch": 0.004746084480303749, + "grad_norm": 3.2453901600434927, + "learning_rate": 2.9655990510083035e-07, + "loss": 0.9506, + "step": 10 + }, + { + "epoch": 0.007119126720455624, + "grad_norm": 3.3063811487253774, + "learning_rate": 4.4483985765124564e-07, + "loss": 0.9561, + "step": 15 + }, + { + "epoch": 0.009492168960607499, + "grad_norm": 3.0124679320627834, + "learning_rate": 5.931198102016607e-07, + "loss": 0.9495, + "step": 20 + }, + { + "epoch": 0.011865211200759373, + "grad_norm": 2.39745977667125, + "learning_rate": 7.41399762752076e-07, + "loss": 0.9406, + "step": 25 + }, + { + "epoch": 0.014238253440911248, + "grad_norm": 2.162974880299377, + "learning_rate": 8.896797153024913e-07, + "loss": 0.9204, + "step": 30 + }, + { + "epoch": 0.016611295681063124, + "grad_norm": 1.4719613723599203, + "learning_rate": 1.0379596678529062e-06, + "loss": 0.8899, + "step": 35 + }, + { + "epoch": 0.018984337921214997, + "grad_norm": 1.4362384937789894, + "learning_rate": 1.1862396204033214e-06, + "loss": 0.8729, + "step": 40 + }, + { + "epoch": 0.021357380161366873, + "grad_norm": 1.0617274864665578, + "learning_rate": 1.3345195729537368e-06, + "loss": 0.8665, + "step": 45 + }, + { + "epoch": 0.023730422401518746, + "grad_norm": 0.8587915416684542, + "learning_rate": 1.482799525504152e-06, + "loss": 0.8311, + "step": 50 + }, + { + "epoch": 0.026103464641670623, + "grad_norm": 0.8260285973003328, + "learning_rate": 1.631079478054567e-06, + "loss": 0.8161, + "step": 55 + }, + { + "epoch": 0.028476506881822496, + "grad_norm": 0.586588115262803, + "learning_rate": 1.7793594306049826e-06, + "loss": 0.7932, + "step": 60 + }, + { + "epoch": 0.030849549121974372, + "grad_norm": 0.7005559681270839, + "learning_rate": 1.9276393831553977e-06, + "loss": 0.7598, + "step": 65 + }, + { + "epoch": 0.03322259136212625, + "grad_norm": 0.5202240451810206, + "learning_rate": 2.0759193357058125e-06, + "loss": 0.7462, + "step": 70 + }, + { + "epoch": 0.03559563360227812, + "grad_norm": 0.45468689989845074, + "learning_rate": 2.224199288256228e-06, + "loss": 0.7258, + "step": 75 + }, + { + "epoch": 0.037968675842429994, + "grad_norm": 0.40893045293016167, + "learning_rate": 2.372479240806643e-06, + "loss": 0.7164, + "step": 80 + }, + { + "epoch": 0.04034171808258187, + "grad_norm": 0.41141265860075205, + "learning_rate": 2.520759193357058e-06, + "loss": 0.7079, + "step": 85 + }, + { + "epoch": 0.04271476032273375, + "grad_norm": 0.381263611063291, + "learning_rate": 2.6690391459074736e-06, + "loss": 0.7095, + "step": 90 + }, + { + "epoch": 0.045087802562885616, + "grad_norm": 0.35766613991996504, + "learning_rate": 2.8173190984578884e-06, + "loss": 0.7024, + "step": 95 + }, + { + "epoch": 0.04746084480303749, + "grad_norm": 0.3204572771435286, + "learning_rate": 2.965599051008304e-06, + "loss": 0.6993, + "step": 100 + }, + { + "epoch": 0.04983388704318937, + "grad_norm": 0.31747242887588323, + "learning_rate": 3.113879003558719e-06, + "loss": 0.6876, + "step": 105 + }, + { + "epoch": 0.052206929283341245, + "grad_norm": 0.3334831168209942, + "learning_rate": 3.262158956109134e-06, + "loss": 0.6775, + "step": 110 + }, + { + "epoch": 0.054579971523493115, + "grad_norm": 0.32189039077080556, + "learning_rate": 3.4104389086595495e-06, + "loss": 0.6719, + "step": 115 + }, + { + "epoch": 0.05695301376364499, + "grad_norm": 0.32700405994012854, + "learning_rate": 3.558718861209965e-06, + "loss": 0.6567, + "step": 120 + }, + { + "epoch": 0.05932605600379687, + "grad_norm": 0.3178545439579411, + "learning_rate": 3.7069988137603794e-06, + "loss": 0.6698, + "step": 125 + }, + { + "epoch": 0.061699098243948744, + "grad_norm": 0.32683530632015984, + "learning_rate": 3.8552787663107955e-06, + "loss": 0.6446, + "step": 130 + }, + { + "epoch": 0.06407214048410062, + "grad_norm": 0.30220930997119105, + "learning_rate": 4.00355871886121e-06, + "loss": 0.6505, + "step": 135 + }, + { + "epoch": 0.0664451827242525, + "grad_norm": 0.3014602595417579, + "learning_rate": 4.151838671411625e-06, + "loss": 0.6443, + "step": 140 + }, + { + "epoch": 0.06881822496440437, + "grad_norm": 0.29452550378866854, + "learning_rate": 4.3001186239620406e-06, + "loss": 0.6563, + "step": 145 + }, + { + "epoch": 0.07119126720455624, + "grad_norm": 0.3298457973156086, + "learning_rate": 4.448398576512456e-06, + "loss": 0.6436, + "step": 150 + }, + { + "epoch": 0.07356430944470811, + "grad_norm": 0.31172790883810075, + "learning_rate": 4.596678529062871e-06, + "loss": 0.6272, + "step": 155 + }, + { + "epoch": 0.07593735168485999, + "grad_norm": 0.3238660393104736, + "learning_rate": 4.744958481613286e-06, + "loss": 0.6427, + "step": 160 + }, + { + "epoch": 0.07831039392501186, + "grad_norm": 0.32201152715486914, + "learning_rate": 4.893238434163701e-06, + "loss": 0.6267, + "step": 165 + }, + { + "epoch": 0.08068343616516374, + "grad_norm": 0.31797029109753616, + "learning_rate": 5.041518386714116e-06, + "loss": 0.6316, + "step": 170 + }, + { + "epoch": 0.08305647840531562, + "grad_norm": 0.3308386737313696, + "learning_rate": 5.189798339264532e-06, + "loss": 0.6354, + "step": 175 + }, + { + "epoch": 0.0854295206454675, + "grad_norm": 0.3256177337072402, + "learning_rate": 5.338078291814947e-06, + "loss": 0.6098, + "step": 180 + }, + { + "epoch": 0.08780256288561937, + "grad_norm": 0.3135232710649523, + "learning_rate": 5.486358244365362e-06, + "loss": 0.6179, + "step": 185 + }, + { + "epoch": 0.09017560512577123, + "grad_norm": 0.32031765482804947, + "learning_rate": 5.634638196915777e-06, + "loss": 0.6082, + "step": 190 + }, + { + "epoch": 0.09254864736592311, + "grad_norm": 0.3262188292383802, + "learning_rate": 5.782918149466192e-06, + "loss": 0.6146, + "step": 195 + }, + { + "epoch": 0.09492168960607499, + "grad_norm": 0.35372195923163335, + "learning_rate": 5.931198102016608e-06, + "loss": 0.6108, + "step": 200 + }, + { + "epoch": 0.09729473184622686, + "grad_norm": 0.32958268543483216, + "learning_rate": 6.079478054567023e-06, + "loss": 0.5955, + "step": 205 + }, + { + "epoch": 0.09966777408637874, + "grad_norm": 0.35359713100368884, + "learning_rate": 6.227758007117438e-06, + "loss": 0.6099, + "step": 210 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 0.34542718222677854, + "learning_rate": 6.376037959667854e-06, + "loss": 0.6007, + "step": 215 + }, + { + "epoch": 0.10441385856668249, + "grad_norm": 0.32226149915126967, + "learning_rate": 6.524317912218268e-06, + "loss": 0.6024, + "step": 220 + }, + { + "epoch": 0.10678690080683437, + "grad_norm": 0.34736857682005506, + "learning_rate": 6.6725978647686826e-06, + "loss": 0.6003, + "step": 225 + }, + { + "epoch": 0.10915994304698623, + "grad_norm": 0.33713082240259973, + "learning_rate": 6.820877817319099e-06, + "loss": 0.622, + "step": 230 + }, + { + "epoch": 0.1115329852871381, + "grad_norm": 0.334902995756442, + "learning_rate": 6.969157769869514e-06, + "loss": 0.6087, + "step": 235 + }, + { + "epoch": 0.11390602752728998, + "grad_norm": 0.35150297438228933, + "learning_rate": 7.11743772241993e-06, + "loss": 0.5981, + "step": 240 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 0.32947417974802357, + "learning_rate": 7.265717674970345e-06, + "loss": 0.604, + "step": 245 + }, + { + "epoch": 0.11865211200759374, + "grad_norm": 0.3210384873066011, + "learning_rate": 7.413997627520759e-06, + "loss": 0.6026, + "step": 250 + }, + { + "epoch": 0.12102515424774561, + "grad_norm": 0.3855413772535901, + "learning_rate": 7.562277580071175e-06, + "loss": 0.6023, + "step": 255 + }, + { + "epoch": 0.12339819648789749, + "grad_norm": 0.3868968781664726, + "learning_rate": 7.710557532621591e-06, + "loss": 0.5813, + "step": 260 + }, + { + "epoch": 0.12577123872804935, + "grad_norm": 0.39154737958589847, + "learning_rate": 7.858837485172005e-06, + "loss": 0.5824, + "step": 265 + }, + { + "epoch": 0.12814428096820124, + "grad_norm": 0.35185515323762334, + "learning_rate": 8.00711743772242e-06, + "loss": 0.5957, + "step": 270 + }, + { + "epoch": 0.1305173232083531, + "grad_norm": 0.36915906222914, + "learning_rate": 8.155397390272836e-06, + "loss": 0.5984, + "step": 275 + }, + { + "epoch": 0.132890365448505, + "grad_norm": 0.3595523452429201, + "learning_rate": 8.30367734282325e-06, + "loss": 0.586, + "step": 280 + }, + { + "epoch": 0.13526340768865686, + "grad_norm": 0.35245804687138305, + "learning_rate": 8.451957295373667e-06, + "loss": 0.601, + "step": 285 + }, + { + "epoch": 0.13763644992880875, + "grad_norm": 0.3538366217776793, + "learning_rate": 8.600237247924081e-06, + "loss": 0.5964, + "step": 290 + }, + { + "epoch": 0.1400094921689606, + "grad_norm": 0.3502800597403107, + "learning_rate": 8.748517200474495e-06, + "loss": 0.5774, + "step": 295 + }, + { + "epoch": 0.14238253440911247, + "grad_norm": 0.37156130032807116, + "learning_rate": 8.896797153024912e-06, + "loss": 0.5888, + "step": 300 + }, + { + "epoch": 0.14475557664926436, + "grad_norm": 0.33397596452013123, + "learning_rate": 9.045077105575326e-06, + "loss": 0.589, + "step": 305 + }, + { + "epoch": 0.14712861888941622, + "grad_norm": 0.3268109520246717, + "learning_rate": 9.193357058125742e-06, + "loss": 0.5774, + "step": 310 + }, + { + "epoch": 0.14950166112956811, + "grad_norm": 0.3636849624789689, + "learning_rate": 9.341637010676157e-06, + "loss": 0.5764, + "step": 315 + }, + { + "epoch": 0.15187470336971998, + "grad_norm": 0.36943520728621476, + "learning_rate": 9.489916963226571e-06, + "loss": 0.5814, + "step": 320 + }, + { + "epoch": 0.15424774560987187, + "grad_norm": 0.4124521741636909, + "learning_rate": 9.638196915776987e-06, + "loss": 0.5908, + "step": 325 + }, + { + "epoch": 0.15662078785002373, + "grad_norm": 0.3606093224335012, + "learning_rate": 9.786476868327403e-06, + "loss": 0.5731, + "step": 330 + }, + { + "epoch": 0.1589938300901756, + "grad_norm": 0.35609580379154593, + "learning_rate": 9.934756820877818e-06, + "loss": 0.587, + "step": 335 + }, + { + "epoch": 0.16136687233032748, + "grad_norm": 0.39723326363223765, + "learning_rate": 1.0083036773428232e-05, + "loss": 0.5801, + "step": 340 + }, + { + "epoch": 0.16373991457047934, + "grad_norm": 0.4023803899190287, + "learning_rate": 1.023131672597865e-05, + "loss": 0.5794, + "step": 345 + }, + { + "epoch": 0.16611295681063123, + "grad_norm": 0.355235601755243, + "learning_rate": 1.0379596678529063e-05, + "loss": 0.5856, + "step": 350 + }, + { + "epoch": 0.1684859990507831, + "grad_norm": 0.36810573835290467, + "learning_rate": 1.0527876631079477e-05, + "loss": 0.5754, + "step": 355 + }, + { + "epoch": 0.170859041290935, + "grad_norm": 0.3491478434564828, + "learning_rate": 1.0676156583629894e-05, + "loss": 0.5797, + "step": 360 + }, + { + "epoch": 0.17323208353108685, + "grad_norm": 0.3986661419120226, + "learning_rate": 1.0824436536180308e-05, + "loss": 0.5662, + "step": 365 + }, + { + "epoch": 0.17560512577123874, + "grad_norm": 0.36415521268536455, + "learning_rate": 1.0972716488730724e-05, + "loss": 0.565, + "step": 370 + }, + { + "epoch": 0.1779781680113906, + "grad_norm": 0.408573298265987, + "learning_rate": 1.112099644128114e-05, + "loss": 0.5715, + "step": 375 + }, + { + "epoch": 0.18035121025154247, + "grad_norm": 0.3856990325239515, + "learning_rate": 1.1269276393831553e-05, + "loss": 0.5642, + "step": 380 + }, + { + "epoch": 0.18272425249169436, + "grad_norm": 0.3870854063730815, + "learning_rate": 1.1417556346381969e-05, + "loss": 0.5714, + "step": 385 + }, + { + "epoch": 0.18509729473184622, + "grad_norm": 0.39026689260030334, + "learning_rate": 1.1565836298932385e-05, + "loss": 0.5724, + "step": 390 + }, + { + "epoch": 0.1874703369719981, + "grad_norm": 0.4178752177624149, + "learning_rate": 1.17141162514828e-05, + "loss": 0.5708, + "step": 395 + }, + { + "epoch": 0.18984337921214997, + "grad_norm": 0.36302997422703387, + "learning_rate": 1.1862396204033216e-05, + "loss": 0.5587, + "step": 400 + }, + { + "epoch": 0.19221642145230186, + "grad_norm": 0.4065730727875294, + "learning_rate": 1.2010676156583631e-05, + "loss": 0.5628, + "step": 405 + }, + { + "epoch": 0.19458946369245372, + "grad_norm": 0.3900851892303272, + "learning_rate": 1.2158956109134045e-05, + "loss": 0.5696, + "step": 410 + }, + { + "epoch": 0.1969625059326056, + "grad_norm": 0.3634921063395654, + "learning_rate": 1.2307236061684461e-05, + "loss": 0.5583, + "step": 415 + }, + { + "epoch": 0.19933554817275748, + "grad_norm": 0.38484538645441185, + "learning_rate": 1.2455516014234877e-05, + "loss": 0.5637, + "step": 420 + }, + { + "epoch": 0.20170859041290934, + "grad_norm": 0.3652157185144551, + "learning_rate": 1.2603795966785292e-05, + "loss": 0.5598, + "step": 425 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.3764240946359201, + "learning_rate": 1.2752075919335708e-05, + "loss": 0.5585, + "step": 430 + }, + { + "epoch": 0.2064546748932131, + "grad_norm": 0.397140361346716, + "learning_rate": 1.2900355871886122e-05, + "loss": 0.5519, + "step": 435 + }, + { + "epoch": 0.20882771713336498, + "grad_norm": 0.3195457159255324, + "learning_rate": 1.3048635824436536e-05, + "loss": 0.5525, + "step": 440 + }, + { + "epoch": 0.21120075937351684, + "grad_norm": 0.3990567806151324, + "learning_rate": 1.3196915776986951e-05, + "loss": 0.5517, + "step": 445 + }, + { + "epoch": 0.21357380161366873, + "grad_norm": 0.3755761451515465, + "learning_rate": 1.3345195729537365e-05, + "loss": 0.5481, + "step": 450 + }, + { + "epoch": 0.2159468438538206, + "grad_norm": 0.39204890849662255, + "learning_rate": 1.3493475682087784e-05, + "loss": 0.5518, + "step": 455 + }, + { + "epoch": 0.21831988609397246, + "grad_norm": 0.402769328551532, + "learning_rate": 1.3641755634638198e-05, + "loss": 0.552, + "step": 460 + }, + { + "epoch": 0.22069292833412435, + "grad_norm": 0.35378180832247846, + "learning_rate": 1.3790035587188614e-05, + "loss": 0.5519, + "step": 465 + }, + { + "epoch": 0.2230659705742762, + "grad_norm": 0.3910683920907302, + "learning_rate": 1.3938315539739028e-05, + "loss": 0.5498, + "step": 470 + }, + { + "epoch": 0.2254390128144281, + "grad_norm": 0.3687274540533866, + "learning_rate": 1.4086595492289441e-05, + "loss": 0.5571, + "step": 475 + }, + { + "epoch": 0.22781205505457996, + "grad_norm": 0.3852811229003759, + "learning_rate": 1.423487544483986e-05, + "loss": 0.5576, + "step": 480 + }, + { + "epoch": 0.23018509729473186, + "grad_norm": 0.43216217367531373, + "learning_rate": 1.4383155397390274e-05, + "loss": 0.5552, + "step": 485 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.399588223506238, + "learning_rate": 1.453143534994069e-05, + "loss": 0.5524, + "step": 490 + }, + { + "epoch": 0.2349311817750356, + "grad_norm": 0.41571436502062936, + "learning_rate": 1.4679715302491104e-05, + "loss": 0.5573, + "step": 495 + }, + { + "epoch": 0.23730422401518747, + "grad_norm": 0.4583867826518887, + "learning_rate": 1.4827995255041518e-05, + "loss": 0.5533, + "step": 500 + }, + { + "epoch": 0.23967726625533933, + "grad_norm": 0.46163850254039046, + "learning_rate": 1.4976275207591933e-05, + "loss": 0.5334, + "step": 505 + }, + { + "epoch": 0.24205030849549122, + "grad_norm": 0.3922807774607747, + "learning_rate": 1.512455516014235e-05, + "loss": 0.5379, + "step": 510 + }, + { + "epoch": 0.24442335073564309, + "grad_norm": 0.4099857472943755, + "learning_rate": 1.5272835112692766e-05, + "loss": 0.5504, + "step": 515 + }, + { + "epoch": 0.24679639297579498, + "grad_norm": 0.38584769809436775, + "learning_rate": 1.5421115065243182e-05, + "loss": 0.5544, + "step": 520 + }, + { + "epoch": 0.24916943521594684, + "grad_norm": 0.3740345284471938, + "learning_rate": 1.5569395017793594e-05, + "loss": 0.5437, + "step": 525 + }, + { + "epoch": 0.2515424774560987, + "grad_norm": 0.41280027539873604, + "learning_rate": 1.571767497034401e-05, + "loss": 0.5488, + "step": 530 + }, + { + "epoch": 0.2539155196962506, + "grad_norm": 0.46800837205401813, + "learning_rate": 1.5865954922894425e-05, + "loss": 0.547, + "step": 535 + }, + { + "epoch": 0.2562885619364025, + "grad_norm": 0.39951340215340997, + "learning_rate": 1.601423487544484e-05, + "loss": 0.5538, + "step": 540 + }, + { + "epoch": 0.2586616041765543, + "grad_norm": 0.4220993329810669, + "learning_rate": 1.6162514827995256e-05, + "loss": 0.5518, + "step": 545 + }, + { + "epoch": 0.2610346464167062, + "grad_norm": 0.4179215801816134, + "learning_rate": 1.6310794780545672e-05, + "loss": 0.5438, + "step": 550 + }, + { + "epoch": 0.2634076886568581, + "grad_norm": 0.3882103885401237, + "learning_rate": 1.6459074733096084e-05, + "loss": 0.5427, + "step": 555 + }, + { + "epoch": 0.26578073089701, + "grad_norm": 0.3591021374743675, + "learning_rate": 1.66073546856465e-05, + "loss": 0.5357, + "step": 560 + }, + { + "epoch": 0.2681537731371618, + "grad_norm": 0.4111039666272371, + "learning_rate": 1.6755634638196915e-05, + "loss": 0.5431, + "step": 565 + }, + { + "epoch": 0.2705268153773137, + "grad_norm": 0.4797830767928753, + "learning_rate": 1.6903914590747334e-05, + "loss": 0.5488, + "step": 570 + }, + { + "epoch": 0.2728998576174656, + "grad_norm": 0.36677458486856146, + "learning_rate": 1.7052194543297747e-05, + "loss": 0.5381, + "step": 575 + }, + { + "epoch": 0.2752728998576175, + "grad_norm": 0.4181265482905293, + "learning_rate": 1.7200474495848162e-05, + "loss": 0.5457, + "step": 580 + }, + { + "epoch": 0.2776459420977693, + "grad_norm": 0.4191407882096967, + "learning_rate": 1.7348754448398578e-05, + "loss": 0.5416, + "step": 585 + }, + { + "epoch": 0.2800189843379212, + "grad_norm": 0.3955716334735947, + "learning_rate": 1.749703440094899e-05, + "loss": 0.5379, + "step": 590 + }, + { + "epoch": 0.2823920265780731, + "grad_norm": 0.4125380671401423, + "learning_rate": 1.7645314353499406e-05, + "loss": 0.537, + "step": 595 + }, + { + "epoch": 0.28476506881822494, + "grad_norm": 0.41657962425019524, + "learning_rate": 1.7793594306049825e-05, + "loss": 0.5528, + "step": 600 + }, + { + "epoch": 0.28713811105837683, + "grad_norm": 0.4542100932585639, + "learning_rate": 1.794187425860024e-05, + "loss": 0.5512, + "step": 605 + }, + { + "epoch": 0.2895111532985287, + "grad_norm": 0.4763380565361265, + "learning_rate": 1.8090154211150652e-05, + "loss": 0.5345, + "step": 610 + }, + { + "epoch": 0.2918841955386806, + "grad_norm": 0.4160842001278535, + "learning_rate": 1.8238434163701068e-05, + "loss": 0.5466, + "step": 615 + }, + { + "epoch": 0.29425723777883245, + "grad_norm": 0.44526984274013437, + "learning_rate": 1.8386714116251484e-05, + "loss": 0.5291, + "step": 620 + }, + { + "epoch": 0.29663028001898434, + "grad_norm": 0.4775247819212695, + "learning_rate": 1.85349940688019e-05, + "loss": 0.5368, + "step": 625 + }, + { + "epoch": 0.29900332225913623, + "grad_norm": 0.43243495806434634, + "learning_rate": 1.8683274021352315e-05, + "loss": 0.5414, + "step": 630 + }, + { + "epoch": 0.30137636449928806, + "grad_norm": 0.39761908198880813, + "learning_rate": 1.883155397390273e-05, + "loss": 0.5458, + "step": 635 + }, + { + "epoch": 0.30374940673943995, + "grad_norm": 0.4210928552565514, + "learning_rate": 1.8979833926453143e-05, + "loss": 0.5328, + "step": 640 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 0.4325895607757229, + "learning_rate": 1.912811387900356e-05, + "loss": 0.5454, + "step": 645 + }, + { + "epoch": 0.30849549121974373, + "grad_norm": 0.48830110028725376, + "learning_rate": 1.9276393831553974e-05, + "loss": 0.5213, + "step": 650 + }, + { + "epoch": 0.31086853345989557, + "grad_norm": 0.4057114575873381, + "learning_rate": 1.9424673784104393e-05, + "loss": 0.5539, + "step": 655 + }, + { + "epoch": 0.31324157570004746, + "grad_norm": 0.3795257336668363, + "learning_rate": 1.9572953736654805e-05, + "loss": 0.5349, + "step": 660 + }, + { + "epoch": 0.31561461794019935, + "grad_norm": 0.4141072168789327, + "learning_rate": 1.972123368920522e-05, + "loss": 0.5313, + "step": 665 + }, + { + "epoch": 0.3179876601803512, + "grad_norm": 0.46330118265249154, + "learning_rate": 1.9869513641755636e-05, + "loss": 0.5399, + "step": 670 + }, + { + "epoch": 0.3203607024205031, + "grad_norm": 0.5289215797774157, + "learning_rate": 2.001779359430605e-05, + "loss": 0.535, + "step": 675 + }, + { + "epoch": 0.32273374466065496, + "grad_norm": 0.4144365652590653, + "learning_rate": 2.0166073546856464e-05, + "loss": 0.536, + "step": 680 + }, + { + "epoch": 0.32510678690080685, + "grad_norm": 0.5046205801622089, + "learning_rate": 2.0314353499406883e-05, + "loss": 0.5418, + "step": 685 + }, + { + "epoch": 0.3274798291409587, + "grad_norm": 0.44077389334245864, + "learning_rate": 2.04626334519573e-05, + "loss": 0.5309, + "step": 690 + }, + { + "epoch": 0.3298528713811106, + "grad_norm": 0.40640486206145127, + "learning_rate": 2.061091340450771e-05, + "loss": 0.5381, + "step": 695 + }, + { + "epoch": 0.33222591362126247, + "grad_norm": 0.43988475294939433, + "learning_rate": 2.0759193357058127e-05, + "loss": 0.5267, + "step": 700 + }, + { + "epoch": 0.33459895586141436, + "grad_norm": 0.4320836546446297, + "learning_rate": 2.0907473309608542e-05, + "loss": 0.5293, + "step": 705 + }, + { + "epoch": 0.3369719981015662, + "grad_norm": 0.43450452561157477, + "learning_rate": 2.1055753262158954e-05, + "loss": 0.5319, + "step": 710 + }, + { + "epoch": 0.3393450403417181, + "grad_norm": 0.41685269734665076, + "learning_rate": 2.1204033214709373e-05, + "loss": 0.5333, + "step": 715 + }, + { + "epoch": 0.34171808258187, + "grad_norm": 0.4004755313185238, + "learning_rate": 2.135231316725979e-05, + "loss": 0.5148, + "step": 720 + }, + { + "epoch": 0.3440911248220218, + "grad_norm": 0.44725984492713566, + "learning_rate": 2.1500593119810205e-05, + "loss": 0.539, + "step": 725 + }, + { + "epoch": 0.3464641670621737, + "grad_norm": 0.39334044544390223, + "learning_rate": 2.1648873072360617e-05, + "loss": 0.5319, + "step": 730 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 0.4643667764086214, + "learning_rate": 2.1797153024911032e-05, + "loss": 0.5291, + "step": 735 + }, + { + "epoch": 0.3512102515424775, + "grad_norm": 0.5331374241565268, + "learning_rate": 2.1945432977461448e-05, + "loss": 0.5268, + "step": 740 + }, + { + "epoch": 0.3535832937826293, + "grad_norm": 0.4945637884261665, + "learning_rate": 2.2093712930011864e-05, + "loss": 0.5123, + "step": 745 + }, + { + "epoch": 0.3559563360227812, + "grad_norm": 0.47825936175115275, + "learning_rate": 2.224199288256228e-05, + "loss": 0.5414, + "step": 750 + }, + { + "epoch": 0.3583293782629331, + "grad_norm": 0.4764353162418638, + "learning_rate": 2.2390272835112695e-05, + "loss": 0.5353, + "step": 755 + }, + { + "epoch": 0.36070242050308493, + "grad_norm": 0.45453452651190546, + "learning_rate": 2.2538552787663107e-05, + "loss": 0.5308, + "step": 760 + }, + { + "epoch": 0.3630754627432368, + "grad_norm": 0.42459695280396786, + "learning_rate": 2.2686832740213523e-05, + "loss": 0.5282, + "step": 765 + }, + { + "epoch": 0.3654485049833887, + "grad_norm": 0.46084581904090643, + "learning_rate": 2.2835112692763938e-05, + "loss": 0.5276, + "step": 770 + }, + { + "epoch": 0.3678215472235406, + "grad_norm": 0.4646358279028416, + "learning_rate": 2.2983392645314357e-05, + "loss": 0.5247, + "step": 775 + }, + { + "epoch": 0.37019458946369244, + "grad_norm": 0.4336158725146883, + "learning_rate": 2.313167259786477e-05, + "loss": 0.5349, + "step": 780 + }, + { + "epoch": 0.3725676317038443, + "grad_norm": 0.44527312524667795, + "learning_rate": 2.3279952550415185e-05, + "loss": 0.525, + "step": 785 + }, + { + "epoch": 0.3749406739439962, + "grad_norm": 0.45095383049598037, + "learning_rate": 2.34282325029656e-05, + "loss": 0.5336, + "step": 790 + }, + { + "epoch": 0.37731371618414805, + "grad_norm": 0.43117183410762083, + "learning_rate": 2.3576512455516013e-05, + "loss": 0.5176, + "step": 795 + }, + { + "epoch": 0.37968675842429994, + "grad_norm": 0.4385554613843474, + "learning_rate": 2.3724792408066432e-05, + "loss": 0.5376, + "step": 800 + }, + { + "epoch": 0.38205980066445183, + "grad_norm": 0.4460922074930747, + "learning_rate": 2.3873072360616847e-05, + "loss": 0.5135, + "step": 805 + }, + { + "epoch": 0.3844328429046037, + "grad_norm": 0.4400760051952893, + "learning_rate": 2.4021352313167263e-05, + "loss": 0.5249, + "step": 810 + }, + { + "epoch": 0.38680588514475556, + "grad_norm": 0.40585392016286337, + "learning_rate": 2.4169632265717675e-05, + "loss": 0.5264, + "step": 815 + }, + { + "epoch": 0.38917892738490745, + "grad_norm": 0.4729708518291751, + "learning_rate": 2.431791221826809e-05, + "loss": 0.5225, + "step": 820 + }, + { + "epoch": 0.39155196962505934, + "grad_norm": 0.4419648774782904, + "learning_rate": 2.4466192170818506e-05, + "loss": 0.5263, + "step": 825 + }, + { + "epoch": 0.3939250118652112, + "grad_norm": 0.44684005881380745, + "learning_rate": 2.4614472123368922e-05, + "loss": 0.5221, + "step": 830 + }, + { + "epoch": 0.39629805410536306, + "grad_norm": 0.37965802864124837, + "learning_rate": 2.4762752075919338e-05, + "loss": 0.5142, + "step": 835 + }, + { + "epoch": 0.39867109634551495, + "grad_norm": 0.4887440153622237, + "learning_rate": 2.4911032028469753e-05, + "loss": 0.5192, + "step": 840 + }, + { + "epoch": 0.40104413858566684, + "grad_norm": 0.46456079409720835, + "learning_rate": 2.5059311981020165e-05, + "loss": 0.5131, + "step": 845 + }, + { + "epoch": 0.4034171808258187, + "grad_norm": 0.4643028097400259, + "learning_rate": 2.5207591933570584e-05, + "loss": 0.5171, + "step": 850 + }, + { + "epoch": 0.40579022306597057, + "grad_norm": 0.4699750295999524, + "learning_rate": 2.5355871886120997e-05, + "loss": 0.5238, + "step": 855 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.47405431277498733, + "learning_rate": 2.5504151838671416e-05, + "loss": 0.5306, + "step": 860 + }, + { + "epoch": 0.41053630754627435, + "grad_norm": 0.449649042882859, + "learning_rate": 2.5652431791221824e-05, + "loss": 0.5304, + "step": 865 + }, + { + "epoch": 0.4129093497864262, + "grad_norm": 0.4008681165865558, + "learning_rate": 2.5800711743772243e-05, + "loss": 0.5086, + "step": 870 + }, + { + "epoch": 0.4152823920265781, + "grad_norm": 0.43032435925300583, + "learning_rate": 2.5948991696322662e-05, + "loss": 0.5133, + "step": 875 + }, + { + "epoch": 0.41765543426672996, + "grad_norm": 0.43398683679482736, + "learning_rate": 2.609727164887307e-05, + "loss": 0.5321, + "step": 880 + }, + { + "epoch": 0.4200284765068818, + "grad_norm": 0.46312356407922634, + "learning_rate": 2.624555160142349e-05, + "loss": 0.5334, + "step": 885 + }, + { + "epoch": 0.4224015187470337, + "grad_norm": 0.553245980747163, + "learning_rate": 2.6393831553973902e-05, + "loss": 0.5236, + "step": 890 + }, + { + "epoch": 0.4247745609871856, + "grad_norm": 0.6284300504529712, + "learning_rate": 2.654211150652432e-05, + "loss": 0.5065, + "step": 895 + }, + { + "epoch": 0.42714760322733747, + "grad_norm": 0.5303309135122871, + "learning_rate": 2.669039145907473e-05, + "loss": 0.5196, + "step": 900 + }, + { + "epoch": 0.4295206454674893, + "grad_norm": 0.5216001448514943, + "learning_rate": 2.683867141162515e-05, + "loss": 0.5188, + "step": 905 + }, + { + "epoch": 0.4318936877076412, + "grad_norm": 0.5253975589290137, + "learning_rate": 2.6986951364175568e-05, + "loss": 0.5077, + "step": 910 + }, + { + "epoch": 0.4342667299477931, + "grad_norm": 0.48940082359972686, + "learning_rate": 2.7135231316725977e-05, + "loss": 0.5063, + "step": 915 + }, + { + "epoch": 0.4366397721879449, + "grad_norm": 0.5044814127727938, + "learning_rate": 2.7283511269276396e-05, + "loss": 0.5196, + "step": 920 + }, + { + "epoch": 0.4390128144280968, + "grad_norm": 0.4137926997412646, + "learning_rate": 2.7431791221826808e-05, + "loss": 0.511, + "step": 925 + }, + { + "epoch": 0.4413858566682487, + "grad_norm": 0.47779006520306977, + "learning_rate": 2.7580071174377227e-05, + "loss": 0.5099, + "step": 930 + }, + { + "epoch": 0.4437588989084006, + "grad_norm": 0.5235065251727852, + "learning_rate": 2.7728351126927643e-05, + "loss": 0.5141, + "step": 935 + }, + { + "epoch": 0.4461319411485524, + "grad_norm": 0.43424752868321015, + "learning_rate": 2.7876631079478055e-05, + "loss": 0.5067, + "step": 940 + }, + { + "epoch": 0.4485049833887043, + "grad_norm": 0.4729681335087864, + "learning_rate": 2.8024911032028474e-05, + "loss": 0.518, + "step": 945 + }, + { + "epoch": 0.4508780256288562, + "grad_norm": 0.41777430021681183, + "learning_rate": 2.8173190984578883e-05, + "loss": 0.5196, + "step": 950 + }, + { + "epoch": 0.4532510678690081, + "grad_norm": 0.4898399518793486, + "learning_rate": 2.8321470937129302e-05, + "loss": 0.5076, + "step": 955 + }, + { + "epoch": 0.45562411010915993, + "grad_norm": 0.47388433899300964, + "learning_rate": 2.846975088967972e-05, + "loss": 0.5158, + "step": 960 + }, + { + "epoch": 0.4579971523493118, + "grad_norm": 0.41293304779080336, + "learning_rate": 2.861803084223013e-05, + "loss": 0.5316, + "step": 965 + }, + { + "epoch": 0.4603701945894637, + "grad_norm": 0.4300290066560654, + "learning_rate": 2.876631079478055e-05, + "loss": 0.5306, + "step": 970 + }, + { + "epoch": 0.46274323682961555, + "grad_norm": 0.39130027555005736, + "learning_rate": 2.891459074733096e-05, + "loss": 0.5208, + "step": 975 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.5064968730142579, + "learning_rate": 2.906287069988138e-05, + "loss": 0.5211, + "step": 980 + }, + { + "epoch": 0.4674893213099193, + "grad_norm": 0.37814067000017126, + "learning_rate": 2.921115065243179e-05, + "loss": 0.5162, + "step": 985 + }, + { + "epoch": 0.4698623635500712, + "grad_norm": 0.3690366042949725, + "learning_rate": 2.9359430604982208e-05, + "loss": 0.5144, + "step": 990 + }, + { + "epoch": 0.47223540579022305, + "grad_norm": 0.4336872130127643, + "learning_rate": 2.9507710557532627e-05, + "loss": 0.5167, + "step": 995 + }, + { + "epoch": 0.47460844803037494, + "grad_norm": 0.4098214132014324, + "learning_rate": 2.9655990510083035e-05, + "loss": 0.5165, + "step": 1000 + }, + { + "epoch": 0.47698149027052683, + "grad_norm": 0.4668275588678632, + "learning_rate": 2.9804270462633454e-05, + "loss": 0.5211, + "step": 1005 + }, + { + "epoch": 0.47935453251067867, + "grad_norm": 0.4664397804681739, + "learning_rate": 2.9952550415183867e-05, + "loss": 0.5055, + "step": 1010 + }, + { + "epoch": 0.48172757475083056, + "grad_norm": 0.4933690064600994, + "learning_rate": 3.0100830367734286e-05, + "loss": 0.523, + "step": 1015 + }, + { + "epoch": 0.48410061699098245, + "grad_norm": 0.4030473846880117, + "learning_rate": 3.02491103202847e-05, + "loss": 0.5135, + "step": 1020 + }, + { + "epoch": 0.48647365923113434, + "grad_norm": 0.4181287471422199, + "learning_rate": 3.0397390272835113e-05, + "loss": 0.5028, + "step": 1025 + }, + { + "epoch": 0.48884670147128617, + "grad_norm": 0.6467043170453629, + "learning_rate": 3.054567022538553e-05, + "loss": 0.5254, + "step": 1030 + }, + { + "epoch": 0.49121974371143806, + "grad_norm": 0.4774630800025195, + "learning_rate": 3.0693950177935945e-05, + "loss": 0.5141, + "step": 1035 + }, + { + "epoch": 0.49359278595158995, + "grad_norm": 0.43692205943495027, + "learning_rate": 3.0842230130486364e-05, + "loss": 0.5367, + "step": 1040 + }, + { + "epoch": 0.4959658281917418, + "grad_norm": 0.43567495937642653, + "learning_rate": 3.099051008303677e-05, + "loss": 0.5151, + "step": 1045 + }, + { + "epoch": 0.4983388704318937, + "grad_norm": 0.4320194414166116, + "learning_rate": 3.113879003558719e-05, + "loss": 0.4929, + "step": 1050 + }, + { + "epoch": 0.5007119126720455, + "grad_norm": 0.42453775761601936, + "learning_rate": 3.128706998813761e-05, + "loss": 0.5107, + "step": 1055 + }, + { + "epoch": 0.5030849549121974, + "grad_norm": 0.4774873576654063, + "learning_rate": 3.143534994068802e-05, + "loss": 0.5128, + "step": 1060 + }, + { + "epoch": 0.5054579971523493, + "grad_norm": 0.42773917832938757, + "learning_rate": 3.158362989323844e-05, + "loss": 0.5098, + "step": 1065 + }, + { + "epoch": 0.5078310393925012, + "grad_norm": 0.4153144561817415, + "learning_rate": 3.173190984578885e-05, + "loss": 0.5124, + "step": 1070 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 0.42566012561743904, + "learning_rate": 3.188018979833927e-05, + "loss": 0.5114, + "step": 1075 + }, + { + "epoch": 0.512577123872805, + "grad_norm": 0.430210979390748, + "learning_rate": 3.202846975088968e-05, + "loss": 0.5126, + "step": 1080 + }, + { + "epoch": 0.5149501661129569, + "grad_norm": 0.43344398631323494, + "learning_rate": 3.2176749703440094e-05, + "loss": 0.4912, + "step": 1085 + }, + { + "epoch": 0.5173232083531086, + "grad_norm": 0.4612980073496342, + "learning_rate": 3.232502965599051e-05, + "loss": 0.5157, + "step": 1090 + }, + { + "epoch": 0.5196962505932605, + "grad_norm": 0.4167364536029927, + "learning_rate": 3.2473309608540925e-05, + "loss": 0.4999, + "step": 1095 + }, + { + "epoch": 0.5220692928334124, + "grad_norm": 0.4922697662528512, + "learning_rate": 3.2621589561091344e-05, + "loss": 0.513, + "step": 1100 + }, + { + "epoch": 0.5244423350735643, + "grad_norm": 0.4362151292597722, + "learning_rate": 3.276986951364176e-05, + "loss": 0.5207, + "step": 1105 + }, + { + "epoch": 0.5268153773137162, + "grad_norm": 0.4312496565986578, + "learning_rate": 3.291814946619217e-05, + "loss": 0.4999, + "step": 1110 + }, + { + "epoch": 0.5291884195538681, + "grad_norm": 0.4515963302805682, + "learning_rate": 3.306642941874259e-05, + "loss": 0.498, + "step": 1115 + }, + { + "epoch": 0.53156146179402, + "grad_norm": 0.4664094121570051, + "learning_rate": 3.3214709371293e-05, + "loss": 0.4904, + "step": 1120 + }, + { + "epoch": 0.5339345040341718, + "grad_norm": 0.41851013924489594, + "learning_rate": 3.336298932384342e-05, + "loss": 0.5033, + "step": 1125 + }, + { + "epoch": 0.5363075462743236, + "grad_norm": 0.4347062062992873, + "learning_rate": 3.351126927639383e-05, + "loss": 0.5033, + "step": 1130 + }, + { + "epoch": 0.5386805885144755, + "grad_norm": 0.3793368642185312, + "learning_rate": 3.365954922894425e-05, + "loss": 0.5084, + "step": 1135 + }, + { + "epoch": 0.5410536307546274, + "grad_norm": 0.4113679711480408, + "learning_rate": 3.380782918149467e-05, + "loss": 0.4976, + "step": 1140 + }, + { + "epoch": 0.5434266729947793, + "grad_norm": 0.3525763540512438, + "learning_rate": 3.3956109134045074e-05, + "loss": 0.5116, + "step": 1145 + }, + { + "epoch": 0.5457997152349312, + "grad_norm": 0.42225638317596387, + "learning_rate": 3.410438908659549e-05, + "loss": 0.5122, + "step": 1150 + }, + { + "epoch": 0.5481727574750831, + "grad_norm": 0.4106621414535342, + "learning_rate": 3.4252669039145906e-05, + "loss": 0.5016, + "step": 1155 + }, + { + "epoch": 0.550545799715235, + "grad_norm": 0.4009455640356164, + "learning_rate": 3.4400948991696325e-05, + "loss": 0.5012, + "step": 1160 + }, + { + "epoch": 0.5529188419553868, + "grad_norm": 0.5061594418702849, + "learning_rate": 3.4549228944246744e-05, + "loss": 0.5104, + "step": 1165 + }, + { + "epoch": 0.5552918841955387, + "grad_norm": 0.4667635892464623, + "learning_rate": 3.4697508896797156e-05, + "loss": 0.5045, + "step": 1170 + }, + { + "epoch": 0.5576649264356905, + "grad_norm": 0.46576573238656455, + "learning_rate": 3.4845788849347575e-05, + "loss": 0.5043, + "step": 1175 + }, + { + "epoch": 0.5600379686758424, + "grad_norm": 0.40046681057479233, + "learning_rate": 3.499406880189798e-05, + "loss": 0.4953, + "step": 1180 + }, + { + "epoch": 0.5624110109159943, + "grad_norm": 0.3991615333277713, + "learning_rate": 3.51423487544484e-05, + "loss": 0.4931, + "step": 1185 + }, + { + "epoch": 0.5647840531561462, + "grad_norm": 0.4067376854252093, + "learning_rate": 3.529062870699881e-05, + "loss": 0.5059, + "step": 1190 + }, + { + "epoch": 0.5671570953962981, + "grad_norm": 0.42609176195317844, + "learning_rate": 3.543890865954923e-05, + "loss": 0.507, + "step": 1195 + }, + { + "epoch": 0.5695301376364499, + "grad_norm": 0.3902895139169864, + "learning_rate": 3.558718861209965e-05, + "loss": 0.4929, + "step": 1200 + }, + { + "epoch": 0.5719031798766018, + "grad_norm": 0.43067137484602924, + "learning_rate": 3.573546856465006e-05, + "loss": 0.5041, + "step": 1205 + }, + { + "epoch": 0.5742762221167537, + "grad_norm": 0.5364502855050799, + "learning_rate": 3.588374851720048e-05, + "loss": 0.4993, + "step": 1210 + }, + { + "epoch": 0.5766492643569056, + "grad_norm": 0.4694672726832272, + "learning_rate": 3.6032028469750886e-05, + "loss": 0.5009, + "step": 1215 + }, + { + "epoch": 0.5790223065970574, + "grad_norm": 0.4779262317365731, + "learning_rate": 3.6180308422301305e-05, + "loss": 0.508, + "step": 1220 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 0.4183710201917089, + "learning_rate": 3.6328588374851724e-05, + "loss": 0.4946, + "step": 1225 + }, + { + "epoch": 0.5837683910773612, + "grad_norm": 0.46252095261785975, + "learning_rate": 3.6476868327402136e-05, + "loss": 0.4934, + "step": 1230 + }, + { + "epoch": 0.586141433317513, + "grad_norm": 0.463821047813579, + "learning_rate": 3.6625148279952555e-05, + "loss": 0.511, + "step": 1235 + }, + { + "epoch": 0.5885144755576649, + "grad_norm": 0.4740063625220242, + "learning_rate": 3.677342823250297e-05, + "loss": 0.5089, + "step": 1240 + }, + { + "epoch": 0.5908875177978168, + "grad_norm": 0.48137914911247615, + "learning_rate": 3.6921708185053386e-05, + "loss": 0.5034, + "step": 1245 + }, + { + "epoch": 0.5932605600379687, + "grad_norm": 0.3694084071066886, + "learning_rate": 3.70699881376038e-05, + "loss": 0.4946, + "step": 1250 + }, + { + "epoch": 0.5956336022781206, + "grad_norm": 0.4024287443348156, + "learning_rate": 3.721826809015421e-05, + "loss": 0.5031, + "step": 1255 + }, + { + "epoch": 0.5980066445182725, + "grad_norm": 0.44293740626005007, + "learning_rate": 3.736654804270463e-05, + "loss": 0.5063, + "step": 1260 + }, + { + "epoch": 0.6003796867584243, + "grad_norm": 0.436219411847971, + "learning_rate": 3.751482799525504e-05, + "loss": 0.4915, + "step": 1265 + }, + { + "epoch": 0.6027527289985761, + "grad_norm": 0.5264906441766312, + "learning_rate": 3.766310794780546e-05, + "loss": 0.4963, + "step": 1270 + }, + { + "epoch": 0.605125771238728, + "grad_norm": 0.42713432727143735, + "learning_rate": 3.781138790035587e-05, + "loss": 0.5023, + "step": 1275 + }, + { + "epoch": 0.6074988134788799, + "grad_norm": 0.45987744731262153, + "learning_rate": 3.7959667852906285e-05, + "loss": 0.5133, + "step": 1280 + }, + { + "epoch": 0.6098718557190318, + "grad_norm": 0.5490589332715551, + "learning_rate": 3.8107947805456704e-05, + "loss": 0.4984, + "step": 1285 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.4769336237778267, + "learning_rate": 3.825622775800712e-05, + "loss": 0.5073, + "step": 1290 + }, + { + "epoch": 0.6146179401993356, + "grad_norm": 0.4633396314928124, + "learning_rate": 3.8404507710557536e-05, + "loss": 0.5037, + "step": 1295 + }, + { + "epoch": 0.6169909824394875, + "grad_norm": 0.44892326660129966, + "learning_rate": 3.855278766310795e-05, + "loss": 0.4879, + "step": 1300 + }, + { + "epoch": 0.6193640246796392, + "grad_norm": 0.38270644750140365, + "learning_rate": 3.870106761565837e-05, + "loss": 0.4936, + "step": 1305 + }, + { + "epoch": 0.6217370669197911, + "grad_norm": 0.40809667134788635, + "learning_rate": 3.8849347568208786e-05, + "loss": 0.4951, + "step": 1310 + }, + { + "epoch": 0.624110109159943, + "grad_norm": 0.3785162578957824, + "learning_rate": 3.899762752075919e-05, + "loss": 0.4922, + "step": 1315 + }, + { + "epoch": 0.6264831514000949, + "grad_norm": 0.4100098017845109, + "learning_rate": 3.914590747330961e-05, + "loss": 0.4914, + "step": 1320 + }, + { + "epoch": 0.6288561936402468, + "grad_norm": 0.4388263670866574, + "learning_rate": 3.929418742586002e-05, + "loss": 0.4965, + "step": 1325 + }, + { + "epoch": 0.6312292358803987, + "grad_norm": 0.4902138047716282, + "learning_rate": 3.944246737841044e-05, + "loss": 0.5055, + "step": 1330 + }, + { + "epoch": 0.6336022781205506, + "grad_norm": 0.4439322838715264, + "learning_rate": 3.9590747330960854e-05, + "loss": 0.5022, + "step": 1335 + }, + { + "epoch": 0.6359753203607024, + "grad_norm": 0.46033670741125865, + "learning_rate": 3.973902728351127e-05, + "loss": 0.5043, + "step": 1340 + }, + { + "epoch": 0.6383483626008543, + "grad_norm": 0.5463513064711918, + "learning_rate": 3.988730723606169e-05, + "loss": 0.4991, + "step": 1345 + }, + { + "epoch": 0.6407214048410061, + "grad_norm": 0.44390020714840367, + "learning_rate": 4.00355871886121e-05, + "loss": 0.5038, + "step": 1350 + }, + { + "epoch": 0.643094447081158, + "grad_norm": 0.48535371325434296, + "learning_rate": 4.0183867141162516e-05, + "loss": 0.4986, + "step": 1355 + }, + { + "epoch": 0.6454674893213099, + "grad_norm": 0.46831650153539645, + "learning_rate": 4.033214709371293e-05, + "loss": 0.5133, + "step": 1360 + }, + { + "epoch": 0.6478405315614618, + "grad_norm": 0.3993138094326991, + "learning_rate": 4.048042704626335e-05, + "loss": 0.5033, + "step": 1365 + }, + { + "epoch": 0.6502135738016137, + "grad_norm": 0.45553090809849, + "learning_rate": 4.0628706998813766e-05, + "loss": 0.493, + "step": 1370 + }, + { + "epoch": 0.6525866160417655, + "grad_norm": 0.4192770824054018, + "learning_rate": 4.077698695136418e-05, + "loss": 0.4991, + "step": 1375 + }, + { + "epoch": 0.6549596582819174, + "grad_norm": 0.4357882004607306, + "learning_rate": 4.09252669039146e-05, + "loss": 0.5034, + "step": 1380 + }, + { + "epoch": 0.6573327005220693, + "grad_norm": 0.47870327632865933, + "learning_rate": 4.1073546856465e-05, + "loss": 0.5019, + "step": 1385 + }, + { + "epoch": 0.6597057427622212, + "grad_norm": 0.4766720981110986, + "learning_rate": 4.122182680901542e-05, + "loss": 0.4999, + "step": 1390 + }, + { + "epoch": 0.662078785002373, + "grad_norm": 0.4782011695477935, + "learning_rate": 4.1370106761565834e-05, + "loss": 0.491, + "step": 1395 + }, + { + "epoch": 0.6644518272425249, + "grad_norm": 0.42876772812003955, + "learning_rate": 4.151838671411625e-05, + "loss": 0.5018, + "step": 1400 + }, + { + "epoch": 0.6668248694826768, + "grad_norm": 0.3906853084428434, + "learning_rate": 4.166666666666667e-05, + "loss": 0.4902, + "step": 1405 + }, + { + "epoch": 0.6691979117228287, + "grad_norm": 0.4067553950137155, + "learning_rate": 4.1814946619217084e-05, + "loss": 0.4995, + "step": 1410 + }, + { + "epoch": 0.6715709539629805, + "grad_norm": 0.4174505958676777, + "learning_rate": 4.19632265717675e-05, + "loss": 0.4909, + "step": 1415 + }, + { + "epoch": 0.6739439962031324, + "grad_norm": 0.372948864181697, + "learning_rate": 4.211150652431791e-05, + "loss": 0.483, + "step": 1420 + }, + { + "epoch": 0.6763170384432843, + "grad_norm": 0.4022794319606398, + "learning_rate": 4.225978647686833e-05, + "loss": 0.4919, + "step": 1425 + }, + { + "epoch": 0.6786900806834362, + "grad_norm": 0.4850444469813929, + "learning_rate": 4.240806642941875e-05, + "loss": 0.4976, + "step": 1430 + }, + { + "epoch": 0.6810631229235881, + "grad_norm": 0.4302329034376522, + "learning_rate": 4.255634638196916e-05, + "loss": 0.474, + "step": 1435 + }, + { + "epoch": 0.68343616516374, + "grad_norm": 0.47103499427575307, + "learning_rate": 4.270462633451958e-05, + "loss": 0.4965, + "step": 1440 + }, + { + "epoch": 0.6858092074038918, + "grad_norm": 0.4354829462871034, + "learning_rate": 4.285290628706999e-05, + "loss": 0.5035, + "step": 1445 + }, + { + "epoch": 0.6881822496440436, + "grad_norm": 0.4578741554487985, + "learning_rate": 4.300118623962041e-05, + "loss": 0.4904, + "step": 1450 + }, + { + "epoch": 0.6905552918841955, + "grad_norm": 0.41172881805406164, + "learning_rate": 4.314946619217082e-05, + "loss": 0.486, + "step": 1455 + }, + { + "epoch": 0.6929283341243474, + "grad_norm": 0.4693760921820907, + "learning_rate": 4.3297746144721233e-05, + "loss": 0.4931, + "step": 1460 + }, + { + "epoch": 0.6953013763644993, + "grad_norm": 0.4087008924171691, + "learning_rate": 4.344602609727165e-05, + "loss": 0.4904, + "step": 1465 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.489739477052162, + "learning_rate": 4.3594306049822065e-05, + "loss": 0.496, + "step": 1470 + }, + { + "epoch": 0.7000474608448031, + "grad_norm": 0.514829696992182, + "learning_rate": 4.3742586002372484e-05, + "loss": 0.4832, + "step": 1475 + }, + { + "epoch": 0.702420503084955, + "grad_norm": 0.4751106223891163, + "learning_rate": 4.3890865954922896e-05, + "loss": 0.4996, + "step": 1480 + }, + { + "epoch": 0.7047935453251067, + "grad_norm": 0.43130028475604437, + "learning_rate": 4.403914590747331e-05, + "loss": 0.4959, + "step": 1485 + }, + { + "epoch": 0.7071665875652586, + "grad_norm": 0.4582566742345348, + "learning_rate": 4.418742586002373e-05, + "loss": 0.4764, + "step": 1490 + }, + { + "epoch": 0.7095396298054105, + "grad_norm": 0.4684774970693439, + "learning_rate": 4.433570581257414e-05, + "loss": 0.5007, + "step": 1495 + }, + { + "epoch": 0.7119126720455624, + "grad_norm": 0.42631578115771, + "learning_rate": 4.448398576512456e-05, + "loss": 0.4858, + "step": 1500 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.4330457302953655, + "learning_rate": 4.463226571767497e-05, + "loss": 0.4917, + "step": 1505 + }, + { + "epoch": 0.7166587565258662, + "grad_norm": 0.45769162245451434, + "learning_rate": 4.478054567022539e-05, + "loss": 0.481, + "step": 1510 + }, + { + "epoch": 0.7190317987660181, + "grad_norm": 0.4333328854805106, + "learning_rate": 4.492882562277581e-05, + "loss": 0.5021, + "step": 1515 + }, + { + "epoch": 0.7214048410061699, + "grad_norm": 0.42353519930742867, + "learning_rate": 4.5077105575326214e-05, + "loss": 0.4855, + "step": 1520 + }, + { + "epoch": 0.7237778832463218, + "grad_norm": 0.45220095995796106, + "learning_rate": 4.522538552787663e-05, + "loss": 0.5037, + "step": 1525 + }, + { + "epoch": 0.7261509254864736, + "grad_norm": 0.48283448636614634, + "learning_rate": 4.5373665480427045e-05, + "loss": 0.4976, + "step": 1530 + }, + { + "epoch": 0.7285239677266255, + "grad_norm": 0.4261739347683797, + "learning_rate": 4.5521945432977464e-05, + "loss": 0.4945, + "step": 1535 + }, + { + "epoch": 0.7308970099667774, + "grad_norm": 0.4570407517085502, + "learning_rate": 4.5670225385527876e-05, + "loss": 0.4993, + "step": 1540 + }, + { + "epoch": 0.7332700522069293, + "grad_norm": 0.4072095483023356, + "learning_rate": 4.5818505338078295e-05, + "loss": 0.4869, + "step": 1545 + }, + { + "epoch": 0.7356430944470812, + "grad_norm": 0.39934905423537975, + "learning_rate": 4.5966785290628714e-05, + "loss": 0.4844, + "step": 1550 + }, + { + "epoch": 0.738016136687233, + "grad_norm": 0.46363002823084976, + "learning_rate": 4.611506524317912e-05, + "loss": 0.4809, + "step": 1555 + }, + { + "epoch": 0.7403891789273849, + "grad_norm": 0.4744777450769252, + "learning_rate": 4.626334519572954e-05, + "loss": 0.4949, + "step": 1560 + }, + { + "epoch": 0.7427622211675368, + "grad_norm": 0.45514047294460086, + "learning_rate": 4.641162514827995e-05, + "loss": 0.4911, + "step": 1565 + }, + { + "epoch": 0.7451352634076887, + "grad_norm": 0.5366478062407083, + "learning_rate": 4.655990510083037e-05, + "loss": 0.4858, + "step": 1570 + }, + { + "epoch": 0.7475083056478405, + "grad_norm": 0.47550493815189937, + "learning_rate": 4.670818505338079e-05, + "loss": 0.4976, + "step": 1575 + }, + { + "epoch": 0.7498813478879924, + "grad_norm": 0.44422738551244634, + "learning_rate": 4.68564650059312e-05, + "loss": 0.5016, + "step": 1580 + }, + { + "epoch": 0.7522543901281443, + "grad_norm": 0.43895417438757645, + "learning_rate": 4.700474495848162e-05, + "loss": 0.4933, + "step": 1585 + }, + { + "epoch": 0.7546274323682961, + "grad_norm": 0.42514339467560874, + "learning_rate": 4.7153024911032026e-05, + "loss": 0.5073, + "step": 1590 + }, + { + "epoch": 0.757000474608448, + "grad_norm": 0.45850669267674515, + "learning_rate": 4.7301304863582445e-05, + "loss": 0.5004, + "step": 1595 + }, + { + "epoch": 0.7593735168485999, + "grad_norm": 0.4046136608716938, + "learning_rate": 4.7449584816132864e-05, + "loss": 0.5027, + "step": 1600 + }, + { + "epoch": 0.7617465590887518, + "grad_norm": 0.5462732310295528, + "learning_rate": 4.7597864768683276e-05, + "loss": 0.4826, + "step": 1605 + }, + { + "epoch": 0.7641196013289037, + "grad_norm": 0.4749157941288522, + "learning_rate": 4.7746144721233695e-05, + "loss": 0.4891, + "step": 1610 + }, + { + "epoch": 0.7664926435690556, + "grad_norm": 0.49370839733365984, + "learning_rate": 4.789442467378411e-05, + "loss": 0.4763, + "step": 1615 + }, + { + "epoch": 0.7688656858092074, + "grad_norm": 0.44172159645454845, + "learning_rate": 4.8042704626334526e-05, + "loss": 0.487, + "step": 1620 + }, + { + "epoch": 0.7712387280493592, + "grad_norm": 0.42681787843984914, + "learning_rate": 4.819098457888493e-05, + "loss": 0.4771, + "step": 1625 + }, + { + "epoch": 0.7736117702895111, + "grad_norm": 0.48258098747358424, + "learning_rate": 4.833926453143535e-05, + "loss": 0.4872, + "step": 1630 + }, + { + "epoch": 0.775984812529663, + "grad_norm": 0.43966131203439074, + "learning_rate": 4.848754448398577e-05, + "loss": 0.4926, + "step": 1635 + }, + { + "epoch": 0.7783578547698149, + "grad_norm": 0.3992475617968615, + "learning_rate": 4.863582443653618e-05, + "loss": 0.4849, + "step": 1640 + }, + { + "epoch": 0.7807308970099668, + "grad_norm": 0.42612453790830057, + "learning_rate": 4.87841043890866e-05, + "loss": 0.4945, + "step": 1645 + }, + { + "epoch": 0.7831039392501187, + "grad_norm": 0.41452315395666933, + "learning_rate": 4.893238434163701e-05, + "loss": 0.4925, + "step": 1650 + }, + { + "epoch": 0.7854769814902706, + "grad_norm": 0.38238809138702173, + "learning_rate": 4.908066429418743e-05, + "loss": 0.4788, + "step": 1655 + }, + { + "epoch": 0.7878500237304225, + "grad_norm": 0.41254286891519243, + "learning_rate": 4.9228944246737844e-05, + "loss": 0.4909, + "step": 1660 + }, + { + "epoch": 0.7902230659705742, + "grad_norm": 0.37197053445816636, + "learning_rate": 4.9377224199288256e-05, + "loss": 0.495, + "step": 1665 + }, + { + "epoch": 0.7925961082107261, + "grad_norm": 0.430632705979827, + "learning_rate": 4.9525504151838675e-05, + "loss": 0.491, + "step": 1670 + }, + { + "epoch": 0.794969150450878, + "grad_norm": 0.40064209585993604, + "learning_rate": 4.967378410438909e-05, + "loss": 0.4921, + "step": 1675 + }, + { + "epoch": 0.7973421926910299, + "grad_norm": 0.430518060749054, + "learning_rate": 4.9822064056939506e-05, + "loss": 0.4935, + "step": 1680 + }, + { + "epoch": 0.7997152349311818, + "grad_norm": 0.4932492624249253, + "learning_rate": 4.997034400948992e-05, + "loss": 0.4807, + "step": 1685 + }, + { + "epoch": 0.8020882771713337, + "grad_norm": 0.43438121525107104, + "learning_rate": 4.998681608437706e-05, + "loss": 0.4878, + "step": 1690 + }, + { + "epoch": 0.8044613194114856, + "grad_norm": 0.46112152411881074, + "learning_rate": 4.9970336189848385e-05, + "loss": 0.484, + "step": 1695 + }, + { + "epoch": 0.8068343616516374, + "grad_norm": 0.40013750571685647, + "learning_rate": 4.995385629531971e-05, + "loss": 0.4843, + "step": 1700 + }, + { + "epoch": 0.8092074038917892, + "grad_norm": 0.41714180186649424, + "learning_rate": 4.9937376400791035e-05, + "loss": 0.4961, + "step": 1705 + }, + { + "epoch": 0.8115804461319411, + "grad_norm": 0.42827906067734717, + "learning_rate": 4.992089650626236e-05, + "loss": 0.4789, + "step": 1710 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 0.3881996049954332, + "learning_rate": 4.990441661173369e-05, + "loss": 0.4807, + "step": 1715 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.43318423447193993, + "learning_rate": 4.9887936717205015e-05, + "loss": 0.4901, + "step": 1720 + }, + { + "epoch": 0.8186995728523968, + "grad_norm": 0.45995959439284567, + "learning_rate": 4.987145682267634e-05, + "loss": 0.4956, + "step": 1725 + }, + { + "epoch": 0.8210726150925487, + "grad_norm": 0.4327317280682721, + "learning_rate": 4.985497692814766e-05, + "loss": 0.4981, + "step": 1730 + }, + { + "epoch": 0.8234456573327005, + "grad_norm": 0.39468016339718287, + "learning_rate": 4.983849703361899e-05, + "loss": 0.4788, + "step": 1735 + }, + { + "epoch": 0.8258186995728524, + "grad_norm": 0.3960324238180488, + "learning_rate": 4.982201713909031e-05, + "loss": 0.4846, + "step": 1740 + }, + { + "epoch": 0.8281917418130043, + "grad_norm": 0.4346163293896913, + "learning_rate": 4.980553724456164e-05, + "loss": 0.4878, + "step": 1745 + }, + { + "epoch": 0.8305647840531561, + "grad_norm": 0.4194398729446732, + "learning_rate": 4.978905735003297e-05, + "loss": 0.4901, + "step": 1750 + }, + { + "epoch": 0.832937826293308, + "grad_norm": 0.4904079924049648, + "learning_rate": 4.977257745550429e-05, + "loss": 0.4832, + "step": 1755 + }, + { + "epoch": 0.8353108685334599, + "grad_norm": 0.47655904171399294, + "learning_rate": 4.975609756097561e-05, + "loss": 0.4841, + "step": 1760 + }, + { + "epoch": 0.8376839107736118, + "grad_norm": 0.4702749406675099, + "learning_rate": 4.973961766644693e-05, + "loss": 0.4846, + "step": 1765 + }, + { + "epoch": 0.8400569530137636, + "grad_norm": 0.4028316462470333, + "learning_rate": 4.972313777191826e-05, + "loss": 0.504, + "step": 1770 + }, + { + "epoch": 0.8424299952539155, + "grad_norm": 0.4079372503034545, + "learning_rate": 4.970665787738959e-05, + "loss": 0.4965, + "step": 1775 + }, + { + "epoch": 0.8448030374940674, + "grad_norm": 0.34613910280534305, + "learning_rate": 4.969017798286091e-05, + "loss": 0.4869, + "step": 1780 + }, + { + "epoch": 0.8471760797342193, + "grad_norm": 0.40770883404300645, + "learning_rate": 4.967369808833224e-05, + "loss": 0.4744, + "step": 1785 + }, + { + "epoch": 0.8495491219743712, + "grad_norm": 0.36151095759456925, + "learning_rate": 4.965721819380356e-05, + "loss": 0.4907, + "step": 1790 + }, + { + "epoch": 0.851922164214523, + "grad_norm": 0.3884180666261444, + "learning_rate": 4.9640738299274885e-05, + "loss": 0.4829, + "step": 1795 + }, + { + "epoch": 0.8542952064546749, + "grad_norm": 0.4356929975057039, + "learning_rate": 4.962425840474621e-05, + "loss": 0.4757, + "step": 1800 + }, + { + "epoch": 0.8566682486948267, + "grad_norm": 0.44767853808766767, + "learning_rate": 4.960777851021754e-05, + "loss": 0.4878, + "step": 1805 + }, + { + "epoch": 0.8590412909349786, + "grad_norm": 0.37318545947237336, + "learning_rate": 4.9591298615688864e-05, + "loss": 0.4843, + "step": 1810 + }, + { + "epoch": 0.8614143331751305, + "grad_norm": 0.3977852281315618, + "learning_rate": 4.9574818721160186e-05, + "loss": 0.4918, + "step": 1815 + }, + { + "epoch": 0.8637873754152824, + "grad_norm": 0.4215654200363493, + "learning_rate": 4.9558338826631515e-05, + "loss": 0.4857, + "step": 1820 + }, + { + "epoch": 0.8661604176554343, + "grad_norm": 0.43979078747958156, + "learning_rate": 4.954185893210284e-05, + "loss": 0.4835, + "step": 1825 + }, + { + "epoch": 0.8685334598955862, + "grad_norm": 0.43491197823988115, + "learning_rate": 4.952537903757416e-05, + "loss": 0.4741, + "step": 1830 + }, + { + "epoch": 0.8709065021357381, + "grad_norm": 0.44392615823303316, + "learning_rate": 4.950889914304549e-05, + "loss": 0.4813, + "step": 1835 + }, + { + "epoch": 0.8732795443758898, + "grad_norm": 0.43100922752383947, + "learning_rate": 4.9492419248516817e-05, + "loss": 0.4841, + "step": 1840 + }, + { + "epoch": 0.8756525866160417, + "grad_norm": 0.4174913370631169, + "learning_rate": 4.947593935398814e-05, + "loss": 0.4789, + "step": 1845 + }, + { + "epoch": 0.8780256288561936, + "grad_norm": 0.392979115113426, + "learning_rate": 4.945945945945946e-05, + "loss": 0.479, + "step": 1850 + }, + { + "epoch": 0.8803986710963455, + "grad_norm": 0.40404243332321443, + "learning_rate": 4.944297956493079e-05, + "loss": 0.476, + "step": 1855 + }, + { + "epoch": 0.8827717133364974, + "grad_norm": 0.4296965099985081, + "learning_rate": 4.942649967040211e-05, + "loss": 0.4887, + "step": 1860 + }, + { + "epoch": 0.8851447555766493, + "grad_norm": 0.4617983500941567, + "learning_rate": 4.941001977587343e-05, + "loss": 0.4794, + "step": 1865 + }, + { + "epoch": 0.8875177978168012, + "grad_norm": 0.5151340863120837, + "learning_rate": 4.939353988134476e-05, + "loss": 0.4784, + "step": 1870 + }, + { + "epoch": 0.889890840056953, + "grad_norm": 0.3948541125046343, + "learning_rate": 4.937705998681609e-05, + "loss": 0.4829, + "step": 1875 + }, + { + "epoch": 0.8922638822971048, + "grad_norm": 0.3988455357233691, + "learning_rate": 4.936058009228741e-05, + "loss": 0.4858, + "step": 1880 + }, + { + "epoch": 0.8946369245372567, + "grad_norm": 0.5779824622540904, + "learning_rate": 4.9344100197758735e-05, + "loss": 0.4868, + "step": 1885 + }, + { + "epoch": 0.8970099667774086, + "grad_norm": 0.46235924309460785, + "learning_rate": 4.932762030323006e-05, + "loss": 0.4797, + "step": 1890 + }, + { + "epoch": 0.8993830090175605, + "grad_norm": 0.4384690065383677, + "learning_rate": 4.9311140408701385e-05, + "loss": 0.4665, + "step": 1895 + }, + { + "epoch": 0.9017560512577124, + "grad_norm": 0.4614371362317558, + "learning_rate": 4.9294660514172714e-05, + "loss": 0.4821, + "step": 1900 + }, + { + "epoch": 0.9041290934978643, + "grad_norm": 0.48070613189906947, + "learning_rate": 4.9278180619644036e-05, + "loss": 0.484, + "step": 1905 + }, + { + "epoch": 0.9065021357380162, + "grad_norm": 0.3616453287007227, + "learning_rate": 4.9261700725115365e-05, + "loss": 0.4732, + "step": 1910 + }, + { + "epoch": 0.908875177978168, + "grad_norm": 0.45800793123659117, + "learning_rate": 4.924522083058669e-05, + "loss": 0.4719, + "step": 1915 + }, + { + "epoch": 0.9112482202183199, + "grad_norm": 0.39945180192493207, + "learning_rate": 4.922874093605801e-05, + "loss": 0.4892, + "step": 1920 + }, + { + "epoch": 0.9136212624584718, + "grad_norm": 0.43064594888957075, + "learning_rate": 4.921226104152933e-05, + "loss": 0.4849, + "step": 1925 + }, + { + "epoch": 0.9159943046986236, + "grad_norm": 0.4112165360082447, + "learning_rate": 4.9195781147000666e-05, + "loss": 0.4791, + "step": 1930 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 0.4182099108387935, + "learning_rate": 4.917930125247199e-05, + "loss": 0.4723, + "step": 1935 + }, + { + "epoch": 0.9207403891789274, + "grad_norm": 0.4056023864795481, + "learning_rate": 4.916282135794331e-05, + "loss": 0.4733, + "step": 1940 + }, + { + "epoch": 0.9231134314190793, + "grad_norm": 0.39714609078147595, + "learning_rate": 4.914634146341464e-05, + "loss": 0.4771, + "step": 1945 + }, + { + "epoch": 0.9254864736592311, + "grad_norm": 0.38741520931445894, + "learning_rate": 4.912986156888596e-05, + "loss": 0.477, + "step": 1950 + }, + { + "epoch": 0.927859515899383, + "grad_norm": 0.3913791693187267, + "learning_rate": 4.911338167435728e-05, + "loss": 0.4888, + "step": 1955 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.41534302042210236, + "learning_rate": 4.909690177982861e-05, + "loss": 0.4943, + "step": 1960 + }, + { + "epoch": 0.9326056003796868, + "grad_norm": 0.4141837029304029, + "learning_rate": 4.908042188529994e-05, + "loss": 0.4663, + "step": 1965 + }, + { + "epoch": 0.9349786426198387, + "grad_norm": 0.4252527729984004, + "learning_rate": 4.906394199077126e-05, + "loss": 0.4802, + "step": 1970 + }, + { + "epoch": 0.9373516848599905, + "grad_norm": 0.48457763358438466, + "learning_rate": 4.9047462096242584e-05, + "loss": 0.467, + "step": 1975 + }, + { + "epoch": 0.9397247271001424, + "grad_norm": 0.3620109972706721, + "learning_rate": 4.903098220171391e-05, + "loss": 0.4757, + "step": 1980 + }, + { + "epoch": 0.9420977693402942, + "grad_norm": 0.3147898117923985, + "learning_rate": 4.9014502307185235e-05, + "loss": 0.4739, + "step": 1985 + }, + { + "epoch": 0.9444708115804461, + "grad_norm": 0.4053389206239973, + "learning_rate": 4.8998022412656564e-05, + "loss": 0.4785, + "step": 1990 + }, + { + "epoch": 0.946843853820598, + "grad_norm": 0.49906453546942287, + "learning_rate": 4.8981542518127886e-05, + "loss": 0.4847, + "step": 1995 + }, + { + "epoch": 0.9492168960607499, + "grad_norm": 0.4143889656572964, + "learning_rate": 4.8965062623599214e-05, + "loss": 0.4756, + "step": 2000 + }, + { + "epoch": 0.9515899383009018, + "grad_norm": 0.3611004009774898, + "learning_rate": 4.8948582729070536e-05, + "loss": 0.475, + "step": 2005 + }, + { + "epoch": 0.9539629805410537, + "grad_norm": 0.4214026039943159, + "learning_rate": 4.893210283454186e-05, + "loss": 0.4818, + "step": 2010 + }, + { + "epoch": 0.9563360227812056, + "grad_norm": 0.36629951834855184, + "learning_rate": 4.891562294001319e-05, + "loss": 0.4797, + "step": 2015 + }, + { + "epoch": 0.9587090650213573, + "grad_norm": 0.44175416805632334, + "learning_rate": 4.8899143045484516e-05, + "loss": 0.4826, + "step": 2020 + }, + { + "epoch": 0.9610821072615092, + "grad_norm": 0.3907755643057963, + "learning_rate": 4.888266315095584e-05, + "loss": 0.4669, + "step": 2025 + }, + { + "epoch": 0.9634551495016611, + "grad_norm": 0.3775868140213636, + "learning_rate": 4.886618325642716e-05, + "loss": 0.4791, + "step": 2030 + }, + { + "epoch": 0.965828191741813, + "grad_norm": 0.3247164957127407, + "learning_rate": 4.884970336189849e-05, + "loss": 0.4703, + "step": 2035 + }, + { + "epoch": 0.9682012339819649, + "grad_norm": 0.45750054535716583, + "learning_rate": 4.883322346736981e-05, + "loss": 0.4815, + "step": 2040 + }, + { + "epoch": 0.9705742762221168, + "grad_norm": 0.4170164917137097, + "learning_rate": 4.881674357284113e-05, + "loss": 0.4767, + "step": 2045 + }, + { + "epoch": 0.9729473184622687, + "grad_norm": 0.39438784711191727, + "learning_rate": 4.880026367831246e-05, + "loss": 0.4736, + "step": 2050 + }, + { + "epoch": 0.9753203607024205, + "grad_norm": 0.44215942641911193, + "learning_rate": 4.878378378378379e-05, + "loss": 0.4755, + "step": 2055 + }, + { + "epoch": 0.9776934029425723, + "grad_norm": 0.36148419707943424, + "learning_rate": 4.876730388925511e-05, + "loss": 0.4679, + "step": 2060 + }, + { + "epoch": 0.9800664451827242, + "grad_norm": 0.4208681390517646, + "learning_rate": 4.8750823994726434e-05, + "loss": 0.4889, + "step": 2065 + }, + { + "epoch": 0.9824394874228761, + "grad_norm": 0.37064298691379627, + "learning_rate": 4.873434410019776e-05, + "loss": 0.4723, + "step": 2070 + }, + { + "epoch": 0.984812529663028, + "grad_norm": 0.3934372176253987, + "learning_rate": 4.8717864205669084e-05, + "loss": 0.476, + "step": 2075 + }, + { + "epoch": 0.9871855719031799, + "grad_norm": 0.4153163596602145, + "learning_rate": 4.870138431114041e-05, + "loss": 0.486, + "step": 2080 + }, + { + "epoch": 0.9895586141433318, + "grad_norm": 0.42417990135907596, + "learning_rate": 4.8684904416611735e-05, + "loss": 0.4721, + "step": 2085 + }, + { + "epoch": 0.9919316563834836, + "grad_norm": 0.4054259998855423, + "learning_rate": 4.8668424522083064e-05, + "loss": 0.4775, + "step": 2090 + }, + { + "epoch": 0.9943046986236355, + "grad_norm": 0.3925106316284817, + "learning_rate": 4.8651944627554386e-05, + "loss": 0.4855, + "step": 2095 + }, + { + "epoch": 0.9966777408637874, + "grad_norm": 0.38215419387639465, + "learning_rate": 4.863546473302571e-05, + "loss": 0.474, + "step": 2100 + }, + { + "epoch": 0.9990507831039392, + "grad_norm": 0.44987661416808, + "learning_rate": 4.8618984838497037e-05, + "loss": 0.4673, + "step": 2105 + }, + { + "epoch": 1.001423825344091, + "grad_norm": 0.4193613709758678, + "learning_rate": 4.8602504943968365e-05, + "loss": 0.4635, + "step": 2110 + }, + { + "epoch": 1.003796867584243, + "grad_norm": 0.49137378417911376, + "learning_rate": 4.858602504943969e-05, + "loss": 0.4652, + "step": 2115 + }, + { + "epoch": 1.0061699098243948, + "grad_norm": 0.5415885947557014, + "learning_rate": 4.856954515491101e-05, + "loss": 0.4625, + "step": 2120 + }, + { + "epoch": 1.0085429520645468, + "grad_norm": 0.4487095304878435, + "learning_rate": 4.855306526038234e-05, + "loss": 0.4668, + "step": 2125 + }, + { + "epoch": 1.0109159943046986, + "grad_norm": 0.40841710700548844, + "learning_rate": 4.853658536585366e-05, + "loss": 0.4594, + "step": 2130 + }, + { + "epoch": 1.0132890365448506, + "grad_norm": 0.41216780768364514, + "learning_rate": 4.852010547132498e-05, + "loss": 0.4493, + "step": 2135 + }, + { + "epoch": 1.0156620787850024, + "grad_norm": 0.3670269083144018, + "learning_rate": 4.850362557679631e-05, + "loss": 0.4534, + "step": 2140 + }, + { + "epoch": 1.0180351210251541, + "grad_norm": 0.3560878912381435, + "learning_rate": 4.848714568226764e-05, + "loss": 0.4687, + "step": 2145 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.3701866830551485, + "learning_rate": 4.847066578773896e-05, + "loss": 0.4532, + "step": 2150 + }, + { + "epoch": 1.022781205505458, + "grad_norm": 0.36497608349106625, + "learning_rate": 4.845418589321028e-05, + "loss": 0.4446, + "step": 2155 + }, + { + "epoch": 1.02515424774561, + "grad_norm": 0.39702578382451115, + "learning_rate": 4.843770599868161e-05, + "loss": 0.4585, + "step": 2160 + }, + { + "epoch": 1.0275272899857617, + "grad_norm": 0.39299364909803075, + "learning_rate": 4.8421226104152934e-05, + "loss": 0.443, + "step": 2165 + }, + { + "epoch": 1.0299003322259137, + "grad_norm": 0.3789813586605377, + "learning_rate": 4.840474620962426e-05, + "loss": 0.456, + "step": 2170 + }, + { + "epoch": 1.0322733744660655, + "grad_norm": 0.3991540699371848, + "learning_rate": 4.8388266315095585e-05, + "loss": 0.4555, + "step": 2175 + }, + { + "epoch": 1.0346464167062173, + "grad_norm": 0.37493756679275436, + "learning_rate": 4.8371786420566913e-05, + "loss": 0.4671, + "step": 2180 + }, + { + "epoch": 1.0370194589463693, + "grad_norm": 0.49405699888949794, + "learning_rate": 4.8355306526038235e-05, + "loss": 0.4676, + "step": 2185 + }, + { + "epoch": 1.039392501186521, + "grad_norm": 0.37495217195888486, + "learning_rate": 4.833882663150956e-05, + "loss": 0.4509, + "step": 2190 + }, + { + "epoch": 1.041765543426673, + "grad_norm": 0.3965007876193761, + "learning_rate": 4.8322346736980886e-05, + "loss": 0.459, + "step": 2195 + }, + { + "epoch": 1.0441385856668248, + "grad_norm": 0.4032316088825582, + "learning_rate": 4.8305866842452215e-05, + "loss": 0.4645, + "step": 2200 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 0.4186863420216998, + "learning_rate": 4.828938694792354e-05, + "loss": 0.452, + "step": 2205 + }, + { + "epoch": 1.0488846701471286, + "grad_norm": 0.4069973315514433, + "learning_rate": 4.827290705339486e-05, + "loss": 0.4613, + "step": 2210 + }, + { + "epoch": 1.0512577123872804, + "grad_norm": 0.35248672181896196, + "learning_rate": 4.825642715886619e-05, + "loss": 0.4358, + "step": 2215 + }, + { + "epoch": 1.0536307546274324, + "grad_norm": 0.3361254465053039, + "learning_rate": 4.823994726433751e-05, + "loss": 0.4486, + "step": 2220 + }, + { + "epoch": 1.0560037968675842, + "grad_norm": 0.355892491899898, + "learning_rate": 4.822346736980883e-05, + "loss": 0.4446, + "step": 2225 + }, + { + "epoch": 1.0583768391077362, + "grad_norm": 0.44492037765371084, + "learning_rate": 4.820698747528016e-05, + "loss": 0.4416, + "step": 2230 + }, + { + "epoch": 1.060749881347888, + "grad_norm": 0.3461275694194245, + "learning_rate": 4.819050758075149e-05, + "loss": 0.4546, + "step": 2235 + }, + { + "epoch": 1.06312292358804, + "grad_norm": 0.3417534729786961, + "learning_rate": 4.817402768622281e-05, + "loss": 0.457, + "step": 2240 + }, + { + "epoch": 1.0654959658281917, + "grad_norm": 0.3626694880810102, + "learning_rate": 4.815754779169413e-05, + "loss": 0.4518, + "step": 2245 + }, + { + "epoch": 1.0678690080683437, + "grad_norm": 0.3304838091243937, + "learning_rate": 4.814106789716546e-05, + "loss": 0.4499, + "step": 2250 + }, + { + "epoch": 1.0702420503084955, + "grad_norm": 0.4017349090480421, + "learning_rate": 4.8124588002636784e-05, + "loss": 0.4496, + "step": 2255 + }, + { + "epoch": 1.0726150925486473, + "grad_norm": 0.3550830587915998, + "learning_rate": 4.810810810810811e-05, + "loss": 0.4528, + "step": 2260 + }, + { + "epoch": 1.0749881347887993, + "grad_norm": 0.3804598800219582, + "learning_rate": 4.8091628213579434e-05, + "loss": 0.4439, + "step": 2265 + }, + { + "epoch": 1.077361177028951, + "grad_norm": 0.4424849885800182, + "learning_rate": 4.807514831905076e-05, + "loss": 0.4527, + "step": 2270 + }, + { + "epoch": 1.079734219269103, + "grad_norm": 0.3937537143619511, + "learning_rate": 4.8058668424522085e-05, + "loss": 0.441, + "step": 2275 + }, + { + "epoch": 1.0821072615092548, + "grad_norm": 0.3553299250560569, + "learning_rate": 4.804218852999341e-05, + "loss": 0.4615, + "step": 2280 + }, + { + "epoch": 1.0844803037494066, + "grad_norm": 0.41787434323369643, + "learning_rate": 4.8025708635464736e-05, + "loss": 0.4575, + "step": 2285 + }, + { + "epoch": 1.0868533459895586, + "grad_norm": 0.3863961900166699, + "learning_rate": 4.8009228740936064e-05, + "loss": 0.4539, + "step": 2290 + }, + { + "epoch": 1.0892263882297104, + "grad_norm": 0.40806292732353056, + "learning_rate": 4.7992748846407386e-05, + "loss": 0.4486, + "step": 2295 + }, + { + "epoch": 1.0915994304698624, + "grad_norm": 0.4356251338404078, + "learning_rate": 4.797626895187871e-05, + "loss": 0.451, + "step": 2300 + }, + { + "epoch": 1.0939724727100142, + "grad_norm": 0.42288611039854496, + "learning_rate": 4.795978905735004e-05, + "loss": 0.4487, + "step": 2305 + }, + { + "epoch": 1.0963455149501662, + "grad_norm": 0.36438948822628925, + "learning_rate": 4.794330916282136e-05, + "loss": 0.447, + "step": 2310 + }, + { + "epoch": 1.098718557190318, + "grad_norm": 0.36892539402218794, + "learning_rate": 4.792682926829268e-05, + "loss": 0.4387, + "step": 2315 + }, + { + "epoch": 1.10109159943047, + "grad_norm": 0.41154651986339014, + "learning_rate": 4.791034937376401e-05, + "loss": 0.4578, + "step": 2320 + }, + { + "epoch": 1.1034646416706217, + "grad_norm": 0.36094194360005666, + "learning_rate": 4.789386947923534e-05, + "loss": 0.4571, + "step": 2325 + }, + { + "epoch": 1.1058376839107735, + "grad_norm": 0.4324533137146243, + "learning_rate": 4.787738958470666e-05, + "loss": 0.4632, + "step": 2330 + }, + { + "epoch": 1.1082107261509255, + "grad_norm": 0.36749712210236335, + "learning_rate": 4.786090969017798e-05, + "loss": 0.4628, + "step": 2335 + }, + { + "epoch": 1.1105837683910773, + "grad_norm": 0.37623887566142966, + "learning_rate": 4.784442979564931e-05, + "loss": 0.4442, + "step": 2340 + }, + { + "epoch": 1.1129568106312293, + "grad_norm": 0.3666172328927925, + "learning_rate": 4.782794990112063e-05, + "loss": 0.4497, + "step": 2345 + }, + { + "epoch": 1.115329852871381, + "grad_norm": 0.3315527939987629, + "learning_rate": 4.7811470006591955e-05, + "loss": 0.4472, + "step": 2350 + }, + { + "epoch": 1.1177028951115329, + "grad_norm": 0.32975634948265753, + "learning_rate": 4.7794990112063284e-05, + "loss": 0.4421, + "step": 2355 + }, + { + "epoch": 1.1200759373516849, + "grad_norm": 0.3180324343810521, + "learning_rate": 4.777851021753461e-05, + "loss": 0.4446, + "step": 2360 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 0.3686750703754881, + "learning_rate": 4.7762030323005935e-05, + "loss": 0.452, + "step": 2365 + }, + { + "epoch": 1.1248220218319886, + "grad_norm": 0.36381183965201674, + "learning_rate": 4.7745550428477257e-05, + "loss": 0.4486, + "step": 2370 + }, + { + "epoch": 1.1271950640721404, + "grad_norm": 0.3710632926784022, + "learning_rate": 4.7729070533948585e-05, + "loss": 0.4581, + "step": 2375 + }, + { + "epoch": 1.1295681063122924, + "grad_norm": 0.33214616654115126, + "learning_rate": 4.771259063941991e-05, + "loss": 0.4528, + "step": 2380 + }, + { + "epoch": 1.1319411485524442, + "grad_norm": 0.3937615296505256, + "learning_rate": 4.7696110744891236e-05, + "loss": 0.4497, + "step": 2385 + }, + { + "epoch": 1.1343141907925962, + "grad_norm": 0.35018034542023757, + "learning_rate": 4.767963085036256e-05, + "loss": 0.4488, + "step": 2390 + }, + { + "epoch": 1.136687233032748, + "grad_norm": 0.3822628347970978, + "learning_rate": 4.766315095583389e-05, + "loss": 0.4477, + "step": 2395 + }, + { + "epoch": 1.1390602752728998, + "grad_norm": 0.36211939941331484, + "learning_rate": 4.764667106130521e-05, + "loss": 0.4462, + "step": 2400 + }, + { + "epoch": 1.1414333175130518, + "grad_norm": 0.3782855671578854, + "learning_rate": 4.763019116677653e-05, + "loss": 0.4491, + "step": 2405 + }, + { + "epoch": 1.1438063597532035, + "grad_norm": 0.3973955456773456, + "learning_rate": 4.761371127224786e-05, + "loss": 0.4634, + "step": 2410 + }, + { + "epoch": 1.1461794019933556, + "grad_norm": 0.4761287692916258, + "learning_rate": 4.759723137771919e-05, + "loss": 0.4499, + "step": 2415 + }, + { + "epoch": 1.1485524442335073, + "grad_norm": 0.38303441011638373, + "learning_rate": 4.758075148319051e-05, + "loss": 0.4466, + "step": 2420 + }, + { + "epoch": 1.150925486473659, + "grad_norm": 0.36241073846256083, + "learning_rate": 4.756427158866183e-05, + "loss": 0.4434, + "step": 2425 + }, + { + "epoch": 1.153298528713811, + "grad_norm": 0.3840863658973048, + "learning_rate": 4.754779169413316e-05, + "loss": 0.4389, + "step": 2430 + }, + { + "epoch": 1.155671570953963, + "grad_norm": 0.3705876863297323, + "learning_rate": 4.753131179960448e-05, + "loss": 0.4483, + "step": 2435 + }, + { + "epoch": 1.158044613194115, + "grad_norm": 0.3475011566657756, + "learning_rate": 4.7514831905075805e-05, + "loss": 0.4539, + "step": 2440 + }, + { + "epoch": 1.1604176554342667, + "grad_norm": 0.3512685927485482, + "learning_rate": 4.749835201054714e-05, + "loss": 0.4532, + "step": 2445 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.3349952782507381, + "learning_rate": 4.748187211601846e-05, + "loss": 0.4547, + "step": 2450 + }, + { + "epoch": 1.1651637399145705, + "grad_norm": 0.3485373074153633, + "learning_rate": 4.7465392221489784e-05, + "loss": 0.4421, + "step": 2455 + }, + { + "epoch": 1.1675367821547225, + "grad_norm": 0.3923765248881854, + "learning_rate": 4.7448912326961106e-05, + "loss": 0.4553, + "step": 2460 + }, + { + "epoch": 1.1699098243948742, + "grad_norm": 0.33164136192887694, + "learning_rate": 4.7432432432432435e-05, + "loss": 0.4552, + "step": 2465 + }, + { + "epoch": 1.172282866635026, + "grad_norm": 0.38120982139751175, + "learning_rate": 4.741595253790376e-05, + "loss": 0.4408, + "step": 2470 + }, + { + "epoch": 1.174655908875178, + "grad_norm": 0.40555558520082, + "learning_rate": 4.7399472643375086e-05, + "loss": 0.4492, + "step": 2475 + }, + { + "epoch": 1.1770289511153298, + "grad_norm": 0.3447320639937937, + "learning_rate": 4.738299274884641e-05, + "loss": 0.4451, + "step": 2480 + }, + { + "epoch": 1.1794019933554818, + "grad_norm": 0.35431486690839337, + "learning_rate": 4.7366512854317736e-05, + "loss": 0.4549, + "step": 2485 + }, + { + "epoch": 1.1817750355956336, + "grad_norm": 0.3811284068999293, + "learning_rate": 4.735003295978906e-05, + "loss": 0.442, + "step": 2490 + }, + { + "epoch": 1.1841480778357856, + "grad_norm": 0.35166485833650557, + "learning_rate": 4.733355306526038e-05, + "loss": 0.4416, + "step": 2495 + }, + { + "epoch": 1.1865211200759374, + "grad_norm": 0.36317308565764306, + "learning_rate": 4.731707317073171e-05, + "loss": 0.4517, + "step": 2500 + }, + { + "epoch": 1.1888941623160891, + "grad_norm": 0.3778054451781364, + "learning_rate": 4.730059327620304e-05, + "loss": 0.4521, + "step": 2505 + }, + { + "epoch": 1.1912672045562411, + "grad_norm": 0.33645808645679315, + "learning_rate": 4.728411338167436e-05, + "loss": 0.4488, + "step": 2510 + }, + { + "epoch": 1.193640246796393, + "grad_norm": 0.36280222256196604, + "learning_rate": 4.726763348714568e-05, + "loss": 0.4555, + "step": 2515 + }, + { + "epoch": 1.196013289036545, + "grad_norm": 0.4395765954303555, + "learning_rate": 4.725115359261701e-05, + "loss": 0.4548, + "step": 2520 + }, + { + "epoch": 1.1983863312766967, + "grad_norm": 0.35688890321891453, + "learning_rate": 4.723467369808833e-05, + "loss": 0.4533, + "step": 2525 + }, + { + "epoch": 1.2007593735168487, + "grad_norm": 0.36187925496532175, + "learning_rate": 4.7218193803559654e-05, + "loss": 0.4389, + "step": 2530 + }, + { + "epoch": 1.2031324157570005, + "grad_norm": 0.43451648889237754, + "learning_rate": 4.720171390903099e-05, + "loss": 0.4441, + "step": 2535 + }, + { + "epoch": 1.2055054579971523, + "grad_norm": 0.34213080169042154, + "learning_rate": 4.718523401450231e-05, + "loss": 0.4417, + "step": 2540 + }, + { + "epoch": 1.2078785002373043, + "grad_norm": 0.38699861570335814, + "learning_rate": 4.7168754119973634e-05, + "loss": 0.4491, + "step": 2545 + }, + { + "epoch": 1.210251542477456, + "grad_norm": 0.3190876088144051, + "learning_rate": 4.7152274225444956e-05, + "loss": 0.4444, + "step": 2550 + }, + { + "epoch": 1.212624584717608, + "grad_norm": 0.372465946772267, + "learning_rate": 4.7135794330916284e-05, + "loss": 0.4479, + "step": 2555 + }, + { + "epoch": 1.2149976269577598, + "grad_norm": 0.3812437930231937, + "learning_rate": 4.7119314436387606e-05, + "loss": 0.4486, + "step": 2560 + }, + { + "epoch": 1.2173706691979118, + "grad_norm": 0.3490889576293788, + "learning_rate": 4.7102834541858935e-05, + "loss": 0.4452, + "step": 2565 + }, + { + "epoch": 1.2197437114380636, + "grad_norm": 0.3810902672606976, + "learning_rate": 4.7086354647330264e-05, + "loss": 0.4422, + "step": 2570 + }, + { + "epoch": 1.2221167536782156, + "grad_norm": 0.4146297936314819, + "learning_rate": 4.7069874752801586e-05, + "loss": 0.4487, + "step": 2575 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 0.3453467778119927, + "learning_rate": 4.705339485827291e-05, + "loss": 0.4393, + "step": 2580 + }, + { + "epoch": 1.2268628381585192, + "grad_norm": 0.3573026874206105, + "learning_rate": 4.703691496374423e-05, + "loss": 0.4381, + "step": 2585 + }, + { + "epoch": 1.2292358803986712, + "grad_norm": 0.39464943868713054, + "learning_rate": 4.702043506921556e-05, + "loss": 0.4444, + "step": 2590 + }, + { + "epoch": 1.231608922638823, + "grad_norm": 0.34620068408313026, + "learning_rate": 4.700395517468689e-05, + "loss": 0.451, + "step": 2595 + }, + { + "epoch": 1.233981964878975, + "grad_norm": 0.3633364878130708, + "learning_rate": 4.698747528015821e-05, + "loss": 0.4557, + "step": 2600 + }, + { + "epoch": 1.2363550071191267, + "grad_norm": 0.34032391106065135, + "learning_rate": 4.697099538562954e-05, + "loss": 0.4522, + "step": 2605 + }, + { + "epoch": 1.2387280493592785, + "grad_norm": 0.35532877581197614, + "learning_rate": 4.695451549110086e-05, + "loss": 0.4508, + "step": 2610 + }, + { + "epoch": 1.2411010915994305, + "grad_norm": 0.40703888471529526, + "learning_rate": 4.693803559657218e-05, + "loss": 0.4516, + "step": 2615 + }, + { + "epoch": 1.2434741338395823, + "grad_norm": 0.37948594276884134, + "learning_rate": 4.6921555702043504e-05, + "loss": 0.4409, + "step": 2620 + }, + { + "epoch": 1.2458471760797343, + "grad_norm": 0.3973644847789226, + "learning_rate": 4.690507580751484e-05, + "loss": 0.4554, + "step": 2625 + }, + { + "epoch": 1.248220218319886, + "grad_norm": 0.3377033768476747, + "learning_rate": 4.688859591298616e-05, + "loss": 0.443, + "step": 2630 + }, + { + "epoch": 1.2505932605600378, + "grad_norm": 0.38971965881044207, + "learning_rate": 4.687211601845748e-05, + "loss": 0.4532, + "step": 2635 + }, + { + "epoch": 1.2529663028001898, + "grad_norm": 0.38434578579405393, + "learning_rate": 4.685563612392881e-05, + "loss": 0.446, + "step": 2640 + }, + { + "epoch": 1.2553393450403418, + "grad_norm": 0.4304653860982579, + "learning_rate": 4.6839156229400134e-05, + "loss": 0.4449, + "step": 2645 + }, + { + "epoch": 1.2577123872804936, + "grad_norm": 0.5106781586539677, + "learning_rate": 4.6822676334871456e-05, + "loss": 0.4412, + "step": 2650 + }, + { + "epoch": 1.2600854295206454, + "grad_norm": 0.41752146937955764, + "learning_rate": 4.6806196440342785e-05, + "loss": 0.4482, + "step": 2655 + }, + { + "epoch": 1.2624584717607974, + "grad_norm": 0.34149066983854115, + "learning_rate": 4.6789716545814113e-05, + "loss": 0.4477, + "step": 2660 + }, + { + "epoch": 1.2648315140009492, + "grad_norm": 0.39993786457682423, + "learning_rate": 4.6773236651285435e-05, + "loss": 0.4503, + "step": 2665 + }, + { + "epoch": 1.2672045562411012, + "grad_norm": 0.3468512563238135, + "learning_rate": 4.675675675675676e-05, + "loss": 0.4453, + "step": 2670 + }, + { + "epoch": 1.269577598481253, + "grad_norm": 0.35543843103265843, + "learning_rate": 4.6740276862228086e-05, + "loss": 0.4495, + "step": 2675 + }, + { + "epoch": 1.2719506407214047, + "grad_norm": 0.32319950168804634, + "learning_rate": 4.672379696769941e-05, + "loss": 0.4442, + "step": 2680 + }, + { + "epoch": 1.2743236829615567, + "grad_norm": 0.3268087460876734, + "learning_rate": 4.670731707317074e-05, + "loss": 0.4395, + "step": 2685 + }, + { + "epoch": 1.2766967252017085, + "grad_norm": 0.3514179442459635, + "learning_rate": 4.669083717864206e-05, + "loss": 0.4357, + "step": 2690 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 0.3281627147770551, + "learning_rate": 4.667435728411339e-05, + "loss": 0.4537, + "step": 2695 + }, + { + "epoch": 1.2814428096820123, + "grad_norm": 0.3403162769825748, + "learning_rate": 4.665787738958471e-05, + "loss": 0.4573, + "step": 2700 + }, + { + "epoch": 1.2838158519221643, + "grad_norm": 0.35928126006827393, + "learning_rate": 4.664139749505603e-05, + "loss": 0.4503, + "step": 2705 + }, + { + "epoch": 1.286188894162316, + "grad_norm": 0.35724857308807384, + "learning_rate": 4.662491760052736e-05, + "loss": 0.4474, + "step": 2710 + }, + { + "epoch": 1.288561936402468, + "grad_norm": 0.31796867253801436, + "learning_rate": 4.660843770599869e-05, + "loss": 0.4451, + "step": 2715 + }, + { + "epoch": 1.2909349786426199, + "grad_norm": 0.3371984682640605, + "learning_rate": 4.659195781147001e-05, + "loss": 0.4441, + "step": 2720 + }, + { + "epoch": 1.2933080208827716, + "grad_norm": 0.30929848908032437, + "learning_rate": 4.657547791694133e-05, + "loss": 0.4472, + "step": 2725 + }, + { + "epoch": 1.2956810631229236, + "grad_norm": 0.36812774702044, + "learning_rate": 4.655899802241266e-05, + "loss": 0.4471, + "step": 2730 + }, + { + "epoch": 1.2980541053630754, + "grad_norm": 0.38686552163699595, + "learning_rate": 4.6542518127883984e-05, + "loss": 0.4473, + "step": 2735 + }, + { + "epoch": 1.3004271476032274, + "grad_norm": 0.3741326952303235, + "learning_rate": 4.6526038233355306e-05, + "loss": 0.4469, + "step": 2740 + }, + { + "epoch": 1.3028001898433792, + "grad_norm": 0.3587460998692433, + "learning_rate": 4.6509558338826634e-05, + "loss": 0.4545, + "step": 2745 + }, + { + "epoch": 1.305173232083531, + "grad_norm": 0.44961882787074187, + "learning_rate": 4.649307844429796e-05, + "loss": 0.4412, + "step": 2750 + }, + { + "epoch": 1.307546274323683, + "grad_norm": 0.37920160628524063, + "learning_rate": 4.6476598549769285e-05, + "loss": 0.4371, + "step": 2755 + }, + { + "epoch": 1.3099193165638348, + "grad_norm": 0.3827321594158867, + "learning_rate": 4.646011865524061e-05, + "loss": 0.4468, + "step": 2760 + }, + { + "epoch": 1.3122923588039868, + "grad_norm": 0.32631623510239194, + "learning_rate": 4.6443638760711936e-05, + "loss": 0.4451, + "step": 2765 + }, + { + "epoch": 1.3146654010441385, + "grad_norm": 0.4013707453215059, + "learning_rate": 4.642715886618326e-05, + "loss": 0.4412, + "step": 2770 + }, + { + "epoch": 1.3170384432842905, + "grad_norm": 0.4244907368036369, + "learning_rate": 4.641067897165458e-05, + "loss": 0.4401, + "step": 2775 + }, + { + "epoch": 1.3194114855244423, + "grad_norm": 0.42872188778689607, + "learning_rate": 4.639419907712591e-05, + "loss": 0.4332, + "step": 2780 + }, + { + "epoch": 1.3217845277645943, + "grad_norm": 0.48580726993612605, + "learning_rate": 4.637771918259724e-05, + "loss": 0.4455, + "step": 2785 + }, + { + "epoch": 1.324157570004746, + "grad_norm": 0.3775283672846771, + "learning_rate": 4.636123928806856e-05, + "loss": 0.4431, + "step": 2790 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 0.3278853954032256, + "learning_rate": 4.634475939353988e-05, + "loss": 0.4446, + "step": 2795 + }, + { + "epoch": 1.3289036544850499, + "grad_norm": 0.39513210608660587, + "learning_rate": 4.632827949901121e-05, + "loss": 0.4566, + "step": 2800 + }, + { + "epoch": 1.3312766967252017, + "grad_norm": 0.3800528307037516, + "learning_rate": 4.631179960448253e-05, + "loss": 0.4544, + "step": 2805 + }, + { + "epoch": 1.3336497389653537, + "grad_norm": 0.38150115266154117, + "learning_rate": 4.629531970995386e-05, + "loss": 0.4538, + "step": 2810 + }, + { + "epoch": 1.3360227812055054, + "grad_norm": 0.33441985338223623, + "learning_rate": 4.627883981542518e-05, + "loss": 0.4436, + "step": 2815 + }, + { + "epoch": 1.3383958234456572, + "grad_norm": 0.3815786734642785, + "learning_rate": 4.626235992089651e-05, + "loss": 0.4496, + "step": 2820 + }, + { + "epoch": 1.3407688656858092, + "grad_norm": 0.3988936237953365, + "learning_rate": 4.624588002636783e-05, + "loss": 0.4533, + "step": 2825 + }, + { + "epoch": 1.3431419079259612, + "grad_norm": 0.35075463921747313, + "learning_rate": 4.6229400131839155e-05, + "loss": 0.4391, + "step": 2830 + }, + { + "epoch": 1.345514950166113, + "grad_norm": 0.3156166701396103, + "learning_rate": 4.6212920237310484e-05, + "loss": 0.4516, + "step": 2835 + }, + { + "epoch": 1.3478879924062648, + "grad_norm": 0.3664991757918029, + "learning_rate": 4.619644034278181e-05, + "loss": 0.4432, + "step": 2840 + }, + { + "epoch": 1.3502610346464168, + "grad_norm": 0.3279078835965106, + "learning_rate": 4.6179960448253135e-05, + "loss": 0.4431, + "step": 2845 + }, + { + "epoch": 1.3526340768865686, + "grad_norm": 0.40055917536772234, + "learning_rate": 4.6163480553724457e-05, + "loss": 0.4532, + "step": 2850 + }, + { + "epoch": 1.3550071191267206, + "grad_norm": 0.31647623715035406, + "learning_rate": 4.6147000659195785e-05, + "loss": 0.4381, + "step": 2855 + }, + { + "epoch": 1.3573801613668723, + "grad_norm": 0.3707584265962824, + "learning_rate": 4.613052076466711e-05, + "loss": 0.436, + "step": 2860 + }, + { + "epoch": 1.3597532036070241, + "grad_norm": 0.33619026923741024, + "learning_rate": 4.611404087013843e-05, + "loss": 0.4457, + "step": 2865 + }, + { + "epoch": 1.3621262458471761, + "grad_norm": 0.3303455509653127, + "learning_rate": 4.609756097560976e-05, + "loss": 0.4356, + "step": 2870 + }, + { + "epoch": 1.364499288087328, + "grad_norm": 0.34865945115724756, + "learning_rate": 4.608108108108109e-05, + "loss": 0.4473, + "step": 2875 + }, + { + "epoch": 1.36687233032748, + "grad_norm": 0.30761057373231604, + "learning_rate": 4.606460118655241e-05, + "loss": 0.4363, + "step": 2880 + }, + { + "epoch": 1.3692453725676317, + "grad_norm": 0.33817704767429624, + "learning_rate": 4.604812129202373e-05, + "loss": 0.448, + "step": 2885 + }, + { + "epoch": 1.3716184148077835, + "grad_norm": 0.3556614483270205, + "learning_rate": 4.603164139749506e-05, + "loss": 0.4423, + "step": 2890 + }, + { + "epoch": 1.3739914570479355, + "grad_norm": 0.3661793291304645, + "learning_rate": 4.601516150296638e-05, + "loss": 0.4434, + "step": 2895 + }, + { + "epoch": 1.3763644992880875, + "grad_norm": 0.3526520997703949, + "learning_rate": 4.599868160843771e-05, + "loss": 0.4471, + "step": 2900 + }, + { + "epoch": 1.3787375415282392, + "grad_norm": 0.39662234942763075, + "learning_rate": 4.598220171390903e-05, + "loss": 0.4342, + "step": 2905 + }, + { + "epoch": 1.381110583768391, + "grad_norm": 0.3904414972386865, + "learning_rate": 4.596572181938036e-05, + "loss": 0.4508, + "step": 2910 + }, + { + "epoch": 1.383483626008543, + "grad_norm": 0.43178291601240315, + "learning_rate": 4.594924192485168e-05, + "loss": 0.4461, + "step": 2915 + }, + { + "epoch": 1.3858566682486948, + "grad_norm": 0.38034005890669653, + "learning_rate": 4.5932762030323005e-05, + "loss": 0.4389, + "step": 2920 + }, + { + "epoch": 1.3882297104888468, + "grad_norm": 0.3353240387126112, + "learning_rate": 4.5916282135794334e-05, + "loss": 0.4431, + "step": 2925 + }, + { + "epoch": 1.3906027527289986, + "grad_norm": 0.51193726081928, + "learning_rate": 4.589980224126566e-05, + "loss": 0.4458, + "step": 2930 + }, + { + "epoch": 1.3929757949691504, + "grad_norm": 0.3965504949073376, + "learning_rate": 4.5883322346736984e-05, + "loss": 0.446, + "step": 2935 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 0.46120016775178524, + "learning_rate": 4.5866842452208306e-05, + "loss": 0.4397, + "step": 2940 + }, + { + "epoch": 1.3977218794494541, + "grad_norm": 0.39651082313246505, + "learning_rate": 4.5850362557679635e-05, + "loss": 0.4373, + "step": 2945 + }, + { + "epoch": 1.4000949216896061, + "grad_norm": 0.3777273682605796, + "learning_rate": 4.583388266315096e-05, + "loss": 0.4453, + "step": 2950 + }, + { + "epoch": 1.402467963929758, + "grad_norm": 0.3326820416443824, + "learning_rate": 4.581740276862228e-05, + "loss": 0.4516, + "step": 2955 + }, + { + "epoch": 1.4048410061699097, + "grad_norm": 0.3471824009917557, + "learning_rate": 4.580092287409361e-05, + "loss": 0.4408, + "step": 2960 + }, + { + "epoch": 1.4072140484100617, + "grad_norm": 0.3143952540299479, + "learning_rate": 4.5784442979564936e-05, + "loss": 0.434, + "step": 2965 + }, + { + "epoch": 1.4095870906502137, + "grad_norm": 0.3146032880425632, + "learning_rate": 4.576796308503626e-05, + "loss": 0.4487, + "step": 2970 + }, + { + "epoch": 1.4119601328903655, + "grad_norm": 0.3066219636097115, + "learning_rate": 4.575148319050758e-05, + "loss": 0.4506, + "step": 2975 + }, + { + "epoch": 1.4143331751305173, + "grad_norm": 0.3628669341746542, + "learning_rate": 4.573500329597891e-05, + "loss": 0.4315, + "step": 2980 + }, + { + "epoch": 1.4167062173706693, + "grad_norm": 0.3287002887092089, + "learning_rate": 4.571852340145023e-05, + "loss": 0.4475, + "step": 2985 + }, + { + "epoch": 1.419079259610821, + "grad_norm": 0.31663174664260235, + "learning_rate": 4.570204350692156e-05, + "loss": 0.4405, + "step": 2990 + }, + { + "epoch": 1.421452301850973, + "grad_norm": 0.309822456152951, + "learning_rate": 4.568556361239288e-05, + "loss": 0.4319, + "step": 2995 + }, + { + "epoch": 1.4238253440911248, + "grad_norm": 0.3229609266405243, + "learning_rate": 4.566908371786421e-05, + "loss": 0.4296, + "step": 3000 + }, + { + "epoch": 1.4261983863312766, + "grad_norm": 0.36250967560518976, + "learning_rate": 4.565260382333553e-05, + "loss": 0.4418, + "step": 3005 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.3544919929967802, + "learning_rate": 4.5636123928806854e-05, + "loss": 0.4379, + "step": 3010 + }, + { + "epoch": 1.4309444708115804, + "grad_norm": 0.3210649959707927, + "learning_rate": 4.561964403427818e-05, + "loss": 0.4383, + "step": 3015 + }, + { + "epoch": 1.4333175130517324, + "grad_norm": 0.3332767169118873, + "learning_rate": 4.560316413974951e-05, + "loss": 0.4509, + "step": 3020 + }, + { + "epoch": 1.4356905552918842, + "grad_norm": 0.37610372068157916, + "learning_rate": 4.5586684245220834e-05, + "loss": 0.4297, + "step": 3025 + }, + { + "epoch": 1.438063597532036, + "grad_norm": 0.34448382878230344, + "learning_rate": 4.5570204350692156e-05, + "loss": 0.4425, + "step": 3030 + }, + { + "epoch": 1.440436639772188, + "grad_norm": 0.36570450965301904, + "learning_rate": 4.5553724456163485e-05, + "loss": 0.4159, + "step": 3035 + }, + { + "epoch": 1.44280968201234, + "grad_norm": 0.36809319750693353, + "learning_rate": 4.5537244561634806e-05, + "loss": 0.4321, + "step": 3040 + }, + { + "epoch": 1.4451827242524917, + "grad_norm": 0.3819167899834458, + "learning_rate": 4.552076466710613e-05, + "loss": 0.4501, + "step": 3045 + }, + { + "epoch": 1.4475557664926435, + "grad_norm": 0.3627117246094779, + "learning_rate": 4.550428477257746e-05, + "loss": 0.4355, + "step": 3050 + }, + { + "epoch": 1.4499288087327955, + "grad_norm": 0.35025867193204685, + "learning_rate": 4.5487804878048786e-05, + "loss": 0.4473, + "step": 3055 + }, + { + "epoch": 1.4523018509729473, + "grad_norm": 0.3779696377596133, + "learning_rate": 4.547132498352011e-05, + "loss": 0.4215, + "step": 3060 + }, + { + "epoch": 1.4546748932130993, + "grad_norm": 0.3430221345261885, + "learning_rate": 4.545484508899143e-05, + "loss": 0.4529, + "step": 3065 + }, + { + "epoch": 1.457047935453251, + "grad_norm": 0.3620070356768351, + "learning_rate": 4.543836519446276e-05, + "loss": 0.4371, + "step": 3070 + }, + { + "epoch": 1.4594209776934028, + "grad_norm": 0.33035401919242563, + "learning_rate": 4.542188529993408e-05, + "loss": 0.4473, + "step": 3075 + }, + { + "epoch": 1.4617940199335548, + "grad_norm": 0.36067462057711774, + "learning_rate": 4.540540540540541e-05, + "loss": 0.4393, + "step": 3080 + }, + { + "epoch": 1.4641670621737066, + "grad_norm": 0.30757073072992647, + "learning_rate": 4.538892551087673e-05, + "loss": 0.436, + "step": 3085 + }, + { + "epoch": 1.4665401044138586, + "grad_norm": 0.33386511908192595, + "learning_rate": 4.537244561634806e-05, + "loss": 0.4333, + "step": 3090 + }, + { + "epoch": 1.4689131466540104, + "grad_norm": 0.3181978439488915, + "learning_rate": 4.535596572181938e-05, + "loss": 0.4437, + "step": 3095 + }, + { + "epoch": 1.4712861888941622, + "grad_norm": 0.3585710663667612, + "learning_rate": 4.5339485827290704e-05, + "loss": 0.4415, + "step": 3100 + }, + { + "epoch": 1.4736592311343142, + "grad_norm": 0.4178275268545838, + "learning_rate": 4.532300593276203e-05, + "loss": 0.4406, + "step": 3105 + }, + { + "epoch": 1.4760322733744662, + "grad_norm": 0.3577703839853541, + "learning_rate": 4.530652603823336e-05, + "loss": 0.4413, + "step": 3110 + }, + { + "epoch": 1.478405315614618, + "grad_norm": 0.36751774308141416, + "learning_rate": 4.5290046143704683e-05, + "loss": 0.4478, + "step": 3115 + }, + { + "epoch": 1.4807783578547697, + "grad_norm": 0.39722178548560655, + "learning_rate": 4.5273566249176005e-05, + "loss": 0.4292, + "step": 3120 + }, + { + "epoch": 1.4831514000949217, + "grad_norm": 0.33033917125837775, + "learning_rate": 4.5257086354647334e-05, + "loss": 0.4328, + "step": 3125 + }, + { + "epoch": 1.4855244423350735, + "grad_norm": 0.34779370161895246, + "learning_rate": 4.5240606460118656e-05, + "loss": 0.4381, + "step": 3130 + }, + { + "epoch": 1.4878974845752255, + "grad_norm": 0.3260583334588898, + "learning_rate": 4.522412656558998e-05, + "loss": 0.4478, + "step": 3135 + }, + { + "epoch": 1.4902705268153773, + "grad_norm": 0.3395616703128198, + "learning_rate": 4.520764667106131e-05, + "loss": 0.4459, + "step": 3140 + }, + { + "epoch": 1.492643569055529, + "grad_norm": 0.3704953951052309, + "learning_rate": 4.5191166776532636e-05, + "loss": 0.4491, + "step": 3145 + }, + { + "epoch": 1.495016611295681, + "grad_norm": 0.34992524861046714, + "learning_rate": 4.517468688200396e-05, + "loss": 0.4414, + "step": 3150 + }, + { + "epoch": 1.4973896535358329, + "grad_norm": 0.3339263123887668, + "learning_rate": 4.515820698747528e-05, + "loss": 0.4357, + "step": 3155 + }, + { + "epoch": 1.4997626957759849, + "grad_norm": 0.3910551767193392, + "learning_rate": 4.514172709294661e-05, + "loss": 0.4485, + "step": 3160 + }, + { + "epoch": 1.5021357380161366, + "grad_norm": 0.3530439693221716, + "learning_rate": 4.512524719841793e-05, + "loss": 0.4525, + "step": 3165 + }, + { + "epoch": 1.5045087802562884, + "grad_norm": 0.38223482294329, + "learning_rate": 4.510876730388926e-05, + "loss": 0.4402, + "step": 3170 + }, + { + "epoch": 1.5068818224964404, + "grad_norm": 0.3632150847447156, + "learning_rate": 4.509228740936058e-05, + "loss": 0.4392, + "step": 3175 + }, + { + "epoch": 1.5092548647365924, + "grad_norm": 0.3430051563232184, + "learning_rate": 4.507580751483191e-05, + "loss": 0.4332, + "step": 3180 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 0.3728779425352776, + "learning_rate": 4.505932762030323e-05, + "loss": 0.4401, + "step": 3185 + }, + { + "epoch": 1.514000949216896, + "grad_norm": 0.34238307437873466, + "learning_rate": 4.5042847725774554e-05, + "loss": 0.4331, + "step": 3190 + }, + { + "epoch": 1.516373991457048, + "grad_norm": 0.3410568303847029, + "learning_rate": 4.502636783124588e-05, + "loss": 0.4336, + "step": 3195 + }, + { + "epoch": 1.5187470336971998, + "grad_norm": 0.3162200417120553, + "learning_rate": 4.500988793671721e-05, + "loss": 0.4375, + "step": 3200 + }, + { + "epoch": 1.5211200759373518, + "grad_norm": 0.40047797140595465, + "learning_rate": 4.499340804218853e-05, + "loss": 0.4495, + "step": 3205 + }, + { + "epoch": 1.5234931181775035, + "grad_norm": 0.33328828885893386, + "learning_rate": 4.4976928147659855e-05, + "loss": 0.4479, + "step": 3210 + }, + { + "epoch": 1.5258661604176553, + "grad_norm": 0.3410662252172626, + "learning_rate": 4.4960448253131184e-05, + "loss": 0.4433, + "step": 3215 + }, + { + "epoch": 1.5282392026578073, + "grad_norm": 0.34052270452269834, + "learning_rate": 4.4943968358602506e-05, + "loss": 0.4333, + "step": 3220 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 0.3316228490573878, + "learning_rate": 4.492748846407383e-05, + "loss": 0.4347, + "step": 3225 + }, + { + "epoch": 1.532985287138111, + "grad_norm": 0.31708699392712175, + "learning_rate": 4.4911008569545156e-05, + "loss": 0.4499, + "step": 3230 + }, + { + "epoch": 1.5353583293782629, + "grad_norm": 0.3723255939857838, + "learning_rate": 4.4894528675016485e-05, + "loss": 0.455, + "step": 3235 + }, + { + "epoch": 1.5377313716184147, + "grad_norm": 0.37885430116795515, + "learning_rate": 4.487804878048781e-05, + "loss": 0.4336, + "step": 3240 + }, + { + "epoch": 1.5401044138585667, + "grad_norm": 0.32265692469621304, + "learning_rate": 4.486156888595913e-05, + "loss": 0.4463, + "step": 3245 + }, + { + "epoch": 1.5424774560987187, + "grad_norm": 0.33703118224530576, + "learning_rate": 4.484508899143046e-05, + "loss": 0.4561, + "step": 3250 + }, + { + "epoch": 1.5448504983388704, + "grad_norm": 0.3545866165063668, + "learning_rate": 4.482860909690178e-05, + "loss": 0.4405, + "step": 3255 + }, + { + "epoch": 1.5472235405790222, + "grad_norm": 0.3634160105784248, + "learning_rate": 4.48121292023731e-05, + "loss": 0.4413, + "step": 3260 + }, + { + "epoch": 1.5495965828191742, + "grad_norm": 0.408414309681026, + "learning_rate": 4.479564930784444e-05, + "loss": 0.4301, + "step": 3265 + }, + { + "epoch": 1.551969625059326, + "grad_norm": 0.32701246963340563, + "learning_rate": 4.477916941331576e-05, + "loss": 0.4481, + "step": 3270 + }, + { + "epoch": 1.554342667299478, + "grad_norm": 0.3476000938053823, + "learning_rate": 4.476268951878708e-05, + "loss": 0.4432, + "step": 3275 + }, + { + "epoch": 1.5567157095396298, + "grad_norm": 0.33151554988945997, + "learning_rate": 4.47462096242584e-05, + "loss": 0.4457, + "step": 3280 + }, + { + "epoch": 1.5590887517797816, + "grad_norm": 0.31801413910883736, + "learning_rate": 4.472972972972973e-05, + "loss": 0.4374, + "step": 3285 + }, + { + "epoch": 1.5614617940199336, + "grad_norm": 0.2903705861031492, + "learning_rate": 4.4713249835201054e-05, + "loss": 0.4477, + "step": 3290 + }, + { + "epoch": 1.5638348362600856, + "grad_norm": 0.3703633628016247, + "learning_rate": 4.469676994067238e-05, + "loss": 0.4331, + "step": 3295 + }, + { + "epoch": 1.5662078785002373, + "grad_norm": 0.3239471916944658, + "learning_rate": 4.468029004614371e-05, + "loss": 0.4228, + "step": 3300 + }, + { + "epoch": 1.5685809207403891, + "grad_norm": 0.3134485928994517, + "learning_rate": 4.466381015161503e-05, + "loss": 0.4315, + "step": 3305 + }, + { + "epoch": 1.570953962980541, + "grad_norm": 0.3728347603886756, + "learning_rate": 4.4647330257086355e-05, + "loss": 0.4334, + "step": 3310 + }, + { + "epoch": 1.573327005220693, + "grad_norm": 0.33493232557389324, + "learning_rate": 4.463085036255768e-05, + "loss": 0.446, + "step": 3315 + }, + { + "epoch": 1.575700047460845, + "grad_norm": 0.3438228638174042, + "learning_rate": 4.4614370468029006e-05, + "loss": 0.4316, + "step": 3320 + }, + { + "epoch": 1.5780730897009967, + "grad_norm": 0.34784822060770065, + "learning_rate": 4.4597890573500335e-05, + "loss": 0.4329, + "step": 3325 + }, + { + "epoch": 1.5804461319411485, + "grad_norm": 0.3112269110958962, + "learning_rate": 4.458141067897166e-05, + "loss": 0.4426, + "step": 3330 + }, + { + "epoch": 1.5828191741813005, + "grad_norm": 0.3442881544675051, + "learning_rate": 4.456493078444298e-05, + "loss": 0.4447, + "step": 3335 + }, + { + "epoch": 1.5851922164214522, + "grad_norm": 0.4167440276139508, + "learning_rate": 4.454845088991431e-05, + "loss": 0.4356, + "step": 3340 + }, + { + "epoch": 1.5875652586616043, + "grad_norm": 0.32125912347931557, + "learning_rate": 4.453197099538563e-05, + "loss": 0.4318, + "step": 3345 + }, + { + "epoch": 1.589938300901756, + "grad_norm": 0.3560780585960654, + "learning_rate": 4.451549110085695e-05, + "loss": 0.435, + "step": 3350 + }, + { + "epoch": 1.5923113431419078, + "grad_norm": 0.3302485469266685, + "learning_rate": 4.449901120632829e-05, + "loss": 0.4346, + "step": 3355 + }, + { + "epoch": 1.5946843853820598, + "grad_norm": 0.3293345646371729, + "learning_rate": 4.448253131179961e-05, + "loss": 0.4413, + "step": 3360 + }, + { + "epoch": 1.5970574276222118, + "grad_norm": 0.321005934950186, + "learning_rate": 4.446605141727093e-05, + "loss": 0.4352, + "step": 3365 + }, + { + "epoch": 1.5994304698623636, + "grad_norm": 0.3523217083659719, + "learning_rate": 4.444957152274225e-05, + "loss": 0.4416, + "step": 3370 + }, + { + "epoch": 1.6018035121025154, + "grad_norm": 0.29572359628628075, + "learning_rate": 4.443309162821358e-05, + "loss": 0.438, + "step": 3375 + }, + { + "epoch": 1.6041765543426671, + "grad_norm": 0.32966688691159146, + "learning_rate": 4.4416611733684903e-05, + "loss": 0.4307, + "step": 3380 + }, + { + "epoch": 1.6065495965828192, + "grad_norm": 0.3592731312863687, + "learning_rate": 4.440013183915623e-05, + "loss": 0.4356, + "step": 3385 + }, + { + "epoch": 1.6089226388229712, + "grad_norm": 0.34847548415457347, + "learning_rate": 4.438365194462756e-05, + "loss": 0.4395, + "step": 3390 + }, + { + "epoch": 1.611295681063123, + "grad_norm": 0.3748117045568233, + "learning_rate": 4.436717205009888e-05, + "loss": 0.4389, + "step": 3395 + }, + { + "epoch": 1.6136687233032747, + "grad_norm": 0.3260137927755355, + "learning_rate": 4.4350692155570205e-05, + "loss": 0.4435, + "step": 3400 + }, + { + "epoch": 1.6160417655434267, + "grad_norm": 0.3697317167541188, + "learning_rate": 4.433421226104153e-05, + "loss": 0.4382, + "step": 3405 + }, + { + "epoch": 1.6184148077835787, + "grad_norm": 0.3609257841671695, + "learning_rate": 4.4317732366512856e-05, + "loss": 0.4379, + "step": 3410 + }, + { + "epoch": 1.6207878500237305, + "grad_norm": 0.36243381378277095, + "learning_rate": 4.4301252471984184e-05, + "loss": 0.4313, + "step": 3415 + }, + { + "epoch": 1.6231608922638823, + "grad_norm": 0.31233333076184994, + "learning_rate": 4.4284772577455506e-05, + "loss": 0.4276, + "step": 3420 + }, + { + "epoch": 1.625533934504034, + "grad_norm": 0.3210998682507186, + "learning_rate": 4.4268292682926835e-05, + "loss": 0.4433, + "step": 3425 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.32081062959752926, + "learning_rate": 4.425181278839816e-05, + "loss": 0.4329, + "step": 3430 + }, + { + "epoch": 1.630280018984338, + "grad_norm": 0.346703790753863, + "learning_rate": 4.423533289386948e-05, + "loss": 0.4404, + "step": 3435 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.36090096210944195, + "learning_rate": 4.42188529993408e-05, + "loss": 0.4328, + "step": 3440 + }, + { + "epoch": 1.6350261034646416, + "grad_norm": 0.36587458049272015, + "learning_rate": 4.4202373104812136e-05, + "loss": 0.4321, + "step": 3445 + }, + { + "epoch": 1.6373991457047934, + "grad_norm": 0.41967762774669937, + "learning_rate": 4.418589321028346e-05, + "loss": 0.4397, + "step": 3450 + }, + { + "epoch": 1.6397721879449454, + "grad_norm": 0.31333581474972855, + "learning_rate": 4.416941331575478e-05, + "loss": 0.4284, + "step": 3455 + }, + { + "epoch": 1.6421452301850974, + "grad_norm": 0.36974664792699674, + "learning_rate": 4.415293342122611e-05, + "loss": 0.4377, + "step": 3460 + }, + { + "epoch": 1.6445182724252492, + "grad_norm": 0.4120986048725969, + "learning_rate": 4.413645352669743e-05, + "loss": 0.4393, + "step": 3465 + }, + { + "epoch": 1.646891314665401, + "grad_norm": 0.3658646852839157, + "learning_rate": 4.411997363216875e-05, + "loss": 0.427, + "step": 3470 + }, + { + "epoch": 1.649264356905553, + "grad_norm": 0.35054351286559776, + "learning_rate": 4.410349373764008e-05, + "loss": 0.429, + "step": 3475 + }, + { + "epoch": 1.651637399145705, + "grad_norm": 0.3282569501268787, + "learning_rate": 4.408701384311141e-05, + "loss": 0.4402, + "step": 3480 + }, + { + "epoch": 1.6540104413858567, + "grad_norm": 0.3307590066142975, + "learning_rate": 4.407053394858273e-05, + "loss": 0.436, + "step": 3485 + }, + { + "epoch": 1.6563834836260085, + "grad_norm": 0.3240065338494159, + "learning_rate": 4.4054054054054054e-05, + "loss": 0.4292, + "step": 3490 + }, + { + "epoch": 1.6587565258661603, + "grad_norm": 0.32305765002484, + "learning_rate": 4.403757415952538e-05, + "loss": 0.4351, + "step": 3495 + }, + { + "epoch": 1.6611295681063123, + "grad_norm": 0.3223885233695426, + "learning_rate": 4.4021094264996705e-05, + "loss": 0.4232, + "step": 3500 + }, + { + "epoch": 1.6635026103464643, + "grad_norm": 0.337413611821002, + "learning_rate": 4.4004614370468034e-05, + "loss": 0.4317, + "step": 3505 + }, + { + "epoch": 1.665875652586616, + "grad_norm": 0.3470632637225415, + "learning_rate": 4.3988134475939356e-05, + "loss": 0.4443, + "step": 3510 + }, + { + "epoch": 1.6682486948267679, + "grad_norm": 0.3262010525878894, + "learning_rate": 4.3971654581410685e-05, + "loss": 0.445, + "step": 3515 + }, + { + "epoch": 1.6706217370669196, + "grad_norm": 0.32796957208465005, + "learning_rate": 4.3955174686882007e-05, + "loss": 0.4242, + "step": 3520 + }, + { + "epoch": 1.6729947793070716, + "grad_norm": 0.31140816464844756, + "learning_rate": 4.393869479235333e-05, + "loss": 0.4361, + "step": 3525 + }, + { + "epoch": 1.6753678215472236, + "grad_norm": 0.35178884999519466, + "learning_rate": 4.392221489782466e-05, + "loss": 0.4448, + "step": 3530 + }, + { + "epoch": 1.6777408637873754, + "grad_norm": 0.2978106651492192, + "learning_rate": 4.3905735003295986e-05, + "loss": 0.4332, + "step": 3535 + }, + { + "epoch": 1.6801139060275272, + "grad_norm": 0.3105699293854891, + "learning_rate": 4.388925510876731e-05, + "loss": 0.4349, + "step": 3540 + }, + { + "epoch": 1.6824869482676792, + "grad_norm": 0.32037184103392813, + "learning_rate": 4.387277521423863e-05, + "loss": 0.4453, + "step": 3545 + }, + { + "epoch": 1.6848599905078312, + "grad_norm": 0.31730353578402365, + "learning_rate": 4.385629531970996e-05, + "loss": 0.4267, + "step": 3550 + }, + { + "epoch": 1.687233032747983, + "grad_norm": 0.3272536421980602, + "learning_rate": 4.383981542518128e-05, + "loss": 0.4492, + "step": 3555 + }, + { + "epoch": 1.6896060749881348, + "grad_norm": 0.3256159703591963, + "learning_rate": 4.38233355306526e-05, + "loss": 0.4289, + "step": 3560 + }, + { + "epoch": 1.6919791172282865, + "grad_norm": 0.32176177858936256, + "learning_rate": 4.380685563612393e-05, + "loss": 0.4372, + "step": 3565 + }, + { + "epoch": 1.6943521594684385, + "grad_norm": 0.38588529574809016, + "learning_rate": 4.379037574159526e-05, + "loss": 0.4252, + "step": 3570 + }, + { + "epoch": 1.6967252017085905, + "grad_norm": 0.38130731424007364, + "learning_rate": 4.377389584706658e-05, + "loss": 0.4382, + "step": 3575 + }, + { + "epoch": 1.6990982439487423, + "grad_norm": 0.3230510275225513, + "learning_rate": 4.3757415952537904e-05, + "loss": 0.4388, + "step": 3580 + }, + { + "epoch": 1.701471286188894, + "grad_norm": 0.3089559086041697, + "learning_rate": 4.374093605800923e-05, + "loss": 0.4347, + "step": 3585 + }, + { + "epoch": 1.7038443284290459, + "grad_norm": 0.29837060783837027, + "learning_rate": 4.3724456163480555e-05, + "loss": 0.4419, + "step": 3590 + }, + { + "epoch": 1.7062173706691979, + "grad_norm": 0.3250320671579211, + "learning_rate": 4.3707976268951883e-05, + "loss": 0.4257, + "step": 3595 + }, + { + "epoch": 1.7085904129093499, + "grad_norm": 0.3454930835845931, + "learning_rate": 4.3691496374423205e-05, + "loss": 0.4394, + "step": 3600 + }, + { + "epoch": 1.7109634551495017, + "grad_norm": 0.3027234808853718, + "learning_rate": 4.3675016479894534e-05, + "loss": 0.436, + "step": 3605 + }, + { + "epoch": 1.7133364973896534, + "grad_norm": 0.31906382844385844, + "learning_rate": 4.3658536585365856e-05, + "loss": 0.4386, + "step": 3610 + }, + { + "epoch": 1.7157095396298054, + "grad_norm": 0.2959858541005958, + "learning_rate": 4.364205669083718e-05, + "loss": 0.4379, + "step": 3615 + }, + { + "epoch": 1.7180825818699574, + "grad_norm": 0.2949666310251056, + "learning_rate": 4.362557679630851e-05, + "loss": 0.4306, + "step": 3620 + }, + { + "epoch": 1.7204556241101092, + "grad_norm": 0.3543343347553355, + "learning_rate": 4.3609096901779836e-05, + "loss": 0.4409, + "step": 3625 + }, + { + "epoch": 1.722828666350261, + "grad_norm": 0.35142921540506766, + "learning_rate": 4.359261700725116e-05, + "loss": 0.4413, + "step": 3630 + }, + { + "epoch": 1.7252017085904128, + "grad_norm": 0.35274268753325677, + "learning_rate": 4.357613711272248e-05, + "loss": 0.4371, + "step": 3635 + }, + { + "epoch": 1.7275747508305648, + "grad_norm": 0.37488862642960935, + "learning_rate": 4.355965721819381e-05, + "loss": 0.4463, + "step": 3640 + }, + { + "epoch": 1.7299477930707168, + "grad_norm": 0.2849740232385117, + "learning_rate": 4.354317732366513e-05, + "loss": 0.4448, + "step": 3645 + }, + { + "epoch": 1.7323208353108686, + "grad_norm": 0.3391828964508724, + "learning_rate": 4.352669742913645e-05, + "loss": 0.4326, + "step": 3650 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 0.30659444718814477, + "learning_rate": 4.351021753460778e-05, + "loss": 0.4242, + "step": 3655 + }, + { + "epoch": 1.7370669197911723, + "grad_norm": 0.3289532334962945, + "learning_rate": 4.349373764007911e-05, + "loss": 0.4507, + "step": 3660 + }, + { + "epoch": 1.7394399620313241, + "grad_norm": 0.31091094238923156, + "learning_rate": 4.347725774555043e-05, + "loss": 0.4425, + "step": 3665 + }, + { + "epoch": 1.7418130042714761, + "grad_norm": 0.35150140572549693, + "learning_rate": 4.3460777851021754e-05, + "loss": 0.4282, + "step": 3670 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 0.41132185493998913, + "learning_rate": 4.344429795649308e-05, + "loss": 0.4353, + "step": 3675 + }, + { + "epoch": 1.7465590887517797, + "grad_norm": 0.32958670293320913, + "learning_rate": 4.3427818061964404e-05, + "loss": 0.434, + "step": 3680 + }, + { + "epoch": 1.7489321309919317, + "grad_norm": 0.3445915127124403, + "learning_rate": 4.3411338167435726e-05, + "loss": 0.4454, + "step": 3685 + }, + { + "epoch": 1.7513051732320837, + "grad_norm": 0.32004129982984514, + "learning_rate": 4.3394858272907055e-05, + "loss": 0.4266, + "step": 3690 + }, + { + "epoch": 1.7536782154722355, + "grad_norm": 0.43317620560904196, + "learning_rate": 4.3378378378378384e-05, + "loss": 0.4389, + "step": 3695 + }, + { + "epoch": 1.7560512577123872, + "grad_norm": 0.3514855320711862, + "learning_rate": 4.3361898483849706e-05, + "loss": 0.4315, + "step": 3700 + }, + { + "epoch": 1.758424299952539, + "grad_norm": 0.43030646319228527, + "learning_rate": 4.334541858932103e-05, + "loss": 0.4464, + "step": 3705 + }, + { + "epoch": 1.760797342192691, + "grad_norm": 0.38131914972559794, + "learning_rate": 4.3328938694792356e-05, + "loss": 0.4339, + "step": 3710 + }, + { + "epoch": 1.763170384432843, + "grad_norm": 0.36619537431285953, + "learning_rate": 4.331245880026368e-05, + "loss": 0.45, + "step": 3715 + }, + { + "epoch": 1.7655434266729948, + "grad_norm": 0.344907852237113, + "learning_rate": 4.329597890573501e-05, + "loss": 0.4318, + "step": 3720 + }, + { + "epoch": 1.7679164689131466, + "grad_norm": 0.3684234177771228, + "learning_rate": 4.327949901120633e-05, + "loss": 0.4293, + "step": 3725 + }, + { + "epoch": 1.7702895111532986, + "grad_norm": 0.37640139251060206, + "learning_rate": 4.326301911667766e-05, + "loss": 0.4448, + "step": 3730 + }, + { + "epoch": 1.7726625533934504, + "grad_norm": 0.3776417578153282, + "learning_rate": 4.324653922214898e-05, + "loss": 0.4447, + "step": 3735 + }, + { + "epoch": 1.7750355956336024, + "grad_norm": 0.322168717612925, + "learning_rate": 4.32300593276203e-05, + "loss": 0.4288, + "step": 3740 + }, + { + "epoch": 1.7774086378737541, + "grad_norm": 0.30661489872024916, + "learning_rate": 4.321357943309163e-05, + "loss": 0.4265, + "step": 3745 + }, + { + "epoch": 1.779781680113906, + "grad_norm": 0.3251814150171211, + "learning_rate": 4.319709953856296e-05, + "loss": 0.443, + "step": 3750 + }, + { + "epoch": 1.782154722354058, + "grad_norm": 0.3752034801612459, + "learning_rate": 4.318061964403428e-05, + "loss": 0.4297, + "step": 3755 + }, + { + "epoch": 1.78452776459421, + "grad_norm": 0.31355202207664257, + "learning_rate": 4.31641397495056e-05, + "loss": 0.4255, + "step": 3760 + }, + { + "epoch": 1.7869008068343617, + "grad_norm": 0.30846841189404245, + "learning_rate": 4.314765985497693e-05, + "loss": 0.44, + "step": 3765 + }, + { + "epoch": 1.7892738490745135, + "grad_norm": 0.32015277865529973, + "learning_rate": 4.3131179960448254e-05, + "loss": 0.4383, + "step": 3770 + }, + { + "epoch": 1.7916468913146653, + "grad_norm": 0.333877306713241, + "learning_rate": 4.3114700065919576e-05, + "loss": 0.4387, + "step": 3775 + }, + { + "epoch": 1.7940199335548173, + "grad_norm": 0.3053959053116733, + "learning_rate": 4.3098220171390905e-05, + "loss": 0.4405, + "step": 3780 + }, + { + "epoch": 1.7963929757949693, + "grad_norm": 0.3403774783717389, + "learning_rate": 4.308174027686223e-05, + "loss": 0.4263, + "step": 3785 + }, + { + "epoch": 1.798766018035121, + "grad_norm": 0.3398769681512224, + "learning_rate": 4.3065260382333555e-05, + "loss": 0.4454, + "step": 3790 + }, + { + "epoch": 1.8011390602752728, + "grad_norm": 0.34322908829970167, + "learning_rate": 4.304878048780488e-05, + "loss": 0.4266, + "step": 3795 + }, + { + "epoch": 1.8035121025154248, + "grad_norm": 0.3199157464990695, + "learning_rate": 4.3032300593276206e-05, + "loss": 0.4283, + "step": 3800 + }, + { + "epoch": 1.8058851447555766, + "grad_norm": 0.34154579072632674, + "learning_rate": 4.301582069874753e-05, + "loss": 0.4247, + "step": 3805 + }, + { + "epoch": 1.8082581869957286, + "grad_norm": 0.3322440468308872, + "learning_rate": 4.299934080421886e-05, + "loss": 0.4324, + "step": 3810 + }, + { + "epoch": 1.8106312292358804, + "grad_norm": 0.36499792027673067, + "learning_rate": 4.298286090969018e-05, + "loss": 0.4301, + "step": 3815 + }, + { + "epoch": 1.8130042714760322, + "grad_norm": 0.32941982857702096, + "learning_rate": 4.296638101516151e-05, + "loss": 0.437, + "step": 3820 + }, + { + "epoch": 1.8153773137161842, + "grad_norm": 0.3565437704673514, + "learning_rate": 4.294990112063283e-05, + "loss": 0.4316, + "step": 3825 + }, + { + "epoch": 1.8177503559563362, + "grad_norm": 0.33139195406338673, + "learning_rate": 4.293342122610415e-05, + "loss": 0.429, + "step": 3830 + }, + { + "epoch": 1.820123398196488, + "grad_norm": 0.31210360850320407, + "learning_rate": 4.291694133157548e-05, + "loss": 0.45, + "step": 3835 + }, + { + "epoch": 1.8224964404366397, + "grad_norm": 0.35695980662145915, + "learning_rate": 4.290046143704681e-05, + "loss": 0.4373, + "step": 3840 + }, + { + "epoch": 1.8248694826767915, + "grad_norm": 0.32250029941074715, + "learning_rate": 4.288398154251813e-05, + "loss": 0.4341, + "step": 3845 + }, + { + "epoch": 1.8272425249169435, + "grad_norm": 0.337000441449131, + "learning_rate": 4.286750164798945e-05, + "loss": 0.4207, + "step": 3850 + }, + { + "epoch": 1.8296155671570955, + "grad_norm": 0.3255725341893471, + "learning_rate": 4.285102175346078e-05, + "loss": 0.442, + "step": 3855 + }, + { + "epoch": 1.8319886093972473, + "grad_norm": 0.30257438616222176, + "learning_rate": 4.2834541858932103e-05, + "loss": 0.4376, + "step": 3860 + }, + { + "epoch": 1.834361651637399, + "grad_norm": 0.3680823170879787, + "learning_rate": 4.2818061964403425e-05, + "loss": 0.4435, + "step": 3865 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.31805336435633363, + "learning_rate": 4.2801582069874754e-05, + "loss": 0.4349, + "step": 3870 + }, + { + "epoch": 1.8391077361177028, + "grad_norm": 0.3650256435142743, + "learning_rate": 4.278510217534608e-05, + "loss": 0.4295, + "step": 3875 + }, + { + "epoch": 1.8414807783578548, + "grad_norm": 0.30751517848755483, + "learning_rate": 4.2768622280817405e-05, + "loss": 0.429, + "step": 3880 + }, + { + "epoch": 1.8438538205980066, + "grad_norm": 0.34134168198384085, + "learning_rate": 4.275214238628873e-05, + "loss": 0.4303, + "step": 3885 + }, + { + "epoch": 1.8462268628381584, + "grad_norm": 0.2858392481217053, + "learning_rate": 4.2735662491760056e-05, + "loss": 0.439, + "step": 3890 + }, + { + "epoch": 1.8485999050783104, + "grad_norm": 0.33533162008481004, + "learning_rate": 4.271918259723138e-05, + "loss": 0.4401, + "step": 3895 + }, + { + "epoch": 1.8509729473184624, + "grad_norm": 0.30245962917959723, + "learning_rate": 4.2702702702702706e-05, + "loss": 0.4287, + "step": 3900 + }, + { + "epoch": 1.8533459895586142, + "grad_norm": 0.3185941237962249, + "learning_rate": 4.268622280817403e-05, + "loss": 0.433, + "step": 3905 + }, + { + "epoch": 1.855719031798766, + "grad_norm": 0.32585555665393295, + "learning_rate": 4.266974291364536e-05, + "loss": 0.4435, + "step": 3910 + }, + { + "epoch": 1.8580920740389177, + "grad_norm": 0.34384746837184893, + "learning_rate": 4.265326301911668e-05, + "loss": 0.4379, + "step": 3915 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 0.2823089374851849, + "learning_rate": 4.2636783124588e-05, + "loss": 0.4284, + "step": 3920 + }, + { + "epoch": 1.8628381585192217, + "grad_norm": 0.34115093916336936, + "learning_rate": 4.262030323005933e-05, + "loss": 0.437, + "step": 3925 + }, + { + "epoch": 1.8652112007593735, + "grad_norm": 0.34808797849224793, + "learning_rate": 4.260382333553066e-05, + "loss": 0.4437, + "step": 3930 + }, + { + "epoch": 1.8675842429995253, + "grad_norm": 0.3308870766634637, + "learning_rate": 4.258734344100198e-05, + "loss": 0.4429, + "step": 3935 + }, + { + "epoch": 1.8699572852396773, + "grad_norm": 0.29632418013400047, + "learning_rate": 4.25708635464733e-05, + "loss": 0.4351, + "step": 3940 + }, + { + "epoch": 1.8723303274798293, + "grad_norm": 0.3085521963346963, + "learning_rate": 4.255438365194463e-05, + "loss": 0.4495, + "step": 3945 + }, + { + "epoch": 1.874703369719981, + "grad_norm": 0.29586494144083714, + "learning_rate": 4.253790375741595e-05, + "loss": 0.4345, + "step": 3950 + }, + { + "epoch": 1.8770764119601329, + "grad_norm": 0.3009976877203374, + "learning_rate": 4.2521423862887275e-05, + "loss": 0.428, + "step": 3955 + }, + { + "epoch": 1.8794494542002846, + "grad_norm": 0.3264758916830044, + "learning_rate": 4.2504943968358604e-05, + "loss": 0.4358, + "step": 3960 + }, + { + "epoch": 1.8818224964404366, + "grad_norm": 0.31922199769969156, + "learning_rate": 4.248846407382993e-05, + "loss": 0.4252, + "step": 3965 + }, + { + "epoch": 1.8841955386805886, + "grad_norm": 0.30377051284846174, + "learning_rate": 4.2471984179301254e-05, + "loss": 0.4446, + "step": 3970 + }, + { + "epoch": 1.8865685809207404, + "grad_norm": 0.2941521376388604, + "learning_rate": 4.2455504284772576e-05, + "loss": 0.4268, + "step": 3975 + }, + { + "epoch": 1.8889416231608922, + "grad_norm": 0.29712552866669983, + "learning_rate": 4.2439024390243905e-05, + "loss": 0.4403, + "step": 3980 + }, + { + "epoch": 1.891314665401044, + "grad_norm": 0.3311131925302983, + "learning_rate": 4.242254449571523e-05, + "loss": 0.4277, + "step": 3985 + }, + { + "epoch": 1.893687707641196, + "grad_norm": 0.30637050220064843, + "learning_rate": 4.2406064601186556e-05, + "loss": 0.4288, + "step": 3990 + }, + { + "epoch": 1.896060749881348, + "grad_norm": 0.30560998330707456, + "learning_rate": 4.238958470665788e-05, + "loss": 0.4479, + "step": 3995 + }, + { + "epoch": 1.8984337921214998, + "grad_norm": 0.3035407328145707, + "learning_rate": 4.2373104812129207e-05, + "loss": 0.4291, + "step": 4000 + }, + { + "epoch": 1.9008068343616515, + "grad_norm": 0.3208333923104267, + "learning_rate": 4.235662491760053e-05, + "loss": 0.4446, + "step": 4005 + }, + { + "epoch": 1.9031798766018035, + "grad_norm": 0.3556107427521709, + "learning_rate": 4.234014502307185e-05, + "loss": 0.4202, + "step": 4010 + }, + { + "epoch": 1.9055529188419555, + "grad_norm": 0.36691562084046586, + "learning_rate": 4.232366512854318e-05, + "loss": 0.4292, + "step": 4015 + }, + { + "epoch": 1.9079259610821073, + "grad_norm": 0.31989759841188165, + "learning_rate": 4.230718523401451e-05, + "loss": 0.4363, + "step": 4020 + }, + { + "epoch": 1.910299003322259, + "grad_norm": 0.3230520193155397, + "learning_rate": 4.229070533948583e-05, + "loss": 0.4303, + "step": 4025 + }, + { + "epoch": 1.9126720455624109, + "grad_norm": 0.3585817855207104, + "learning_rate": 4.227422544495715e-05, + "loss": 0.4392, + "step": 4030 + }, + { + "epoch": 1.9150450878025629, + "grad_norm": 0.313815791852119, + "learning_rate": 4.225774555042848e-05, + "loss": 0.4477, + "step": 4035 + }, + { + "epoch": 1.9174181300427149, + "grad_norm": 0.3273607272712406, + "learning_rate": 4.22412656558998e-05, + "loss": 0.429, + "step": 4040 + }, + { + "epoch": 1.9197911722828667, + "grad_norm": 0.32304729870680876, + "learning_rate": 4.2224785761371125e-05, + "loss": 0.4267, + "step": 4045 + }, + { + "epoch": 1.9221642145230184, + "grad_norm": 0.28426766552053906, + "learning_rate": 4.220830586684246e-05, + "loss": 0.4288, + "step": 4050 + }, + { + "epoch": 1.9245372567631702, + "grad_norm": 0.33446734221777075, + "learning_rate": 4.219182597231378e-05, + "loss": 0.4347, + "step": 4055 + }, + { + "epoch": 1.9269102990033222, + "grad_norm": 0.29303018273099024, + "learning_rate": 4.2175346077785104e-05, + "loss": 0.4236, + "step": 4060 + }, + { + "epoch": 1.9292833412434742, + "grad_norm": 0.3307780362651841, + "learning_rate": 4.2158866183256426e-05, + "loss": 0.432, + "step": 4065 + }, + { + "epoch": 1.931656383483626, + "grad_norm": 0.34281192543863764, + "learning_rate": 4.2142386288727755e-05, + "loss": 0.4354, + "step": 4070 + }, + { + "epoch": 1.9340294257237778, + "grad_norm": 0.31335293297920025, + "learning_rate": 4.212590639419908e-05, + "loss": 0.4335, + "step": 4075 + }, + { + "epoch": 1.9364024679639298, + "grad_norm": 0.3960873119581401, + "learning_rate": 4.2109426499670405e-05, + "loss": 0.4427, + "step": 4080 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 0.4208229202132566, + "learning_rate": 4.2092946605141734e-05, + "loss": 0.4317, + "step": 4085 + }, + { + "epoch": 1.9411485524442336, + "grad_norm": 0.36854850778772164, + "learning_rate": 4.2076466710613056e-05, + "loss": 0.4314, + "step": 4090 + }, + { + "epoch": 1.9435215946843853, + "grad_norm": 0.3481797544625902, + "learning_rate": 4.205998681608438e-05, + "loss": 0.4299, + "step": 4095 + }, + { + "epoch": 1.9458946369245371, + "grad_norm": 0.3433910012631487, + "learning_rate": 4.20435069215557e-05, + "loss": 0.4269, + "step": 4100 + }, + { + "epoch": 1.9482676791646891, + "grad_norm": 0.31621309899418354, + "learning_rate": 4.202702702702703e-05, + "loss": 0.4377, + "step": 4105 + }, + { + "epoch": 1.9506407214048411, + "grad_norm": 0.311606472072901, + "learning_rate": 4.201054713249835e-05, + "loss": 0.4255, + "step": 4110 + }, + { + "epoch": 1.953013763644993, + "grad_norm": 0.3065769765454056, + "learning_rate": 4.199406723796968e-05, + "loss": 0.4247, + "step": 4115 + }, + { + "epoch": 1.9553868058851447, + "grad_norm": 0.3010718024595711, + "learning_rate": 4.197758734344101e-05, + "loss": 0.4248, + "step": 4120 + }, + { + "epoch": 1.9577598481252965, + "grad_norm": 0.3024714758476704, + "learning_rate": 4.196110744891233e-05, + "loss": 0.4232, + "step": 4125 + }, + { + "epoch": 1.9601328903654485, + "grad_norm": 0.3176704018721933, + "learning_rate": 4.194462755438365e-05, + "loss": 0.4137, + "step": 4130 + }, + { + "epoch": 1.9625059326056005, + "grad_norm": 0.294504245367282, + "learning_rate": 4.1928147659854974e-05, + "loss": 0.4269, + "step": 4135 + }, + { + "epoch": 1.9648789748457522, + "grad_norm": 0.34356740739350733, + "learning_rate": 4.19116677653263e-05, + "loss": 0.4283, + "step": 4140 + }, + { + "epoch": 1.967252017085904, + "grad_norm": 0.29407247470655384, + "learning_rate": 4.189518787079763e-05, + "loss": 0.4294, + "step": 4145 + }, + { + "epoch": 1.969625059326056, + "grad_norm": 0.311433160474579, + "learning_rate": 4.1878707976268954e-05, + "loss": 0.4218, + "step": 4150 + }, + { + "epoch": 1.971998101566208, + "grad_norm": 0.3195576875313136, + "learning_rate": 4.186222808174028e-05, + "loss": 0.4271, + "step": 4155 + }, + { + "epoch": 1.9743711438063598, + "grad_norm": 0.42411828301446697, + "learning_rate": 4.1845748187211604e-05, + "loss": 0.4137, + "step": 4160 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 0.3064812597514605, + "learning_rate": 4.1829268292682926e-05, + "loss": 0.4107, + "step": 4165 + }, + { + "epoch": 1.9791172282866634, + "grad_norm": 0.3171569773543572, + "learning_rate": 4.181278839815425e-05, + "loss": 0.4322, + "step": 4170 + }, + { + "epoch": 1.9814902705268154, + "grad_norm": 0.33004544450390044, + "learning_rate": 4.1796308503625584e-05, + "loss": 0.4377, + "step": 4175 + }, + { + "epoch": 1.9838633127669674, + "grad_norm": 0.3035461533210494, + "learning_rate": 4.1779828609096906e-05, + "loss": 0.4339, + "step": 4180 + }, + { + "epoch": 1.9862363550071191, + "grad_norm": 0.3140559622582693, + "learning_rate": 4.176334871456823e-05, + "loss": 0.4293, + "step": 4185 + }, + { + "epoch": 1.988609397247271, + "grad_norm": 0.2951404589313509, + "learning_rate": 4.174686882003955e-05, + "loss": 0.4164, + "step": 4190 + }, + { + "epoch": 1.990982439487423, + "grad_norm": 0.32407144376898045, + "learning_rate": 4.173038892551088e-05, + "loss": 0.4303, + "step": 4195 + }, + { + "epoch": 1.9933554817275747, + "grad_norm": 0.28656671019690244, + "learning_rate": 4.17139090309822e-05, + "loss": 0.4242, + "step": 4200 + }, + { + "epoch": 1.9957285239677267, + "grad_norm": 0.3290906996760554, + "learning_rate": 4.169742913645353e-05, + "loss": 0.4319, + "step": 4205 + }, + { + "epoch": 1.9981015662078785, + "grad_norm": 0.31987730774416256, + "learning_rate": 4.168094924192486e-05, + "loss": 0.4371, + "step": 4210 + }, + { + "epoch": 2.0004746084480303, + "grad_norm": 0.3275847389105153, + "learning_rate": 4.166446934739618e-05, + "loss": 0.423, + "step": 4215 + }, + { + "epoch": 2.002847650688182, + "grad_norm": 0.3430461851251968, + "learning_rate": 4.16479894528675e-05, + "loss": 0.3951, + "step": 4220 + }, + { + "epoch": 2.0052206929283343, + "grad_norm": 0.31795226104030255, + "learning_rate": 4.1631509558338824e-05, + "loss": 0.3993, + "step": 4225 + }, + { + "epoch": 2.007593735168486, + "grad_norm": 0.3369379994207098, + "learning_rate": 4.161502966381015e-05, + "loss": 0.3954, + "step": 4230 + }, + { + "epoch": 2.009966777408638, + "grad_norm": 0.31759330004137687, + "learning_rate": 4.159854976928148e-05, + "loss": 0.3915, + "step": 4235 + }, + { + "epoch": 2.0123398196487896, + "grad_norm": 0.3086970613807332, + "learning_rate": 4.15820698747528e-05, + "loss": 0.4018, + "step": 4240 + }, + { + "epoch": 2.014712861888942, + "grad_norm": 0.32055687597168575, + "learning_rate": 4.156558998022413e-05, + "loss": 0.386, + "step": 4245 + }, + { + "epoch": 2.0170859041290936, + "grad_norm": 0.3053086706958724, + "learning_rate": 4.1549110085695454e-05, + "loss": 0.3923, + "step": 4250 + }, + { + "epoch": 2.0194589463692454, + "grad_norm": 0.3155130688396081, + "learning_rate": 4.1532630191166776e-05, + "loss": 0.4026, + "step": 4255 + }, + { + "epoch": 2.021831988609397, + "grad_norm": 0.34880223156210866, + "learning_rate": 4.15161502966381e-05, + "loss": 0.395, + "step": 4260 + }, + { + "epoch": 2.024205030849549, + "grad_norm": 0.29129443799888455, + "learning_rate": 4.149967040210943e-05, + "loss": 0.403, + "step": 4265 + }, + { + "epoch": 2.026578073089701, + "grad_norm": 0.29901970807366324, + "learning_rate": 4.1483190507580755e-05, + "loss": 0.3926, + "step": 4270 + }, + { + "epoch": 2.028951115329853, + "grad_norm": 0.3408176215763249, + "learning_rate": 4.146671061305208e-05, + "loss": 0.3978, + "step": 4275 + }, + { + "epoch": 2.0313241575700047, + "grad_norm": 0.3609397947861268, + "learning_rate": 4.1450230718523406e-05, + "loss": 0.4014, + "step": 4280 + }, + { + "epoch": 2.0336971998101565, + "grad_norm": 0.3719243638816488, + "learning_rate": 4.143375082399473e-05, + "loss": 0.4042, + "step": 4285 + }, + { + "epoch": 2.0360702420503083, + "grad_norm": 0.3935532101732094, + "learning_rate": 4.141727092946605e-05, + "loss": 0.3983, + "step": 4290 + }, + { + "epoch": 2.0384432842904605, + "grad_norm": 0.3348390271849565, + "learning_rate": 4.140079103493738e-05, + "loss": 0.3994, + "step": 4295 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 0.3577514995941809, + "learning_rate": 4.138431114040871e-05, + "loss": 0.3978, + "step": 4300 + }, + { + "epoch": 2.043189368770764, + "grad_norm": 0.40215151295220103, + "learning_rate": 4.136783124588003e-05, + "loss": 0.3813, + "step": 4305 + }, + { + "epoch": 2.045562411010916, + "grad_norm": 0.34767032886985416, + "learning_rate": 4.135135135135135e-05, + "loss": 0.4047, + "step": 4310 + }, + { + "epoch": 2.047935453251068, + "grad_norm": 0.2750683156548067, + "learning_rate": 4.133487145682268e-05, + "loss": 0.3923, + "step": 4315 + }, + { + "epoch": 2.05030849549122, + "grad_norm": 0.29300699704551647, + "learning_rate": 4.1318391562294e-05, + "loss": 0.3958, + "step": 4320 + }, + { + "epoch": 2.0526815377313716, + "grad_norm": 0.2910845452030118, + "learning_rate": 4.130191166776533e-05, + "loss": 0.3927, + "step": 4325 + }, + { + "epoch": 2.0550545799715234, + "grad_norm": 0.3050300620359581, + "learning_rate": 4.128543177323665e-05, + "loss": 0.3952, + "step": 4330 + }, + { + "epoch": 2.057427622211675, + "grad_norm": 0.3032406339935657, + "learning_rate": 4.126895187870798e-05, + "loss": 0.4002, + "step": 4335 + }, + { + "epoch": 2.0598006644518274, + "grad_norm": 0.33874264878148125, + "learning_rate": 4.1252471984179304e-05, + "loss": 0.4081, + "step": 4340 + }, + { + "epoch": 2.062173706691979, + "grad_norm": 0.29131240306300665, + "learning_rate": 4.1235992089650625e-05, + "loss": 0.3983, + "step": 4345 + }, + { + "epoch": 2.064546748932131, + "grad_norm": 0.3045720256391466, + "learning_rate": 4.1219512195121954e-05, + "loss": 0.3949, + "step": 4350 + }, + { + "epoch": 2.0669197911722827, + "grad_norm": 0.33252765316273414, + "learning_rate": 4.120303230059328e-05, + "loss": 0.4011, + "step": 4355 + }, + { + "epoch": 2.0692928334124345, + "grad_norm": 0.30125594236121056, + "learning_rate": 4.1186552406064605e-05, + "loss": 0.3946, + "step": 4360 + }, + { + "epoch": 2.0716658756525868, + "grad_norm": 0.3313638765574868, + "learning_rate": 4.117007251153593e-05, + "loss": 0.3982, + "step": 4365 + }, + { + "epoch": 2.0740389178927385, + "grad_norm": 0.31309640563733265, + "learning_rate": 4.1153592617007256e-05, + "loss": 0.4036, + "step": 4370 + }, + { + "epoch": 2.0764119601328903, + "grad_norm": 0.2788734110907511, + "learning_rate": 4.113711272247858e-05, + "loss": 0.3961, + "step": 4375 + }, + { + "epoch": 2.078785002373042, + "grad_norm": 0.30531227815345224, + "learning_rate": 4.11206328279499e-05, + "loss": 0.3931, + "step": 4380 + }, + { + "epoch": 2.0811580446131943, + "grad_norm": 0.31576243737161314, + "learning_rate": 4.110415293342123e-05, + "loss": 0.3981, + "step": 4385 + }, + { + "epoch": 2.083531086853346, + "grad_norm": 0.32535622664211983, + "learning_rate": 4.108767303889256e-05, + "loss": 0.3986, + "step": 4390 + }, + { + "epoch": 2.085904129093498, + "grad_norm": 0.2995461235152468, + "learning_rate": 4.107119314436388e-05, + "loss": 0.3968, + "step": 4395 + }, + { + "epoch": 2.0882771713336497, + "grad_norm": 0.32113879157726444, + "learning_rate": 4.10547132498352e-05, + "loss": 0.3951, + "step": 4400 + }, + { + "epoch": 2.0906502135738014, + "grad_norm": 0.29818220865571354, + "learning_rate": 4.103823335530653e-05, + "loss": 0.4006, + "step": 4405 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 0.31022449991528045, + "learning_rate": 4.102175346077785e-05, + "loss": 0.4146, + "step": 4410 + }, + { + "epoch": 2.0953962980541054, + "grad_norm": 0.31264585754759494, + "learning_rate": 4.100527356624918e-05, + "loss": 0.3881, + "step": 4415 + }, + { + "epoch": 2.097769340294257, + "grad_norm": 0.3320572485569992, + "learning_rate": 4.09887936717205e-05, + "loss": 0.3951, + "step": 4420 + }, + { + "epoch": 2.100142382534409, + "grad_norm": 0.30469930036014675, + "learning_rate": 4.097231377719183e-05, + "loss": 0.3942, + "step": 4425 + }, + { + "epoch": 2.1025154247745608, + "grad_norm": 0.31824892654547454, + "learning_rate": 4.095583388266315e-05, + "loss": 0.3955, + "step": 4430 + }, + { + "epoch": 2.104888467014713, + "grad_norm": 0.3400483423949204, + "learning_rate": 4.0939353988134475e-05, + "loss": 0.3992, + "step": 4435 + }, + { + "epoch": 2.1072615092548648, + "grad_norm": 0.3715815634547762, + "learning_rate": 4.0922874093605804e-05, + "loss": 0.4049, + "step": 4440 + }, + { + "epoch": 2.1096345514950166, + "grad_norm": 0.3359736671710263, + "learning_rate": 4.090639419907713e-05, + "loss": 0.3952, + "step": 4445 + }, + { + "epoch": 2.1120075937351683, + "grad_norm": 0.2973657338209733, + "learning_rate": 4.0889914304548455e-05, + "loss": 0.3908, + "step": 4450 + }, + { + "epoch": 2.1143806359753206, + "grad_norm": 0.29198663248881335, + "learning_rate": 4.0873434410019776e-05, + "loss": 0.4021, + "step": 4455 + }, + { + "epoch": 2.1167536782154723, + "grad_norm": 0.3200752251026872, + "learning_rate": 4.0856954515491105e-05, + "loss": 0.4016, + "step": 4460 + }, + { + "epoch": 2.119126720455624, + "grad_norm": 0.2959432438654945, + "learning_rate": 4.084047462096243e-05, + "loss": 0.3981, + "step": 4465 + }, + { + "epoch": 2.121499762695776, + "grad_norm": 0.2794113242426709, + "learning_rate": 4.082399472643375e-05, + "loss": 0.392, + "step": 4470 + }, + { + "epoch": 2.1238728049359277, + "grad_norm": 0.2686588345385481, + "learning_rate": 4.080751483190508e-05, + "loss": 0.3994, + "step": 4475 + }, + { + "epoch": 2.12624584717608, + "grad_norm": 0.31624785660796695, + "learning_rate": 4.079103493737641e-05, + "loss": 0.3905, + "step": 4480 + }, + { + "epoch": 2.1286188894162317, + "grad_norm": 0.34608470207120967, + "learning_rate": 4.077455504284773e-05, + "loss": 0.3856, + "step": 4485 + }, + { + "epoch": 2.1309919316563835, + "grad_norm": 0.3237211155599076, + "learning_rate": 4.075807514831905e-05, + "loss": 0.3937, + "step": 4490 + }, + { + "epoch": 2.1333649738965352, + "grad_norm": 0.34613008517391597, + "learning_rate": 4.074159525379038e-05, + "loss": 0.4034, + "step": 4495 + }, + { + "epoch": 2.1357380161366875, + "grad_norm": 0.302777965876585, + "learning_rate": 4.07251153592617e-05, + "loss": 0.4144, + "step": 4500 + }, + { + "epoch": 2.1381110583768392, + "grad_norm": 0.31515678831866845, + "learning_rate": 4.070863546473303e-05, + "loss": 0.4009, + "step": 4505 + }, + { + "epoch": 2.140484100616991, + "grad_norm": 0.3172179296718023, + "learning_rate": 4.069215557020435e-05, + "loss": 0.3951, + "step": 4510 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.3002870328759794, + "learning_rate": 4.067567567567568e-05, + "loss": 0.3911, + "step": 4515 + }, + { + "epoch": 2.1452301850972946, + "grad_norm": 0.3210644782524601, + "learning_rate": 4.0659195781147e-05, + "loss": 0.4126, + "step": 4520 + }, + { + "epoch": 2.147603227337447, + "grad_norm": 0.3002235080463189, + "learning_rate": 4.0642715886618325e-05, + "loss": 0.391, + "step": 4525 + }, + { + "epoch": 2.1499762695775986, + "grad_norm": 0.3055503052038052, + "learning_rate": 4.0626235992089653e-05, + "loss": 0.3998, + "step": 4530 + }, + { + "epoch": 2.1523493118177504, + "grad_norm": 0.2928159430727094, + "learning_rate": 4.060975609756098e-05, + "loss": 0.399, + "step": 4535 + }, + { + "epoch": 2.154722354057902, + "grad_norm": 0.29318130384165625, + "learning_rate": 4.0593276203032304e-05, + "loss": 0.4073, + "step": 4540 + }, + { + "epoch": 2.157095396298054, + "grad_norm": 0.3142769620903918, + "learning_rate": 4.0576796308503626e-05, + "loss": 0.406, + "step": 4545 + }, + { + "epoch": 2.159468438538206, + "grad_norm": 0.32659681624704767, + "learning_rate": 4.0560316413974955e-05, + "loss": 0.397, + "step": 4550 + }, + { + "epoch": 2.161841480778358, + "grad_norm": 0.3222559071032518, + "learning_rate": 4.054383651944628e-05, + "loss": 0.3996, + "step": 4555 + }, + { + "epoch": 2.1642145230185097, + "grad_norm": 0.37853681502991493, + "learning_rate": 4.05273566249176e-05, + "loss": 0.3936, + "step": 4560 + }, + { + "epoch": 2.1665875652586615, + "grad_norm": 0.36471400004904986, + "learning_rate": 4.051087673038893e-05, + "loss": 0.4035, + "step": 4565 + }, + { + "epoch": 2.1689606074988133, + "grad_norm": 0.33614397818380193, + "learning_rate": 4.0494396835860256e-05, + "loss": 0.4048, + "step": 4570 + }, + { + "epoch": 2.1713336497389655, + "grad_norm": 0.29796619687703285, + "learning_rate": 4.047791694133158e-05, + "loss": 0.3932, + "step": 4575 + }, + { + "epoch": 2.1737066919791173, + "grad_norm": 0.31600842458769646, + "learning_rate": 4.04614370468029e-05, + "loss": 0.3958, + "step": 4580 + }, + { + "epoch": 2.176079734219269, + "grad_norm": 0.35758676973109116, + "learning_rate": 4.044495715227423e-05, + "loss": 0.4125, + "step": 4585 + }, + { + "epoch": 2.178452776459421, + "grad_norm": 0.28657755003540486, + "learning_rate": 4.042847725774555e-05, + "loss": 0.3983, + "step": 4590 + }, + { + "epoch": 2.180825818699573, + "grad_norm": 0.33200876622649683, + "learning_rate": 4.041199736321687e-05, + "loss": 0.3972, + "step": 4595 + }, + { + "epoch": 2.183198860939725, + "grad_norm": 0.2962009752237219, + "learning_rate": 4.03955174686882e-05, + "loss": 0.3899, + "step": 4600 + }, + { + "epoch": 2.1855719031798766, + "grad_norm": 0.33649720438326586, + "learning_rate": 4.037903757415953e-05, + "loss": 0.3956, + "step": 4605 + }, + { + "epoch": 2.1879449454200284, + "grad_norm": 0.38387885562405477, + "learning_rate": 4.036255767963085e-05, + "loss": 0.3985, + "step": 4610 + }, + { + "epoch": 2.19031798766018, + "grad_norm": 0.36007156013681846, + "learning_rate": 4.0346077785102174e-05, + "loss": 0.392, + "step": 4615 + }, + { + "epoch": 2.1926910299003324, + "grad_norm": 0.32491851263431176, + "learning_rate": 4.03295978905735e-05, + "loss": 0.4047, + "step": 4620 + }, + { + "epoch": 2.195064072140484, + "grad_norm": 0.2906383863017801, + "learning_rate": 4.0313117996044825e-05, + "loss": 0.4082, + "step": 4625 + }, + { + "epoch": 2.197437114380636, + "grad_norm": 0.301924213741903, + "learning_rate": 4.0296638101516154e-05, + "loss": 0.3907, + "step": 4630 + }, + { + "epoch": 2.1998101566207877, + "grad_norm": 0.3357284512683552, + "learning_rate": 4.0280158206987476e-05, + "loss": 0.4214, + "step": 4635 + }, + { + "epoch": 2.20218319886094, + "grad_norm": 0.3064824526433963, + "learning_rate": 4.0263678312458804e-05, + "loss": 0.3947, + "step": 4640 + }, + { + "epoch": 2.2045562411010917, + "grad_norm": 0.32873455421097264, + "learning_rate": 4.0247198417930126e-05, + "loss": 0.4004, + "step": 4645 + }, + { + "epoch": 2.2069292833412435, + "grad_norm": 0.3035492958998065, + "learning_rate": 4.023071852340145e-05, + "loss": 0.3942, + "step": 4650 + }, + { + "epoch": 2.2093023255813953, + "grad_norm": 0.3031850378049543, + "learning_rate": 4.021423862887278e-05, + "loss": 0.3939, + "step": 4655 + }, + { + "epoch": 2.211675367821547, + "grad_norm": 0.3605101529289465, + "learning_rate": 4.0197758734344106e-05, + "loss": 0.4041, + "step": 4660 + }, + { + "epoch": 2.2140484100616993, + "grad_norm": 0.31993937279816204, + "learning_rate": 4.018127883981543e-05, + "loss": 0.3953, + "step": 4665 + }, + { + "epoch": 2.216421452301851, + "grad_norm": 0.33641824814259075, + "learning_rate": 4.016479894528675e-05, + "loss": 0.4066, + "step": 4670 + }, + { + "epoch": 2.218794494542003, + "grad_norm": 0.3230884470123325, + "learning_rate": 4.014831905075808e-05, + "loss": 0.4043, + "step": 4675 + }, + { + "epoch": 2.2211675367821546, + "grad_norm": 0.36194859842681065, + "learning_rate": 4.01318391562294e-05, + "loss": 0.3995, + "step": 4680 + }, + { + "epoch": 2.2235405790223064, + "grad_norm": 0.28903704908671457, + "learning_rate": 4.011535926170072e-05, + "loss": 0.4042, + "step": 4685 + }, + { + "epoch": 2.2259136212624586, + "grad_norm": 0.33712851498775315, + "learning_rate": 4.009887936717205e-05, + "loss": 0.4041, + "step": 4690 + }, + { + "epoch": 2.2282866635026104, + "grad_norm": 0.2971284900912896, + "learning_rate": 4.008239947264338e-05, + "loss": 0.3882, + "step": 4695 + }, + { + "epoch": 2.230659705742762, + "grad_norm": 0.3543524322396388, + "learning_rate": 4.00659195781147e-05, + "loss": 0.3937, + "step": 4700 + }, + { + "epoch": 2.233032747982914, + "grad_norm": 0.33498384126843506, + "learning_rate": 4.0049439683586024e-05, + "loss": 0.3985, + "step": 4705 + }, + { + "epoch": 2.2354057902230657, + "grad_norm": 0.29213319067182436, + "learning_rate": 4.003295978905735e-05, + "loss": 0.4105, + "step": 4710 + }, + { + "epoch": 2.237778832463218, + "grad_norm": 0.3208030218583873, + "learning_rate": 4.0016479894528675e-05, + "loss": 0.4014, + "step": 4715 + }, + { + "epoch": 2.2401518747033697, + "grad_norm": 0.3141725758437314, + "learning_rate": 4e-05, + "loss": 0.4062, + "step": 4720 + }, + { + "epoch": 2.2425249169435215, + "grad_norm": 0.3025527419028678, + "learning_rate": 3.9983520105471325e-05, + "loss": 0.4066, + "step": 4725 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 0.31600519578056824, + "learning_rate": 3.9967040210942654e-05, + "loss": 0.4099, + "step": 4730 + }, + { + "epoch": 2.2472710014238255, + "grad_norm": 0.32871714061636537, + "learning_rate": 3.9950560316413976e-05, + "loss": 0.3955, + "step": 4735 + }, + { + "epoch": 2.2496440436639773, + "grad_norm": 0.27049592372863784, + "learning_rate": 3.99340804218853e-05, + "loss": 0.3958, + "step": 4740 + }, + { + "epoch": 2.252017085904129, + "grad_norm": 0.3089430649045912, + "learning_rate": 3.991760052735663e-05, + "loss": 0.4004, + "step": 4745 + }, + { + "epoch": 2.254390128144281, + "grad_norm": 0.37788601040216463, + "learning_rate": 3.9901120632827955e-05, + "loss": 0.403, + "step": 4750 + }, + { + "epoch": 2.256763170384433, + "grad_norm": 0.3132978947438683, + "learning_rate": 3.988464073829928e-05, + "loss": 0.3914, + "step": 4755 + }, + { + "epoch": 2.259136212624585, + "grad_norm": 0.2884074572586541, + "learning_rate": 3.98681608437706e-05, + "loss": 0.3944, + "step": 4760 + }, + { + "epoch": 2.2615092548647366, + "grad_norm": 0.36011476066901865, + "learning_rate": 3.985168094924193e-05, + "loss": 0.4052, + "step": 4765 + }, + { + "epoch": 2.2638822971048884, + "grad_norm": 0.32212054107045907, + "learning_rate": 3.983520105471325e-05, + "loss": 0.3919, + "step": 4770 + }, + { + "epoch": 2.26625533934504, + "grad_norm": 0.28947302147038495, + "learning_rate": 3.981872116018457e-05, + "loss": 0.3946, + "step": 4775 + }, + { + "epoch": 2.2686283815851924, + "grad_norm": 0.3294797568694005, + "learning_rate": 3.98022412656559e-05, + "loss": 0.3976, + "step": 4780 + }, + { + "epoch": 2.271001423825344, + "grad_norm": 0.2857645392867982, + "learning_rate": 3.978576137112723e-05, + "loss": 0.3888, + "step": 4785 + }, + { + "epoch": 2.273374466065496, + "grad_norm": 0.2980692425363247, + "learning_rate": 3.976928147659855e-05, + "loss": 0.4112, + "step": 4790 + }, + { + "epoch": 2.2757475083056478, + "grad_norm": 0.31943410557444235, + "learning_rate": 3.9752801582069873e-05, + "loss": 0.3983, + "step": 4795 + }, + { + "epoch": 2.2781205505457995, + "grad_norm": 0.30360489214317843, + "learning_rate": 3.97363216875412e-05, + "loss": 0.3968, + "step": 4800 + }, + { + "epoch": 2.2804935927859518, + "grad_norm": 0.2722546072078695, + "learning_rate": 3.9719841793012524e-05, + "loss": 0.4043, + "step": 4805 + }, + { + "epoch": 2.2828666350261035, + "grad_norm": 0.2814273025478308, + "learning_rate": 3.970336189848385e-05, + "loss": 0.3973, + "step": 4810 + }, + { + "epoch": 2.2852396772662553, + "grad_norm": 0.3072769430537086, + "learning_rate": 3.9686882003955175e-05, + "loss": 0.4, + "step": 4815 + }, + { + "epoch": 2.287612719506407, + "grad_norm": 0.30958645202016843, + "learning_rate": 3.9670402109426504e-05, + "loss": 0.3887, + "step": 4820 + }, + { + "epoch": 2.289985761746559, + "grad_norm": 0.3071494748836781, + "learning_rate": 3.9653922214897826e-05, + "loss": 0.4034, + "step": 4825 + }, + { + "epoch": 2.292358803986711, + "grad_norm": 0.33431219356786396, + "learning_rate": 3.963744232036915e-05, + "loss": 0.3918, + "step": 4830 + }, + { + "epoch": 2.294731846226863, + "grad_norm": 0.2979271969749279, + "learning_rate": 3.9620962425840476e-05, + "loss": 0.4155, + "step": 4835 + }, + { + "epoch": 2.2971048884670147, + "grad_norm": 0.3167168058066888, + "learning_rate": 3.9604482531311805e-05, + "loss": 0.3974, + "step": 4840 + }, + { + "epoch": 2.2994779307071664, + "grad_norm": 0.2666428853245294, + "learning_rate": 3.958800263678313e-05, + "loss": 0.3944, + "step": 4845 + }, + { + "epoch": 2.301850972947318, + "grad_norm": 0.3048030132681787, + "learning_rate": 3.957152274225445e-05, + "loss": 0.3982, + "step": 4850 + }, + { + "epoch": 2.3042240151874704, + "grad_norm": 0.2981064067330076, + "learning_rate": 3.955504284772578e-05, + "loss": 0.3945, + "step": 4855 + }, + { + "epoch": 2.306597057427622, + "grad_norm": 0.3080741897331429, + "learning_rate": 3.95385629531971e-05, + "loss": 0.3957, + "step": 4860 + }, + { + "epoch": 2.308970099667774, + "grad_norm": 0.29812392739254856, + "learning_rate": 3.952208305866842e-05, + "loss": 0.4034, + "step": 4865 + }, + { + "epoch": 2.311343141907926, + "grad_norm": 0.2722953053235835, + "learning_rate": 3.950560316413976e-05, + "loss": 0.3931, + "step": 4870 + }, + { + "epoch": 2.313716184148078, + "grad_norm": 0.3072303656352438, + "learning_rate": 3.948912326961108e-05, + "loss": 0.3907, + "step": 4875 + }, + { + "epoch": 2.31608922638823, + "grad_norm": 0.2953943899775401, + "learning_rate": 3.94726433750824e-05, + "loss": 0.4092, + "step": 4880 + }, + { + "epoch": 2.3184622686283816, + "grad_norm": 0.2978068531169368, + "learning_rate": 3.945616348055372e-05, + "loss": 0.4024, + "step": 4885 + }, + { + "epoch": 2.3208353108685333, + "grad_norm": 0.3781797543064686, + "learning_rate": 3.943968358602505e-05, + "loss": 0.4029, + "step": 4890 + }, + { + "epoch": 2.3232083531086856, + "grad_norm": 0.35156104716212655, + "learning_rate": 3.9423203691496374e-05, + "loss": 0.4001, + "step": 4895 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 0.3117244393486639, + "learning_rate": 3.94067237969677e-05, + "loss": 0.4047, + "step": 4900 + }, + { + "epoch": 2.327954437588989, + "grad_norm": 0.3257088527330816, + "learning_rate": 3.939024390243903e-05, + "loss": 0.3923, + "step": 4905 + }, + { + "epoch": 2.330327479829141, + "grad_norm": 0.3275749245890093, + "learning_rate": 3.937376400791035e-05, + "loss": 0.4054, + "step": 4910 + }, + { + "epoch": 2.3327005220692927, + "grad_norm": 0.33163339059070723, + "learning_rate": 3.9357284113381675e-05, + "loss": 0.3915, + "step": 4915 + }, + { + "epoch": 2.335073564309445, + "grad_norm": 0.3124181370098426, + "learning_rate": 3.9340804218853e-05, + "loss": 0.3993, + "step": 4920 + }, + { + "epoch": 2.3374466065495967, + "grad_norm": 0.3142047009588773, + "learning_rate": 3.9324324324324326e-05, + "loss": 0.4116, + "step": 4925 + }, + { + "epoch": 2.3398196487897485, + "grad_norm": 0.30165248237603437, + "learning_rate": 3.9307844429795655e-05, + "loss": 0.4021, + "step": 4930 + }, + { + "epoch": 2.3421926910299002, + "grad_norm": 0.28278755123224664, + "learning_rate": 3.9291364535266977e-05, + "loss": 0.4029, + "step": 4935 + }, + { + "epoch": 2.344565733270052, + "grad_norm": 0.26886552386155693, + "learning_rate": 3.9274884640738305e-05, + "loss": 0.3907, + "step": 4940 + }, + { + "epoch": 2.3469387755102042, + "grad_norm": 0.42094950860123626, + "learning_rate": 3.925840474620963e-05, + "loss": 0.4062, + "step": 4945 + }, + { + "epoch": 2.349311817750356, + "grad_norm": 0.31529927823379533, + "learning_rate": 3.924192485168095e-05, + "loss": 0.3902, + "step": 4950 + }, + { + "epoch": 2.351684859990508, + "grad_norm": 0.30104571701735866, + "learning_rate": 3.922544495715227e-05, + "loss": 0.4027, + "step": 4955 + }, + { + "epoch": 2.3540579022306596, + "grad_norm": 0.3698861461415233, + "learning_rate": 3.920896506262361e-05, + "loss": 0.4022, + "step": 4960 + }, + { + "epoch": 2.3564309444708114, + "grad_norm": 0.3197605150415797, + "learning_rate": 3.919248516809493e-05, + "loss": 0.3924, + "step": 4965 + }, + { + "epoch": 2.3588039867109636, + "grad_norm": 0.3751635726575327, + "learning_rate": 3.917600527356625e-05, + "loss": 0.4051, + "step": 4970 + }, + { + "epoch": 2.3611770289511154, + "grad_norm": 0.30508492987810315, + "learning_rate": 3.915952537903758e-05, + "loss": 0.4032, + "step": 4975 + }, + { + "epoch": 2.363550071191267, + "grad_norm": 0.31110367684588, + "learning_rate": 3.91430454845089e-05, + "loss": 0.4096, + "step": 4980 + }, + { + "epoch": 2.365923113431419, + "grad_norm": 0.35435870670629366, + "learning_rate": 3.912656558998022e-05, + "loss": 0.4097, + "step": 4985 + }, + { + "epoch": 2.368296155671571, + "grad_norm": 0.34701353940944757, + "learning_rate": 3.9110085695451545e-05, + "loss": 0.3991, + "step": 4990 + }, + { + "epoch": 2.370669197911723, + "grad_norm": 0.2888962285961819, + "learning_rate": 3.909360580092288e-05, + "loss": 0.4026, + "step": 4995 + }, + { + "epoch": 2.3730422401518747, + "grad_norm": 0.3249941910922877, + "learning_rate": 3.90771259063942e-05, + "loss": 0.3993, + "step": 5000 + }, + { + "epoch": 2.3754152823920265, + "grad_norm": 0.34765968337216363, + "learning_rate": 3.9060646011865525e-05, + "loss": 0.3972, + "step": 5005 + }, + { + "epoch": 2.3777883246321783, + "grad_norm": 0.3085486444197968, + "learning_rate": 3.904416611733685e-05, + "loss": 0.3984, + "step": 5010 + }, + { + "epoch": 2.3801613668723305, + "grad_norm": 0.3077189514430981, + "learning_rate": 3.9027686222808175e-05, + "loss": 0.3996, + "step": 5015 + }, + { + "epoch": 2.3825344091124823, + "grad_norm": 0.26731054229641715, + "learning_rate": 3.90112063282795e-05, + "loss": 0.3892, + "step": 5020 + }, + { + "epoch": 2.384907451352634, + "grad_norm": 0.3186247088878084, + "learning_rate": 3.8994726433750826e-05, + "loss": 0.3926, + "step": 5025 + }, + { + "epoch": 2.387280493592786, + "grad_norm": 0.2921759032261509, + "learning_rate": 3.8978246539222155e-05, + "loss": 0.3939, + "step": 5030 + }, + { + "epoch": 2.389653535832938, + "grad_norm": 0.29315805927351335, + "learning_rate": 3.896176664469348e-05, + "loss": 0.3933, + "step": 5035 + }, + { + "epoch": 2.39202657807309, + "grad_norm": 0.3087294270042965, + "learning_rate": 3.89452867501648e-05, + "loss": 0.4117, + "step": 5040 + }, + { + "epoch": 2.3943996203132416, + "grad_norm": 0.32498346290264973, + "learning_rate": 3.892880685563612e-05, + "loss": 0.4047, + "step": 5045 + }, + { + "epoch": 2.3967726625533934, + "grad_norm": 0.35056141583576417, + "learning_rate": 3.891232696110745e-05, + "loss": 0.3922, + "step": 5050 + }, + { + "epoch": 2.399145704793545, + "grad_norm": 0.28845012275244897, + "learning_rate": 3.889584706657878e-05, + "loss": 0.4064, + "step": 5055 + }, + { + "epoch": 2.4015187470336974, + "grad_norm": 0.27219238217726177, + "learning_rate": 3.88793671720501e-05, + "loss": 0.3986, + "step": 5060 + }, + { + "epoch": 2.403891789273849, + "grad_norm": 0.2815775080855697, + "learning_rate": 3.886288727752143e-05, + "loss": 0.395, + "step": 5065 + }, + { + "epoch": 2.406264831514001, + "grad_norm": 0.2828988496053846, + "learning_rate": 3.884640738299275e-05, + "loss": 0.392, + "step": 5070 + }, + { + "epoch": 2.4086378737541527, + "grad_norm": 0.3042081180934621, + "learning_rate": 3.882992748846407e-05, + "loss": 0.3915, + "step": 5075 + }, + { + "epoch": 2.4110109159943045, + "grad_norm": 0.3017494362802122, + "learning_rate": 3.8813447593935395e-05, + "loss": 0.4003, + "step": 5080 + }, + { + "epoch": 2.4133839582344567, + "grad_norm": 0.3528005709221205, + "learning_rate": 3.879696769940673e-05, + "loss": 0.4081, + "step": 5085 + }, + { + "epoch": 2.4157570004746085, + "grad_norm": 0.35179393958842764, + "learning_rate": 3.878048780487805e-05, + "loss": 0.3906, + "step": 5090 + }, + { + "epoch": 2.4181300427147603, + "grad_norm": 0.2902278009175582, + "learning_rate": 3.8764007910349374e-05, + "loss": 0.3928, + "step": 5095 + }, + { + "epoch": 2.420503084954912, + "grad_norm": 0.37247699782569477, + "learning_rate": 3.87475280158207e-05, + "loss": 0.3951, + "step": 5100 + }, + { + "epoch": 2.422876127195064, + "grad_norm": 0.30409666547285574, + "learning_rate": 3.8731048121292025e-05, + "loss": 0.4005, + "step": 5105 + }, + { + "epoch": 2.425249169435216, + "grad_norm": 0.280732359331487, + "learning_rate": 3.871456822676335e-05, + "loss": 0.396, + "step": 5110 + }, + { + "epoch": 2.427622211675368, + "grad_norm": 0.2933555345394685, + "learning_rate": 3.8698088332234676e-05, + "loss": 0.4152, + "step": 5115 + }, + { + "epoch": 2.4299952539155196, + "grad_norm": 0.323616656484218, + "learning_rate": 3.8681608437706004e-05, + "loss": 0.3985, + "step": 5120 + }, + { + "epoch": 2.4323682961556714, + "grad_norm": 0.306483516568932, + "learning_rate": 3.8665128543177326e-05, + "loss": 0.3967, + "step": 5125 + }, + { + "epoch": 2.4347413383958236, + "grad_norm": 0.3009729428346335, + "learning_rate": 3.864864864864865e-05, + "loss": 0.4015, + "step": 5130 + }, + { + "epoch": 2.4371143806359754, + "grad_norm": 0.313847044430982, + "learning_rate": 3.863216875411998e-05, + "loss": 0.3936, + "step": 5135 + }, + { + "epoch": 2.439487422876127, + "grad_norm": 0.2863696925073533, + "learning_rate": 3.86156888595913e-05, + "loss": 0.397, + "step": 5140 + }, + { + "epoch": 2.441860465116279, + "grad_norm": 0.26737534072672386, + "learning_rate": 3.859920896506263e-05, + "loss": 0.3972, + "step": 5145 + }, + { + "epoch": 2.444233507356431, + "grad_norm": 0.32134315180670153, + "learning_rate": 3.858272907053395e-05, + "loss": 0.4066, + "step": 5150 + }, + { + "epoch": 2.446606549596583, + "grad_norm": 0.36702686761031933, + "learning_rate": 3.856624917600528e-05, + "loss": 0.4109, + "step": 5155 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 0.3207300116467373, + "learning_rate": 3.85497692814766e-05, + "loss": 0.4027, + "step": 5160 + }, + { + "epoch": 2.4513526340768865, + "grad_norm": 0.31337290873685997, + "learning_rate": 3.853328938694792e-05, + "loss": 0.3985, + "step": 5165 + }, + { + "epoch": 2.4537256763170383, + "grad_norm": 0.2922708254595613, + "learning_rate": 3.851680949241925e-05, + "loss": 0.3963, + "step": 5170 + }, + { + "epoch": 2.4560987185571905, + "grad_norm": 0.3252296937284064, + "learning_rate": 3.850032959789058e-05, + "loss": 0.3937, + "step": 5175 + }, + { + "epoch": 2.4584717607973423, + "grad_norm": 0.29250109040842937, + "learning_rate": 3.84838497033619e-05, + "loss": 0.3977, + "step": 5180 + }, + { + "epoch": 2.460844803037494, + "grad_norm": 0.35396023413338845, + "learning_rate": 3.8467369808833224e-05, + "loss": 0.3986, + "step": 5185 + }, + { + "epoch": 2.463217845277646, + "grad_norm": 0.3211890996451017, + "learning_rate": 3.845088991430455e-05, + "loss": 0.3994, + "step": 5190 + }, + { + "epoch": 2.4655908875177976, + "grad_norm": 0.2584415962902095, + "learning_rate": 3.8434410019775875e-05, + "loss": 0.3931, + "step": 5195 + }, + { + "epoch": 2.46796392975795, + "grad_norm": 0.26401609487740696, + "learning_rate": 3.8417930125247197e-05, + "loss": 0.3942, + "step": 5200 + }, + { + "epoch": 2.4703369719981016, + "grad_norm": 0.28399694674431925, + "learning_rate": 3.8401450230718525e-05, + "loss": 0.3993, + "step": 5205 + }, + { + "epoch": 2.4727100142382534, + "grad_norm": 0.3367303292773929, + "learning_rate": 3.8384970336189854e-05, + "loss": 0.4035, + "step": 5210 + }, + { + "epoch": 2.475083056478405, + "grad_norm": 0.30888041235896846, + "learning_rate": 3.8368490441661176e-05, + "loss": 0.3945, + "step": 5215 + }, + { + "epoch": 2.477456098718557, + "grad_norm": 0.27214719356728234, + "learning_rate": 3.83520105471325e-05, + "loss": 0.3965, + "step": 5220 + }, + { + "epoch": 2.479829140958709, + "grad_norm": 0.33047923111077443, + "learning_rate": 3.833553065260383e-05, + "loss": 0.3871, + "step": 5225 + }, + { + "epoch": 2.482202183198861, + "grad_norm": 0.29263659352242644, + "learning_rate": 3.831905075807515e-05, + "loss": 0.3939, + "step": 5230 + }, + { + "epoch": 2.4845752254390128, + "grad_norm": 0.2709400641563632, + "learning_rate": 3.830257086354648e-05, + "loss": 0.3931, + "step": 5235 + }, + { + "epoch": 2.4869482676791645, + "grad_norm": 0.3262914777888245, + "learning_rate": 3.82860909690178e-05, + "loss": 0.4016, + "step": 5240 + }, + { + "epoch": 2.4893213099193163, + "grad_norm": 0.29436329024348196, + "learning_rate": 3.826961107448913e-05, + "loss": 0.4003, + "step": 5245 + }, + { + "epoch": 2.4916943521594686, + "grad_norm": 0.2748458058335282, + "learning_rate": 3.825313117996045e-05, + "loss": 0.3914, + "step": 5250 + }, + { + "epoch": 2.4940673943996203, + "grad_norm": 0.28242886484396035, + "learning_rate": 3.823665128543177e-05, + "loss": 0.3867, + "step": 5255 + }, + { + "epoch": 2.496440436639772, + "grad_norm": 0.31852154183056797, + "learning_rate": 3.82201713909031e-05, + "loss": 0.3985, + "step": 5260 + }, + { + "epoch": 2.498813478879924, + "grad_norm": 0.29028210845380276, + "learning_rate": 3.820369149637443e-05, + "loss": 0.3937, + "step": 5265 + }, + { + "epoch": 2.5011865211200757, + "grad_norm": 0.28546033009829075, + "learning_rate": 3.818721160184575e-05, + "loss": 0.3974, + "step": 5270 + }, + { + "epoch": 2.503559563360228, + "grad_norm": 0.27265159348087087, + "learning_rate": 3.8170731707317073e-05, + "loss": 0.3936, + "step": 5275 + }, + { + "epoch": 2.5059326056003797, + "grad_norm": 0.3356799038696626, + "learning_rate": 3.81542518127884e-05, + "loss": 0.399, + "step": 5280 + }, + { + "epoch": 2.5083056478405314, + "grad_norm": 0.2972010693401072, + "learning_rate": 3.8137771918259724e-05, + "loss": 0.4015, + "step": 5285 + }, + { + "epoch": 2.5106786900806837, + "grad_norm": 0.2957605749457893, + "learning_rate": 3.8121292023731046e-05, + "loss": 0.4019, + "step": 5290 + }, + { + "epoch": 2.5130517323208355, + "grad_norm": 0.272905574915861, + "learning_rate": 3.8104812129202375e-05, + "loss": 0.3911, + "step": 5295 + }, + { + "epoch": 2.5154247745609872, + "grad_norm": 0.28237916678735425, + "learning_rate": 3.8088332234673704e-05, + "loss": 0.4084, + "step": 5300 + }, + { + "epoch": 2.517797816801139, + "grad_norm": 0.27926596626802846, + "learning_rate": 3.8071852340145026e-05, + "loss": 0.3893, + "step": 5305 + }, + { + "epoch": 2.520170859041291, + "grad_norm": 0.37153937968917233, + "learning_rate": 3.805537244561635e-05, + "loss": 0.3997, + "step": 5310 + }, + { + "epoch": 2.522543901281443, + "grad_norm": 0.31583873012366565, + "learning_rate": 3.8038892551087676e-05, + "loss": 0.3958, + "step": 5315 + }, + { + "epoch": 2.524916943521595, + "grad_norm": 0.2731056099152705, + "learning_rate": 3.8022412656559e-05, + "loss": 0.3855, + "step": 5320 + }, + { + "epoch": 2.5272899857617466, + "grad_norm": 0.3768183790368408, + "learning_rate": 3.800593276203033e-05, + "loss": 0.4006, + "step": 5325 + }, + { + "epoch": 2.5296630280018984, + "grad_norm": 0.30739330277780336, + "learning_rate": 3.798945286750165e-05, + "loss": 0.3975, + "step": 5330 + }, + { + "epoch": 2.53203607024205, + "grad_norm": 0.3011517055099788, + "learning_rate": 3.797297297297298e-05, + "loss": 0.3926, + "step": 5335 + }, + { + "epoch": 2.5344091124822024, + "grad_norm": 0.3435817144544524, + "learning_rate": 3.79564930784443e-05, + "loss": 0.3903, + "step": 5340 + }, + { + "epoch": 2.536782154722354, + "grad_norm": 0.31242847876644325, + "learning_rate": 3.794001318391562e-05, + "loss": 0.397, + "step": 5345 + }, + { + "epoch": 2.539155196962506, + "grad_norm": 0.2765638804810798, + "learning_rate": 3.792353328938695e-05, + "loss": 0.3987, + "step": 5350 + }, + { + "epoch": 2.5415282392026577, + "grad_norm": 0.2896422027894726, + "learning_rate": 3.790705339485828e-05, + "loss": 0.3922, + "step": 5355 + }, + { + "epoch": 2.5439012814428095, + "grad_norm": 0.28509307783097904, + "learning_rate": 3.78905735003296e-05, + "loss": 0.3957, + "step": 5360 + }, + { + "epoch": 2.5462743236829617, + "grad_norm": 0.30298793610609553, + "learning_rate": 3.787409360580092e-05, + "loss": 0.4058, + "step": 5365 + }, + { + "epoch": 2.5486473659231135, + "grad_norm": 0.3224848208820257, + "learning_rate": 3.785761371127225e-05, + "loss": 0.3996, + "step": 5370 + }, + { + "epoch": 2.5510204081632653, + "grad_norm": 0.282091518753708, + "learning_rate": 3.7841133816743574e-05, + "loss": 0.3994, + "step": 5375 + }, + { + "epoch": 2.553393450403417, + "grad_norm": 0.29823425353581534, + "learning_rate": 3.7824653922214896e-05, + "loss": 0.4069, + "step": 5380 + }, + { + "epoch": 2.555766492643569, + "grad_norm": 0.300944696219973, + "learning_rate": 3.7808174027686224e-05, + "loss": 0.3911, + "step": 5385 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 0.29390361792017805, + "learning_rate": 3.779169413315755e-05, + "loss": 0.4052, + "step": 5390 + }, + { + "epoch": 2.560512577123873, + "grad_norm": 0.29993386103686176, + "learning_rate": 3.7775214238628875e-05, + "loss": 0.4018, + "step": 5395 + }, + { + "epoch": 2.5628856193640246, + "grad_norm": 0.29753987509358304, + "learning_rate": 3.77587343441002e-05, + "loss": 0.3994, + "step": 5400 + }, + { + "epoch": 2.565258661604177, + "grad_norm": 0.2934775698895535, + "learning_rate": 3.7742254449571526e-05, + "loss": 0.3981, + "step": 5405 + }, + { + "epoch": 2.5676317038443286, + "grad_norm": 0.2664929167848885, + "learning_rate": 3.772577455504285e-05, + "loss": 0.3953, + "step": 5410 + }, + { + "epoch": 2.5700047460844804, + "grad_norm": 0.29617034517038493, + "learning_rate": 3.7709294660514177e-05, + "loss": 0.3977, + "step": 5415 + }, + { + "epoch": 2.572377788324632, + "grad_norm": 0.27477711894681983, + "learning_rate": 3.76928147659855e-05, + "loss": 0.4075, + "step": 5420 + }, + { + "epoch": 2.574750830564784, + "grad_norm": 0.2793811660441871, + "learning_rate": 3.767633487145683e-05, + "loss": 0.4183, + "step": 5425 + }, + { + "epoch": 2.577123872804936, + "grad_norm": 0.27450122062079324, + "learning_rate": 3.765985497692815e-05, + "loss": 0.3915, + "step": 5430 + }, + { + "epoch": 2.579496915045088, + "grad_norm": 0.3573247145791366, + "learning_rate": 3.764337508239947e-05, + "loss": 0.3945, + "step": 5435 + }, + { + "epoch": 2.5818699572852397, + "grad_norm": 0.28247103458185957, + "learning_rate": 3.76268951878708e-05, + "loss": 0.399, + "step": 5440 + }, + { + "epoch": 2.5842429995253915, + "grad_norm": 0.2876324759842524, + "learning_rate": 3.761041529334212e-05, + "loss": 0.4038, + "step": 5445 + }, + { + "epoch": 2.5866160417655433, + "grad_norm": 0.3112113283949458, + "learning_rate": 3.759393539881345e-05, + "loss": 0.402, + "step": 5450 + }, + { + "epoch": 2.5889890840056955, + "grad_norm": 0.31528015579407104, + "learning_rate": 3.757745550428477e-05, + "loss": 0.3982, + "step": 5455 + }, + { + "epoch": 2.5913621262458473, + "grad_norm": 0.30938626623835663, + "learning_rate": 3.75609756097561e-05, + "loss": 0.3918, + "step": 5460 + }, + { + "epoch": 2.593735168485999, + "grad_norm": 0.3467122678244832, + "learning_rate": 3.754449571522742e-05, + "loss": 0.3987, + "step": 5465 + }, + { + "epoch": 2.596108210726151, + "grad_norm": 0.28266383760596386, + "learning_rate": 3.7528015820698745e-05, + "loss": 0.4074, + "step": 5470 + }, + { + "epoch": 2.5984812529663026, + "grad_norm": 0.2868750202066412, + "learning_rate": 3.7511535926170074e-05, + "loss": 0.3965, + "step": 5475 + }, + { + "epoch": 2.600854295206455, + "grad_norm": 0.31413471754275357, + "learning_rate": 3.74950560316414e-05, + "loss": 0.3937, + "step": 5480 + }, + { + "epoch": 2.6032273374466066, + "grad_norm": 0.29699630586939785, + "learning_rate": 3.7478576137112725e-05, + "loss": 0.3957, + "step": 5485 + }, + { + "epoch": 2.6056003796867584, + "grad_norm": 0.32372906494037307, + "learning_rate": 3.746209624258405e-05, + "loss": 0.4007, + "step": 5490 + }, + { + "epoch": 2.60797342192691, + "grad_norm": 0.301013219974074, + "learning_rate": 3.7445616348055375e-05, + "loss": 0.3912, + "step": 5495 + }, + { + "epoch": 2.610346464167062, + "grad_norm": 0.27931475035510633, + "learning_rate": 3.74291364535267e-05, + "loss": 0.4038, + "step": 5500 + }, + { + "epoch": 2.612719506407214, + "grad_norm": 0.3167218468981087, + "learning_rate": 3.741265655899802e-05, + "loss": 0.3934, + "step": 5505 + }, + { + "epoch": 2.615092548647366, + "grad_norm": 0.26252833115757407, + "learning_rate": 3.739617666446935e-05, + "loss": 0.3923, + "step": 5510 + }, + { + "epoch": 2.6174655908875177, + "grad_norm": 0.28946060156900894, + "learning_rate": 3.737969676994068e-05, + "loss": 0.394, + "step": 5515 + }, + { + "epoch": 2.6198386331276695, + "grad_norm": 0.32840593113506095, + "learning_rate": 3.7363216875412e-05, + "loss": 0.4049, + "step": 5520 + }, + { + "epoch": 2.6222116753678213, + "grad_norm": 0.2760304417287925, + "learning_rate": 3.734673698088332e-05, + "loss": 0.3908, + "step": 5525 + }, + { + "epoch": 2.6245847176079735, + "grad_norm": 0.2903800844923619, + "learning_rate": 3.733025708635465e-05, + "loss": 0.391, + "step": 5530 + }, + { + "epoch": 2.6269577598481253, + "grad_norm": 0.2609052905479032, + "learning_rate": 3.731377719182597e-05, + "loss": 0.3964, + "step": 5535 + }, + { + "epoch": 2.629330802088277, + "grad_norm": 0.3058321290102913, + "learning_rate": 3.72972972972973e-05, + "loss": 0.3944, + "step": 5540 + }, + { + "epoch": 2.6317038443284293, + "grad_norm": 0.28298391843169657, + "learning_rate": 3.728081740276862e-05, + "loss": 0.4011, + "step": 5545 + }, + { + "epoch": 2.634076886568581, + "grad_norm": 0.30339373840568173, + "learning_rate": 3.726433750823995e-05, + "loss": 0.3968, + "step": 5550 + }, + { + "epoch": 2.636449928808733, + "grad_norm": 0.30120866583838674, + "learning_rate": 3.724785761371127e-05, + "loss": 0.3887, + "step": 5555 + }, + { + "epoch": 2.6388229710488846, + "grad_norm": 0.2879528272962737, + "learning_rate": 3.7231377719182595e-05, + "loss": 0.3956, + "step": 5560 + }, + { + "epoch": 2.6411960132890364, + "grad_norm": 0.3347208496438643, + "learning_rate": 3.7214897824653924e-05, + "loss": 0.4012, + "step": 5565 + }, + { + "epoch": 2.6435690555291886, + "grad_norm": 0.3102800348566766, + "learning_rate": 3.719841793012525e-05, + "loss": 0.401, + "step": 5570 + }, + { + "epoch": 2.6459420977693404, + "grad_norm": 0.26891642104446484, + "learning_rate": 3.7181938035596574e-05, + "loss": 0.3881, + "step": 5575 + }, + { + "epoch": 2.648315140009492, + "grad_norm": 0.33916481153225997, + "learning_rate": 3.7165458141067896e-05, + "loss": 0.4011, + "step": 5580 + }, + { + "epoch": 2.650688182249644, + "grad_norm": 0.29561487970194195, + "learning_rate": 3.7148978246539225e-05, + "loss": 0.3951, + "step": 5585 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 0.2928235956047956, + "learning_rate": 3.713249835201055e-05, + "loss": 0.387, + "step": 5590 + }, + { + "epoch": 2.655434266729948, + "grad_norm": 0.2895090809948617, + "learning_rate": 3.711601845748187e-05, + "loss": 0.3944, + "step": 5595 + }, + { + "epoch": 2.6578073089700998, + "grad_norm": 0.30335858124781884, + "learning_rate": 3.70995385629532e-05, + "loss": 0.3962, + "step": 5600 + }, + { + "epoch": 2.6601803512102515, + "grad_norm": 0.29186225397544474, + "learning_rate": 3.7083058668424526e-05, + "loss": 0.4013, + "step": 5605 + }, + { + "epoch": 2.6625533934504033, + "grad_norm": 0.32052896292512023, + "learning_rate": 3.706657877389585e-05, + "loss": 0.3938, + "step": 5610 + }, + { + "epoch": 2.664926435690555, + "grad_norm": 0.31050811662929173, + "learning_rate": 3.705009887936717e-05, + "loss": 0.3984, + "step": 5615 + }, + { + "epoch": 2.6672994779307073, + "grad_norm": 0.3568190232038088, + "learning_rate": 3.70336189848385e-05, + "loss": 0.3931, + "step": 5620 + }, + { + "epoch": 2.669672520170859, + "grad_norm": 0.3725813623702858, + "learning_rate": 3.701713909030982e-05, + "loss": 0.3942, + "step": 5625 + }, + { + "epoch": 2.672045562411011, + "grad_norm": 0.2885113923355964, + "learning_rate": 3.700065919578115e-05, + "loss": 0.4044, + "step": 5630 + }, + { + "epoch": 2.6744186046511627, + "grad_norm": 0.27558707118066267, + "learning_rate": 3.698417930125247e-05, + "loss": 0.3966, + "step": 5635 + }, + { + "epoch": 2.6767916468913144, + "grad_norm": 0.2762473881218796, + "learning_rate": 3.69676994067238e-05, + "loss": 0.402, + "step": 5640 + }, + { + "epoch": 2.6791646891314667, + "grad_norm": 0.3375455561895496, + "learning_rate": 3.695121951219512e-05, + "loss": 0.4037, + "step": 5645 + }, + { + "epoch": 2.6815377313716184, + "grad_norm": 0.32146060472555127, + "learning_rate": 3.6934739617666444e-05, + "loss": 0.4108, + "step": 5650 + }, + { + "epoch": 2.68391077361177, + "grad_norm": 0.30927156651065507, + "learning_rate": 3.691825972313777e-05, + "loss": 0.3929, + "step": 5655 + }, + { + "epoch": 2.6862838158519224, + "grad_norm": 0.3436573727679062, + "learning_rate": 3.69017798286091e-05, + "loss": 0.4056, + "step": 5660 + }, + { + "epoch": 2.6886568580920738, + "grad_norm": 0.2779754646227213, + "learning_rate": 3.6885299934080424e-05, + "loss": 0.4133, + "step": 5665 + }, + { + "epoch": 2.691029900332226, + "grad_norm": 0.3054861923681235, + "learning_rate": 3.6868820039551746e-05, + "loss": 0.4061, + "step": 5670 + }, + { + "epoch": 2.693402942572378, + "grad_norm": 0.29610526506791496, + "learning_rate": 3.6852340145023075e-05, + "loss": 0.4023, + "step": 5675 + }, + { + "epoch": 2.6957759848125296, + "grad_norm": 0.3227043878058561, + "learning_rate": 3.6835860250494397e-05, + "loss": 0.4031, + "step": 5680 + }, + { + "epoch": 2.698149027052682, + "grad_norm": 0.3170791598617777, + "learning_rate": 3.681938035596572e-05, + "loss": 0.3978, + "step": 5685 + }, + { + "epoch": 2.7005220692928336, + "grad_norm": 0.3422198354154997, + "learning_rate": 3.6802900461437054e-05, + "loss": 0.397, + "step": 5690 + }, + { + "epoch": 2.7028951115329853, + "grad_norm": 0.30376267244554883, + "learning_rate": 3.6786420566908376e-05, + "loss": 0.4014, + "step": 5695 + }, + { + "epoch": 2.705268153773137, + "grad_norm": 0.33130817633765686, + "learning_rate": 3.67699406723797e-05, + "loss": 0.3933, + "step": 5700 + }, + { + "epoch": 2.707641196013289, + "grad_norm": 0.2798728004555241, + "learning_rate": 3.675346077785102e-05, + "loss": 0.4036, + "step": 5705 + }, + { + "epoch": 2.710014238253441, + "grad_norm": 0.3061318480391815, + "learning_rate": 3.673698088332235e-05, + "loss": 0.3976, + "step": 5710 + }, + { + "epoch": 2.712387280493593, + "grad_norm": 0.2923845369803522, + "learning_rate": 3.672050098879367e-05, + "loss": 0.399, + "step": 5715 + }, + { + "epoch": 2.7147603227337447, + "grad_norm": 0.26513011574496725, + "learning_rate": 3.6704021094265e-05, + "loss": 0.3985, + "step": 5720 + }, + { + "epoch": 2.7171333649738965, + "grad_norm": 0.2854333269210002, + "learning_rate": 3.668754119973633e-05, + "loss": 0.3976, + "step": 5725 + }, + { + "epoch": 2.7195064072140482, + "grad_norm": 0.28574293335918155, + "learning_rate": 3.667106130520765e-05, + "loss": 0.4051, + "step": 5730 + }, + { + "epoch": 2.7218794494542005, + "grad_norm": 0.32795412035915666, + "learning_rate": 3.665458141067897e-05, + "loss": 0.4033, + "step": 5735 + }, + { + "epoch": 2.7242524916943522, + "grad_norm": 0.27524829252473976, + "learning_rate": 3.6638101516150294e-05, + "loss": 0.4021, + "step": 5740 + }, + { + "epoch": 2.726625533934504, + "grad_norm": 0.29585921913664487, + "learning_rate": 3.662162162162162e-05, + "loss": 0.3977, + "step": 5745 + }, + { + "epoch": 2.728998576174656, + "grad_norm": 0.2847932585153143, + "learning_rate": 3.660514172709295e-05, + "loss": 0.3862, + "step": 5750 + }, + { + "epoch": 2.7313716184148076, + "grad_norm": 0.28415866556374475, + "learning_rate": 3.6588661832564274e-05, + "loss": 0.3934, + "step": 5755 + }, + { + "epoch": 2.73374466065496, + "grad_norm": 0.2962063403810216, + "learning_rate": 3.65721819380356e-05, + "loss": 0.3945, + "step": 5760 + }, + { + "epoch": 2.7361177028951116, + "grad_norm": 0.2767972102087323, + "learning_rate": 3.6555702043506924e-05, + "loss": 0.3974, + "step": 5765 + }, + { + "epoch": 2.7384907451352634, + "grad_norm": 0.28750058669828193, + "learning_rate": 3.6539222148978246e-05, + "loss": 0.3917, + "step": 5770 + }, + { + "epoch": 2.740863787375415, + "grad_norm": 0.27899438070834937, + "learning_rate": 3.652274225444957e-05, + "loss": 0.3901, + "step": 5775 + }, + { + "epoch": 2.743236829615567, + "grad_norm": 0.2582168547215116, + "learning_rate": 3.6506262359920904e-05, + "loss": 0.381, + "step": 5780 + }, + { + "epoch": 2.745609871855719, + "grad_norm": 0.31055960130203264, + "learning_rate": 3.6489782465392226e-05, + "loss": 0.3981, + "step": 5785 + }, + { + "epoch": 2.747982914095871, + "grad_norm": 0.2959218424956789, + "learning_rate": 3.647330257086355e-05, + "loss": 0.3981, + "step": 5790 + }, + { + "epoch": 2.7503559563360227, + "grad_norm": 0.3577492520627772, + "learning_rate": 3.6456822676334876e-05, + "loss": 0.3977, + "step": 5795 + }, + { + "epoch": 2.752728998576175, + "grad_norm": 0.3152779814045453, + "learning_rate": 3.64403427818062e-05, + "loss": 0.4017, + "step": 5800 + }, + { + "epoch": 2.7551020408163263, + "grad_norm": 0.36279558137604123, + "learning_rate": 3.642386288727752e-05, + "loss": 0.4071, + "step": 5805 + }, + { + "epoch": 2.7574750830564785, + "grad_norm": 0.3378772970485606, + "learning_rate": 3.640738299274885e-05, + "loss": 0.4064, + "step": 5810 + }, + { + "epoch": 2.7598481252966303, + "grad_norm": 0.27027271355933785, + "learning_rate": 3.639090309822018e-05, + "loss": 0.3984, + "step": 5815 + }, + { + "epoch": 2.762221167536782, + "grad_norm": 0.2827685961802921, + "learning_rate": 3.63744232036915e-05, + "loss": 0.3846, + "step": 5820 + }, + { + "epoch": 2.7645942097769343, + "grad_norm": 0.2887466993612592, + "learning_rate": 3.635794330916282e-05, + "loss": 0.4002, + "step": 5825 + }, + { + "epoch": 2.766967252017086, + "grad_norm": 0.3288705819474495, + "learning_rate": 3.634146341463415e-05, + "loss": 0.3964, + "step": 5830 + }, + { + "epoch": 2.769340294257238, + "grad_norm": 0.27879997389978917, + "learning_rate": 3.632498352010547e-05, + "loss": 0.4053, + "step": 5835 + }, + { + "epoch": 2.7717133364973896, + "grad_norm": 0.32377640490965676, + "learning_rate": 3.63085036255768e-05, + "loss": 0.4023, + "step": 5840 + }, + { + "epoch": 2.7740863787375414, + "grad_norm": 0.3440054004381142, + "learning_rate": 3.629202373104812e-05, + "loss": 0.3837, + "step": 5845 + }, + { + "epoch": 2.7764594209776936, + "grad_norm": 0.29661164628708325, + "learning_rate": 3.627554383651945e-05, + "loss": 0.3894, + "step": 5850 + }, + { + "epoch": 2.7788324632178454, + "grad_norm": 0.29500840209014373, + "learning_rate": 3.6259063941990774e-05, + "loss": 0.393, + "step": 5855 + }, + { + "epoch": 2.781205505457997, + "grad_norm": 0.3280901444453548, + "learning_rate": 3.6242584047462096e-05, + "loss": 0.386, + "step": 5860 + }, + { + "epoch": 2.783578547698149, + "grad_norm": 0.292766023442049, + "learning_rate": 3.622610415293342e-05, + "loss": 0.3991, + "step": 5865 + }, + { + "epoch": 2.7859515899383007, + "grad_norm": 0.3169659417426221, + "learning_rate": 3.620962425840475e-05, + "loss": 0.3868, + "step": 5870 + }, + { + "epoch": 2.788324632178453, + "grad_norm": 0.30852944728897325, + "learning_rate": 3.6193144363876075e-05, + "loss": 0.3907, + "step": 5875 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 0.35575316786102795, + "learning_rate": 3.61766644693474e-05, + "loss": 0.3931, + "step": 5880 + }, + { + "epoch": 2.7930707166587565, + "grad_norm": 0.35753337487668563, + "learning_rate": 3.6160184574818726e-05, + "loss": 0.3989, + "step": 5885 + }, + { + "epoch": 2.7954437588989083, + "grad_norm": 0.3165284177360715, + "learning_rate": 3.614370468029005e-05, + "loss": 0.401, + "step": 5890 + }, + { + "epoch": 2.79781680113906, + "grad_norm": 0.3190436000095109, + "learning_rate": 3.612722478576137e-05, + "loss": 0.3925, + "step": 5895 + }, + { + "epoch": 2.8001898433792123, + "grad_norm": 0.28890365001210155, + "learning_rate": 3.611074489123269e-05, + "loss": 0.404, + "step": 5900 + }, + { + "epoch": 2.802562885619364, + "grad_norm": 0.28036907160277846, + "learning_rate": 3.609426499670403e-05, + "loss": 0.3868, + "step": 5905 + }, + { + "epoch": 2.804935927859516, + "grad_norm": 0.2810227351098721, + "learning_rate": 3.607778510217535e-05, + "loss": 0.4013, + "step": 5910 + }, + { + "epoch": 2.8073089700996676, + "grad_norm": 0.2689471717813572, + "learning_rate": 3.606130520764667e-05, + "loss": 0.3963, + "step": 5915 + }, + { + "epoch": 2.8096820123398194, + "grad_norm": 0.2971577327693362, + "learning_rate": 3.6044825313118e-05, + "loss": 0.4016, + "step": 5920 + }, + { + "epoch": 2.8120550545799716, + "grad_norm": 0.31059297670439073, + "learning_rate": 3.602834541858932e-05, + "loss": 0.4018, + "step": 5925 + }, + { + "epoch": 2.8144280968201234, + "grad_norm": 0.32501255091024195, + "learning_rate": 3.6011865524060644e-05, + "loss": 0.3914, + "step": 5930 + }, + { + "epoch": 2.816801139060275, + "grad_norm": 0.34167242058956476, + "learning_rate": 3.599538562953197e-05, + "loss": 0.3914, + "step": 5935 + }, + { + "epoch": 2.8191741813004274, + "grad_norm": 0.3113626028622469, + "learning_rate": 3.59789057350033e-05, + "loss": 0.3957, + "step": 5940 + }, + { + "epoch": 2.821547223540579, + "grad_norm": 0.2886626913621029, + "learning_rate": 3.5962425840474623e-05, + "loss": 0.3967, + "step": 5945 + }, + { + "epoch": 2.823920265780731, + "grad_norm": 0.31170108856992246, + "learning_rate": 3.5945945945945945e-05, + "loss": 0.4014, + "step": 5950 + }, + { + "epoch": 2.8262933080208827, + "grad_norm": 0.3160931155371558, + "learning_rate": 3.5929466051417274e-05, + "loss": 0.4023, + "step": 5955 + }, + { + "epoch": 2.8286663502610345, + "grad_norm": 0.3289479935236273, + "learning_rate": 3.5912986156888596e-05, + "loss": 0.4046, + "step": 5960 + }, + { + "epoch": 2.8310393925011867, + "grad_norm": 0.2879355327332094, + "learning_rate": 3.5896506262359925e-05, + "loss": 0.3921, + "step": 5965 + }, + { + "epoch": 2.8334124347413385, + "grad_norm": 0.27618051307732017, + "learning_rate": 3.588002636783125e-05, + "loss": 0.388, + "step": 5970 + }, + { + "epoch": 2.8357854769814903, + "grad_norm": 0.26395370690500985, + "learning_rate": 3.5863546473302576e-05, + "loss": 0.402, + "step": 5975 + }, + { + "epoch": 2.838158519221642, + "grad_norm": 0.28871370467444873, + "learning_rate": 3.58470665787739e-05, + "loss": 0.4003, + "step": 5980 + }, + { + "epoch": 2.840531561461794, + "grad_norm": 0.31786306691127303, + "learning_rate": 3.583058668424522e-05, + "loss": 0.3907, + "step": 5985 + }, + { + "epoch": 2.842904603701946, + "grad_norm": 0.2914003501659342, + "learning_rate": 3.581410678971655e-05, + "loss": 0.399, + "step": 5990 + }, + { + "epoch": 2.845277645942098, + "grad_norm": 0.3135394955455369, + "learning_rate": 3.579762689518788e-05, + "loss": 0.4037, + "step": 5995 + }, + { + "epoch": 2.8476506881822496, + "grad_norm": 0.31574899190025485, + "learning_rate": 3.57811470006592e-05, + "loss": 0.4, + "step": 6000 + }, + { + "epoch": 2.8500237304224014, + "grad_norm": 0.31383866008386185, + "learning_rate": 3.576466710613052e-05, + "loss": 0.3998, + "step": 6005 + }, + { + "epoch": 2.852396772662553, + "grad_norm": 0.25984291786635255, + "learning_rate": 3.574818721160185e-05, + "loss": 0.3978, + "step": 6010 + }, + { + "epoch": 2.8547698149027054, + "grad_norm": 0.29036363618578814, + "learning_rate": 3.573170731707317e-05, + "loss": 0.3948, + "step": 6015 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.2476713923903881, + "learning_rate": 3.5715227422544494e-05, + "loss": 0.3982, + "step": 6020 + }, + { + "epoch": 2.859515899383009, + "grad_norm": 0.2769833602703935, + "learning_rate": 3.569874752801582e-05, + "loss": 0.399, + "step": 6025 + }, + { + "epoch": 2.8618889416231608, + "grad_norm": 0.28026820640769307, + "learning_rate": 3.568226763348715e-05, + "loss": 0.3876, + "step": 6030 + }, + { + "epoch": 2.8642619838633125, + "grad_norm": 0.3454650448595783, + "learning_rate": 3.566578773895847e-05, + "loss": 0.4008, + "step": 6035 + }, + { + "epoch": 2.8666350261034648, + "grad_norm": 0.29768637866149844, + "learning_rate": 3.5649307844429795e-05, + "loss": 0.4007, + "step": 6040 + }, + { + "epoch": 2.8690080683436165, + "grad_norm": 0.2941507907351043, + "learning_rate": 3.5632827949901124e-05, + "loss": 0.4018, + "step": 6045 + }, + { + "epoch": 2.8713811105837683, + "grad_norm": 0.29246105282408547, + "learning_rate": 3.5616348055372446e-05, + "loss": 0.3905, + "step": 6050 + }, + { + "epoch": 2.87375415282392, + "grad_norm": 0.2935399441228513, + "learning_rate": 3.5599868160843774e-05, + "loss": 0.4021, + "step": 6055 + }, + { + "epoch": 2.876127195064072, + "grad_norm": 0.29133923458908406, + "learning_rate": 3.5583388266315096e-05, + "loss": 0.4003, + "step": 6060 + }, + { + "epoch": 2.878500237304224, + "grad_norm": 0.33286923718735917, + "learning_rate": 3.5566908371786425e-05, + "loss": 0.3914, + "step": 6065 + }, + { + "epoch": 2.880873279544376, + "grad_norm": 0.2979830995989918, + "learning_rate": 3.555042847725775e-05, + "loss": 0.4011, + "step": 6070 + }, + { + "epoch": 2.8832463217845277, + "grad_norm": 0.30512012228589647, + "learning_rate": 3.553394858272907e-05, + "loss": 0.3998, + "step": 6075 + }, + { + "epoch": 2.88561936402468, + "grad_norm": 0.3134207071743758, + "learning_rate": 3.55174686882004e-05, + "loss": 0.3922, + "step": 6080 + }, + { + "epoch": 2.8879924062648317, + "grad_norm": 0.27945432233633494, + "learning_rate": 3.5500988793671727e-05, + "loss": 0.3984, + "step": 6085 + }, + { + "epoch": 2.8903654485049834, + "grad_norm": 0.267653844954846, + "learning_rate": 3.548450889914305e-05, + "loss": 0.3897, + "step": 6090 + }, + { + "epoch": 2.8927384907451352, + "grad_norm": 0.30218320544854094, + "learning_rate": 3.546802900461437e-05, + "loss": 0.3944, + "step": 6095 + }, + { + "epoch": 2.895111532985287, + "grad_norm": 0.2754091741687095, + "learning_rate": 3.54515491100857e-05, + "loss": 0.3963, + "step": 6100 + }, + { + "epoch": 2.8974845752254392, + "grad_norm": 0.2856124032064029, + "learning_rate": 3.543506921555702e-05, + "loss": 0.4046, + "step": 6105 + }, + { + "epoch": 2.899857617465591, + "grad_norm": 0.287680276567761, + "learning_rate": 3.541858932102834e-05, + "loss": 0.3973, + "step": 6110 + }, + { + "epoch": 2.902230659705743, + "grad_norm": 0.25184493160031785, + "learning_rate": 3.540210942649967e-05, + "loss": 0.3996, + "step": 6115 + }, + { + "epoch": 2.9046037019458946, + "grad_norm": 0.28748804018079427, + "learning_rate": 3.5385629531971e-05, + "loss": 0.397, + "step": 6120 + }, + { + "epoch": 2.9069767441860463, + "grad_norm": 0.276433501778736, + "learning_rate": 3.536914963744232e-05, + "loss": 0.3986, + "step": 6125 + }, + { + "epoch": 2.9093497864261986, + "grad_norm": 0.3192642804829344, + "learning_rate": 3.5352669742913645e-05, + "loss": 0.3936, + "step": 6130 + }, + { + "epoch": 2.9117228286663503, + "grad_norm": 0.27632320119088044, + "learning_rate": 3.533618984838497e-05, + "loss": 0.4021, + "step": 6135 + }, + { + "epoch": 2.914095870906502, + "grad_norm": 0.2836417572567318, + "learning_rate": 3.5319709953856295e-05, + "loss": 0.3953, + "step": 6140 + }, + { + "epoch": 2.916468913146654, + "grad_norm": 0.2977682016568343, + "learning_rate": 3.5303230059327624e-05, + "loss": 0.401, + "step": 6145 + }, + { + "epoch": 2.9188419553868057, + "grad_norm": 0.34977219898365947, + "learning_rate": 3.5286750164798946e-05, + "loss": 0.3954, + "step": 6150 + }, + { + "epoch": 2.921214997626958, + "grad_norm": 0.32232865127900834, + "learning_rate": 3.5270270270270275e-05, + "loss": 0.3981, + "step": 6155 + }, + { + "epoch": 2.9235880398671097, + "grad_norm": 0.29977171469203534, + "learning_rate": 3.52537903757416e-05, + "loss": 0.3981, + "step": 6160 + }, + { + "epoch": 2.9259610821072615, + "grad_norm": 0.274540181260087, + "learning_rate": 3.523731048121292e-05, + "loss": 0.3926, + "step": 6165 + }, + { + "epoch": 2.9283341243474132, + "grad_norm": 0.2998512086435193, + "learning_rate": 3.522083058668425e-05, + "loss": 0.3934, + "step": 6170 + }, + { + "epoch": 2.930707166587565, + "grad_norm": 0.2869597177384944, + "learning_rate": 3.5204350692155576e-05, + "loss": 0.3891, + "step": 6175 + }, + { + "epoch": 2.9330802088277173, + "grad_norm": 0.26676207514436656, + "learning_rate": 3.51878707976269e-05, + "loss": 0.3826, + "step": 6180 + }, + { + "epoch": 2.935453251067869, + "grad_norm": 0.2987182356737427, + "learning_rate": 3.517139090309822e-05, + "loss": 0.3966, + "step": 6185 + }, + { + "epoch": 2.937826293308021, + "grad_norm": 0.2947686835857011, + "learning_rate": 3.515491100856955e-05, + "loss": 0.3982, + "step": 6190 + }, + { + "epoch": 2.940199335548173, + "grad_norm": 0.3048421062424294, + "learning_rate": 3.513843111404087e-05, + "loss": 0.4028, + "step": 6195 + }, + { + "epoch": 2.9425723777883244, + "grad_norm": 0.30992317852062135, + "learning_rate": 3.512195121951219e-05, + "loss": 0.3956, + "step": 6200 + }, + { + "epoch": 2.9449454200284766, + "grad_norm": 0.3330382964166558, + "learning_rate": 3.510547132498352e-05, + "loss": 0.3982, + "step": 6205 + }, + { + "epoch": 2.9473184622686284, + "grad_norm": 0.2934899206522683, + "learning_rate": 3.508899143045485e-05, + "loss": 0.3945, + "step": 6210 + }, + { + "epoch": 2.94969150450878, + "grad_norm": 0.3012463229414802, + "learning_rate": 3.507251153592617e-05, + "loss": 0.3976, + "step": 6215 + }, + { + "epoch": 2.9520645467489324, + "grad_norm": 0.2978206090165989, + "learning_rate": 3.5056031641397494e-05, + "loss": 0.3845, + "step": 6220 + }, + { + "epoch": 2.954437588989084, + "grad_norm": 0.28009580455694094, + "learning_rate": 3.503955174686882e-05, + "loss": 0.4061, + "step": 6225 + }, + { + "epoch": 2.956810631229236, + "grad_norm": 0.2866851179269444, + "learning_rate": 3.5023071852340145e-05, + "loss": 0.3982, + "step": 6230 + }, + { + "epoch": 2.9591836734693877, + "grad_norm": 0.35434699566076944, + "learning_rate": 3.5006591957811474e-05, + "loss": 0.3877, + "step": 6235 + }, + { + "epoch": 2.9615567157095395, + "grad_norm": 0.33390454486719406, + "learning_rate": 3.4990112063282796e-05, + "loss": 0.3947, + "step": 6240 + }, + { + "epoch": 2.9639297579496917, + "grad_norm": 0.3021380980458449, + "learning_rate": 3.4973632168754124e-05, + "loss": 0.4037, + "step": 6245 + }, + { + "epoch": 2.9663028001898435, + "grad_norm": 0.29302566508727823, + "learning_rate": 3.4957152274225446e-05, + "loss": 0.4028, + "step": 6250 + }, + { + "epoch": 2.9686758424299953, + "grad_norm": 0.26505401858693045, + "learning_rate": 3.494067237969677e-05, + "loss": 0.3945, + "step": 6255 + }, + { + "epoch": 2.971048884670147, + "grad_norm": 0.28241093057504135, + "learning_rate": 3.49241924851681e-05, + "loss": 0.3963, + "step": 6260 + }, + { + "epoch": 2.973421926910299, + "grad_norm": 0.2684801550204648, + "learning_rate": 3.4907712590639426e-05, + "loss": 0.3946, + "step": 6265 + }, + { + "epoch": 2.975794969150451, + "grad_norm": 0.28424140468802495, + "learning_rate": 3.489123269611075e-05, + "loss": 0.3883, + "step": 6270 + }, + { + "epoch": 2.978168011390603, + "grad_norm": 0.3216735552026381, + "learning_rate": 3.487475280158207e-05, + "loss": 0.4008, + "step": 6275 + }, + { + "epoch": 2.9805410536307546, + "grad_norm": 0.2840337828281333, + "learning_rate": 3.48582729070534e-05, + "loss": 0.3964, + "step": 6280 + }, + { + "epoch": 2.9829140958709064, + "grad_norm": 0.2856187987758529, + "learning_rate": 3.484179301252472e-05, + "loss": 0.4071, + "step": 6285 + }, + { + "epoch": 2.985287138111058, + "grad_norm": 0.34911139172490885, + "learning_rate": 3.482531311799604e-05, + "loss": 0.3977, + "step": 6290 + }, + { + "epoch": 2.9876601803512104, + "grad_norm": 0.3029562646129297, + "learning_rate": 3.480883322346737e-05, + "loss": 0.4033, + "step": 6295 + }, + { + "epoch": 2.990033222591362, + "grad_norm": 0.3230508749643729, + "learning_rate": 3.47923533289387e-05, + "loss": 0.3871, + "step": 6300 + }, + { + "epoch": 2.992406264831514, + "grad_norm": 0.29060938916490675, + "learning_rate": 3.477587343441002e-05, + "loss": 0.3863, + "step": 6305 + }, + { + "epoch": 2.9947793070716657, + "grad_norm": 0.26090961173252225, + "learning_rate": 3.4759393539881344e-05, + "loss": 0.3889, + "step": 6310 + }, + { + "epoch": 2.9971523493118175, + "grad_norm": 0.2698428503697563, + "learning_rate": 3.474291364535267e-05, + "loss": 0.3934, + "step": 6315 + }, + { + "epoch": 2.9995253915519697, + "grad_norm": 0.27004877866446114, + "learning_rate": 3.4726433750823994e-05, + "loss": 0.3958, + "step": 6320 + }, + { + "epoch": 3.0018984337921215, + "grad_norm": 0.30216421253969433, + "learning_rate": 3.470995385629532e-05, + "loss": 0.3653, + "step": 6325 + }, + { + "epoch": 3.0042714760322733, + "grad_norm": 0.28988680208351986, + "learning_rate": 3.4693473961766645e-05, + "loss": 0.363, + "step": 6330 + }, + { + "epoch": 3.006644518272425, + "grad_norm": 0.3109792745623253, + "learning_rate": 3.4676994067237974e-05, + "loss": 0.3634, + "step": 6335 + }, + { + "epoch": 3.0090175605125773, + "grad_norm": 0.2820990497321511, + "learning_rate": 3.4660514172709296e-05, + "loss": 0.3538, + "step": 6340 + }, + { + "epoch": 3.011390602752729, + "grad_norm": 0.2976540838861252, + "learning_rate": 3.464403427818062e-05, + "loss": 0.3633, + "step": 6345 + }, + { + "epoch": 3.013763644992881, + "grad_norm": 0.26868779546592453, + "learning_rate": 3.4627554383651947e-05, + "loss": 0.3641, + "step": 6350 + }, + { + "epoch": 3.0161366872330326, + "grad_norm": 0.27264957236885934, + "learning_rate": 3.461107448912327e-05, + "loss": 0.356, + "step": 6355 + }, + { + "epoch": 3.0185097294731844, + "grad_norm": 0.28380426638672024, + "learning_rate": 3.45945945945946e-05, + "loss": 0.3617, + "step": 6360 + }, + { + "epoch": 3.0208827717133366, + "grad_norm": 0.29203948503020705, + "learning_rate": 3.457811470006592e-05, + "loss": 0.3578, + "step": 6365 + }, + { + "epoch": 3.0232558139534884, + "grad_norm": 0.274563328125401, + "learning_rate": 3.456163480553725e-05, + "loss": 0.3594, + "step": 6370 + }, + { + "epoch": 3.02562885619364, + "grad_norm": 0.2807154219407433, + "learning_rate": 3.454515491100857e-05, + "loss": 0.3605, + "step": 6375 + }, + { + "epoch": 3.028001898433792, + "grad_norm": 0.301205524355929, + "learning_rate": 3.452867501647989e-05, + "loss": 0.3563, + "step": 6380 + }, + { + "epoch": 3.030374940673944, + "grad_norm": 0.28174163034647826, + "learning_rate": 3.451219512195122e-05, + "loss": 0.359, + "step": 6385 + }, + { + "epoch": 3.032747982914096, + "grad_norm": 0.28634024730061064, + "learning_rate": 3.449571522742255e-05, + "loss": 0.3575, + "step": 6390 + }, + { + "epoch": 3.0351210251542478, + "grad_norm": 0.3020214063224101, + "learning_rate": 3.447923533289387e-05, + "loss": 0.3659, + "step": 6395 + }, + { + "epoch": 3.0374940673943995, + "grad_norm": 0.2788544012670729, + "learning_rate": 3.446275543836519e-05, + "loss": 0.3597, + "step": 6400 + }, + { + "epoch": 3.0398671096345513, + "grad_norm": 0.26561236104971037, + "learning_rate": 3.444627554383652e-05, + "loss": 0.3624, + "step": 6405 + }, + { + "epoch": 3.0422401518747035, + "grad_norm": 0.27158043791232617, + "learning_rate": 3.4429795649307844e-05, + "loss": 0.3666, + "step": 6410 + }, + { + "epoch": 3.0446131941148553, + "grad_norm": 0.28861871573593967, + "learning_rate": 3.4413315754779166e-05, + "loss": 0.3474, + "step": 6415 + }, + { + "epoch": 3.046986236355007, + "grad_norm": 0.2848167906807079, + "learning_rate": 3.4396835860250495e-05, + "loss": 0.3569, + "step": 6420 + }, + { + "epoch": 3.049359278595159, + "grad_norm": 0.26876388401377815, + "learning_rate": 3.4380355965721823e-05, + "loss": 0.3574, + "step": 6425 + }, + { + "epoch": 3.0517323208353107, + "grad_norm": 0.25485539527593415, + "learning_rate": 3.4363876071193145e-05, + "loss": 0.3604, + "step": 6430 + }, + { + "epoch": 3.054105363075463, + "grad_norm": 0.3291900618553336, + "learning_rate": 3.434739617666447e-05, + "loss": 0.3635, + "step": 6435 + }, + { + "epoch": 3.0564784053156147, + "grad_norm": 0.31317808207362247, + "learning_rate": 3.4330916282135796e-05, + "loss": 0.3557, + "step": 6440 + }, + { + "epoch": 3.0588514475557664, + "grad_norm": 0.297858274471182, + "learning_rate": 3.431443638760712e-05, + "loss": 0.3638, + "step": 6445 + }, + { + "epoch": 3.061224489795918, + "grad_norm": 0.2773893427861958, + "learning_rate": 3.429795649307845e-05, + "loss": 0.3732, + "step": 6450 + }, + { + "epoch": 3.0635975320360704, + "grad_norm": 0.2702693168459655, + "learning_rate": 3.428147659854977e-05, + "loss": 0.3632, + "step": 6455 + }, + { + "epoch": 3.065970574276222, + "grad_norm": 0.2687732032077547, + "learning_rate": 3.42649967040211e-05, + "loss": 0.3575, + "step": 6460 + }, + { + "epoch": 3.068343616516374, + "grad_norm": 0.2663470999002388, + "learning_rate": 3.424851680949242e-05, + "loss": 0.3698, + "step": 6465 + }, + { + "epoch": 3.0707166587565258, + "grad_norm": 0.33880537146255146, + "learning_rate": 3.423203691496374e-05, + "loss": 0.3562, + "step": 6470 + }, + { + "epoch": 3.0730897009966776, + "grad_norm": 0.30600814766202367, + "learning_rate": 3.421555702043507e-05, + "loss": 0.3574, + "step": 6475 + }, + { + "epoch": 3.0754627432368298, + "grad_norm": 0.312783178663751, + "learning_rate": 3.41990771259064e-05, + "loss": 0.3765, + "step": 6480 + }, + { + "epoch": 3.0778357854769816, + "grad_norm": 0.2836938037237224, + "learning_rate": 3.418259723137772e-05, + "loss": 0.3556, + "step": 6485 + }, + { + "epoch": 3.0802088277171333, + "grad_norm": 0.3189242861546878, + "learning_rate": 3.416611733684904e-05, + "loss": 0.3502, + "step": 6490 + }, + { + "epoch": 3.082581869957285, + "grad_norm": 0.33860101770758383, + "learning_rate": 3.414963744232037e-05, + "loss": 0.3546, + "step": 6495 + }, + { + "epoch": 3.0849549121974373, + "grad_norm": 0.32660230074912155, + "learning_rate": 3.4133157547791694e-05, + "loss": 0.3633, + "step": 6500 + }, + { + "epoch": 3.087327954437589, + "grad_norm": 0.2730932078134112, + "learning_rate": 3.4116677653263016e-05, + "loss": 0.3602, + "step": 6505 + }, + { + "epoch": 3.089700996677741, + "grad_norm": 0.2722750916885425, + "learning_rate": 3.410019775873435e-05, + "loss": 0.3722, + "step": 6510 + }, + { + "epoch": 3.0920740389178927, + "grad_norm": 0.34243534054027763, + "learning_rate": 3.408371786420567e-05, + "loss": 0.3749, + "step": 6515 + }, + { + "epoch": 3.0944470811580445, + "grad_norm": 0.2836619282400383, + "learning_rate": 3.4067237969676995e-05, + "loss": 0.3667, + "step": 6520 + }, + { + "epoch": 3.0968201233981967, + "grad_norm": 0.28401267590229456, + "learning_rate": 3.405075807514832e-05, + "loss": 0.3637, + "step": 6525 + }, + { + "epoch": 3.0991931656383485, + "grad_norm": 0.2682547479261168, + "learning_rate": 3.4034278180619646e-05, + "loss": 0.3605, + "step": 6530 + }, + { + "epoch": 3.1015662078785002, + "grad_norm": 0.2738366307788057, + "learning_rate": 3.401779828609097e-05, + "loss": 0.372, + "step": 6535 + }, + { + "epoch": 3.103939250118652, + "grad_norm": 0.2761193300764729, + "learning_rate": 3.4001318391562296e-05, + "loss": 0.3575, + "step": 6540 + }, + { + "epoch": 3.106312292358804, + "grad_norm": 0.3074657677386684, + "learning_rate": 3.3984838497033625e-05, + "loss": 0.3539, + "step": 6545 + }, + { + "epoch": 3.108685334598956, + "grad_norm": 0.2806037172455145, + "learning_rate": 3.396835860250495e-05, + "loss": 0.3701, + "step": 6550 + }, + { + "epoch": 3.111058376839108, + "grad_norm": 0.2797372408312975, + "learning_rate": 3.395187870797627e-05, + "loss": 0.3639, + "step": 6555 + }, + { + "epoch": 3.1134314190792596, + "grad_norm": 0.2630475165999282, + "learning_rate": 3.393539881344759e-05, + "loss": 0.3541, + "step": 6560 + }, + { + "epoch": 3.1158044613194114, + "grad_norm": 0.29891273025965354, + "learning_rate": 3.391891891891892e-05, + "loss": 0.3578, + "step": 6565 + }, + { + "epoch": 3.118177503559563, + "grad_norm": 0.28516654816762466, + "learning_rate": 3.390243902439025e-05, + "loss": 0.3636, + "step": 6570 + }, + { + "epoch": 3.1205505457997154, + "grad_norm": 0.31215512396372347, + "learning_rate": 3.388595912986157e-05, + "loss": 0.3633, + "step": 6575 + }, + { + "epoch": 3.122923588039867, + "grad_norm": 0.28277333768132995, + "learning_rate": 3.38694792353329e-05, + "loss": 0.3572, + "step": 6580 + }, + { + "epoch": 3.125296630280019, + "grad_norm": 0.2940986474925492, + "learning_rate": 3.385299934080422e-05, + "loss": 0.361, + "step": 6585 + }, + { + "epoch": 3.1276696725201707, + "grad_norm": 0.24905334474864152, + "learning_rate": 3.383651944627554e-05, + "loss": 0.3621, + "step": 6590 + }, + { + "epoch": 3.130042714760323, + "grad_norm": 0.27287873691020853, + "learning_rate": 3.3820039551746865e-05, + "loss": 0.3542, + "step": 6595 + }, + { + "epoch": 3.1324157570004747, + "grad_norm": 0.2742028206137696, + "learning_rate": 3.38035596572182e-05, + "loss": 0.3659, + "step": 6600 + }, + { + "epoch": 3.1347887992406265, + "grad_norm": 0.2770428044707851, + "learning_rate": 3.378707976268952e-05, + "loss": 0.3621, + "step": 6605 + }, + { + "epoch": 3.1371618414807783, + "grad_norm": 0.2907067477276018, + "learning_rate": 3.3770599868160845e-05, + "loss": 0.3532, + "step": 6610 + }, + { + "epoch": 3.13953488372093, + "grad_norm": 0.27522076292533143, + "learning_rate": 3.375411997363217e-05, + "loss": 0.3603, + "step": 6615 + }, + { + "epoch": 3.1419079259610823, + "grad_norm": 0.2800131602192635, + "learning_rate": 3.3737640079103495e-05, + "loss": 0.3611, + "step": 6620 + }, + { + "epoch": 3.144280968201234, + "grad_norm": 0.2621097453650788, + "learning_rate": 3.372116018457482e-05, + "loss": 0.3543, + "step": 6625 + }, + { + "epoch": 3.146654010441386, + "grad_norm": 0.2886814973350028, + "learning_rate": 3.3704680290046146e-05, + "loss": 0.3657, + "step": 6630 + }, + { + "epoch": 3.1490270526815376, + "grad_norm": 0.293195874942286, + "learning_rate": 3.3688200395517475e-05, + "loss": 0.3611, + "step": 6635 + }, + { + "epoch": 3.15140009492169, + "grad_norm": 0.2899275112360174, + "learning_rate": 3.36717205009888e-05, + "loss": 0.3577, + "step": 6640 + }, + { + "epoch": 3.1537731371618416, + "grad_norm": 0.31714426391095113, + "learning_rate": 3.365524060646012e-05, + "loss": 0.3639, + "step": 6645 + }, + { + "epoch": 3.1561461794019934, + "grad_norm": 0.286774001648739, + "learning_rate": 3.363876071193145e-05, + "loss": 0.3577, + "step": 6650 + }, + { + "epoch": 3.158519221642145, + "grad_norm": 0.2872770137407236, + "learning_rate": 3.362228081740277e-05, + "loss": 0.3639, + "step": 6655 + }, + { + "epoch": 3.160892263882297, + "grad_norm": 0.32265182208425974, + "learning_rate": 3.36058009228741e-05, + "loss": 0.3617, + "step": 6660 + }, + { + "epoch": 3.163265306122449, + "grad_norm": 0.2934072794392843, + "learning_rate": 3.358932102834542e-05, + "loss": 0.3648, + "step": 6665 + }, + { + "epoch": 3.165638348362601, + "grad_norm": 0.2800226042486211, + "learning_rate": 3.357284113381675e-05, + "loss": 0.357, + "step": 6670 + }, + { + "epoch": 3.1680113906027527, + "grad_norm": 0.3019711736283545, + "learning_rate": 3.355636123928807e-05, + "loss": 0.3676, + "step": 6675 + }, + { + "epoch": 3.1703844328429045, + "grad_norm": 0.2992423592228816, + "learning_rate": 3.353988134475939e-05, + "loss": 0.368, + "step": 6680 + }, + { + "epoch": 3.1727574750830563, + "grad_norm": 0.2886431545445516, + "learning_rate": 3.3523401450230715e-05, + "loss": 0.3579, + "step": 6685 + }, + { + "epoch": 3.1751305173232085, + "grad_norm": 0.33748264071370193, + "learning_rate": 3.350692155570205e-05, + "loss": 0.3594, + "step": 6690 + }, + { + "epoch": 3.1775035595633603, + "grad_norm": 0.26754960641662284, + "learning_rate": 3.349044166117337e-05, + "loss": 0.3653, + "step": 6695 + }, + { + "epoch": 3.179876601803512, + "grad_norm": 0.2926809674191939, + "learning_rate": 3.3473961766644694e-05, + "loss": 0.3633, + "step": 6700 + }, + { + "epoch": 3.182249644043664, + "grad_norm": 0.28431887572530185, + "learning_rate": 3.345748187211602e-05, + "loss": 0.3617, + "step": 6705 + }, + { + "epoch": 3.1846226862838156, + "grad_norm": 0.2713601826992011, + "learning_rate": 3.3441001977587345e-05, + "loss": 0.3687, + "step": 6710 + }, + { + "epoch": 3.186995728523968, + "grad_norm": 0.26221383864920755, + "learning_rate": 3.342452208305867e-05, + "loss": 0.3627, + "step": 6715 + }, + { + "epoch": 3.1893687707641196, + "grad_norm": 0.26626980704661585, + "learning_rate": 3.3408042188529996e-05, + "loss": 0.3636, + "step": 6720 + }, + { + "epoch": 3.1917418130042714, + "grad_norm": 0.25959853064930355, + "learning_rate": 3.3391562294001324e-05, + "loss": 0.3674, + "step": 6725 + }, + { + "epoch": 3.194114855244423, + "grad_norm": 0.2695756746732414, + "learning_rate": 3.3375082399472646e-05, + "loss": 0.3586, + "step": 6730 + }, + { + "epoch": 3.1964878974845754, + "grad_norm": 0.2763840096411635, + "learning_rate": 3.335860250494397e-05, + "loss": 0.3512, + "step": 6735 + }, + { + "epoch": 3.198860939724727, + "grad_norm": 0.26731395047321244, + "learning_rate": 3.33421226104153e-05, + "loss": 0.3556, + "step": 6740 + }, + { + "epoch": 3.201233981964879, + "grad_norm": 0.39776160866832627, + "learning_rate": 3.332564271588662e-05, + "loss": 0.3602, + "step": 6745 + }, + { + "epoch": 3.2036070242050307, + "grad_norm": 0.30685957654409907, + "learning_rate": 3.330916282135795e-05, + "loss": 0.3526, + "step": 6750 + }, + { + "epoch": 3.2059800664451825, + "grad_norm": 0.2994030264993999, + "learning_rate": 3.329268292682927e-05, + "loss": 0.3576, + "step": 6755 + }, + { + "epoch": 3.2083531086853347, + "grad_norm": 0.3074777697651586, + "learning_rate": 3.32762030323006e-05, + "loss": 0.3624, + "step": 6760 + }, + { + "epoch": 3.2107261509254865, + "grad_norm": 0.28774553052042035, + "learning_rate": 3.325972313777192e-05, + "loss": 0.3716, + "step": 6765 + }, + { + "epoch": 3.2130991931656383, + "grad_norm": 0.27434735893864004, + "learning_rate": 3.324324324324324e-05, + "loss": 0.3662, + "step": 6770 + }, + { + "epoch": 3.21547223540579, + "grad_norm": 0.27005587801896996, + "learning_rate": 3.322676334871457e-05, + "loss": 0.3568, + "step": 6775 + }, + { + "epoch": 3.2178452776459423, + "grad_norm": 0.3237517954256803, + "learning_rate": 3.321028345418589e-05, + "loss": 0.3566, + "step": 6780 + }, + { + "epoch": 3.220218319886094, + "grad_norm": 0.3390601567254222, + "learning_rate": 3.319380355965722e-05, + "loss": 0.374, + "step": 6785 + }, + { + "epoch": 3.222591362126246, + "grad_norm": 0.3173132209287459, + "learning_rate": 3.3177323665128544e-05, + "loss": 0.3611, + "step": 6790 + }, + { + "epoch": 3.2249644043663976, + "grad_norm": 0.298128895897468, + "learning_rate": 3.316084377059987e-05, + "loss": 0.3683, + "step": 6795 + }, + { + "epoch": 3.2273374466065494, + "grad_norm": 0.2940188060936306, + "learning_rate": 3.3144363876071194e-05, + "loss": 0.3595, + "step": 6800 + }, + { + "epoch": 3.2297104888467016, + "grad_norm": 0.2742824461200737, + "learning_rate": 3.3127883981542516e-05, + "loss": 0.3628, + "step": 6805 + }, + { + "epoch": 3.2320835310868534, + "grad_norm": 0.266488622519178, + "learning_rate": 3.3111404087013845e-05, + "loss": 0.3689, + "step": 6810 + }, + { + "epoch": 3.234456573327005, + "grad_norm": 0.26691915522720683, + "learning_rate": 3.3094924192485174e-05, + "loss": 0.364, + "step": 6815 + }, + { + "epoch": 3.236829615567157, + "grad_norm": 0.3034517590112466, + "learning_rate": 3.3078444297956496e-05, + "loss": 0.3582, + "step": 6820 + }, + { + "epoch": 3.2392026578073088, + "grad_norm": 0.339216191229273, + "learning_rate": 3.306196440342782e-05, + "loss": 0.3622, + "step": 6825 + }, + { + "epoch": 3.241575700047461, + "grad_norm": 0.2736348914714302, + "learning_rate": 3.3045484508899147e-05, + "loss": 0.3703, + "step": 6830 + }, + { + "epoch": 3.2439487422876128, + "grad_norm": 0.2643740929518335, + "learning_rate": 3.302900461437047e-05, + "loss": 0.3609, + "step": 6835 + }, + { + "epoch": 3.2463217845277645, + "grad_norm": 0.2817638868186219, + "learning_rate": 3.301252471984179e-05, + "loss": 0.3589, + "step": 6840 + }, + { + "epoch": 3.2486948267679163, + "grad_norm": 0.2784394816473238, + "learning_rate": 3.299604482531312e-05, + "loss": 0.3691, + "step": 6845 + }, + { + "epoch": 3.251067869008068, + "grad_norm": 0.2947705542730554, + "learning_rate": 3.297956493078445e-05, + "loss": 0.3589, + "step": 6850 + }, + { + "epoch": 3.2534409112482203, + "grad_norm": 0.25707021392545853, + "learning_rate": 3.296308503625577e-05, + "loss": 0.3632, + "step": 6855 + }, + { + "epoch": 3.255813953488372, + "grad_norm": 0.28330043795223303, + "learning_rate": 3.294660514172709e-05, + "loss": 0.3625, + "step": 6860 + }, + { + "epoch": 3.258186995728524, + "grad_norm": 0.2813424055552145, + "learning_rate": 3.293012524719842e-05, + "loss": 0.3611, + "step": 6865 + }, + { + "epoch": 3.2605600379686757, + "grad_norm": 0.27134564479394613, + "learning_rate": 3.291364535266974e-05, + "loss": 0.3522, + "step": 6870 + }, + { + "epoch": 3.262933080208828, + "grad_norm": 0.34341844435318014, + "learning_rate": 3.289716545814107e-05, + "loss": 0.3696, + "step": 6875 + }, + { + "epoch": 3.2653061224489797, + "grad_norm": 0.2821810752120773, + "learning_rate": 3.288068556361239e-05, + "loss": 0.3676, + "step": 6880 + }, + { + "epoch": 3.2676791646891314, + "grad_norm": 0.26009918701617035, + "learning_rate": 3.286420566908372e-05, + "loss": 0.3519, + "step": 6885 + }, + { + "epoch": 3.2700522069292832, + "grad_norm": 0.30186918133186735, + "learning_rate": 3.2847725774555044e-05, + "loss": 0.3574, + "step": 6890 + }, + { + "epoch": 3.2724252491694354, + "grad_norm": 0.25853989750469675, + "learning_rate": 3.2831245880026366e-05, + "loss": 0.3644, + "step": 6895 + }, + { + "epoch": 3.2747982914095872, + "grad_norm": 0.2687601203868982, + "learning_rate": 3.2814765985497695e-05, + "loss": 0.359, + "step": 6900 + }, + { + "epoch": 3.277171333649739, + "grad_norm": 0.2677934875149181, + "learning_rate": 3.2798286090969024e-05, + "loss": 0.3565, + "step": 6905 + }, + { + "epoch": 3.279544375889891, + "grad_norm": 0.27675352554409133, + "learning_rate": 3.2781806196440345e-05, + "loss": 0.3697, + "step": 6910 + }, + { + "epoch": 3.2819174181300426, + "grad_norm": 0.3097336855506349, + "learning_rate": 3.276532630191167e-05, + "loss": 0.3669, + "step": 6915 + }, + { + "epoch": 3.284290460370195, + "grad_norm": 0.2660356295299164, + "learning_rate": 3.2748846407382996e-05, + "loss": 0.3661, + "step": 6920 + }, + { + "epoch": 3.2866635026103466, + "grad_norm": 0.2842802215184955, + "learning_rate": 3.273236651285432e-05, + "loss": 0.3666, + "step": 6925 + }, + { + "epoch": 3.2890365448504983, + "grad_norm": 0.2719154671406519, + "learning_rate": 3.271588661832564e-05, + "loss": 0.369, + "step": 6930 + }, + { + "epoch": 3.29140958709065, + "grad_norm": 0.25550659280748084, + "learning_rate": 3.269940672379697e-05, + "loss": 0.3561, + "step": 6935 + }, + { + "epoch": 3.293782629330802, + "grad_norm": 0.2739894836136779, + "learning_rate": 3.26829268292683e-05, + "loss": 0.3644, + "step": 6940 + }, + { + "epoch": 3.296155671570954, + "grad_norm": 0.27142126182632165, + "learning_rate": 3.266644693473962e-05, + "loss": 0.3643, + "step": 6945 + }, + { + "epoch": 3.298528713811106, + "grad_norm": 0.27567706938037, + "learning_rate": 3.264996704021094e-05, + "loss": 0.3703, + "step": 6950 + }, + { + "epoch": 3.3009017560512577, + "grad_norm": 0.3112773946003835, + "learning_rate": 3.263348714568227e-05, + "loss": 0.3635, + "step": 6955 + }, + { + "epoch": 3.3032747982914095, + "grad_norm": 0.29861061629777985, + "learning_rate": 3.261700725115359e-05, + "loss": 0.3603, + "step": 6960 + }, + { + "epoch": 3.3056478405315612, + "grad_norm": 0.28731859327833215, + "learning_rate": 3.260052735662492e-05, + "loss": 0.3658, + "step": 6965 + }, + { + "epoch": 3.3080208827717135, + "grad_norm": 0.2845208163780069, + "learning_rate": 3.258404746209624e-05, + "loss": 0.3631, + "step": 6970 + }, + { + "epoch": 3.3103939250118652, + "grad_norm": 0.27399150893077606, + "learning_rate": 3.256756756756757e-05, + "loss": 0.3599, + "step": 6975 + }, + { + "epoch": 3.312766967252017, + "grad_norm": 0.28137909238339426, + "learning_rate": 3.2551087673038894e-05, + "loss": 0.3718, + "step": 6980 + }, + { + "epoch": 3.315140009492169, + "grad_norm": 0.2832832419657157, + "learning_rate": 3.2534607778510216e-05, + "loss": 0.3654, + "step": 6985 + }, + { + "epoch": 3.317513051732321, + "grad_norm": 0.27887777548141773, + "learning_rate": 3.2518127883981544e-05, + "loss": 0.3722, + "step": 6990 + }, + { + "epoch": 3.319886093972473, + "grad_norm": 0.29038059970916513, + "learning_rate": 3.250164798945287e-05, + "loss": 0.368, + "step": 6995 + }, + { + "epoch": 3.3222591362126246, + "grad_norm": 0.25198949509478985, + "learning_rate": 3.2485168094924195e-05, + "loss": 0.3535, + "step": 7000 + }, + { + "epoch": 3.3246321784527764, + "grad_norm": 0.25800409785835404, + "learning_rate": 3.246868820039552e-05, + "loss": 0.3639, + "step": 7005 + }, + { + "epoch": 3.327005220692928, + "grad_norm": 0.26420588112821014, + "learning_rate": 3.2452208305866846e-05, + "loss": 0.3751, + "step": 7010 + }, + { + "epoch": 3.3293782629330804, + "grad_norm": 0.27922667461450074, + "learning_rate": 3.243572841133817e-05, + "loss": 0.3732, + "step": 7015 + }, + { + "epoch": 3.331751305173232, + "grad_norm": 0.3124869318893854, + "learning_rate": 3.241924851680949e-05, + "loss": 0.3577, + "step": 7020 + }, + { + "epoch": 3.334124347413384, + "grad_norm": 0.3089921363850617, + "learning_rate": 3.240276862228082e-05, + "loss": 0.3657, + "step": 7025 + }, + { + "epoch": 3.3364973896535357, + "grad_norm": 0.30430066306477105, + "learning_rate": 3.238628872775215e-05, + "loss": 0.3578, + "step": 7030 + }, + { + "epoch": 3.338870431893688, + "grad_norm": 0.2937169527174515, + "learning_rate": 3.236980883322347e-05, + "loss": 0.3645, + "step": 7035 + }, + { + "epoch": 3.3412434741338397, + "grad_norm": 0.2776296242996785, + "learning_rate": 3.235332893869479e-05, + "loss": 0.364, + "step": 7040 + }, + { + "epoch": 3.3436165163739915, + "grad_norm": 0.299894145597139, + "learning_rate": 3.233684904416612e-05, + "loss": 0.3659, + "step": 7045 + }, + { + "epoch": 3.3459895586141433, + "grad_norm": 0.27925307342378547, + "learning_rate": 3.232036914963744e-05, + "loss": 0.3638, + "step": 7050 + }, + { + "epoch": 3.348362600854295, + "grad_norm": 0.27976380434358644, + "learning_rate": 3.230388925510877e-05, + "loss": 0.3708, + "step": 7055 + }, + { + "epoch": 3.3507356430944473, + "grad_norm": 0.26071594553390914, + "learning_rate": 3.228740936058009e-05, + "loss": 0.3645, + "step": 7060 + }, + { + "epoch": 3.353108685334599, + "grad_norm": 0.25661221118160565, + "learning_rate": 3.227092946605142e-05, + "loss": 0.362, + "step": 7065 + }, + { + "epoch": 3.355481727574751, + "grad_norm": 0.24889895751563068, + "learning_rate": 3.225444957152274e-05, + "loss": 0.3649, + "step": 7070 + }, + { + "epoch": 3.3578547698149026, + "grad_norm": 0.2898667371669415, + "learning_rate": 3.2237969676994065e-05, + "loss": 0.3706, + "step": 7075 + }, + { + "epoch": 3.3602278120550544, + "grad_norm": 0.3012944689572224, + "learning_rate": 3.2221489782465394e-05, + "loss": 0.3667, + "step": 7080 + }, + { + "epoch": 3.3626008542952066, + "grad_norm": 0.2741483488431299, + "learning_rate": 3.220500988793672e-05, + "loss": 0.3709, + "step": 7085 + }, + { + "epoch": 3.3649738965353584, + "grad_norm": 0.26249249387095375, + "learning_rate": 3.2188529993408045e-05, + "loss": 0.3563, + "step": 7090 + }, + { + "epoch": 3.36734693877551, + "grad_norm": 0.2594469810601939, + "learning_rate": 3.2172050098879367e-05, + "loss": 0.3562, + "step": 7095 + }, + { + "epoch": 3.369719981015662, + "grad_norm": 0.25635318088899306, + "learning_rate": 3.2155570204350695e-05, + "loss": 0.3667, + "step": 7100 + }, + { + "epoch": 3.3720930232558137, + "grad_norm": 0.26553253752814504, + "learning_rate": 3.213909030982202e-05, + "loss": 0.3644, + "step": 7105 + }, + { + "epoch": 3.374466065495966, + "grad_norm": 0.26046365984538555, + "learning_rate": 3.212261041529334e-05, + "loss": 0.3706, + "step": 7110 + }, + { + "epoch": 3.3768391077361177, + "grad_norm": 0.29074442030011394, + "learning_rate": 3.210613052076467e-05, + "loss": 0.3734, + "step": 7115 + }, + { + "epoch": 3.3792121499762695, + "grad_norm": 0.2611941345446546, + "learning_rate": 3.2089650626236e-05, + "loss": 0.3676, + "step": 7120 + }, + { + "epoch": 3.3815851922164213, + "grad_norm": 0.254619193032026, + "learning_rate": 3.207317073170732e-05, + "loss": 0.3596, + "step": 7125 + }, + { + "epoch": 3.3839582344565735, + "grad_norm": 0.2707634549630488, + "learning_rate": 3.205669083717864e-05, + "loss": 0.3693, + "step": 7130 + }, + { + "epoch": 3.3863312766967253, + "grad_norm": 0.28477542508457143, + "learning_rate": 3.204021094264997e-05, + "loss": 0.3623, + "step": 7135 + }, + { + "epoch": 3.388704318936877, + "grad_norm": 0.28276465979158205, + "learning_rate": 3.202373104812129e-05, + "loss": 0.3759, + "step": 7140 + }, + { + "epoch": 3.391077361177029, + "grad_norm": 0.2745797469527534, + "learning_rate": 3.200725115359262e-05, + "loss": 0.3699, + "step": 7145 + }, + { + "epoch": 3.393450403417181, + "grad_norm": 0.32208083738162263, + "learning_rate": 3.199077125906394e-05, + "loss": 0.3656, + "step": 7150 + }, + { + "epoch": 3.395823445657333, + "grad_norm": 0.2604189216489594, + "learning_rate": 3.197429136453527e-05, + "loss": 0.3674, + "step": 7155 + }, + { + "epoch": 3.3981964878974846, + "grad_norm": 0.28043466581504783, + "learning_rate": 3.195781147000659e-05, + "loss": 0.3642, + "step": 7160 + }, + { + "epoch": 3.4005695301376364, + "grad_norm": 0.2692001112067546, + "learning_rate": 3.1941331575477915e-05, + "loss": 0.3654, + "step": 7165 + }, + { + "epoch": 3.402942572377788, + "grad_norm": 0.2743395856823029, + "learning_rate": 3.1924851680949244e-05, + "loss": 0.3689, + "step": 7170 + }, + { + "epoch": 3.4053156146179404, + "grad_norm": 0.2885447310873954, + "learning_rate": 3.190837178642057e-05, + "loss": 0.3676, + "step": 7175 + }, + { + "epoch": 3.407688656858092, + "grad_norm": 0.3162408720299851, + "learning_rate": 3.1891891891891894e-05, + "loss": 0.3631, + "step": 7180 + }, + { + "epoch": 3.410061699098244, + "grad_norm": 0.32064238777346477, + "learning_rate": 3.1875411997363216e-05, + "loss": 0.3681, + "step": 7185 + }, + { + "epoch": 3.4124347413383957, + "grad_norm": 0.3017883068624383, + "learning_rate": 3.1858932102834545e-05, + "loss": 0.3778, + "step": 7190 + }, + { + "epoch": 3.4148077835785475, + "grad_norm": 0.27186036686588844, + "learning_rate": 3.184245220830587e-05, + "loss": 0.3533, + "step": 7195 + }, + { + "epoch": 3.4171808258186998, + "grad_norm": 0.2741272484135891, + "learning_rate": 3.182597231377719e-05, + "loss": 0.3521, + "step": 7200 + }, + { + "epoch": 3.4195538680588515, + "grad_norm": 0.28124966011141633, + "learning_rate": 3.1809492419248524e-05, + "loss": 0.3705, + "step": 7205 + }, + { + "epoch": 3.4219269102990033, + "grad_norm": 0.30349495100970536, + "learning_rate": 3.1793012524719846e-05, + "loss": 0.3602, + "step": 7210 + }, + { + "epoch": 3.424299952539155, + "grad_norm": 0.29307638944976006, + "learning_rate": 3.177653263019117e-05, + "loss": 0.3645, + "step": 7215 + }, + { + "epoch": 3.426672994779307, + "grad_norm": 0.28064464487224317, + "learning_rate": 3.176005273566249e-05, + "loss": 0.3603, + "step": 7220 + }, + { + "epoch": 3.429046037019459, + "grad_norm": 0.353205741619622, + "learning_rate": 3.174357284113382e-05, + "loss": 0.3684, + "step": 7225 + }, + { + "epoch": 3.431419079259611, + "grad_norm": 0.31143173365552734, + "learning_rate": 3.172709294660514e-05, + "loss": 0.3651, + "step": 7230 + }, + { + "epoch": 3.4337921214997627, + "grad_norm": 0.345054132054979, + "learning_rate": 3.171061305207646e-05, + "loss": 0.3798, + "step": 7235 + }, + { + "epoch": 3.4361651637399144, + "grad_norm": 0.28297262087828506, + "learning_rate": 3.16941331575478e-05, + "loss": 0.3638, + "step": 7240 + }, + { + "epoch": 3.438538205980066, + "grad_norm": 0.31606412611390283, + "learning_rate": 3.167765326301912e-05, + "loss": 0.3752, + "step": 7245 + }, + { + "epoch": 3.4409112482202184, + "grad_norm": 0.3032590461833387, + "learning_rate": 3.166117336849044e-05, + "loss": 0.3641, + "step": 7250 + }, + { + "epoch": 3.44328429046037, + "grad_norm": 0.26573755382775577, + "learning_rate": 3.1644693473961764e-05, + "loss": 0.3663, + "step": 7255 + }, + { + "epoch": 3.445657332700522, + "grad_norm": 0.2879771272866368, + "learning_rate": 3.162821357943309e-05, + "loss": 0.3618, + "step": 7260 + }, + { + "epoch": 3.4480303749406738, + "grad_norm": 0.2868496129678002, + "learning_rate": 3.1611733684904415e-05, + "loss": 0.3665, + "step": 7265 + }, + { + "epoch": 3.450403417180826, + "grad_norm": 0.3065280666999036, + "learning_rate": 3.1595253790375744e-05, + "loss": 0.3775, + "step": 7270 + }, + { + "epoch": 3.4527764594209778, + "grad_norm": 0.2787524784839979, + "learning_rate": 3.1578773895847066e-05, + "loss": 0.3725, + "step": 7275 + }, + { + "epoch": 3.4551495016611296, + "grad_norm": 0.27732061546411524, + "learning_rate": 3.1562294001318395e-05, + "loss": 0.3693, + "step": 7280 + }, + { + "epoch": 3.4575225439012813, + "grad_norm": 0.3168892934342522, + "learning_rate": 3.1545814106789716e-05, + "loss": 0.3724, + "step": 7285 + }, + { + "epoch": 3.4598955861414336, + "grad_norm": 0.2965276242802056, + "learning_rate": 3.152933421226104e-05, + "loss": 0.3679, + "step": 7290 + }, + { + "epoch": 3.4622686283815853, + "grad_norm": 0.279717447997946, + "learning_rate": 3.151285431773237e-05, + "loss": 0.3688, + "step": 7295 + }, + { + "epoch": 3.464641670621737, + "grad_norm": 0.2844215395590286, + "learning_rate": 3.1496374423203696e-05, + "loss": 0.36, + "step": 7300 + }, + { + "epoch": 3.467014712861889, + "grad_norm": 0.270922256600989, + "learning_rate": 3.147989452867502e-05, + "loss": 0.3673, + "step": 7305 + }, + { + "epoch": 3.4693877551020407, + "grad_norm": 0.26828002177320154, + "learning_rate": 3.146341463414634e-05, + "loss": 0.3629, + "step": 7310 + }, + { + "epoch": 3.471760797342193, + "grad_norm": 0.2721639884301625, + "learning_rate": 3.144693473961767e-05, + "loss": 0.3632, + "step": 7315 + }, + { + "epoch": 3.4741338395823447, + "grad_norm": 0.32235000377205414, + "learning_rate": 3.143045484508899e-05, + "loss": 0.3596, + "step": 7320 + }, + { + "epoch": 3.4765068818224965, + "grad_norm": 0.28392767368134575, + "learning_rate": 3.141397495056031e-05, + "loss": 0.3772, + "step": 7325 + }, + { + "epoch": 3.4788799240626482, + "grad_norm": 0.27298215040087437, + "learning_rate": 3.139749505603165e-05, + "loss": 0.3571, + "step": 7330 + }, + { + "epoch": 3.4812529663028, + "grad_norm": 0.33538729384248783, + "learning_rate": 3.138101516150297e-05, + "loss": 0.3703, + "step": 7335 + }, + { + "epoch": 3.4836260085429522, + "grad_norm": 0.3155881526599587, + "learning_rate": 3.136453526697429e-05, + "loss": 0.371, + "step": 7340 + }, + { + "epoch": 3.485999050783104, + "grad_norm": 0.31954968356775143, + "learning_rate": 3.1348055372445614e-05, + "loss": 0.3692, + "step": 7345 + }, + { + "epoch": 3.488372093023256, + "grad_norm": 0.33166642434750127, + "learning_rate": 3.133157547791694e-05, + "loss": 0.3697, + "step": 7350 + }, + { + "epoch": 3.4907451352634076, + "grad_norm": 0.2869609005517025, + "learning_rate": 3.1315095583388265e-05, + "loss": 0.3697, + "step": 7355 + }, + { + "epoch": 3.4931181775035594, + "grad_norm": 0.2888893694134651, + "learning_rate": 3.1298615688859593e-05, + "loss": 0.3685, + "step": 7360 + }, + { + "epoch": 3.4954912197437116, + "grad_norm": 0.271105161596083, + "learning_rate": 3.128213579433092e-05, + "loss": 0.3665, + "step": 7365 + }, + { + "epoch": 3.4978642619838634, + "grad_norm": 0.27386780230794944, + "learning_rate": 3.1265655899802244e-05, + "loss": 0.3621, + "step": 7370 + }, + { + "epoch": 3.500237304224015, + "grad_norm": 0.281574475957684, + "learning_rate": 3.1249176005273566e-05, + "loss": 0.373, + "step": 7375 + }, + { + "epoch": 3.502610346464167, + "grad_norm": 0.2787905133016995, + "learning_rate": 3.123269611074489e-05, + "loss": 0.3616, + "step": 7380 + }, + { + "epoch": 3.5049833887043187, + "grad_norm": 0.29749812939539116, + "learning_rate": 3.121621621621622e-05, + "loss": 0.3734, + "step": 7385 + }, + { + "epoch": 3.507356430944471, + "grad_norm": 0.3220862251203847, + "learning_rate": 3.1199736321687546e-05, + "loss": 0.3658, + "step": 7390 + }, + { + "epoch": 3.5097294731846227, + "grad_norm": 0.27351586553568175, + "learning_rate": 3.118325642715887e-05, + "loss": 0.3587, + "step": 7395 + }, + { + "epoch": 3.5121025154247745, + "grad_norm": 0.28506905739450156, + "learning_rate": 3.1166776532630196e-05, + "loss": 0.3551, + "step": 7400 + }, + { + "epoch": 3.5144755576649267, + "grad_norm": 0.26212731625494445, + "learning_rate": 3.115029663810152e-05, + "loss": 0.3663, + "step": 7405 + }, + { + "epoch": 3.5168485999050785, + "grad_norm": 0.2867286546121769, + "learning_rate": 3.113381674357284e-05, + "loss": 0.3633, + "step": 7410 + }, + { + "epoch": 3.5192216421452303, + "grad_norm": 0.2846167043746519, + "learning_rate": 3.111733684904416e-05, + "loss": 0.3671, + "step": 7415 + }, + { + "epoch": 3.521594684385382, + "grad_norm": 0.30068597648916695, + "learning_rate": 3.11008569545155e-05, + "loss": 0.365, + "step": 7420 + }, + { + "epoch": 3.523967726625534, + "grad_norm": 0.301635728153225, + "learning_rate": 3.108437705998682e-05, + "loss": 0.3676, + "step": 7425 + }, + { + "epoch": 3.526340768865686, + "grad_norm": 0.321530413650849, + "learning_rate": 3.106789716545814e-05, + "loss": 0.3624, + "step": 7430 + }, + { + "epoch": 3.528713811105838, + "grad_norm": 0.28394923352066737, + "learning_rate": 3.105141727092947e-05, + "loss": 0.369, + "step": 7435 + }, + { + "epoch": 3.5310868533459896, + "grad_norm": 0.27951516504503154, + "learning_rate": 3.103493737640079e-05, + "loss": 0.3694, + "step": 7440 + }, + { + "epoch": 3.5334598955861414, + "grad_norm": 0.31217363186214814, + "learning_rate": 3.1018457481872114e-05, + "loss": 0.3652, + "step": 7445 + }, + { + "epoch": 3.535832937826293, + "grad_norm": 0.29996014350003697, + "learning_rate": 3.100197758734344e-05, + "loss": 0.3703, + "step": 7450 + }, + { + "epoch": 3.5382059800664454, + "grad_norm": 0.26802884368673696, + "learning_rate": 3.098549769281477e-05, + "loss": 0.3709, + "step": 7455 + }, + { + "epoch": 3.540579022306597, + "grad_norm": 0.2586415336822379, + "learning_rate": 3.0969017798286094e-05, + "loss": 0.3683, + "step": 7460 + }, + { + "epoch": 3.542952064546749, + "grad_norm": 0.279521093191455, + "learning_rate": 3.0952537903757416e-05, + "loss": 0.3678, + "step": 7465 + }, + { + "epoch": 3.5453251067869007, + "grad_norm": 0.27451679466829687, + "learning_rate": 3.0936058009228744e-05, + "loss": 0.3612, + "step": 7470 + }, + { + "epoch": 3.5476981490270525, + "grad_norm": 0.29740114034583687, + "learning_rate": 3.0919578114700066e-05, + "loss": 0.3682, + "step": 7475 + }, + { + "epoch": 3.5500711912672047, + "grad_norm": 0.2829295580419803, + "learning_rate": 3.0903098220171395e-05, + "loss": 0.3652, + "step": 7480 + }, + { + "epoch": 3.5524442335073565, + "grad_norm": 0.25436197963727364, + "learning_rate": 3.088661832564272e-05, + "loss": 0.3613, + "step": 7485 + }, + { + "epoch": 3.5548172757475083, + "grad_norm": 0.3054998960616471, + "learning_rate": 3.0870138431114046e-05, + "loss": 0.3651, + "step": 7490 + }, + { + "epoch": 3.55719031798766, + "grad_norm": 0.304067581129953, + "learning_rate": 3.085365853658537e-05, + "loss": 0.3638, + "step": 7495 + }, + { + "epoch": 3.559563360227812, + "grad_norm": 0.2584859516974748, + "learning_rate": 3.083717864205669e-05, + "loss": 0.3723, + "step": 7500 + }, + { + "epoch": 3.561936402467964, + "grad_norm": 0.256654665894268, + "learning_rate": 3.082069874752802e-05, + "loss": 0.3675, + "step": 7505 + }, + { + "epoch": 3.564309444708116, + "grad_norm": 0.3003813092505424, + "learning_rate": 3.080421885299935e-05, + "loss": 0.3662, + "step": 7510 + }, + { + "epoch": 3.5666824869482676, + "grad_norm": 0.26871293358005544, + "learning_rate": 3.078773895847067e-05, + "loss": 0.374, + "step": 7515 + }, + { + "epoch": 3.5690555291884194, + "grad_norm": 0.25665635570391077, + "learning_rate": 3.077125906394199e-05, + "loss": 0.3724, + "step": 7520 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.26423027235989355, + "learning_rate": 3.075477916941332e-05, + "loss": 0.3622, + "step": 7525 + }, + { + "epoch": 3.5738016136687234, + "grad_norm": 0.272922398573992, + "learning_rate": 3.073829927488464e-05, + "loss": 0.361, + "step": 7530 + }, + { + "epoch": 3.576174655908875, + "grad_norm": 0.2754131815233419, + "learning_rate": 3.0721819380355964e-05, + "loss": 0.3589, + "step": 7535 + }, + { + "epoch": 3.578547698149027, + "grad_norm": 0.2835253411187121, + "learning_rate": 3.070533948582729e-05, + "loss": 0.3612, + "step": 7540 + }, + { + "epoch": 3.580920740389179, + "grad_norm": 0.27974670583560846, + "learning_rate": 3.068885959129862e-05, + "loss": 0.3662, + "step": 7545 + }, + { + "epoch": 3.583293782629331, + "grad_norm": 0.2858230788321351, + "learning_rate": 3.067237969676994e-05, + "loss": 0.3533, + "step": 7550 + }, + { + "epoch": 3.5856668248694827, + "grad_norm": 0.2917358658339336, + "learning_rate": 3.0655899802241265e-05, + "loss": 0.37, + "step": 7555 + }, + { + "epoch": 3.5880398671096345, + "grad_norm": 0.2631623383200954, + "learning_rate": 3.0639419907712594e-05, + "loss": 0.3638, + "step": 7560 + }, + { + "epoch": 3.5904129093497863, + "grad_norm": 0.2847825326849691, + "learning_rate": 3.0622940013183916e-05, + "loss": 0.3659, + "step": 7565 + }, + { + "epoch": 3.5927859515899385, + "grad_norm": 0.261305650543369, + "learning_rate": 3.0606460118655245e-05, + "loss": 0.3677, + "step": 7570 + }, + { + "epoch": 3.5951589938300903, + "grad_norm": 0.24928539047974027, + "learning_rate": 3.058998022412657e-05, + "loss": 0.3578, + "step": 7575 + }, + { + "epoch": 3.597532036070242, + "grad_norm": 0.3094000997221716, + "learning_rate": 3.0573500329597895e-05, + "loss": 0.3611, + "step": 7580 + }, + { + "epoch": 3.599905078310394, + "grad_norm": 0.27197141637679983, + "learning_rate": 3.055702043506922e-05, + "loss": 0.3559, + "step": 7585 + }, + { + "epoch": 3.6022781205505456, + "grad_norm": 0.29196585707230127, + "learning_rate": 3.054054054054054e-05, + "loss": 0.3516, + "step": 7590 + }, + { + "epoch": 3.604651162790698, + "grad_norm": 0.25096157393639584, + "learning_rate": 3.052406064601187e-05, + "loss": 0.3554, + "step": 7595 + }, + { + "epoch": 3.6070242050308496, + "grad_norm": 0.2741143205617188, + "learning_rate": 3.0507580751483193e-05, + "loss": 0.366, + "step": 7600 + }, + { + "epoch": 3.6093972472710014, + "grad_norm": 0.25757280730211113, + "learning_rate": 3.049110085695452e-05, + "loss": 0.3636, + "step": 7605 + }, + { + "epoch": 3.611770289511153, + "grad_norm": 0.29146483169156345, + "learning_rate": 3.0474620962425844e-05, + "loss": 0.3689, + "step": 7610 + }, + { + "epoch": 3.614143331751305, + "grad_norm": 0.28792392072413436, + "learning_rate": 3.0458141067897166e-05, + "loss": 0.3626, + "step": 7615 + }, + { + "epoch": 3.616516373991457, + "grad_norm": 0.30526498201256524, + "learning_rate": 3.044166117336849e-05, + "loss": 0.3767, + "step": 7620 + }, + { + "epoch": 3.618889416231609, + "grad_norm": 0.2712885967381952, + "learning_rate": 3.0425181278839813e-05, + "loss": 0.37, + "step": 7625 + }, + { + "epoch": 3.6212624584717608, + "grad_norm": 0.265576204854108, + "learning_rate": 3.0408701384311146e-05, + "loss": 0.3629, + "step": 7630 + }, + { + "epoch": 3.6236355007119125, + "grad_norm": 0.2792221713610559, + "learning_rate": 3.0392221489782468e-05, + "loss": 0.3703, + "step": 7635 + }, + { + "epoch": 3.6260085429520643, + "grad_norm": 0.2759411042935483, + "learning_rate": 3.0375741595253793e-05, + "loss": 0.3711, + "step": 7640 + }, + { + "epoch": 3.6283815851922165, + "grad_norm": 0.2885282365271703, + "learning_rate": 3.0359261700725118e-05, + "loss": 0.3674, + "step": 7645 + }, + { + "epoch": 3.6307546274323683, + "grad_norm": 0.28142235210650685, + "learning_rate": 3.034278180619644e-05, + "loss": 0.3626, + "step": 7650 + }, + { + "epoch": 3.63312766967252, + "grad_norm": 0.29645122405709234, + "learning_rate": 3.0326301911667766e-05, + "loss": 0.3634, + "step": 7655 + }, + { + "epoch": 3.6355007119126723, + "grad_norm": 0.26745995014158364, + "learning_rate": 3.0309822017139094e-05, + "loss": 0.3749, + "step": 7660 + }, + { + "epoch": 3.6378737541528237, + "grad_norm": 0.2899790641818729, + "learning_rate": 3.029334212261042e-05, + "loss": 0.3789, + "step": 7665 + }, + { + "epoch": 3.640246796392976, + "grad_norm": 0.2521267844372713, + "learning_rate": 3.027686222808174e-05, + "loss": 0.3575, + "step": 7670 + }, + { + "epoch": 3.6426198386331277, + "grad_norm": 0.28098700990947395, + "learning_rate": 3.0260382333553067e-05, + "loss": 0.3699, + "step": 7675 + }, + { + "epoch": 3.6449928808732794, + "grad_norm": 0.27519991602389304, + "learning_rate": 3.0243902439024392e-05, + "loss": 0.365, + "step": 7680 + }, + { + "epoch": 3.6473659231134317, + "grad_norm": 0.25619198806344246, + "learning_rate": 3.0227422544495714e-05, + "loss": 0.3665, + "step": 7685 + }, + { + "epoch": 3.6497389653535834, + "grad_norm": 0.28716743455259225, + "learning_rate": 3.021094264996704e-05, + "loss": 0.3653, + "step": 7690 + }, + { + "epoch": 3.652112007593735, + "grad_norm": 0.2740460937012465, + "learning_rate": 3.019446275543837e-05, + "loss": 0.3597, + "step": 7695 + }, + { + "epoch": 3.654485049833887, + "grad_norm": 0.26881116035615243, + "learning_rate": 3.0177982860909694e-05, + "loss": 0.3569, + "step": 7700 + }, + { + "epoch": 3.656858092074039, + "grad_norm": 0.26866370600286554, + "learning_rate": 3.0161502966381016e-05, + "loss": 0.3645, + "step": 7705 + }, + { + "epoch": 3.659231134314191, + "grad_norm": 0.28568530372515427, + "learning_rate": 3.014502307185234e-05, + "loss": 0.3551, + "step": 7710 + }, + { + "epoch": 3.661604176554343, + "grad_norm": 0.31781299579642475, + "learning_rate": 3.0128543177323666e-05, + "loss": 0.3693, + "step": 7715 + }, + { + "epoch": 3.6639772187944946, + "grad_norm": 0.26726658644297635, + "learning_rate": 3.011206328279499e-05, + "loss": 0.3559, + "step": 7720 + }, + { + "epoch": 3.6663502610346463, + "grad_norm": 0.2656334426444647, + "learning_rate": 3.0095583388266317e-05, + "loss": 0.3587, + "step": 7725 + }, + { + "epoch": 3.668723303274798, + "grad_norm": 0.2665567857651594, + "learning_rate": 3.0079103493737642e-05, + "loss": 0.3639, + "step": 7730 + }, + { + "epoch": 3.6710963455149503, + "grad_norm": 0.2781348092420836, + "learning_rate": 3.0062623599208968e-05, + "loss": 0.3621, + "step": 7735 + }, + { + "epoch": 3.673469387755102, + "grad_norm": 0.2608192615036669, + "learning_rate": 3.004614370468029e-05, + "loss": 0.3601, + "step": 7740 + }, + { + "epoch": 3.675842429995254, + "grad_norm": 0.28575390051445526, + "learning_rate": 3.0029663810151615e-05, + "loss": 0.3684, + "step": 7745 + }, + { + "epoch": 3.6782154722354057, + "grad_norm": 0.2752060740442112, + "learning_rate": 3.001318391562294e-05, + "loss": 0.3693, + "step": 7750 + }, + { + "epoch": 3.6805885144755575, + "grad_norm": 0.27134679799029615, + "learning_rate": 2.999670402109427e-05, + "loss": 0.3726, + "step": 7755 + }, + { + "epoch": 3.6829615567157097, + "grad_norm": 0.25723136669016605, + "learning_rate": 2.998022412656559e-05, + "loss": 0.3645, + "step": 7760 + }, + { + "epoch": 3.6853345989558615, + "grad_norm": 0.2644915896747931, + "learning_rate": 2.9963744232036917e-05, + "loss": 0.3652, + "step": 7765 + }, + { + "epoch": 3.6877076411960132, + "grad_norm": 0.2636212135642723, + "learning_rate": 2.9947264337508242e-05, + "loss": 0.3739, + "step": 7770 + }, + { + "epoch": 3.690080683436165, + "grad_norm": 0.255483260080596, + "learning_rate": 2.9930784442979564e-05, + "loss": 0.3672, + "step": 7775 + }, + { + "epoch": 3.692453725676317, + "grad_norm": 0.27046953455695194, + "learning_rate": 2.991430454845089e-05, + "loss": 0.3661, + "step": 7780 + }, + { + "epoch": 3.694826767916469, + "grad_norm": 0.2672652588000945, + "learning_rate": 2.9897824653922218e-05, + "loss": 0.3632, + "step": 7785 + }, + { + "epoch": 3.697199810156621, + "grad_norm": 0.2764342986474873, + "learning_rate": 2.9881344759393543e-05, + "loss": 0.3727, + "step": 7790 + }, + { + "epoch": 3.6995728523967726, + "grad_norm": 0.26934774633031733, + "learning_rate": 2.9864864864864865e-05, + "loss": 0.3601, + "step": 7795 + }, + { + "epoch": 3.701945894636925, + "grad_norm": 0.26142645703483486, + "learning_rate": 2.984838497033619e-05, + "loss": 0.3649, + "step": 7800 + }, + { + "epoch": 3.704318936877076, + "grad_norm": 0.27161502206359117, + "learning_rate": 2.9831905075807516e-05, + "loss": 0.3705, + "step": 7805 + }, + { + "epoch": 3.7066919791172284, + "grad_norm": 0.2726132946613323, + "learning_rate": 2.9815425181278838e-05, + "loss": 0.3563, + "step": 7810 + }, + { + "epoch": 3.70906502135738, + "grad_norm": 0.2737908688052006, + "learning_rate": 2.979894528675017e-05, + "loss": 0.3692, + "step": 7815 + }, + { + "epoch": 3.711438063597532, + "grad_norm": 0.26500873884375237, + "learning_rate": 2.9782465392221492e-05, + "loss": 0.3651, + "step": 7820 + }, + { + "epoch": 3.713811105837684, + "grad_norm": 0.26522971941391804, + "learning_rate": 2.9765985497692817e-05, + "loss": 0.3608, + "step": 7825 + }, + { + "epoch": 3.716184148077836, + "grad_norm": 0.27617834872197566, + "learning_rate": 2.974950560316414e-05, + "loss": 0.3672, + "step": 7830 + }, + { + "epoch": 3.7185571903179877, + "grad_norm": 0.2779301000182583, + "learning_rate": 2.9733025708635465e-05, + "loss": 0.3598, + "step": 7835 + }, + { + "epoch": 3.7209302325581395, + "grad_norm": 0.2728946761940509, + "learning_rate": 2.971654581410679e-05, + "loss": 0.3557, + "step": 7840 + }, + { + "epoch": 3.7233032747982913, + "grad_norm": 0.25595860742932913, + "learning_rate": 2.970006591957812e-05, + "loss": 0.3592, + "step": 7845 + }, + { + "epoch": 3.7256763170384435, + "grad_norm": 0.280984280000374, + "learning_rate": 2.968358602504944e-05, + "loss": 0.3707, + "step": 7850 + }, + { + "epoch": 3.7280493592785953, + "grad_norm": 0.3016194734150708, + "learning_rate": 2.9667106130520766e-05, + "loss": 0.3516, + "step": 7855 + }, + { + "epoch": 3.730422401518747, + "grad_norm": 0.26560020871630224, + "learning_rate": 2.965062623599209e-05, + "loss": 0.3621, + "step": 7860 + }, + { + "epoch": 3.732795443758899, + "grad_norm": 0.26804192605782, + "learning_rate": 2.9634146341463413e-05, + "loss": 0.3543, + "step": 7865 + }, + { + "epoch": 3.7351684859990506, + "grad_norm": 0.2857061526887104, + "learning_rate": 2.961766644693474e-05, + "loss": 0.3659, + "step": 7870 + }, + { + "epoch": 3.737541528239203, + "grad_norm": 0.2917983381970908, + "learning_rate": 2.9601186552406068e-05, + "loss": 0.3722, + "step": 7875 + }, + { + "epoch": 3.7399145704793546, + "grad_norm": 0.25894156706381827, + "learning_rate": 2.9584706657877393e-05, + "loss": 0.353, + "step": 7880 + }, + { + "epoch": 3.7422876127195064, + "grad_norm": 0.2565464129442248, + "learning_rate": 2.9568226763348715e-05, + "loss": 0.3656, + "step": 7885 + }, + { + "epoch": 3.744660654959658, + "grad_norm": 0.2778864692970568, + "learning_rate": 2.955174686882004e-05, + "loss": 0.3616, + "step": 7890 + }, + { + "epoch": 3.74703369719981, + "grad_norm": 0.27772941412178903, + "learning_rate": 2.9535266974291366e-05, + "loss": 0.3643, + "step": 7895 + }, + { + "epoch": 3.749406739439962, + "grad_norm": 0.3062982361934854, + "learning_rate": 2.9518787079762688e-05, + "loss": 0.3747, + "step": 7900 + }, + { + "epoch": 3.751779781680114, + "grad_norm": 0.2543738894763168, + "learning_rate": 2.950230718523402e-05, + "loss": 0.3666, + "step": 7905 + }, + { + "epoch": 3.7541528239202657, + "grad_norm": 0.2609573049389455, + "learning_rate": 2.948582729070534e-05, + "loss": 0.3685, + "step": 7910 + }, + { + "epoch": 3.7565258661604175, + "grad_norm": 0.2825632400063133, + "learning_rate": 2.9469347396176667e-05, + "loss": 0.355, + "step": 7915 + }, + { + "epoch": 3.7588989084005693, + "grad_norm": 0.2624811976098695, + "learning_rate": 2.945286750164799e-05, + "loss": 0.37, + "step": 7920 + }, + { + "epoch": 3.7612719506407215, + "grad_norm": 0.2694575343324625, + "learning_rate": 2.9436387607119314e-05, + "loss": 0.3625, + "step": 7925 + }, + { + "epoch": 3.7636449928808733, + "grad_norm": 0.27290227001459777, + "learning_rate": 2.941990771259064e-05, + "loss": 0.3663, + "step": 7930 + }, + { + "epoch": 3.766018035121025, + "grad_norm": 0.28207064369529167, + "learning_rate": 2.940342781806197e-05, + "loss": 0.3742, + "step": 7935 + }, + { + "epoch": 3.7683910773611773, + "grad_norm": 0.27492285052464766, + "learning_rate": 2.9386947923533294e-05, + "loss": 0.3627, + "step": 7940 + }, + { + "epoch": 3.770764119601329, + "grad_norm": 0.28031908388231414, + "learning_rate": 2.9370468029004616e-05, + "loss": 0.3721, + "step": 7945 + }, + { + "epoch": 3.773137161841481, + "grad_norm": 0.28407821485768575, + "learning_rate": 2.935398813447594e-05, + "loss": 0.3619, + "step": 7950 + }, + { + "epoch": 3.7755102040816326, + "grad_norm": 0.2825098819694467, + "learning_rate": 2.9337508239947263e-05, + "loss": 0.3698, + "step": 7955 + }, + { + "epoch": 3.7778832463217844, + "grad_norm": 0.2698125179367135, + "learning_rate": 2.932102834541859e-05, + "loss": 0.3747, + "step": 7960 + }, + { + "epoch": 3.7802562885619366, + "grad_norm": 0.2807823582986729, + "learning_rate": 2.9304548450889917e-05, + "loss": 0.3749, + "step": 7965 + }, + { + "epoch": 3.7826293308020884, + "grad_norm": 0.2748703541367474, + "learning_rate": 2.9288068556361242e-05, + "loss": 0.362, + "step": 7970 + }, + { + "epoch": 3.78500237304224, + "grad_norm": 0.2829440180887545, + "learning_rate": 2.9271588661832568e-05, + "loss": 0.3713, + "step": 7975 + }, + { + "epoch": 3.787375415282392, + "grad_norm": 0.27876113208110603, + "learning_rate": 2.925510876730389e-05, + "loss": 0.3613, + "step": 7980 + }, + { + "epoch": 3.7897484575225437, + "grad_norm": 0.2563386026546228, + "learning_rate": 2.9238628872775215e-05, + "loss": 0.3593, + "step": 7985 + }, + { + "epoch": 3.792121499762696, + "grad_norm": 0.25585776423414736, + "learning_rate": 2.9222148978246537e-05, + "loss": 0.3571, + "step": 7990 + }, + { + "epoch": 3.7944945420028477, + "grad_norm": 0.2770406332891074, + "learning_rate": 2.920566908371787e-05, + "loss": 0.3634, + "step": 7995 + }, + { + "epoch": 3.7968675842429995, + "grad_norm": 0.2514273653839153, + "learning_rate": 2.918918918918919e-05, + "loss": 0.371, + "step": 8000 + }, + { + "epoch": 3.7992406264831513, + "grad_norm": 0.23647038524233685, + "learning_rate": 2.9172709294660517e-05, + "loss": 0.3645, + "step": 8005 + }, + { + "epoch": 3.801613668723303, + "grad_norm": 0.29039149931921027, + "learning_rate": 2.9156229400131842e-05, + "loss": 0.3675, + "step": 8010 + }, + { + "epoch": 3.8039867109634553, + "grad_norm": 0.25914629822141655, + "learning_rate": 2.9139749505603164e-05, + "loss": 0.3715, + "step": 8015 + }, + { + "epoch": 3.806359753203607, + "grad_norm": 0.2781504986188702, + "learning_rate": 2.912326961107449e-05, + "loss": 0.3635, + "step": 8020 + }, + { + "epoch": 3.808732795443759, + "grad_norm": 0.24646642821638126, + "learning_rate": 2.9106789716545818e-05, + "loss": 0.3553, + "step": 8025 + }, + { + "epoch": 3.8111058376839106, + "grad_norm": 0.33789320517208127, + "learning_rate": 2.9090309822017143e-05, + "loss": 0.3734, + "step": 8030 + }, + { + "epoch": 3.8134788799240624, + "grad_norm": 0.28879379097682806, + "learning_rate": 2.9073829927488465e-05, + "loss": 0.3592, + "step": 8035 + }, + { + "epoch": 3.8158519221642146, + "grad_norm": 0.24686864365636457, + "learning_rate": 2.905735003295979e-05, + "loss": 0.3714, + "step": 8040 + }, + { + "epoch": 3.8182249644043664, + "grad_norm": 0.24840407625381358, + "learning_rate": 2.9040870138431116e-05, + "loss": 0.3553, + "step": 8045 + }, + { + "epoch": 3.820598006644518, + "grad_norm": 0.28652427593027097, + "learning_rate": 2.9024390243902438e-05, + "loss": 0.3747, + "step": 8050 + }, + { + "epoch": 3.82297104888467, + "grad_norm": 0.291101105488601, + "learning_rate": 2.9007910349373767e-05, + "loss": 0.3647, + "step": 8055 + }, + { + "epoch": 3.8253440911248218, + "grad_norm": 0.26706791744107006, + "learning_rate": 2.8991430454845092e-05, + "loss": 0.3637, + "step": 8060 + }, + { + "epoch": 3.827717133364974, + "grad_norm": 0.2548892124966626, + "learning_rate": 2.8974950560316417e-05, + "loss": 0.3592, + "step": 8065 + }, + { + "epoch": 3.8300901756051258, + "grad_norm": 0.26268719640706867, + "learning_rate": 2.895847066578774e-05, + "loss": 0.3613, + "step": 8070 + }, + { + "epoch": 3.8324632178452775, + "grad_norm": 0.27850775936089667, + "learning_rate": 2.8941990771259065e-05, + "loss": 0.3624, + "step": 8075 + }, + { + "epoch": 3.8348362600854298, + "grad_norm": 0.2534097900352772, + "learning_rate": 2.892551087673039e-05, + "loss": 0.3647, + "step": 8080 + }, + { + "epoch": 3.8372093023255816, + "grad_norm": 0.28895558324962367, + "learning_rate": 2.890903098220172e-05, + "loss": 0.3668, + "step": 8085 + }, + { + "epoch": 3.8395823445657333, + "grad_norm": 0.2775582560239414, + "learning_rate": 2.889255108767304e-05, + "loss": 0.3636, + "step": 8090 + }, + { + "epoch": 3.841955386805885, + "grad_norm": 0.2753108248564215, + "learning_rate": 2.8876071193144366e-05, + "loss": 0.3752, + "step": 8095 + }, + { + "epoch": 3.844328429046037, + "grad_norm": 0.25201466004978024, + "learning_rate": 2.885959129861569e-05, + "loss": 0.3652, + "step": 8100 + }, + { + "epoch": 3.846701471286189, + "grad_norm": 0.25035424507769016, + "learning_rate": 2.8843111404087013e-05, + "loss": 0.3628, + "step": 8105 + }, + { + "epoch": 3.849074513526341, + "grad_norm": 0.2621118479130281, + "learning_rate": 2.882663150955834e-05, + "loss": 0.3657, + "step": 8110 + }, + { + "epoch": 3.8514475557664927, + "grad_norm": 0.2581568717114001, + "learning_rate": 2.881015161502966e-05, + "loss": 0.3688, + "step": 8115 + }, + { + "epoch": 3.8538205980066444, + "grad_norm": 0.2569550817316492, + "learning_rate": 2.8793671720500993e-05, + "loss": 0.3693, + "step": 8120 + }, + { + "epoch": 3.8561936402467962, + "grad_norm": 0.25897813134095315, + "learning_rate": 2.8777191825972315e-05, + "loss": 0.3675, + "step": 8125 + }, + { + "epoch": 3.8585666824869485, + "grad_norm": 0.27288997663250364, + "learning_rate": 2.876071193144364e-05, + "loss": 0.3698, + "step": 8130 + }, + { + "epoch": 3.8609397247271002, + "grad_norm": 0.2674169923489197, + "learning_rate": 2.8744232036914966e-05, + "loss": 0.364, + "step": 8135 + }, + { + "epoch": 3.863312766967252, + "grad_norm": 0.28525981406735385, + "learning_rate": 2.8727752142386288e-05, + "loss": 0.3722, + "step": 8140 + }, + { + "epoch": 3.865685809207404, + "grad_norm": 0.2654180576070707, + "learning_rate": 2.8711272247857613e-05, + "loss": 0.3639, + "step": 8145 + }, + { + "epoch": 3.8680588514475556, + "grad_norm": 0.292304979809368, + "learning_rate": 2.869479235332894e-05, + "loss": 0.3651, + "step": 8150 + }, + { + "epoch": 3.870431893687708, + "grad_norm": 0.25832090825140525, + "learning_rate": 2.8678312458800267e-05, + "loss": 0.3519, + "step": 8155 + }, + { + "epoch": 3.8728049359278596, + "grad_norm": 0.30864945222530676, + "learning_rate": 2.866183256427159e-05, + "loss": 0.3705, + "step": 8160 + }, + { + "epoch": 3.8751779781680114, + "grad_norm": 0.2808535825730034, + "learning_rate": 2.8645352669742914e-05, + "loss": 0.3558, + "step": 8165 + }, + { + "epoch": 3.877551020408163, + "grad_norm": 0.3290540472624212, + "learning_rate": 2.862887277521424e-05, + "loss": 0.3625, + "step": 8170 + }, + { + "epoch": 3.879924062648315, + "grad_norm": 0.3073665668039388, + "learning_rate": 2.861239288068556e-05, + "loss": 0.3686, + "step": 8175 + }, + { + "epoch": 3.882297104888467, + "grad_norm": 0.322245381758783, + "learning_rate": 2.859591298615689e-05, + "loss": 0.3637, + "step": 8180 + }, + { + "epoch": 3.884670147128619, + "grad_norm": 0.28211670099778613, + "learning_rate": 2.8579433091628216e-05, + "loss": 0.3646, + "step": 8185 + }, + { + "epoch": 3.8870431893687707, + "grad_norm": 0.25980260938303, + "learning_rate": 2.856295319709954e-05, + "loss": 0.3618, + "step": 8190 + }, + { + "epoch": 3.889416231608923, + "grad_norm": 0.28367473204127563, + "learning_rate": 2.8546473302570863e-05, + "loss": 0.3826, + "step": 8195 + }, + { + "epoch": 3.8917892738490742, + "grad_norm": 0.2952962544550534, + "learning_rate": 2.852999340804219e-05, + "loss": 0.3649, + "step": 8200 + }, + { + "epoch": 3.8941623160892265, + "grad_norm": 0.2765431525668492, + "learning_rate": 2.8513513513513514e-05, + "loss": 0.3641, + "step": 8205 + }, + { + "epoch": 3.8965353583293783, + "grad_norm": 0.24171744797839478, + "learning_rate": 2.8497033618984843e-05, + "loss": 0.3581, + "step": 8210 + }, + { + "epoch": 3.89890840056953, + "grad_norm": 0.2637607343576281, + "learning_rate": 2.8480553724456164e-05, + "loss": 0.3633, + "step": 8215 + }, + { + "epoch": 3.9012814428096823, + "grad_norm": 0.2621407770150014, + "learning_rate": 2.846407382992749e-05, + "loss": 0.3621, + "step": 8220 + }, + { + "epoch": 3.903654485049834, + "grad_norm": 0.25681520184202633, + "learning_rate": 2.8447593935398815e-05, + "loss": 0.3668, + "step": 8225 + }, + { + "epoch": 3.906027527289986, + "grad_norm": 0.26949942960796064, + "learning_rate": 2.8431114040870137e-05, + "loss": 0.3603, + "step": 8230 + }, + { + "epoch": 3.9084005695301376, + "grad_norm": 0.2844350028522269, + "learning_rate": 2.8414634146341462e-05, + "loss": 0.359, + "step": 8235 + }, + { + "epoch": 3.9107736117702894, + "grad_norm": 0.29996342598046855, + "learning_rate": 2.839815425181279e-05, + "loss": 0.3585, + "step": 8240 + }, + { + "epoch": 3.9131466540104416, + "grad_norm": 0.2654494771384901, + "learning_rate": 2.8381674357284117e-05, + "loss": 0.3609, + "step": 8245 + }, + { + "epoch": 3.9155196962505934, + "grad_norm": 0.29595758789368, + "learning_rate": 2.836519446275544e-05, + "loss": 0.3629, + "step": 8250 + }, + { + "epoch": 3.917892738490745, + "grad_norm": 0.31630600614014365, + "learning_rate": 2.8348714568226764e-05, + "loss": 0.3639, + "step": 8255 + }, + { + "epoch": 3.920265780730897, + "grad_norm": 0.28232571488124153, + "learning_rate": 2.833223467369809e-05, + "loss": 0.3652, + "step": 8260 + }, + { + "epoch": 3.9226388229710487, + "grad_norm": 0.2681395112166534, + "learning_rate": 2.831575477916941e-05, + "loss": 0.372, + "step": 8265 + }, + { + "epoch": 3.925011865211201, + "grad_norm": 0.26206818558534956, + "learning_rate": 2.8299274884640743e-05, + "loss": 0.3589, + "step": 8270 + }, + { + "epoch": 3.9273849074513527, + "grad_norm": 0.277677118041545, + "learning_rate": 2.8282794990112065e-05, + "loss": 0.3766, + "step": 8275 + }, + { + "epoch": 3.9297579496915045, + "grad_norm": 0.27144906974218513, + "learning_rate": 2.826631509558339e-05, + "loss": 0.361, + "step": 8280 + }, + { + "epoch": 3.9321309919316563, + "grad_norm": 0.2920786087182757, + "learning_rate": 2.8249835201054713e-05, + "loss": 0.3658, + "step": 8285 + }, + { + "epoch": 3.934504034171808, + "grad_norm": 0.2694806904011237, + "learning_rate": 2.8233355306526038e-05, + "loss": 0.3615, + "step": 8290 + }, + { + "epoch": 3.9368770764119603, + "grad_norm": 0.2787457041326997, + "learning_rate": 2.8216875411997363e-05, + "loss": 0.3594, + "step": 8295 + }, + { + "epoch": 3.939250118652112, + "grad_norm": 0.2705282255249227, + "learning_rate": 2.8200395517468692e-05, + "loss": 0.3622, + "step": 8300 + }, + { + "epoch": 3.941623160892264, + "grad_norm": 0.26910682063898866, + "learning_rate": 2.8183915622940017e-05, + "loss": 0.3681, + "step": 8305 + }, + { + "epoch": 3.9439962031324156, + "grad_norm": 0.25299069889754555, + "learning_rate": 2.816743572841134e-05, + "loss": 0.3707, + "step": 8310 + }, + { + "epoch": 3.9463692453725674, + "grad_norm": 0.2629174681144326, + "learning_rate": 2.8150955833882665e-05, + "loss": 0.3716, + "step": 8315 + }, + { + "epoch": 3.9487422876127196, + "grad_norm": 0.27457960028007955, + "learning_rate": 2.8134475939353987e-05, + "loss": 0.3705, + "step": 8320 + }, + { + "epoch": 3.9511153298528714, + "grad_norm": 0.2638386947455054, + "learning_rate": 2.8117996044825312e-05, + "loss": 0.3675, + "step": 8325 + }, + { + "epoch": 3.953488372093023, + "grad_norm": 0.2698606054615588, + "learning_rate": 2.810151615029664e-05, + "loss": 0.364, + "step": 8330 + }, + { + "epoch": 3.9558614143331754, + "grad_norm": 0.27211079154671763, + "learning_rate": 2.8085036255767966e-05, + "loss": 0.3636, + "step": 8335 + }, + { + "epoch": 3.9582344565733267, + "grad_norm": 0.2778702287739619, + "learning_rate": 2.806855636123929e-05, + "loss": 0.3611, + "step": 8340 + }, + { + "epoch": 3.960607498813479, + "grad_norm": 0.27879677182526724, + "learning_rate": 2.8052076466710613e-05, + "loss": 0.3656, + "step": 8345 + }, + { + "epoch": 3.9629805410536307, + "grad_norm": 0.2855869886894939, + "learning_rate": 2.803559657218194e-05, + "loss": 0.3782, + "step": 8350 + }, + { + "epoch": 3.9653535832937825, + "grad_norm": 0.27191811218275397, + "learning_rate": 2.801911667765326e-05, + "loss": 0.358, + "step": 8355 + }, + { + "epoch": 3.9677266255339347, + "grad_norm": 0.3008041162362551, + "learning_rate": 2.8002636783124593e-05, + "loss": 0.3719, + "step": 8360 + }, + { + "epoch": 3.9700996677740865, + "grad_norm": 0.2556430373532288, + "learning_rate": 2.7986156888595915e-05, + "loss": 0.3699, + "step": 8365 + }, + { + "epoch": 3.9724727100142383, + "grad_norm": 0.2880954923353286, + "learning_rate": 2.796967699406724e-05, + "loss": 0.351, + "step": 8370 + }, + { + "epoch": 3.97484575225439, + "grad_norm": 0.2558341904083954, + "learning_rate": 2.7953197099538562e-05, + "loss": 0.3592, + "step": 8375 + }, + { + "epoch": 3.977218794494542, + "grad_norm": 0.24056112383762324, + "learning_rate": 2.7936717205009888e-05, + "loss": 0.3556, + "step": 8380 + }, + { + "epoch": 3.979591836734694, + "grad_norm": 0.24159811679910317, + "learning_rate": 2.7920237310481213e-05, + "loss": 0.3577, + "step": 8385 + }, + { + "epoch": 3.981964878974846, + "grad_norm": 0.2859303224984707, + "learning_rate": 2.790375741595254e-05, + "loss": 0.3641, + "step": 8390 + }, + { + "epoch": 3.9843379212149976, + "grad_norm": 0.269252931543208, + "learning_rate": 2.7887277521423867e-05, + "loss": 0.3599, + "step": 8395 + }, + { + "epoch": 3.9867109634551494, + "grad_norm": 0.27847139296921136, + "learning_rate": 2.787079762689519e-05, + "loss": 0.3613, + "step": 8400 + }, + { + "epoch": 3.989084005695301, + "grad_norm": 0.2717171838763759, + "learning_rate": 2.7854317732366514e-05, + "loss": 0.3728, + "step": 8405 + }, + { + "epoch": 3.9914570479354534, + "grad_norm": 0.2708255855756631, + "learning_rate": 2.7837837837837836e-05, + "loss": 0.3688, + "step": 8410 + }, + { + "epoch": 3.993830090175605, + "grad_norm": 0.3028783624208806, + "learning_rate": 2.782135794330916e-05, + "loss": 0.3659, + "step": 8415 + }, + { + "epoch": 3.996203132415757, + "grad_norm": 0.2510836193821188, + "learning_rate": 2.780487804878049e-05, + "loss": 0.3629, + "step": 8420 + }, + { + "epoch": 3.9985761746559088, + "grad_norm": 0.27344674143030195, + "learning_rate": 2.7788398154251816e-05, + "loss": 0.3675, + "step": 8425 + }, + { + "epoch": 4.0009492168960605, + "grad_norm": 0.2683859338661456, + "learning_rate": 2.777191825972314e-05, + "loss": 0.3504, + "step": 8430 + }, + { + "epoch": 4.003322259136213, + "grad_norm": 0.2924740906580246, + "learning_rate": 2.7755438365194463e-05, + "loss": 0.3405, + "step": 8435 + }, + { + "epoch": 4.005695301376364, + "grad_norm": 0.2924382922159674, + "learning_rate": 2.773895847066579e-05, + "loss": 0.335, + "step": 8440 + }, + { + "epoch": 4.008068343616516, + "grad_norm": 0.28892812709057164, + "learning_rate": 2.772247857613711e-05, + "loss": 0.3364, + "step": 8445 + }, + { + "epoch": 4.0104413858566685, + "grad_norm": 0.2543875040165368, + "learning_rate": 2.7705998681608443e-05, + "loss": 0.325, + "step": 8450 + }, + { + "epoch": 4.01281442809682, + "grad_norm": 0.2727861586005365, + "learning_rate": 2.7689518787079764e-05, + "loss": 0.3292, + "step": 8455 + }, + { + "epoch": 4.015187470336972, + "grad_norm": 0.25486905498179124, + "learning_rate": 2.767303889255109e-05, + "loss": 0.3308, + "step": 8460 + }, + { + "epoch": 4.017560512577123, + "grad_norm": 0.29814873064487524, + "learning_rate": 2.7656558998022415e-05, + "loss": 0.3285, + "step": 8465 + }, + { + "epoch": 4.019933554817276, + "grad_norm": 0.2779164025670799, + "learning_rate": 2.7640079103493737e-05, + "loss": 0.3216, + "step": 8470 + }, + { + "epoch": 4.022306597057428, + "grad_norm": 0.26938534730410274, + "learning_rate": 2.7623599208965063e-05, + "loss": 0.3289, + "step": 8475 + }, + { + "epoch": 4.024679639297579, + "grad_norm": 0.2875627595388591, + "learning_rate": 2.760711931443639e-05, + "loss": 0.3221, + "step": 8480 + }, + { + "epoch": 4.027052681537731, + "grad_norm": 0.2568098435120524, + "learning_rate": 2.7590639419907717e-05, + "loss": 0.3351, + "step": 8485 + }, + { + "epoch": 4.029425723777884, + "grad_norm": 0.27739323350838296, + "learning_rate": 2.757415952537904e-05, + "loss": 0.3335, + "step": 8490 + }, + { + "epoch": 4.031798766018035, + "grad_norm": 0.2745654523238011, + "learning_rate": 2.7557679630850364e-05, + "loss": 0.3295, + "step": 8495 + }, + { + "epoch": 4.034171808258187, + "grad_norm": 0.2721396851017843, + "learning_rate": 2.754119973632169e-05, + "loss": 0.3327, + "step": 8500 + }, + { + "epoch": 4.0365448504983386, + "grad_norm": 0.27089029452607977, + "learning_rate": 2.752471984179301e-05, + "loss": 0.3279, + "step": 8505 + }, + { + "epoch": 4.038917892738491, + "grad_norm": 0.2656816147923837, + "learning_rate": 2.750823994726434e-05, + "loss": 0.3311, + "step": 8510 + }, + { + "epoch": 4.041290934978643, + "grad_norm": 0.2617256036226156, + "learning_rate": 2.7491760052735665e-05, + "loss": 0.3231, + "step": 8515 + }, + { + "epoch": 4.043663977218794, + "grad_norm": 0.26853152034117317, + "learning_rate": 2.747528015820699e-05, + "loss": 0.3299, + "step": 8520 + }, + { + "epoch": 4.046037019458947, + "grad_norm": 0.27422303498652795, + "learning_rate": 2.7458800263678313e-05, + "loss": 0.3324, + "step": 8525 + }, + { + "epoch": 4.048410061699098, + "grad_norm": 0.2844672724864327, + "learning_rate": 2.7442320369149638e-05, + "loss": 0.3328, + "step": 8530 + }, + { + "epoch": 4.05078310393925, + "grad_norm": 0.26591039644017783, + "learning_rate": 2.7425840474620963e-05, + "loss": 0.3204, + "step": 8535 + }, + { + "epoch": 4.053156146179402, + "grad_norm": 0.29068207133410146, + "learning_rate": 2.7409360580092292e-05, + "loss": 0.3315, + "step": 8540 + }, + { + "epoch": 4.055529188419554, + "grad_norm": 0.25366685185961235, + "learning_rate": 2.7392880685563614e-05, + "loss": 0.3226, + "step": 8545 + }, + { + "epoch": 4.057902230659706, + "grad_norm": 0.2835679155115254, + "learning_rate": 2.737640079103494e-05, + "loss": 0.3404, + "step": 8550 + }, + { + "epoch": 4.060275272899857, + "grad_norm": 0.27282736643378946, + "learning_rate": 2.7359920896506265e-05, + "loss": 0.3259, + "step": 8555 + }, + { + "epoch": 4.0626483151400095, + "grad_norm": 0.2648781179857067, + "learning_rate": 2.7343441001977587e-05, + "loss": 0.3229, + "step": 8560 + }, + { + "epoch": 4.065021357380162, + "grad_norm": 0.26749387594166957, + "learning_rate": 2.7326961107448912e-05, + "loss": 0.3276, + "step": 8565 + }, + { + "epoch": 4.067394399620313, + "grad_norm": 0.27281657774630635, + "learning_rate": 2.7310481212920237e-05, + "loss": 0.3275, + "step": 8570 + }, + { + "epoch": 4.069767441860465, + "grad_norm": 0.2765122890059377, + "learning_rate": 2.7294001318391566e-05, + "loss": 0.3282, + "step": 8575 + }, + { + "epoch": 4.072140484100617, + "grad_norm": 0.25610713984112016, + "learning_rate": 2.7277521423862888e-05, + "loss": 0.3252, + "step": 8580 + }, + { + "epoch": 4.074513526340769, + "grad_norm": 0.2687700688723102, + "learning_rate": 2.7261041529334214e-05, + "loss": 0.3277, + "step": 8585 + }, + { + "epoch": 4.076886568580921, + "grad_norm": 0.2759496992761092, + "learning_rate": 2.724456163480554e-05, + "loss": 0.3331, + "step": 8590 + }, + { + "epoch": 4.079259610821072, + "grad_norm": 0.26391743323574346, + "learning_rate": 2.722808174027686e-05, + "loss": 0.3335, + "step": 8595 + }, + { + "epoch": 4.081632653061225, + "grad_norm": 0.28900915653195824, + "learning_rate": 2.7211601845748186e-05, + "loss": 0.3358, + "step": 8600 + }, + { + "epoch": 4.084005695301377, + "grad_norm": 0.2660218213577039, + "learning_rate": 2.7195121951219515e-05, + "loss": 0.3286, + "step": 8605 + }, + { + "epoch": 4.086378737541528, + "grad_norm": 0.26039499908614133, + "learning_rate": 2.717864205669084e-05, + "loss": 0.3216, + "step": 8610 + }, + { + "epoch": 4.08875177978168, + "grad_norm": 0.25769286071657227, + "learning_rate": 2.7162162162162162e-05, + "loss": 0.3318, + "step": 8615 + }, + { + "epoch": 4.091124822021832, + "grad_norm": 0.2753915733718379, + "learning_rate": 2.7145682267633488e-05, + "loss": 0.3337, + "step": 8620 + }, + { + "epoch": 4.093497864261984, + "grad_norm": 0.2650075605491423, + "learning_rate": 2.7129202373104813e-05, + "loss": 0.3345, + "step": 8625 + }, + { + "epoch": 4.095870906502136, + "grad_norm": 0.28332020003480823, + "learning_rate": 2.7112722478576135e-05, + "loss": 0.3374, + "step": 8630 + }, + { + "epoch": 4.0982439487422875, + "grad_norm": 0.26907267590752526, + "learning_rate": 2.7096242584047467e-05, + "loss": 0.3351, + "step": 8635 + }, + { + "epoch": 4.10061699098244, + "grad_norm": 0.2705134546968492, + "learning_rate": 2.707976268951879e-05, + "loss": 0.3259, + "step": 8640 + }, + { + "epoch": 4.102990033222591, + "grad_norm": 0.26875637819647596, + "learning_rate": 2.7063282794990114e-05, + "loss": 0.3337, + "step": 8645 + }, + { + "epoch": 4.105363075462743, + "grad_norm": 0.27004815208509525, + "learning_rate": 2.7046802900461436e-05, + "loss": 0.3276, + "step": 8650 + }, + { + "epoch": 4.1077361177028955, + "grad_norm": 0.26183932093236584, + "learning_rate": 2.7030323005932762e-05, + "loss": 0.3365, + "step": 8655 + }, + { + "epoch": 4.110109159943047, + "grad_norm": 0.28333436084604874, + "learning_rate": 2.7013843111404087e-05, + "loss": 0.3374, + "step": 8660 + }, + { + "epoch": 4.112482202183199, + "grad_norm": 0.26073803383642863, + "learning_rate": 2.6997363216875416e-05, + "loss": 0.3342, + "step": 8665 + }, + { + "epoch": 4.11485524442335, + "grad_norm": 0.2624231480936637, + "learning_rate": 2.6980883322346738e-05, + "loss": 0.3269, + "step": 8670 + }, + { + "epoch": 4.117228286663503, + "grad_norm": 0.271488626301447, + "learning_rate": 2.6964403427818063e-05, + "loss": 0.3315, + "step": 8675 + }, + { + "epoch": 4.119601328903655, + "grad_norm": 0.28159977355735594, + "learning_rate": 2.694792353328939e-05, + "loss": 0.3372, + "step": 8680 + }, + { + "epoch": 4.121974371143806, + "grad_norm": 0.25105762816505645, + "learning_rate": 2.693144363876071e-05, + "loss": 0.3208, + "step": 8685 + }, + { + "epoch": 4.124347413383958, + "grad_norm": 0.26370979337560996, + "learning_rate": 2.6914963744232036e-05, + "loss": 0.3188, + "step": 8690 + }, + { + "epoch": 4.12672045562411, + "grad_norm": 0.2517745963661239, + "learning_rate": 2.6898483849703365e-05, + "loss": 0.3304, + "step": 8695 + }, + { + "epoch": 4.129093497864262, + "grad_norm": 0.26293642872908884, + "learning_rate": 2.688200395517469e-05, + "loss": 0.3312, + "step": 8700 + }, + { + "epoch": 4.131466540104414, + "grad_norm": 0.2672072634456185, + "learning_rate": 2.6865524060646012e-05, + "loss": 0.3327, + "step": 8705 + }, + { + "epoch": 4.1338395823445655, + "grad_norm": 0.26856798178362995, + "learning_rate": 2.6849044166117337e-05, + "loss": 0.3292, + "step": 8710 + }, + { + "epoch": 4.136212624584718, + "grad_norm": 0.2707843830778, + "learning_rate": 2.6832564271588663e-05, + "loss": 0.3248, + "step": 8715 + }, + { + "epoch": 4.138585666824869, + "grad_norm": 0.2588663457157307, + "learning_rate": 2.6816084377059985e-05, + "loss": 0.3278, + "step": 8720 + }, + { + "epoch": 4.140958709065021, + "grad_norm": 0.2739931317846955, + "learning_rate": 2.6799604482531317e-05, + "loss": 0.3307, + "step": 8725 + }, + { + "epoch": 4.1433317513051735, + "grad_norm": 0.27740048319344557, + "learning_rate": 2.678312458800264e-05, + "loss": 0.3334, + "step": 8730 + }, + { + "epoch": 4.145704793545325, + "grad_norm": 0.25965747768561137, + "learning_rate": 2.6766644693473964e-05, + "loss": 0.3327, + "step": 8735 + }, + { + "epoch": 4.148077835785477, + "grad_norm": 0.28171675125412604, + "learning_rate": 2.6750164798945286e-05, + "loss": 0.326, + "step": 8740 + }, + { + "epoch": 4.150450878025628, + "grad_norm": 0.2695152089617331, + "learning_rate": 2.673368490441661e-05, + "loss": 0.3384, + "step": 8745 + }, + { + "epoch": 4.152823920265781, + "grad_norm": 0.3102092815501156, + "learning_rate": 2.6717205009887937e-05, + "loss": 0.3334, + "step": 8750 + }, + { + "epoch": 4.155196962505933, + "grad_norm": 0.2753320008547481, + "learning_rate": 2.6700725115359265e-05, + "loss": 0.3292, + "step": 8755 + }, + { + "epoch": 4.157570004746084, + "grad_norm": 0.2681250158277203, + "learning_rate": 2.668424522083059e-05, + "loss": 0.3379, + "step": 8760 + }, + { + "epoch": 4.159943046986236, + "grad_norm": 0.2578799758592804, + "learning_rate": 2.6667765326301913e-05, + "loss": 0.3283, + "step": 8765 + }, + { + "epoch": 4.162316089226389, + "grad_norm": 0.2616679585930697, + "learning_rate": 2.6651285431773238e-05, + "loss": 0.3327, + "step": 8770 + }, + { + "epoch": 4.16468913146654, + "grad_norm": 0.25493579385712256, + "learning_rate": 2.663480553724456e-05, + "loss": 0.3287, + "step": 8775 + }, + { + "epoch": 4.167062173706692, + "grad_norm": 0.2538954932507912, + "learning_rate": 2.6618325642715885e-05, + "loss": 0.3296, + "step": 8780 + }, + { + "epoch": 4.1694352159468435, + "grad_norm": 0.2697591446816375, + "learning_rate": 2.6601845748187214e-05, + "loss": 0.3243, + "step": 8785 + }, + { + "epoch": 4.171808258186996, + "grad_norm": 0.26726075420826456, + "learning_rate": 2.658536585365854e-05, + "loss": 0.3346, + "step": 8790 + }, + { + "epoch": 4.174181300427148, + "grad_norm": 0.30325389382896534, + "learning_rate": 2.6568885959129865e-05, + "loss": 0.3306, + "step": 8795 + }, + { + "epoch": 4.176554342667299, + "grad_norm": 0.27816120924565424, + "learning_rate": 2.6552406064601187e-05, + "loss": 0.3271, + "step": 8800 + }, + { + "epoch": 4.1789273849074515, + "grad_norm": 0.26089277356782425, + "learning_rate": 2.6535926170072512e-05, + "loss": 0.328, + "step": 8805 + }, + { + "epoch": 4.181300427147603, + "grad_norm": 0.2668048733823291, + "learning_rate": 2.6519446275543834e-05, + "loss": 0.3276, + "step": 8810 + }, + { + "epoch": 4.183673469387755, + "grad_norm": 0.2651695921004486, + "learning_rate": 2.6502966381015166e-05, + "loss": 0.3387, + "step": 8815 + }, + { + "epoch": 4.186046511627907, + "grad_norm": 0.2695608502376186, + "learning_rate": 2.6486486486486488e-05, + "loss": 0.3386, + "step": 8820 + }, + { + "epoch": 4.188419553868059, + "grad_norm": 0.2482386256801941, + "learning_rate": 2.6470006591957814e-05, + "loss": 0.3278, + "step": 8825 + }, + { + "epoch": 4.190792596108211, + "grad_norm": 0.25093941325247215, + "learning_rate": 2.645352669742914e-05, + "loss": 0.3237, + "step": 8830 + }, + { + "epoch": 4.193165638348362, + "grad_norm": 0.28001531533978136, + "learning_rate": 2.643704680290046e-05, + "loss": 0.3331, + "step": 8835 + }, + { + "epoch": 4.195538680588514, + "grad_norm": 0.2644369255378425, + "learning_rate": 2.6420566908371786e-05, + "loss": 0.3312, + "step": 8840 + }, + { + "epoch": 4.197911722828667, + "grad_norm": 0.2882841271528937, + "learning_rate": 2.6404087013843115e-05, + "loss": 0.3294, + "step": 8845 + }, + { + "epoch": 4.200284765068818, + "grad_norm": 0.2543509083281003, + "learning_rate": 2.638760711931444e-05, + "loss": 0.331, + "step": 8850 + }, + { + "epoch": 4.20265780730897, + "grad_norm": 0.2575198425077173, + "learning_rate": 2.6371127224785762e-05, + "loss": 0.3288, + "step": 8855 + }, + { + "epoch": 4.2050308495491215, + "grad_norm": 0.2749037344024074, + "learning_rate": 2.6354647330257088e-05, + "loss": 0.328, + "step": 8860 + }, + { + "epoch": 4.207403891789274, + "grad_norm": 0.2527009462056468, + "learning_rate": 2.6338167435728413e-05, + "loss": 0.3306, + "step": 8865 + }, + { + "epoch": 4.209776934029426, + "grad_norm": 0.2418961062781919, + "learning_rate": 2.6321687541199735e-05, + "loss": 0.3327, + "step": 8870 + }, + { + "epoch": 4.212149976269577, + "grad_norm": 0.2769738611009153, + "learning_rate": 2.6305207646671064e-05, + "loss": 0.3274, + "step": 8875 + }, + { + "epoch": 4.2145230185097295, + "grad_norm": 0.27774811116469267, + "learning_rate": 2.628872775214239e-05, + "loss": 0.3394, + "step": 8880 + }, + { + "epoch": 4.216896060749882, + "grad_norm": 0.2515297736112349, + "learning_rate": 2.6272247857613714e-05, + "loss": 0.3276, + "step": 8885 + }, + { + "epoch": 4.219269102990033, + "grad_norm": 0.2809835376101396, + "learning_rate": 2.6255767963085036e-05, + "loss": 0.3377, + "step": 8890 + }, + { + "epoch": 4.221642145230185, + "grad_norm": 0.25949617937221114, + "learning_rate": 2.6239288068556362e-05, + "loss": 0.3394, + "step": 8895 + }, + { + "epoch": 4.224015187470337, + "grad_norm": 0.2596225629824594, + "learning_rate": 2.6222808174027687e-05, + "loss": 0.3394, + "step": 8900 + }, + { + "epoch": 4.226388229710489, + "grad_norm": 0.2773869852160912, + "learning_rate": 2.6206328279499016e-05, + "loss": 0.3416, + "step": 8905 + }, + { + "epoch": 4.228761271950641, + "grad_norm": 0.2538512054776717, + "learning_rate": 2.6189848384970338e-05, + "loss": 0.3357, + "step": 8910 + }, + { + "epoch": 4.2311343141907924, + "grad_norm": 0.2650691654073551, + "learning_rate": 2.6173368490441663e-05, + "loss": 0.3369, + "step": 8915 + }, + { + "epoch": 4.233507356430945, + "grad_norm": 0.27610284581671857, + "learning_rate": 2.615688859591299e-05, + "loss": 0.3315, + "step": 8920 + }, + { + "epoch": 4.235880398671096, + "grad_norm": 0.2629708970128836, + "learning_rate": 2.614040870138431e-05, + "loss": 0.3275, + "step": 8925 + }, + { + "epoch": 4.238253440911248, + "grad_norm": 0.2654241322659835, + "learning_rate": 2.6123928806855636e-05, + "loss": 0.3317, + "step": 8930 + }, + { + "epoch": 4.2406264831514004, + "grad_norm": 0.2575165114301853, + "learning_rate": 2.6107448912326965e-05, + "loss": 0.3293, + "step": 8935 + }, + { + "epoch": 4.242999525391552, + "grad_norm": 0.2793671821992738, + "learning_rate": 2.609096901779829e-05, + "loss": 0.3345, + "step": 8940 + }, + { + "epoch": 4.245372567631704, + "grad_norm": 0.27829133300301345, + "learning_rate": 2.6074489123269612e-05, + "loss": 0.3365, + "step": 8945 + }, + { + "epoch": 4.247745609871855, + "grad_norm": 0.26567281543090193, + "learning_rate": 2.6058009228740937e-05, + "loss": 0.3384, + "step": 8950 + }, + { + "epoch": 4.250118652112008, + "grad_norm": 0.2879325805912618, + "learning_rate": 2.6041529334212263e-05, + "loss": 0.3269, + "step": 8955 + }, + { + "epoch": 4.25249169435216, + "grad_norm": 0.24971452886435352, + "learning_rate": 2.6025049439683585e-05, + "loss": 0.333, + "step": 8960 + }, + { + "epoch": 4.254864736592311, + "grad_norm": 0.2679776153922924, + "learning_rate": 2.6008569545154913e-05, + "loss": 0.3297, + "step": 8965 + }, + { + "epoch": 4.257237778832463, + "grad_norm": 0.27884537587424935, + "learning_rate": 2.599208965062624e-05, + "loss": 0.3393, + "step": 8970 + }, + { + "epoch": 4.259610821072615, + "grad_norm": 0.26112294068487657, + "learning_rate": 2.5975609756097564e-05, + "loss": 0.3457, + "step": 8975 + }, + { + "epoch": 4.261983863312767, + "grad_norm": 0.26879138585454665, + "learning_rate": 2.5959129861568886e-05, + "loss": 0.3332, + "step": 8980 + }, + { + "epoch": 4.264356905552919, + "grad_norm": 0.2919180793328969, + "learning_rate": 2.594264996704021e-05, + "loss": 0.3242, + "step": 8985 + }, + { + "epoch": 4.2667299477930705, + "grad_norm": 0.26939945323762726, + "learning_rate": 2.5926170072511537e-05, + "loss": 0.3335, + "step": 8990 + }, + { + "epoch": 4.269102990033223, + "grad_norm": 0.25055026881674597, + "learning_rate": 2.5909690177982865e-05, + "loss": 0.3288, + "step": 8995 + }, + { + "epoch": 4.271476032273375, + "grad_norm": 0.2837187592947859, + "learning_rate": 2.5893210283454187e-05, + "loss": 0.3282, + "step": 9000 + }, + { + "epoch": 4.273849074513526, + "grad_norm": 0.2648381529329172, + "learning_rate": 2.5876730388925513e-05, + "loss": 0.3356, + "step": 9005 + }, + { + "epoch": 4.2762221167536785, + "grad_norm": 0.2635141029394769, + "learning_rate": 2.5860250494396838e-05, + "loss": 0.3244, + "step": 9010 + }, + { + "epoch": 4.27859515899383, + "grad_norm": 0.2537345612809638, + "learning_rate": 2.584377059986816e-05, + "loss": 0.3324, + "step": 9015 + }, + { + "epoch": 4.280968201233982, + "grad_norm": 0.2628809410448406, + "learning_rate": 2.5827290705339485e-05, + "loss": 0.3282, + "step": 9020 + }, + { + "epoch": 4.283341243474134, + "grad_norm": 0.2658933404838991, + "learning_rate": 2.581081081081081e-05, + "loss": 0.3276, + "step": 9025 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.27248476593862064, + "learning_rate": 2.579433091628214e-05, + "loss": 0.3352, + "step": 9030 + }, + { + "epoch": 4.288087327954438, + "grad_norm": 0.264888450427437, + "learning_rate": 2.577785102175346e-05, + "loss": 0.3307, + "step": 9035 + }, + { + "epoch": 4.290460370194589, + "grad_norm": 0.27402136926555665, + "learning_rate": 2.5761371127224787e-05, + "loss": 0.3424, + "step": 9040 + }, + { + "epoch": 4.292833412434741, + "grad_norm": 0.264903254404561, + "learning_rate": 2.5744891232696112e-05, + "loss": 0.3233, + "step": 9045 + }, + { + "epoch": 4.295206454674894, + "grad_norm": 0.2711112186460835, + "learning_rate": 2.5728411338167434e-05, + "loss": 0.3393, + "step": 9050 + }, + { + "epoch": 4.297579496915045, + "grad_norm": 0.2672608294835171, + "learning_rate": 2.571193144363876e-05, + "loss": 0.3427, + "step": 9055 + }, + { + "epoch": 4.299952539155197, + "grad_norm": 0.29494448311011495, + "learning_rate": 2.5695451549110088e-05, + "loss": 0.33, + "step": 9060 + }, + { + "epoch": 4.3023255813953485, + "grad_norm": 0.25459696718039326, + "learning_rate": 2.5678971654581414e-05, + "loss": 0.3339, + "step": 9065 + }, + { + "epoch": 4.304698623635501, + "grad_norm": 0.27224150500011496, + "learning_rate": 2.5662491760052736e-05, + "loss": 0.3297, + "step": 9070 + }, + { + "epoch": 4.307071665875653, + "grad_norm": 0.2691797436595382, + "learning_rate": 2.564601186552406e-05, + "loss": 0.3388, + "step": 9075 + }, + { + "epoch": 4.309444708115804, + "grad_norm": 0.26945530139332396, + "learning_rate": 2.5629531970995386e-05, + "loss": 0.337, + "step": 9080 + }, + { + "epoch": 4.3118177503559565, + "grad_norm": 0.2798781122416174, + "learning_rate": 2.5613052076466708e-05, + "loss": 0.3317, + "step": 9085 + }, + { + "epoch": 4.314190792596108, + "grad_norm": 0.2699401684612275, + "learning_rate": 2.559657218193804e-05, + "loss": 0.3395, + "step": 9090 + }, + { + "epoch": 4.31656383483626, + "grad_norm": 0.2710367636119457, + "learning_rate": 2.5580092287409362e-05, + "loss": 0.3368, + "step": 9095 + }, + { + "epoch": 4.318936877076412, + "grad_norm": 0.2515104690993639, + "learning_rate": 2.5563612392880688e-05, + "loss": 0.3324, + "step": 9100 + }, + { + "epoch": 4.321309919316564, + "grad_norm": 0.2652178647767001, + "learning_rate": 2.554713249835201e-05, + "loss": 0.331, + "step": 9105 + }, + { + "epoch": 4.323682961556716, + "grad_norm": 0.2755733436344168, + "learning_rate": 2.5530652603823335e-05, + "loss": 0.3317, + "step": 9110 + }, + { + "epoch": 4.326056003796868, + "grad_norm": 0.2623589275937289, + "learning_rate": 2.551417270929466e-05, + "loss": 0.3301, + "step": 9115 + }, + { + "epoch": 4.328429046037019, + "grad_norm": 0.2881837882319693, + "learning_rate": 2.549769281476599e-05, + "loss": 0.3369, + "step": 9120 + }, + { + "epoch": 4.330802088277172, + "grad_norm": 0.28686623118826826, + "learning_rate": 2.5481212920237314e-05, + "loss": 0.3356, + "step": 9125 + }, + { + "epoch": 4.333175130517323, + "grad_norm": 0.26593545598028273, + "learning_rate": 2.5464733025708636e-05, + "loss": 0.3286, + "step": 9130 + }, + { + "epoch": 4.335548172757475, + "grad_norm": 0.26355584520726777, + "learning_rate": 2.5448253131179962e-05, + "loss": 0.3296, + "step": 9135 + }, + { + "epoch": 4.3379212149976265, + "grad_norm": 0.27060591690638003, + "learning_rate": 2.5431773236651284e-05, + "loss": 0.3458, + "step": 9140 + }, + { + "epoch": 4.340294257237779, + "grad_norm": 0.2654247753138779, + "learning_rate": 2.541529334212261e-05, + "loss": 0.3387, + "step": 9145 + }, + { + "epoch": 4.342667299477931, + "grad_norm": 0.26037541902295824, + "learning_rate": 2.5398813447593938e-05, + "loss": 0.329, + "step": 9150 + }, + { + "epoch": 4.345040341718082, + "grad_norm": 0.2615294039781512, + "learning_rate": 2.5382333553065263e-05, + "loss": 0.3329, + "step": 9155 + }, + { + "epoch": 4.3474133839582345, + "grad_norm": 0.2691924791317391, + "learning_rate": 2.536585365853659e-05, + "loss": 0.3402, + "step": 9160 + }, + { + "epoch": 4.349786426198387, + "grad_norm": 0.26671924051954254, + "learning_rate": 2.534937376400791e-05, + "loss": 0.3242, + "step": 9165 + }, + { + "epoch": 4.352159468438538, + "grad_norm": 0.249386768635692, + "learning_rate": 2.5332893869479236e-05, + "loss": 0.3247, + "step": 9170 + }, + { + "epoch": 4.35453251067869, + "grad_norm": 0.25163727764014526, + "learning_rate": 2.5316413974950558e-05, + "loss": 0.3347, + "step": 9175 + }, + { + "epoch": 4.356905552918842, + "grad_norm": 0.28046248099485693, + "learning_rate": 2.529993408042189e-05, + "loss": 0.3331, + "step": 9180 + }, + { + "epoch": 4.359278595158994, + "grad_norm": 0.2717011262899165, + "learning_rate": 2.5283454185893212e-05, + "loss": 0.3398, + "step": 9185 + }, + { + "epoch": 4.361651637399146, + "grad_norm": 0.2618051928958144, + "learning_rate": 2.5266974291364537e-05, + "loss": 0.3318, + "step": 9190 + }, + { + "epoch": 4.364024679639297, + "grad_norm": 0.2582883296419774, + "learning_rate": 2.5250494396835863e-05, + "loss": 0.3308, + "step": 9195 + }, + { + "epoch": 4.36639772187945, + "grad_norm": 0.25342953210538055, + "learning_rate": 2.5234014502307185e-05, + "loss": 0.3367, + "step": 9200 + }, + { + "epoch": 4.368770764119601, + "grad_norm": 0.2536949672497748, + "learning_rate": 2.521753460777851e-05, + "loss": 0.3242, + "step": 9205 + }, + { + "epoch": 4.371143806359753, + "grad_norm": 0.24676821606085486, + "learning_rate": 2.520105471324984e-05, + "loss": 0.3345, + "step": 9210 + }, + { + "epoch": 4.373516848599905, + "grad_norm": 0.2574364981500411, + "learning_rate": 2.5184574818721164e-05, + "loss": 0.3297, + "step": 9215 + }, + { + "epoch": 4.375889890840057, + "grad_norm": 0.26350637125506343, + "learning_rate": 2.5168094924192486e-05, + "loss": 0.332, + "step": 9220 + }, + { + "epoch": 4.378262933080209, + "grad_norm": 0.28851452062887534, + "learning_rate": 2.515161502966381e-05, + "loss": 0.3348, + "step": 9225 + }, + { + "epoch": 4.38063597532036, + "grad_norm": 0.27767700557594266, + "learning_rate": 2.5135135135135133e-05, + "loss": 0.3349, + "step": 9230 + }, + { + "epoch": 4.3830090175605125, + "grad_norm": 0.24932533498386253, + "learning_rate": 2.511865524060646e-05, + "loss": 0.3341, + "step": 9235 + }, + { + "epoch": 4.385382059800665, + "grad_norm": 0.2577612131338263, + "learning_rate": 2.5102175346077787e-05, + "loss": 0.3326, + "step": 9240 + }, + { + "epoch": 4.387755102040816, + "grad_norm": 0.2514411519592492, + "learning_rate": 2.5085695451549113e-05, + "loss": 0.3333, + "step": 9245 + }, + { + "epoch": 4.390128144280968, + "grad_norm": 0.2851505248916881, + "learning_rate": 2.5069215557020438e-05, + "loss": 0.3342, + "step": 9250 + }, + { + "epoch": 4.39250118652112, + "grad_norm": 0.2800317104435531, + "learning_rate": 2.505273566249176e-05, + "loss": 0.3425, + "step": 9255 + }, + { + "epoch": 4.394874228761272, + "grad_norm": 0.2609261798522678, + "learning_rate": 2.5036255767963085e-05, + "loss": 0.3385, + "step": 9260 + }, + { + "epoch": 4.397247271001424, + "grad_norm": 0.24935393758105245, + "learning_rate": 2.5019775873434407e-05, + "loss": 0.3285, + "step": 9265 + }, + { + "epoch": 4.399620313241575, + "grad_norm": 0.23936568306838985, + "learning_rate": 2.500329597890574e-05, + "loss": 0.3288, + "step": 9270 + }, + { + "epoch": 4.401993355481728, + "grad_norm": 0.2756100174320702, + "learning_rate": 2.498681608437706e-05, + "loss": 0.334, + "step": 9275 + }, + { + "epoch": 4.40436639772188, + "grad_norm": 0.2658516128915832, + "learning_rate": 2.4970336189848387e-05, + "loss": 0.3257, + "step": 9280 + }, + { + "epoch": 4.406739439962031, + "grad_norm": 0.26188918732908817, + "learning_rate": 2.4953856295319712e-05, + "loss": 0.3356, + "step": 9285 + }, + { + "epoch": 4.409112482202183, + "grad_norm": 0.2780070729104039, + "learning_rate": 2.4937376400791034e-05, + "loss": 0.3367, + "step": 9290 + }, + { + "epoch": 4.411485524442335, + "grad_norm": 0.26994483676529446, + "learning_rate": 2.4920896506262363e-05, + "loss": 0.3443, + "step": 9295 + }, + { + "epoch": 4.413858566682487, + "grad_norm": 0.28448925290398586, + "learning_rate": 2.4904416611733685e-05, + "loss": 0.3352, + "step": 9300 + }, + { + "epoch": 4.416231608922639, + "grad_norm": 0.27130560270239, + "learning_rate": 2.488793671720501e-05, + "loss": 0.3323, + "step": 9305 + }, + { + "epoch": 4.4186046511627906, + "grad_norm": 0.29627006223994984, + "learning_rate": 2.4871456822676336e-05, + "loss": 0.3385, + "step": 9310 + }, + { + "epoch": 4.420977693402943, + "grad_norm": 0.283283969624116, + "learning_rate": 2.485497692814766e-05, + "loss": 0.3296, + "step": 9315 + }, + { + "epoch": 4.423350735643094, + "grad_norm": 0.2697213927499753, + "learning_rate": 2.4838497033618986e-05, + "loss": 0.3329, + "step": 9320 + }, + { + "epoch": 4.425723777883246, + "grad_norm": 0.27086221279198003, + "learning_rate": 2.482201713909031e-05, + "loss": 0.332, + "step": 9325 + }, + { + "epoch": 4.428096820123399, + "grad_norm": 0.2581177962834361, + "learning_rate": 2.4805537244561637e-05, + "loss": 0.336, + "step": 9330 + }, + { + "epoch": 4.43046986236355, + "grad_norm": 0.255043078400756, + "learning_rate": 2.478905735003296e-05, + "loss": 0.3293, + "step": 9335 + }, + { + "epoch": 4.432842904603702, + "grad_norm": 0.26509106295554996, + "learning_rate": 2.4772577455504288e-05, + "loss": 0.3358, + "step": 9340 + }, + { + "epoch": 4.4352159468438535, + "grad_norm": 0.2455808735901656, + "learning_rate": 2.475609756097561e-05, + "loss": 0.3305, + "step": 9345 + }, + { + "epoch": 4.437588989084006, + "grad_norm": 0.2742491423024273, + "learning_rate": 2.4739617666446935e-05, + "loss": 0.3341, + "step": 9350 + }, + { + "epoch": 4.439962031324158, + "grad_norm": 0.27518714576562875, + "learning_rate": 2.472313777191826e-05, + "loss": 0.3434, + "step": 9355 + }, + { + "epoch": 4.442335073564309, + "grad_norm": 0.2477077761456117, + "learning_rate": 2.4706657877389586e-05, + "loss": 0.3311, + "step": 9360 + }, + { + "epoch": 4.4447081158044615, + "grad_norm": 0.26060354765392774, + "learning_rate": 2.469017798286091e-05, + "loss": 0.3447, + "step": 9365 + }, + { + "epoch": 4.447081158044613, + "grad_norm": 0.2654184972083784, + "learning_rate": 2.4673698088332236e-05, + "loss": 0.3304, + "step": 9370 + }, + { + "epoch": 4.449454200284765, + "grad_norm": 0.2725815308795511, + "learning_rate": 2.4657218193803562e-05, + "loss": 0.3389, + "step": 9375 + }, + { + "epoch": 4.451827242524917, + "grad_norm": 0.25569552573269333, + "learning_rate": 2.4640738299274884e-05, + "loss": 0.3342, + "step": 9380 + }, + { + "epoch": 4.454200284765069, + "grad_norm": 0.2590292629115144, + "learning_rate": 2.4624258404746212e-05, + "loss": 0.3412, + "step": 9385 + }, + { + "epoch": 4.456573327005221, + "grad_norm": 0.2684546480413156, + "learning_rate": 2.4607778510217534e-05, + "loss": 0.3368, + "step": 9390 + }, + { + "epoch": 4.458946369245373, + "grad_norm": 0.25448472205861533, + "learning_rate": 2.459129861568886e-05, + "loss": 0.3288, + "step": 9395 + }, + { + "epoch": 4.461319411485524, + "grad_norm": 0.2670723895509774, + "learning_rate": 2.4574818721160185e-05, + "loss": 0.3361, + "step": 9400 + }, + { + "epoch": 4.463692453725677, + "grad_norm": 0.27592870627595845, + "learning_rate": 2.455833882663151e-05, + "loss": 0.3337, + "step": 9405 + }, + { + "epoch": 4.466065495965828, + "grad_norm": 0.24260467557705545, + "learning_rate": 2.4541858932102836e-05, + "loss": 0.3233, + "step": 9410 + }, + { + "epoch": 4.46843853820598, + "grad_norm": 0.2784353782887476, + "learning_rate": 2.452537903757416e-05, + "loss": 0.3352, + "step": 9415 + }, + { + "epoch": 4.4708115804461315, + "grad_norm": 0.2884210954409849, + "learning_rate": 2.4508899143045487e-05, + "loss": 0.3295, + "step": 9420 + }, + { + "epoch": 4.473184622686284, + "grad_norm": 0.27680052848863745, + "learning_rate": 2.449241924851681e-05, + "loss": 0.3379, + "step": 9425 + }, + { + "epoch": 4.475557664926436, + "grad_norm": 0.2584472835335673, + "learning_rate": 2.4475939353988137e-05, + "loss": 0.3321, + "step": 9430 + }, + { + "epoch": 4.477930707166587, + "grad_norm": 0.24824337844033956, + "learning_rate": 2.445945945945946e-05, + "loss": 0.3313, + "step": 9435 + }, + { + "epoch": 4.4803037494067395, + "grad_norm": 0.2501767387024789, + "learning_rate": 2.4442979564930785e-05, + "loss": 0.3399, + "step": 9440 + }, + { + "epoch": 4.482676791646892, + "grad_norm": 0.26135833944817133, + "learning_rate": 2.442649967040211e-05, + "loss": 0.336, + "step": 9445 + }, + { + "epoch": 4.485049833887043, + "grad_norm": 0.27621463807795343, + "learning_rate": 2.4410019775873435e-05, + "loss": 0.3315, + "step": 9450 + }, + { + "epoch": 4.487422876127195, + "grad_norm": 0.23290014965258132, + "learning_rate": 2.439353988134476e-05, + "loss": 0.3371, + "step": 9455 + }, + { + "epoch": 4.489795918367347, + "grad_norm": 0.2524805547275652, + "learning_rate": 2.4377059986816086e-05, + "loss": 0.3438, + "step": 9460 + }, + { + "epoch": 4.492168960607499, + "grad_norm": 0.2698306201603276, + "learning_rate": 2.436058009228741e-05, + "loss": 0.3397, + "step": 9465 + }, + { + "epoch": 4.494542002847651, + "grad_norm": 0.2492885716330345, + "learning_rate": 2.4344100197758733e-05, + "loss": 0.3334, + "step": 9470 + }, + { + "epoch": 4.496915045087802, + "grad_norm": 0.25473972981035203, + "learning_rate": 2.4327620303230062e-05, + "loss": 0.3411, + "step": 9475 + }, + { + "epoch": 4.499288087327955, + "grad_norm": 0.24971325744078135, + "learning_rate": 2.4311140408701384e-05, + "loss": 0.3285, + "step": 9480 + }, + { + "epoch": 4.501661129568106, + "grad_norm": 0.25526061517546805, + "learning_rate": 2.429466051417271e-05, + "loss": 0.3365, + "step": 9485 + }, + { + "epoch": 4.504034171808258, + "grad_norm": 0.25475797888803026, + "learning_rate": 2.4278180619644038e-05, + "loss": 0.3389, + "step": 9490 + }, + { + "epoch": 4.50640721404841, + "grad_norm": 0.2835100208943059, + "learning_rate": 2.426170072511536e-05, + "loss": 0.3404, + "step": 9495 + }, + { + "epoch": 4.508780256288562, + "grad_norm": 0.26140825551039737, + "learning_rate": 2.4245220830586685e-05, + "loss": 0.329, + "step": 9500 + }, + { + "epoch": 4.511153298528714, + "grad_norm": 0.26250618124566827, + "learning_rate": 2.422874093605801e-05, + "loss": 0.3402, + "step": 9505 + }, + { + "epoch": 4.513526340768866, + "grad_norm": 0.26807359037286144, + "learning_rate": 2.4212261041529336e-05, + "loss": 0.3432, + "step": 9510 + }, + { + "epoch": 4.5158993830090175, + "grad_norm": 0.2740627391274403, + "learning_rate": 2.4195781147000658e-05, + "loss": 0.3343, + "step": 9515 + }, + { + "epoch": 4.51827242524917, + "grad_norm": 0.2607652181504779, + "learning_rate": 2.4179301252471987e-05, + "loss": 0.3361, + "step": 9520 + }, + { + "epoch": 4.520645467489321, + "grad_norm": 0.2556519722134509, + "learning_rate": 2.416282135794331e-05, + "loss": 0.3371, + "step": 9525 + }, + { + "epoch": 4.523018509729473, + "grad_norm": 0.25983423133386346, + "learning_rate": 2.4146341463414634e-05, + "loss": 0.3321, + "step": 9530 + }, + { + "epoch": 4.525391551969625, + "grad_norm": 0.2725626301080418, + "learning_rate": 2.4129861568885963e-05, + "loss": 0.3369, + "step": 9535 + }, + { + "epoch": 4.527764594209777, + "grad_norm": 0.2741700850514236, + "learning_rate": 2.4113381674357285e-05, + "loss": 0.3393, + "step": 9540 + }, + { + "epoch": 4.530137636449929, + "grad_norm": 0.27294817578493724, + "learning_rate": 2.409690177982861e-05, + "loss": 0.3386, + "step": 9545 + }, + { + "epoch": 4.53251067869008, + "grad_norm": 0.26230585420512403, + "learning_rate": 2.4080421885299936e-05, + "loss": 0.3393, + "step": 9550 + }, + { + "epoch": 4.534883720930233, + "grad_norm": 0.24912585898666542, + "learning_rate": 2.406394199077126e-05, + "loss": 0.3322, + "step": 9555 + }, + { + "epoch": 4.537256763170385, + "grad_norm": 0.25121917287780804, + "learning_rate": 2.4047462096242583e-05, + "loss": 0.3329, + "step": 9560 + }, + { + "epoch": 4.539629805410536, + "grad_norm": 0.2553991763695146, + "learning_rate": 2.403098220171391e-05, + "loss": 0.3365, + "step": 9565 + }, + { + "epoch": 4.542002847650688, + "grad_norm": 0.26373483997089414, + "learning_rate": 2.4014502307185237e-05, + "loss": 0.3324, + "step": 9570 + }, + { + "epoch": 4.54437588989084, + "grad_norm": 0.27942432562637703, + "learning_rate": 2.399802241265656e-05, + "loss": 0.3302, + "step": 9575 + }, + { + "epoch": 4.546748932130992, + "grad_norm": 0.2680710024245636, + "learning_rate": 2.3981542518127888e-05, + "loss": 0.3375, + "step": 9580 + }, + { + "epoch": 4.549121974371144, + "grad_norm": 0.2650143322494772, + "learning_rate": 2.396506262359921e-05, + "loss": 0.3447, + "step": 9585 + }, + { + "epoch": 4.5514950166112955, + "grad_norm": 0.2560506575640821, + "learning_rate": 2.3948582729070535e-05, + "loss": 0.3297, + "step": 9590 + }, + { + "epoch": 4.553868058851448, + "grad_norm": 0.27588803662967765, + "learning_rate": 2.393210283454186e-05, + "loss": 0.3328, + "step": 9595 + }, + { + "epoch": 4.556241101091599, + "grad_norm": 0.25915936180140703, + "learning_rate": 2.3915622940013186e-05, + "loss": 0.333, + "step": 9600 + }, + { + "epoch": 4.558614143331751, + "grad_norm": 0.2575988449768447, + "learning_rate": 2.389914304548451e-05, + "loss": 0.3376, + "step": 9605 + }, + { + "epoch": 4.5609871855719035, + "grad_norm": 0.24550148464395344, + "learning_rate": 2.3882663150955836e-05, + "loss": 0.3401, + "step": 9610 + }, + { + "epoch": 4.563360227812055, + "grad_norm": 0.2462532984625121, + "learning_rate": 2.3866183256427162e-05, + "loss": 0.3401, + "step": 9615 + }, + { + "epoch": 4.565733270052207, + "grad_norm": 0.2518404553132448, + "learning_rate": 2.3849703361898484e-05, + "loss": 0.3271, + "step": 9620 + }, + { + "epoch": 4.568106312292358, + "grad_norm": 0.25342523120224053, + "learning_rate": 2.3833223467369813e-05, + "loss": 0.3409, + "step": 9625 + }, + { + "epoch": 4.570479354532511, + "grad_norm": 0.27090579280506333, + "learning_rate": 2.3816743572841134e-05, + "loss": 0.3342, + "step": 9630 + }, + { + "epoch": 4.572852396772663, + "grad_norm": 0.2422639363661271, + "learning_rate": 2.380026367831246e-05, + "loss": 0.3282, + "step": 9635 + }, + { + "epoch": 4.575225439012814, + "grad_norm": 0.2597489786001757, + "learning_rate": 2.3783783783783785e-05, + "loss": 0.3367, + "step": 9640 + }, + { + "epoch": 4.577598481252966, + "grad_norm": 0.25309843848672686, + "learning_rate": 2.376730388925511e-05, + "loss": 0.3369, + "step": 9645 + }, + { + "epoch": 4.579971523493118, + "grad_norm": 0.26235685426775257, + "learning_rate": 2.3750823994726436e-05, + "loss": 0.3402, + "step": 9650 + }, + { + "epoch": 4.58234456573327, + "grad_norm": 0.2712482207002275, + "learning_rate": 2.373434410019776e-05, + "loss": 0.3258, + "step": 9655 + }, + { + "epoch": 4.584717607973422, + "grad_norm": 0.2752776314597962, + "learning_rate": 2.3717864205669087e-05, + "loss": 0.3355, + "step": 9660 + }, + { + "epoch": 4.5870906502135735, + "grad_norm": 0.2828714332644122, + "learning_rate": 2.370138431114041e-05, + "loss": 0.3332, + "step": 9665 + }, + { + "epoch": 4.589463692453726, + "grad_norm": 0.27203666860836967, + "learning_rate": 2.3684904416611737e-05, + "loss": 0.3442, + "step": 9670 + }, + { + "epoch": 4.591836734693878, + "grad_norm": 0.26014025179159506, + "learning_rate": 2.366842452208306e-05, + "loss": 0.3366, + "step": 9675 + }, + { + "epoch": 4.594209776934029, + "grad_norm": 0.2577222115747703, + "learning_rate": 2.3651944627554385e-05, + "loss": 0.3415, + "step": 9680 + }, + { + "epoch": 4.5965828191741815, + "grad_norm": 0.26483292632780897, + "learning_rate": 2.363546473302571e-05, + "loss": 0.3372, + "step": 9685 + }, + { + "epoch": 4.598955861414333, + "grad_norm": 0.25516795516133056, + "learning_rate": 2.3618984838497035e-05, + "loss": 0.3328, + "step": 9690 + }, + { + "epoch": 4.601328903654485, + "grad_norm": 0.26215337270916744, + "learning_rate": 2.360250494396836e-05, + "loss": 0.3312, + "step": 9695 + }, + { + "epoch": 4.603701945894636, + "grad_norm": 0.26063911492279934, + "learning_rate": 2.3586025049439683e-05, + "loss": 0.3313, + "step": 9700 + }, + { + "epoch": 4.606074988134789, + "grad_norm": 0.25065632498279244, + "learning_rate": 2.356954515491101e-05, + "loss": 0.3305, + "step": 9705 + }, + { + "epoch": 4.608448030374941, + "grad_norm": 0.25400383027405965, + "learning_rate": 2.3553065260382333e-05, + "loss": 0.3358, + "step": 9710 + }, + { + "epoch": 4.610821072615092, + "grad_norm": 0.2717247699836737, + "learning_rate": 2.353658536585366e-05, + "loss": 0.337, + "step": 9715 + }, + { + "epoch": 4.613194114855244, + "grad_norm": 0.2480719739843851, + "learning_rate": 2.3520105471324984e-05, + "loss": 0.3251, + "step": 9720 + }, + { + "epoch": 4.615567157095397, + "grad_norm": 0.26070282517639576, + "learning_rate": 2.350362557679631e-05, + "loss": 0.3455, + "step": 9725 + }, + { + "epoch": 4.617940199335548, + "grad_norm": 0.24838448082555206, + "learning_rate": 2.3487145682267635e-05, + "loss": 0.3349, + "step": 9730 + }, + { + "epoch": 4.6203132415757, + "grad_norm": 0.2625159679152, + "learning_rate": 2.347066578773896e-05, + "loss": 0.3345, + "step": 9735 + }, + { + "epoch": 4.622686283815852, + "grad_norm": 0.2633292254431972, + "learning_rate": 2.3454185893210285e-05, + "loss": 0.3367, + "step": 9740 + }, + { + "epoch": 4.625059326056004, + "grad_norm": 0.2721159459108398, + "learning_rate": 2.3437705998681607e-05, + "loss": 0.3382, + "step": 9745 + }, + { + "epoch": 4.627432368296156, + "grad_norm": 0.2915754951512862, + "learning_rate": 2.3421226104152936e-05, + "loss": 0.3349, + "step": 9750 + }, + { + "epoch": 4.629805410536307, + "grad_norm": 0.25442788386615284, + "learning_rate": 2.3404746209624258e-05, + "loss": 0.344, + "step": 9755 + }, + { + "epoch": 4.63217845277646, + "grad_norm": 0.2584942468757496, + "learning_rate": 2.3388266315095583e-05, + "loss": 0.3332, + "step": 9760 + }, + { + "epoch": 4.634551495016611, + "grad_norm": 0.2506832700965035, + "learning_rate": 2.337178642056691e-05, + "loss": 0.331, + "step": 9765 + }, + { + "epoch": 4.636924537256763, + "grad_norm": 0.2826281746219548, + "learning_rate": 2.3355306526038234e-05, + "loss": 0.3337, + "step": 9770 + }, + { + "epoch": 4.639297579496915, + "grad_norm": 0.2549003935602647, + "learning_rate": 2.333882663150956e-05, + "loss": 0.3385, + "step": 9775 + }, + { + "epoch": 4.641670621737067, + "grad_norm": 0.27763421183197495, + "learning_rate": 2.3322346736980885e-05, + "loss": 0.3345, + "step": 9780 + }, + { + "epoch": 4.644043663977219, + "grad_norm": 0.2646188954531851, + "learning_rate": 2.330586684245221e-05, + "loss": 0.3378, + "step": 9785 + }, + { + "epoch": 4.646416706217371, + "grad_norm": 0.25014201302864075, + "learning_rate": 2.3289386947923532e-05, + "loss": 0.3297, + "step": 9790 + }, + { + "epoch": 4.6487897484575225, + "grad_norm": 0.2568365973791299, + "learning_rate": 2.327290705339486e-05, + "loss": 0.3348, + "step": 9795 + }, + { + "epoch": 4.651162790697675, + "grad_norm": 0.25056622885418905, + "learning_rate": 2.3256427158866183e-05, + "loss": 0.3411, + "step": 9800 + }, + { + "epoch": 4.653535832937826, + "grad_norm": 0.27388699827022356, + "learning_rate": 2.3239947264337508e-05, + "loss": 0.3275, + "step": 9805 + }, + { + "epoch": 4.655908875177978, + "grad_norm": 0.27155431256515555, + "learning_rate": 2.3223467369808834e-05, + "loss": 0.3327, + "step": 9810 + }, + { + "epoch": 4.65828191741813, + "grad_norm": 0.26372336438964744, + "learning_rate": 2.320698747528016e-05, + "loss": 0.332, + "step": 9815 + }, + { + "epoch": 4.660654959658282, + "grad_norm": 0.25606471452493756, + "learning_rate": 2.3190507580751484e-05, + "loss": 0.3266, + "step": 9820 + }, + { + "epoch": 4.663028001898434, + "grad_norm": 0.2620874856855608, + "learning_rate": 2.317402768622281e-05, + "loss": 0.3282, + "step": 9825 + }, + { + "epoch": 4.665401044138585, + "grad_norm": 0.25868843920522216, + "learning_rate": 2.3157547791694135e-05, + "loss": 0.3303, + "step": 9830 + }, + { + "epoch": 4.667774086378738, + "grad_norm": 0.27218953732289525, + "learning_rate": 2.3141067897165457e-05, + "loss": 0.3316, + "step": 9835 + }, + { + "epoch": 4.67014712861889, + "grad_norm": 0.27449358086585296, + "learning_rate": 2.3124588002636786e-05, + "loss": 0.3388, + "step": 9840 + }, + { + "epoch": 4.672520170859041, + "grad_norm": 0.2509480503970254, + "learning_rate": 2.3108108108108108e-05, + "loss": 0.3384, + "step": 9845 + }, + { + "epoch": 4.674893213099193, + "grad_norm": 0.2694079552728151, + "learning_rate": 2.3091628213579433e-05, + "loss": 0.3467, + "step": 9850 + }, + { + "epoch": 4.677266255339345, + "grad_norm": 0.2511643568085062, + "learning_rate": 2.307514831905076e-05, + "loss": 0.3334, + "step": 9855 + }, + { + "epoch": 4.679639297579497, + "grad_norm": 0.25084812614751156, + "learning_rate": 2.3058668424522084e-05, + "loss": 0.3361, + "step": 9860 + }, + { + "epoch": 4.682012339819649, + "grad_norm": 0.26064341353949244, + "learning_rate": 2.304218852999341e-05, + "loss": 0.3326, + "step": 9865 + }, + { + "epoch": 4.6843853820598005, + "grad_norm": 0.27043220935645207, + "learning_rate": 2.3025708635464734e-05, + "loss": 0.34, + "step": 9870 + }, + { + "epoch": 4.686758424299953, + "grad_norm": 0.2546479732596791, + "learning_rate": 2.300922874093606e-05, + "loss": 0.3295, + "step": 9875 + }, + { + "epoch": 4.689131466540104, + "grad_norm": 0.24267181545627084, + "learning_rate": 2.2992748846407382e-05, + "loss": 0.3294, + "step": 9880 + }, + { + "epoch": 4.691504508780256, + "grad_norm": 0.2509425818204467, + "learning_rate": 2.297626895187871e-05, + "loss": 0.3369, + "step": 9885 + }, + { + "epoch": 4.6938775510204085, + "grad_norm": 0.2507716052151097, + "learning_rate": 2.2959789057350033e-05, + "loss": 0.3336, + "step": 9890 + }, + { + "epoch": 4.69625059326056, + "grad_norm": 0.2523604397282461, + "learning_rate": 2.2943309162821358e-05, + "loss": 0.3375, + "step": 9895 + }, + { + "epoch": 4.698623635500712, + "grad_norm": 0.24248695657298025, + "learning_rate": 2.2926829268292687e-05, + "loss": 0.3275, + "step": 9900 + }, + { + "epoch": 4.700996677740864, + "grad_norm": 0.2699934354208848, + "learning_rate": 2.291034937376401e-05, + "loss": 0.3356, + "step": 9905 + }, + { + "epoch": 4.703369719981016, + "grad_norm": 0.26139583745442097, + "learning_rate": 2.2893869479235334e-05, + "loss": 0.3287, + "step": 9910 + }, + { + "epoch": 4.705742762221168, + "grad_norm": 0.28207255707785867, + "learning_rate": 2.287738958470666e-05, + "loss": 0.3343, + "step": 9915 + }, + { + "epoch": 4.708115804461319, + "grad_norm": 0.27102600075957944, + "learning_rate": 2.2860909690177985e-05, + "loss": 0.3385, + "step": 9920 + }, + { + "epoch": 4.710488846701471, + "grad_norm": 0.27226710156743195, + "learning_rate": 2.2844429795649307e-05, + "loss": 0.3397, + "step": 9925 + }, + { + "epoch": 4.712861888941623, + "grad_norm": 0.2727064076849424, + "learning_rate": 2.2827949901120635e-05, + "loss": 0.326, + "step": 9930 + }, + { + "epoch": 4.715234931181775, + "grad_norm": 0.25543527026491025, + "learning_rate": 2.2811470006591957e-05, + "loss": 0.3363, + "step": 9935 + }, + { + "epoch": 4.717607973421927, + "grad_norm": 0.26754142367545936, + "learning_rate": 2.2794990112063283e-05, + "loss": 0.3265, + "step": 9940 + }, + { + "epoch": 4.7199810156620785, + "grad_norm": 0.25668852083638827, + "learning_rate": 2.277851021753461e-05, + "loss": 0.3318, + "step": 9945 + }, + { + "epoch": 4.722354057902231, + "grad_norm": 0.2618295281376006, + "learning_rate": 2.2762030323005933e-05, + "loss": 0.3359, + "step": 9950 + }, + { + "epoch": 4.724727100142383, + "grad_norm": 0.27414946056152084, + "learning_rate": 2.274555042847726e-05, + "loss": 0.3309, + "step": 9955 + }, + { + "epoch": 4.727100142382534, + "grad_norm": 0.25127815873225223, + "learning_rate": 2.2729070533948584e-05, + "loss": 0.3354, + "step": 9960 + }, + { + "epoch": 4.7294731846226865, + "grad_norm": 0.24945561927965967, + "learning_rate": 2.271259063941991e-05, + "loss": 0.3319, + "step": 9965 + }, + { + "epoch": 4.731846226862838, + "grad_norm": 0.2697651716710156, + "learning_rate": 2.269611074489123e-05, + "loss": 0.3422, + "step": 9970 + }, + { + "epoch": 4.73421926910299, + "grad_norm": 0.2586891776680827, + "learning_rate": 2.267963085036256e-05, + "loss": 0.3456, + "step": 9975 + }, + { + "epoch": 4.736592311343142, + "grad_norm": 0.2795691378046559, + "learning_rate": 2.2663150955833885e-05, + "loss": 0.3359, + "step": 9980 + }, + { + "epoch": 4.738965353583294, + "grad_norm": 0.26383750536001804, + "learning_rate": 2.2646671061305207e-05, + "loss": 0.3299, + "step": 9985 + }, + { + "epoch": 4.741338395823446, + "grad_norm": 0.2979923865321035, + "learning_rate": 2.2630191166776536e-05, + "loss": 0.3344, + "step": 9990 + }, + { + "epoch": 4.743711438063597, + "grad_norm": 0.24770461784652786, + "learning_rate": 2.2613711272247858e-05, + "loss": 0.3345, + "step": 9995 + }, + { + "epoch": 4.746084480303749, + "grad_norm": 0.28316055269799156, + "learning_rate": 2.2597231377719184e-05, + "loss": 0.3321, + "step": 10000 + }, + { + "epoch": 4.748457522543902, + "grad_norm": 0.2650721732509171, + "learning_rate": 2.258075148319051e-05, + "loss": 0.3288, + "step": 10005 + }, + { + "epoch": 4.750830564784053, + "grad_norm": 0.2611405908327354, + "learning_rate": 2.2564271588661834e-05, + "loss": 0.3388, + "step": 10010 + }, + { + "epoch": 4.753203607024205, + "grad_norm": 0.25868556121763386, + "learning_rate": 2.254779169413316e-05, + "loss": 0.3338, + "step": 10015 + }, + { + "epoch": 4.7555766492643565, + "grad_norm": 0.2616619196875038, + "learning_rate": 2.2531311799604485e-05, + "loss": 0.3332, + "step": 10020 + }, + { + "epoch": 4.757949691504509, + "grad_norm": 0.26718671878524175, + "learning_rate": 2.251483190507581e-05, + "loss": 0.3378, + "step": 10025 + }, + { + "epoch": 4.760322733744661, + "grad_norm": 0.27523581549534554, + "learning_rate": 2.2498352010547132e-05, + "loss": 0.3383, + "step": 10030 + }, + { + "epoch": 4.762695775984812, + "grad_norm": 0.2657158267168739, + "learning_rate": 2.248187211601846e-05, + "loss": 0.3276, + "step": 10035 + }, + { + "epoch": 4.7650688182249645, + "grad_norm": 0.2725839943137875, + "learning_rate": 2.2465392221489783e-05, + "loss": 0.3304, + "step": 10040 + }, + { + "epoch": 4.767441860465116, + "grad_norm": 0.2875700071483346, + "learning_rate": 2.244891232696111e-05, + "loss": 0.3427, + "step": 10045 + }, + { + "epoch": 4.769814902705268, + "grad_norm": 0.2889702501757123, + "learning_rate": 2.2432432432432434e-05, + "loss": 0.3289, + "step": 10050 + }, + { + "epoch": 4.77218794494542, + "grad_norm": 0.24530800169512187, + "learning_rate": 2.241595253790376e-05, + "loss": 0.3297, + "step": 10055 + }, + { + "epoch": 4.774560987185572, + "grad_norm": 0.24909598364577767, + "learning_rate": 2.2399472643375084e-05, + "loss": 0.3354, + "step": 10060 + }, + { + "epoch": 4.776934029425724, + "grad_norm": 0.2645715483219994, + "learning_rate": 2.238299274884641e-05, + "loss": 0.3348, + "step": 10065 + }, + { + "epoch": 4.779307071665876, + "grad_norm": 0.25640528903893695, + "learning_rate": 2.2366512854317735e-05, + "loss": 0.3343, + "step": 10070 + }, + { + "epoch": 4.781680113906027, + "grad_norm": 0.24010815176068404, + "learning_rate": 2.2350032959789057e-05, + "loss": 0.3186, + "step": 10075 + }, + { + "epoch": 4.78405315614618, + "grad_norm": 0.26132372986641106, + "learning_rate": 2.2333553065260386e-05, + "loss": 0.3403, + "step": 10080 + }, + { + "epoch": 4.786426198386331, + "grad_norm": 0.26538799157970977, + "learning_rate": 2.2317073170731708e-05, + "loss": 0.3348, + "step": 10085 + }, + { + "epoch": 4.788799240626483, + "grad_norm": 0.2629604858206972, + "learning_rate": 2.2300593276203033e-05, + "loss": 0.3457, + "step": 10090 + }, + { + "epoch": 4.7911722828666345, + "grad_norm": 0.2504573193927311, + "learning_rate": 2.228411338167436e-05, + "loss": 0.3309, + "step": 10095 + }, + { + "epoch": 4.793545325106787, + "grad_norm": 0.25917334260957575, + "learning_rate": 2.2267633487145684e-05, + "loss": 0.3357, + "step": 10100 + }, + { + "epoch": 4.795918367346939, + "grad_norm": 0.26633127802451895, + "learning_rate": 2.225115359261701e-05, + "loss": 0.3362, + "step": 10105 + }, + { + "epoch": 4.79829140958709, + "grad_norm": 0.2728945357523944, + "learning_rate": 2.2234673698088335e-05, + "loss": 0.3344, + "step": 10110 + }, + { + "epoch": 4.8006644518272426, + "grad_norm": 0.27647565535505525, + "learning_rate": 2.221819380355966e-05, + "loss": 0.3362, + "step": 10115 + }, + { + "epoch": 4.803037494067395, + "grad_norm": 0.2697072294684808, + "learning_rate": 2.2201713909030982e-05, + "loss": 0.3444, + "step": 10120 + }, + { + "epoch": 4.805410536307546, + "grad_norm": 0.284801416626149, + "learning_rate": 2.2185234014502307e-05, + "loss": 0.3309, + "step": 10125 + }, + { + "epoch": 4.807783578547698, + "grad_norm": 0.2727407594843715, + "learning_rate": 2.2168754119973633e-05, + "loss": 0.3324, + "step": 10130 + }, + { + "epoch": 4.81015662078785, + "grad_norm": 0.25208240655831693, + "learning_rate": 2.2152274225444958e-05, + "loss": 0.3294, + "step": 10135 + }, + { + "epoch": 4.812529663028002, + "grad_norm": 0.2593371638542793, + "learning_rate": 2.2135794330916283e-05, + "loss": 0.3359, + "step": 10140 + }, + { + "epoch": 4.814902705268154, + "grad_norm": 0.25040777974452766, + "learning_rate": 2.211931443638761e-05, + "loss": 0.3402, + "step": 10145 + }, + { + "epoch": 4.8172757475083055, + "grad_norm": 0.2540077480114518, + "learning_rate": 2.2102834541858934e-05, + "loss": 0.3336, + "step": 10150 + }, + { + "epoch": 4.819648789748458, + "grad_norm": 0.2506711250414973, + "learning_rate": 2.2086354647330256e-05, + "loss": 0.3378, + "step": 10155 + }, + { + "epoch": 4.822021831988609, + "grad_norm": 0.2408579461037636, + "learning_rate": 2.2069874752801585e-05, + "loss": 0.3397, + "step": 10160 + }, + { + "epoch": 4.824394874228761, + "grad_norm": 0.2546842610723499, + "learning_rate": 2.2053394858272907e-05, + "loss": 0.3335, + "step": 10165 + }, + { + "epoch": 4.8267679164689135, + "grad_norm": 0.29229294091809815, + "learning_rate": 2.2036914963744232e-05, + "loss": 0.3334, + "step": 10170 + }, + { + "epoch": 4.829140958709065, + "grad_norm": 0.2962748670655199, + "learning_rate": 2.2020435069215557e-05, + "loss": 0.3325, + "step": 10175 + }, + { + "epoch": 4.831514000949217, + "grad_norm": 0.25127048969257215, + "learning_rate": 2.2003955174686883e-05, + "loss": 0.3282, + "step": 10180 + }, + { + "epoch": 4.833887043189369, + "grad_norm": 0.25599078732957387, + "learning_rate": 2.1987475280158208e-05, + "loss": 0.3381, + "step": 10185 + }, + { + "epoch": 4.836260085429521, + "grad_norm": 0.2397418687443964, + "learning_rate": 2.1970995385629533e-05, + "loss": 0.3249, + "step": 10190 + }, + { + "epoch": 4.838633127669673, + "grad_norm": 0.24809714470287952, + "learning_rate": 2.195451549110086e-05, + "loss": 0.3287, + "step": 10195 + }, + { + "epoch": 4.841006169909824, + "grad_norm": 0.23926640259473425, + "learning_rate": 2.193803559657218e-05, + "loss": 0.3345, + "step": 10200 + }, + { + "epoch": 4.843379212149976, + "grad_norm": 0.28335736318494176, + "learning_rate": 2.192155570204351e-05, + "loss": 0.3436, + "step": 10205 + }, + { + "epoch": 4.845752254390128, + "grad_norm": 0.24591728791125697, + "learning_rate": 2.190507580751483e-05, + "loss": 0.3392, + "step": 10210 + }, + { + "epoch": 4.84812529663028, + "grad_norm": 0.264159053409353, + "learning_rate": 2.1888595912986157e-05, + "loss": 0.337, + "step": 10215 + }, + { + "epoch": 4.850498338870432, + "grad_norm": 0.25869273853719377, + "learning_rate": 2.1872116018457482e-05, + "loss": 0.333, + "step": 10220 + }, + { + "epoch": 4.8528713811105835, + "grad_norm": 0.24538510322426274, + "learning_rate": 2.1855636123928807e-05, + "loss": 0.3293, + "step": 10225 + }, + { + "epoch": 4.855244423350736, + "grad_norm": 0.2576397215596024, + "learning_rate": 2.1839156229400133e-05, + "loss": 0.3333, + "step": 10230 + }, + { + "epoch": 4.857617465590888, + "grad_norm": 0.25705215089930444, + "learning_rate": 2.1822676334871458e-05, + "loss": 0.3348, + "step": 10235 + }, + { + "epoch": 4.859990507831039, + "grad_norm": 0.2576057608685716, + "learning_rate": 2.1806196440342784e-05, + "loss": 0.3358, + "step": 10240 + }, + { + "epoch": 4.8623635500711915, + "grad_norm": 0.2553033977000119, + "learning_rate": 2.1789716545814106e-05, + "loss": 0.3401, + "step": 10245 + }, + { + "epoch": 4.864736592311343, + "grad_norm": 0.2591077429802609, + "learning_rate": 2.1773236651285434e-05, + "loss": 0.3362, + "step": 10250 + }, + { + "epoch": 4.867109634551495, + "grad_norm": 0.26685098354893244, + "learning_rate": 2.1756756756756756e-05, + "loss": 0.335, + "step": 10255 + }, + { + "epoch": 4.869482676791647, + "grad_norm": 0.2634232296251287, + "learning_rate": 2.174027686222808e-05, + "loss": 0.336, + "step": 10260 + }, + { + "epoch": 4.871855719031799, + "grad_norm": 0.2618373107749219, + "learning_rate": 2.1723796967699407e-05, + "loss": 0.3333, + "step": 10265 + }, + { + "epoch": 4.874228761271951, + "grad_norm": 0.25559591889882677, + "learning_rate": 2.1707317073170732e-05, + "loss": 0.331, + "step": 10270 + }, + { + "epoch": 4.876601803512102, + "grad_norm": 0.24632565495979433, + "learning_rate": 2.1690837178642058e-05, + "loss": 0.3303, + "step": 10275 + }, + { + "epoch": 4.878974845752254, + "grad_norm": 0.2440517325075965, + "learning_rate": 2.1674357284113383e-05, + "loss": 0.332, + "step": 10280 + }, + { + "epoch": 4.881347887992407, + "grad_norm": 0.24814371133974783, + "learning_rate": 2.165787738958471e-05, + "loss": 0.3381, + "step": 10285 + }, + { + "epoch": 4.883720930232558, + "grad_norm": 0.24134888701121146, + "learning_rate": 2.164139749505603e-05, + "loss": 0.324, + "step": 10290 + }, + { + "epoch": 4.88609397247271, + "grad_norm": 0.270593207251813, + "learning_rate": 2.162491760052736e-05, + "loss": 0.3331, + "step": 10295 + }, + { + "epoch": 4.888467014712862, + "grad_norm": 0.24555870641937255, + "learning_rate": 2.160843770599868e-05, + "loss": 0.3392, + "step": 10300 + }, + { + "epoch": 4.890840056953014, + "grad_norm": 0.2717268983591372, + "learning_rate": 2.1591957811470006e-05, + "loss": 0.3407, + "step": 10305 + }, + { + "epoch": 4.893213099193166, + "grad_norm": 0.2551573615726735, + "learning_rate": 2.1575477916941335e-05, + "loss": 0.3342, + "step": 10310 + }, + { + "epoch": 4.895586141433317, + "grad_norm": 0.2547692460424567, + "learning_rate": 2.1558998022412657e-05, + "loss": 0.3428, + "step": 10315 + }, + { + "epoch": 4.8979591836734695, + "grad_norm": 0.24800413057013332, + "learning_rate": 2.1542518127883982e-05, + "loss": 0.3375, + "step": 10320 + }, + { + "epoch": 4.900332225913621, + "grad_norm": 0.25923318893681335, + "learning_rate": 2.1526038233355308e-05, + "loss": 0.3429, + "step": 10325 + }, + { + "epoch": 4.902705268153773, + "grad_norm": 0.24469166595001868, + "learning_rate": 2.1509558338826633e-05, + "loss": 0.343, + "step": 10330 + }, + { + "epoch": 4.905078310393925, + "grad_norm": 0.26666477306765424, + "learning_rate": 2.1493078444297955e-05, + "loss": 0.3324, + "step": 10335 + }, + { + "epoch": 4.907451352634077, + "grad_norm": 0.257393303821503, + "learning_rate": 2.1476598549769284e-05, + "loss": 0.3417, + "step": 10340 + }, + { + "epoch": 4.909824394874229, + "grad_norm": 0.25378289273989413, + "learning_rate": 2.1460118655240606e-05, + "loss": 0.3324, + "step": 10345 + }, + { + "epoch": 4.912197437114381, + "grad_norm": 0.27127535510735246, + "learning_rate": 2.144363876071193e-05, + "loss": 0.3381, + "step": 10350 + }, + { + "epoch": 4.914570479354532, + "grad_norm": 0.24141611558221296, + "learning_rate": 2.142715886618326e-05, + "loss": 0.3317, + "step": 10355 + }, + { + "epoch": 4.916943521594685, + "grad_norm": 0.25086424669742763, + "learning_rate": 2.1410678971654582e-05, + "loss": 0.3337, + "step": 10360 + }, + { + "epoch": 4.919316563834836, + "grad_norm": 0.2513705691114776, + "learning_rate": 2.1394199077125907e-05, + "loss": 0.3375, + "step": 10365 + }, + { + "epoch": 4.921689606074988, + "grad_norm": 0.2677346315165033, + "learning_rate": 2.1377719182597233e-05, + "loss": 0.3374, + "step": 10370 + }, + { + "epoch": 4.9240626483151395, + "grad_norm": 0.23554064887464932, + "learning_rate": 2.1361239288068558e-05, + "loss": 0.3335, + "step": 10375 + }, + { + "epoch": 4.926435690555292, + "grad_norm": 0.25488242714554854, + "learning_rate": 2.134475939353988e-05, + "loss": 0.3388, + "step": 10380 + }, + { + "epoch": 4.928808732795444, + "grad_norm": 0.2542880233748861, + "learning_rate": 2.132827949901121e-05, + "loss": 0.3425, + "step": 10385 + }, + { + "epoch": 4.931181775035595, + "grad_norm": 0.2552861003467646, + "learning_rate": 2.1311799604482534e-05, + "loss": 0.329, + "step": 10390 + }, + { + "epoch": 4.9335548172757475, + "grad_norm": 0.27585332600871254, + "learning_rate": 2.1295319709953856e-05, + "loss": 0.3395, + "step": 10395 + }, + { + "epoch": 4.9359278595159, + "grad_norm": 0.2521655081027723, + "learning_rate": 2.1278839815425185e-05, + "loss": 0.3326, + "step": 10400 + }, + { + "epoch": 4.938300901756051, + "grad_norm": 0.2665953886900266, + "learning_rate": 2.1262359920896507e-05, + "loss": 0.3407, + "step": 10405 + }, + { + "epoch": 4.940673943996203, + "grad_norm": 0.24194554711536953, + "learning_rate": 2.1245880026367832e-05, + "loss": 0.3307, + "step": 10410 + }, + { + "epoch": 4.943046986236355, + "grad_norm": 0.25382761826814276, + "learning_rate": 2.1229400131839157e-05, + "loss": 0.3355, + "step": 10415 + }, + { + "epoch": 4.945420028476507, + "grad_norm": 0.2543763216691106, + "learning_rate": 2.1212920237310483e-05, + "loss": 0.3283, + "step": 10420 + }, + { + "epoch": 4.947793070716659, + "grad_norm": 0.2699497270915771, + "learning_rate": 2.1196440342781808e-05, + "loss": 0.3279, + "step": 10425 + }, + { + "epoch": 4.95016611295681, + "grad_norm": 0.2793576058831149, + "learning_rate": 2.1179960448253133e-05, + "loss": 0.3294, + "step": 10430 + }, + { + "epoch": 4.952539155196963, + "grad_norm": 0.2472577614735382, + "learning_rate": 2.116348055372446e-05, + "loss": 0.33, + "step": 10435 + }, + { + "epoch": 4.954912197437114, + "grad_norm": 0.24654816841607494, + "learning_rate": 2.114700065919578e-05, + "loss": 0.3367, + "step": 10440 + }, + { + "epoch": 4.957285239677266, + "grad_norm": 0.25149710065497705, + "learning_rate": 2.113052076466711e-05, + "loss": 0.334, + "step": 10445 + }, + { + "epoch": 4.959658281917418, + "grad_norm": 0.25196562994190885, + "learning_rate": 2.111404087013843e-05, + "loss": 0.3353, + "step": 10450 + }, + { + "epoch": 4.96203132415757, + "grad_norm": 0.27670681430726957, + "learning_rate": 2.1097560975609757e-05, + "loss": 0.3354, + "step": 10455 + }, + { + "epoch": 4.964404366397722, + "grad_norm": 0.2602424795218405, + "learning_rate": 2.1081081081081082e-05, + "loss": 0.3392, + "step": 10460 + }, + { + "epoch": 4.966777408637874, + "grad_norm": 0.2637654848544181, + "learning_rate": 2.1064601186552408e-05, + "loss": 0.3472, + "step": 10465 + }, + { + "epoch": 4.9691504508780255, + "grad_norm": 0.29536910678214784, + "learning_rate": 2.1048121292023733e-05, + "loss": 0.3336, + "step": 10470 + }, + { + "epoch": 4.971523493118178, + "grad_norm": 0.26722924947223864, + "learning_rate": 2.1031641397495058e-05, + "loss": 0.3302, + "step": 10475 + }, + { + "epoch": 4.973896535358329, + "grad_norm": 0.2991656790000353, + "learning_rate": 2.1015161502966384e-05, + "loss": 0.337, + "step": 10480 + }, + { + "epoch": 4.976269577598481, + "grad_norm": 0.23218870426582586, + "learning_rate": 2.0998681608437706e-05, + "loss": 0.3315, + "step": 10485 + }, + { + "epoch": 4.978642619838633, + "grad_norm": 0.25667805662499366, + "learning_rate": 2.0982201713909034e-05, + "loss": 0.3306, + "step": 10490 + }, + { + "epoch": 4.981015662078785, + "grad_norm": 0.2529672932886721, + "learning_rate": 2.0965721819380356e-05, + "loss": 0.3334, + "step": 10495 + }, + { + "epoch": 4.983388704318937, + "grad_norm": 0.2619500901549255, + "learning_rate": 2.094924192485168e-05, + "loss": 0.3373, + "step": 10500 + }, + { + "epoch": 4.985761746559088, + "grad_norm": 0.25765371532275416, + "learning_rate": 2.0932762030323007e-05, + "loss": 0.3295, + "step": 10505 + }, + { + "epoch": 4.988134788799241, + "grad_norm": 0.2510372882491605, + "learning_rate": 2.0916282135794332e-05, + "loss": 0.3338, + "step": 10510 + }, + { + "epoch": 4.990507831039393, + "grad_norm": 0.26697473228154694, + "learning_rate": 2.0899802241265658e-05, + "loss": 0.3361, + "step": 10515 + }, + { + "epoch": 4.992880873279544, + "grad_norm": 0.29489483598099797, + "learning_rate": 2.0883322346736983e-05, + "loss": 0.3371, + "step": 10520 + }, + { + "epoch": 4.995253915519696, + "grad_norm": 0.2576343551377868, + "learning_rate": 2.086684245220831e-05, + "loss": 0.3396, + "step": 10525 + }, + { + "epoch": 4.997626957759848, + "grad_norm": 0.24189420633373915, + "learning_rate": 2.085036255767963e-05, + "loss": 0.337, + "step": 10530 + }, + { + "epoch": 5.0, + "grad_norm": 0.2673693821676852, + "learning_rate": 2.083388266315096e-05, + "loss": 0.3407, + "step": 10535 + }, + { + "epoch": 5.002373042240152, + "grad_norm": 0.27698830074136277, + "learning_rate": 2.081740276862228e-05, + "loss": 0.296, + "step": 10540 + }, + { + "epoch": 5.004746084480304, + "grad_norm": 0.29554161631879866, + "learning_rate": 2.0800922874093606e-05, + "loss": 0.2985, + "step": 10545 + }, + { + "epoch": 5.007119126720456, + "grad_norm": 0.26508097294911, + "learning_rate": 2.0784442979564932e-05, + "loss": 0.2999, + "step": 10550 + }, + { + "epoch": 5.009492168960607, + "grad_norm": 0.2634758280335413, + "learning_rate": 2.0767963085036257e-05, + "loss": 0.3015, + "step": 10555 + }, + { + "epoch": 5.011865211200759, + "grad_norm": 0.2567065054114938, + "learning_rate": 2.0751483190507582e-05, + "loss": 0.2972, + "step": 10560 + }, + { + "epoch": 5.014238253440912, + "grad_norm": 0.25293486465044657, + "learning_rate": 2.0735003295978904e-05, + "loss": 0.2991, + "step": 10565 + }, + { + "epoch": 5.016611295681063, + "grad_norm": 0.2754221441236384, + "learning_rate": 2.0718523401450233e-05, + "loss": 0.302, + "step": 10570 + }, + { + "epoch": 5.018984337921215, + "grad_norm": 0.278702420724589, + "learning_rate": 2.0702043506921555e-05, + "loss": 0.2975, + "step": 10575 + }, + { + "epoch": 5.0213573801613665, + "grad_norm": 0.2613850653872013, + "learning_rate": 2.068556361239288e-05, + "loss": 0.299, + "step": 10580 + }, + { + "epoch": 5.023730422401519, + "grad_norm": 0.24154380060910235, + "learning_rate": 2.0669083717864206e-05, + "loss": 0.2935, + "step": 10585 + }, + { + "epoch": 5.026103464641671, + "grad_norm": 0.2652558090698512, + "learning_rate": 2.065260382333553e-05, + "loss": 0.3048, + "step": 10590 + }, + { + "epoch": 5.028476506881822, + "grad_norm": 0.25766435995332676, + "learning_rate": 2.0636123928806857e-05, + "loss": 0.2996, + "step": 10595 + }, + { + "epoch": 5.0308495491219745, + "grad_norm": 0.2843413738248764, + "learning_rate": 2.0619644034278182e-05, + "loss": 0.3034, + "step": 10600 + }, + { + "epoch": 5.033222591362127, + "grad_norm": 0.26068896529037333, + "learning_rate": 2.0603164139749507e-05, + "loss": 0.2949, + "step": 10605 + }, + { + "epoch": 5.035595633602278, + "grad_norm": 0.2845192067741744, + "learning_rate": 2.058668424522083e-05, + "loss": 0.3049, + "step": 10610 + }, + { + "epoch": 5.03796867584243, + "grad_norm": 0.2536209960593798, + "learning_rate": 2.0570204350692158e-05, + "loss": 0.2978, + "step": 10615 + }, + { + "epoch": 5.040341718082582, + "grad_norm": 0.2781566664305398, + "learning_rate": 2.055372445616348e-05, + "loss": 0.3, + "step": 10620 + }, + { + "epoch": 5.042714760322734, + "grad_norm": 0.2635070124383409, + "learning_rate": 2.0537244561634805e-05, + "loss": 0.3041, + "step": 10625 + }, + { + "epoch": 5.045087802562886, + "grad_norm": 0.2571227940431233, + "learning_rate": 2.052076466710613e-05, + "loss": 0.2971, + "step": 10630 + }, + { + "epoch": 5.047460844803037, + "grad_norm": 0.2741144189317944, + "learning_rate": 2.0504284772577456e-05, + "loss": 0.2966, + "step": 10635 + }, + { + "epoch": 5.04983388704319, + "grad_norm": 0.2576192349796727, + "learning_rate": 2.048780487804878e-05, + "loss": 0.2982, + "step": 10640 + }, + { + "epoch": 5.052206929283341, + "grad_norm": 0.2776494832544757, + "learning_rate": 2.0471324983520107e-05, + "loss": 0.2991, + "step": 10645 + }, + { + "epoch": 5.054579971523493, + "grad_norm": 0.2815232017488148, + "learning_rate": 2.0454845088991432e-05, + "loss": 0.3117, + "step": 10650 + }, + { + "epoch": 5.056953013763645, + "grad_norm": 0.2613073370672699, + "learning_rate": 2.0438365194462754e-05, + "loss": 0.3046, + "step": 10655 + }, + { + "epoch": 5.059326056003797, + "grad_norm": 0.26577855801867084, + "learning_rate": 2.0421885299934083e-05, + "loss": 0.2983, + "step": 10660 + }, + { + "epoch": 5.061699098243949, + "grad_norm": 0.2651491303916473, + "learning_rate": 2.0405405405405405e-05, + "loss": 0.3025, + "step": 10665 + }, + { + "epoch": 5.0640721404841, + "grad_norm": 0.2646491023446959, + "learning_rate": 2.038892551087673e-05, + "loss": 0.3071, + "step": 10670 + }, + { + "epoch": 5.0664451827242525, + "grad_norm": 0.2560919921071399, + "learning_rate": 2.0372445616348055e-05, + "loss": 0.3035, + "step": 10675 + }, + { + "epoch": 5.068818224964405, + "grad_norm": 0.29817092984441423, + "learning_rate": 2.035596572181938e-05, + "loss": 0.3031, + "step": 10680 + }, + { + "epoch": 5.071191267204556, + "grad_norm": 0.26118419028528705, + "learning_rate": 2.0339485827290706e-05, + "loss": 0.2994, + "step": 10685 + }, + { + "epoch": 5.073564309444708, + "grad_norm": 0.26319450833119673, + "learning_rate": 2.032300593276203e-05, + "loss": 0.3, + "step": 10690 + }, + { + "epoch": 5.07593735168486, + "grad_norm": 0.26735790889964967, + "learning_rate": 2.0306526038233357e-05, + "loss": 0.3017, + "step": 10695 + }, + { + "epoch": 5.078310393925012, + "grad_norm": 0.27469743834316157, + "learning_rate": 2.029004614370468e-05, + "loss": 0.2976, + "step": 10700 + }, + { + "epoch": 5.080683436165164, + "grad_norm": 0.27231996776781625, + "learning_rate": 2.0273566249176008e-05, + "loss": 0.2917, + "step": 10705 + }, + { + "epoch": 5.083056478405315, + "grad_norm": 0.2570261316344685, + "learning_rate": 2.025708635464733e-05, + "loss": 0.3015, + "step": 10710 + }, + { + "epoch": 5.085429520645468, + "grad_norm": 0.2627688205362214, + "learning_rate": 2.0240606460118655e-05, + "loss": 0.2941, + "step": 10715 + }, + { + "epoch": 5.087802562885619, + "grad_norm": 0.26451212379981115, + "learning_rate": 2.0224126565589984e-05, + "loss": 0.3033, + "step": 10720 + }, + { + "epoch": 5.090175605125771, + "grad_norm": 0.2574098038249473, + "learning_rate": 2.0207646671061306e-05, + "loss": 0.3016, + "step": 10725 + }, + { + "epoch": 5.092548647365923, + "grad_norm": 0.25961807450852664, + "learning_rate": 2.019116677653263e-05, + "loss": 0.2962, + "step": 10730 + }, + { + "epoch": 5.094921689606075, + "grad_norm": 0.26147232746854826, + "learning_rate": 2.0174686882003956e-05, + "loss": 0.2985, + "step": 10735 + }, + { + "epoch": 5.097294731846227, + "grad_norm": 0.2663392811165594, + "learning_rate": 2.015820698747528e-05, + "loss": 0.3075, + "step": 10740 + }, + { + "epoch": 5.099667774086379, + "grad_norm": 0.2574218043041934, + "learning_rate": 2.0141727092946604e-05, + "loss": 0.3033, + "step": 10745 + }, + { + "epoch": 5.1020408163265305, + "grad_norm": 0.27912585824915837, + "learning_rate": 2.0125247198417932e-05, + "loss": 0.3003, + "step": 10750 + }, + { + "epoch": 5.104413858566683, + "grad_norm": 0.2743316751292982, + "learning_rate": 2.0108767303889258e-05, + "loss": 0.3021, + "step": 10755 + }, + { + "epoch": 5.106786900806834, + "grad_norm": 0.26116898962347335, + "learning_rate": 2.009228740936058e-05, + "loss": 0.3057, + "step": 10760 + }, + { + "epoch": 5.109159943046986, + "grad_norm": 0.25900284915920463, + "learning_rate": 2.007580751483191e-05, + "loss": 0.3004, + "step": 10765 + }, + { + "epoch": 5.1115329852871385, + "grad_norm": 0.24846929502799184, + "learning_rate": 2.005932762030323e-05, + "loss": 0.3009, + "step": 10770 + }, + { + "epoch": 5.11390602752729, + "grad_norm": 0.2507071685606631, + "learning_rate": 2.0042847725774556e-05, + "loss": 0.3043, + "step": 10775 + }, + { + "epoch": 5.116279069767442, + "grad_norm": 0.26094733240470647, + "learning_rate": 2.002636783124588e-05, + "loss": 0.3055, + "step": 10780 + }, + { + "epoch": 5.118652112007593, + "grad_norm": 0.2746029856219387, + "learning_rate": 2.0009887936717206e-05, + "loss": 0.3056, + "step": 10785 + }, + { + "epoch": 5.121025154247746, + "grad_norm": 0.2766785012704057, + "learning_rate": 1.999340804218853e-05, + "loss": 0.2947, + "step": 10790 + }, + { + "epoch": 5.123398196487898, + "grad_norm": 0.25525067892505926, + "learning_rate": 1.9976928147659857e-05, + "loss": 0.2998, + "step": 10795 + }, + { + "epoch": 5.125771238728049, + "grad_norm": 0.25339824296411617, + "learning_rate": 1.9960448253131182e-05, + "loss": 0.2934, + "step": 10800 + }, + { + "epoch": 5.128144280968201, + "grad_norm": 0.27692928603671213, + "learning_rate": 1.9943968358602504e-05, + "loss": 0.3066, + "step": 10805 + }, + { + "epoch": 5.130517323208353, + "grad_norm": 0.2592237258821526, + "learning_rate": 1.9927488464073833e-05, + "loss": 0.3062, + "step": 10810 + }, + { + "epoch": 5.132890365448505, + "grad_norm": 0.2647374474573449, + "learning_rate": 1.9911008569545155e-05, + "loss": 0.2965, + "step": 10815 + }, + { + "epoch": 5.135263407688657, + "grad_norm": 0.2523886330450484, + "learning_rate": 1.989452867501648e-05, + "loss": 0.3033, + "step": 10820 + }, + { + "epoch": 5.1376364499288085, + "grad_norm": 0.2548841099893687, + "learning_rate": 1.9878048780487806e-05, + "loss": 0.2951, + "step": 10825 + }, + { + "epoch": 5.140009492168961, + "grad_norm": 0.27258756853945887, + "learning_rate": 1.986156888595913e-05, + "loss": 0.3075, + "step": 10830 + }, + { + "epoch": 5.142382534409112, + "grad_norm": 0.2622835602034809, + "learning_rate": 1.9845088991430457e-05, + "loss": 0.2989, + "step": 10835 + }, + { + "epoch": 5.144755576649264, + "grad_norm": 0.26573557473511195, + "learning_rate": 1.9828609096901782e-05, + "loss": 0.3021, + "step": 10840 + }, + { + "epoch": 5.1471286188894165, + "grad_norm": 0.26448800720588467, + "learning_rate": 1.9812129202373107e-05, + "loss": 0.311, + "step": 10845 + }, + { + "epoch": 5.149501661129568, + "grad_norm": 0.26399258012009624, + "learning_rate": 1.979564930784443e-05, + "loss": 0.3078, + "step": 10850 + }, + { + "epoch": 5.15187470336972, + "grad_norm": 0.2688689800536709, + "learning_rate": 1.9779169413315758e-05, + "loss": 0.2991, + "step": 10855 + }, + { + "epoch": 5.154247745609872, + "grad_norm": 0.26768044866419854, + "learning_rate": 1.976268951878708e-05, + "loss": 0.3105, + "step": 10860 + }, + { + "epoch": 5.156620787850024, + "grad_norm": 0.26458890864631407, + "learning_rate": 1.9746209624258405e-05, + "loss": 0.3002, + "step": 10865 + }, + { + "epoch": 5.158993830090176, + "grad_norm": 0.2578354439261781, + "learning_rate": 1.972972972972973e-05, + "loss": 0.3, + "step": 10870 + }, + { + "epoch": 5.161366872330327, + "grad_norm": 0.25548141748829273, + "learning_rate": 1.9713249835201056e-05, + "loss": 0.3059, + "step": 10875 + }, + { + "epoch": 5.163739914570479, + "grad_norm": 0.27355151239621645, + "learning_rate": 1.969676994067238e-05, + "loss": 0.3011, + "step": 10880 + }, + { + "epoch": 5.166112956810632, + "grad_norm": 0.2650756011390044, + "learning_rate": 1.9680290046143707e-05, + "loss": 0.3046, + "step": 10885 + }, + { + "epoch": 5.168485999050783, + "grad_norm": 0.27748056410101474, + "learning_rate": 1.9663810151615032e-05, + "loss": 0.3013, + "step": 10890 + }, + { + "epoch": 5.170859041290935, + "grad_norm": 0.26393678946615634, + "learning_rate": 1.9647330257086354e-05, + "loss": 0.2983, + "step": 10895 + }, + { + "epoch": 5.1732320835310865, + "grad_norm": 0.27166905513264095, + "learning_rate": 1.9630850362557683e-05, + "loss": 0.3054, + "step": 10900 + }, + { + "epoch": 5.175605125771239, + "grad_norm": 0.26299136899865827, + "learning_rate": 1.9614370468029005e-05, + "loss": 0.2988, + "step": 10905 + }, + { + "epoch": 5.177978168011391, + "grad_norm": 0.26358614432201105, + "learning_rate": 1.959789057350033e-05, + "loss": 0.2998, + "step": 10910 + }, + { + "epoch": 5.180351210251542, + "grad_norm": 0.2589810747306742, + "learning_rate": 1.9581410678971655e-05, + "loss": 0.3057, + "step": 10915 + }, + { + "epoch": 5.1827242524916945, + "grad_norm": 0.2665907648360242, + "learning_rate": 1.956493078444298e-05, + "loss": 0.2976, + "step": 10920 + }, + { + "epoch": 5.185097294731846, + "grad_norm": 0.2508299871970303, + "learning_rate": 1.9548450889914306e-05, + "loss": 0.3088, + "step": 10925 + }, + { + "epoch": 5.187470336971998, + "grad_norm": 0.262988341143833, + "learning_rate": 1.953197099538563e-05, + "loss": 0.3011, + "step": 10930 + }, + { + "epoch": 5.18984337921215, + "grad_norm": 0.25565849989483685, + "learning_rate": 1.9515491100856957e-05, + "loss": 0.3015, + "step": 10935 + }, + { + "epoch": 5.192216421452302, + "grad_norm": 0.25497946055985027, + "learning_rate": 1.949901120632828e-05, + "loss": 0.3074, + "step": 10940 + }, + { + "epoch": 5.194589463692454, + "grad_norm": 0.25607371661426037, + "learning_rate": 1.9482531311799608e-05, + "loss": 0.303, + "step": 10945 + }, + { + "epoch": 5.196962505932605, + "grad_norm": 0.29046394383713803, + "learning_rate": 1.946605141727093e-05, + "loss": 0.3153, + "step": 10950 + }, + { + "epoch": 5.1993355481727574, + "grad_norm": 0.2642796702096747, + "learning_rate": 1.9449571522742255e-05, + "loss": 0.3053, + "step": 10955 + }, + { + "epoch": 5.20170859041291, + "grad_norm": 0.2624606674530189, + "learning_rate": 1.943309162821358e-05, + "loss": 0.305, + "step": 10960 + }, + { + "epoch": 5.204081632653061, + "grad_norm": 0.24853794453104228, + "learning_rate": 1.9416611733684906e-05, + "loss": 0.3034, + "step": 10965 + }, + { + "epoch": 5.206454674893213, + "grad_norm": 0.25871189022636537, + "learning_rate": 1.940013183915623e-05, + "loss": 0.2972, + "step": 10970 + }, + { + "epoch": 5.208827717133365, + "grad_norm": 0.2780266823085841, + "learning_rate": 1.9383651944627556e-05, + "loss": 0.3111, + "step": 10975 + }, + { + "epoch": 5.211200759373517, + "grad_norm": 0.2832357247931452, + "learning_rate": 1.936717205009888e-05, + "loss": 0.3046, + "step": 10980 + }, + { + "epoch": 5.213573801613669, + "grad_norm": 0.30763535550211263, + "learning_rate": 1.9350692155570204e-05, + "loss": 0.299, + "step": 10985 + }, + { + "epoch": 5.21594684385382, + "grad_norm": 0.27774066316939516, + "learning_rate": 1.9334212261041532e-05, + "loss": 0.2992, + "step": 10990 + }, + { + "epoch": 5.218319886093973, + "grad_norm": 0.26048496287703143, + "learning_rate": 1.9317732366512854e-05, + "loss": 0.3032, + "step": 10995 + }, + { + "epoch": 5.220692928334124, + "grad_norm": 0.27773461487189416, + "learning_rate": 1.930125247198418e-05, + "loss": 0.3082, + "step": 11000 + }, + { + "epoch": 5.223065970574276, + "grad_norm": 0.24400758488322177, + "learning_rate": 1.9284772577455505e-05, + "loss": 0.2974, + "step": 11005 + }, + { + "epoch": 5.225439012814428, + "grad_norm": 0.2643966455222428, + "learning_rate": 1.926829268292683e-05, + "loss": 0.3105, + "step": 11010 + }, + { + "epoch": 5.22781205505458, + "grad_norm": 0.24890031474159766, + "learning_rate": 1.9251812788398156e-05, + "loss": 0.3023, + "step": 11015 + }, + { + "epoch": 5.230185097294732, + "grad_norm": 0.2579104338389493, + "learning_rate": 1.9235332893869478e-05, + "loss": 0.3082, + "step": 11020 + }, + { + "epoch": 5.232558139534884, + "grad_norm": 0.27456691207372685, + "learning_rate": 1.9218852999340806e-05, + "loss": 0.3107, + "step": 11025 + }, + { + "epoch": 5.2349311817750355, + "grad_norm": 0.2669711627392017, + "learning_rate": 1.920237310481213e-05, + "loss": 0.296, + "step": 11030 + }, + { + "epoch": 5.237304224015188, + "grad_norm": 0.2642094167064019, + "learning_rate": 1.9185893210283454e-05, + "loss": 0.302, + "step": 11035 + }, + { + "epoch": 5.239677266255339, + "grad_norm": 0.2641826486321845, + "learning_rate": 1.916941331575478e-05, + "loss": 0.305, + "step": 11040 + }, + { + "epoch": 5.242050308495491, + "grad_norm": 0.26738775948161453, + "learning_rate": 1.9152933421226104e-05, + "loss": 0.3071, + "step": 11045 + }, + { + "epoch": 5.2444233507356435, + "grad_norm": 0.25072511855287505, + "learning_rate": 1.913645352669743e-05, + "loss": 0.2995, + "step": 11050 + }, + { + "epoch": 5.246796392975795, + "grad_norm": 0.244732120923582, + "learning_rate": 1.9119973632168755e-05, + "loss": 0.3081, + "step": 11055 + }, + { + "epoch": 5.249169435215947, + "grad_norm": 0.25208085028285027, + "learning_rate": 1.910349373764008e-05, + "loss": 0.3038, + "step": 11060 + }, + { + "epoch": 5.251542477456098, + "grad_norm": 0.26300709721684556, + "learning_rate": 1.9087013843111403e-05, + "loss": 0.3092, + "step": 11065 + }, + { + "epoch": 5.253915519696251, + "grad_norm": 0.2586763291416558, + "learning_rate": 1.907053394858273e-05, + "loss": 0.3036, + "step": 11070 + }, + { + "epoch": 5.256288561936403, + "grad_norm": 0.2577777126652171, + "learning_rate": 1.9054054054054053e-05, + "loss": 0.3023, + "step": 11075 + }, + { + "epoch": 5.258661604176554, + "grad_norm": 0.2544417642886884, + "learning_rate": 1.903757415952538e-05, + "loss": 0.305, + "step": 11080 + }, + { + "epoch": 5.261034646416706, + "grad_norm": 0.254514107524232, + "learning_rate": 1.9021094264996704e-05, + "loss": 0.2997, + "step": 11085 + }, + { + "epoch": 5.263407688656858, + "grad_norm": 0.2555135484535798, + "learning_rate": 1.900461437046803e-05, + "loss": 0.3039, + "step": 11090 + }, + { + "epoch": 5.26578073089701, + "grad_norm": 0.26120954408440117, + "learning_rate": 1.8988134475939355e-05, + "loss": 0.3046, + "step": 11095 + }, + { + "epoch": 5.268153773137162, + "grad_norm": 0.2564835324757219, + "learning_rate": 1.897165458141068e-05, + "loss": 0.3019, + "step": 11100 + }, + { + "epoch": 5.2705268153773135, + "grad_norm": 0.2612385845153938, + "learning_rate": 1.8955174686882005e-05, + "loss": 0.3183, + "step": 11105 + }, + { + "epoch": 5.272899857617466, + "grad_norm": 0.27154088799642667, + "learning_rate": 1.8938694792353327e-05, + "loss": 0.3164, + "step": 11110 + }, + { + "epoch": 5.275272899857617, + "grad_norm": 0.24950295662557462, + "learning_rate": 1.8922214897824656e-05, + "loss": 0.3055, + "step": 11115 + }, + { + "epoch": 5.277645942097769, + "grad_norm": 0.2646935665949947, + "learning_rate": 1.8905735003295978e-05, + "loss": 0.3116, + "step": 11120 + }, + { + "epoch": 5.2800189843379215, + "grad_norm": 0.27785819613284185, + "learning_rate": 1.8889255108767303e-05, + "loss": 0.2993, + "step": 11125 + }, + { + "epoch": 5.282392026578073, + "grad_norm": 0.2685009338142166, + "learning_rate": 1.8872775214238632e-05, + "loss": 0.3043, + "step": 11130 + }, + { + "epoch": 5.284765068818225, + "grad_norm": 0.2563187875485394, + "learning_rate": 1.8856295319709954e-05, + "loss": 0.3127, + "step": 11135 + }, + { + "epoch": 5.287138111058377, + "grad_norm": 0.26562374250152126, + "learning_rate": 1.883981542518128e-05, + "loss": 0.3019, + "step": 11140 + }, + { + "epoch": 5.289511153298529, + "grad_norm": 0.28325000494903724, + "learning_rate": 1.8823335530652605e-05, + "loss": 0.3057, + "step": 11145 + }, + { + "epoch": 5.291884195538681, + "grad_norm": 0.2527383634590018, + "learning_rate": 1.880685563612393e-05, + "loss": 0.3056, + "step": 11150 + }, + { + "epoch": 5.294257237778832, + "grad_norm": 0.25910089617794274, + "learning_rate": 1.8790375741595252e-05, + "loss": 0.3141, + "step": 11155 + }, + { + "epoch": 5.296630280018984, + "grad_norm": 0.2596384209340674, + "learning_rate": 1.877389584706658e-05, + "loss": 0.3032, + "step": 11160 + }, + { + "epoch": 5.299003322259137, + "grad_norm": 0.2528605265822949, + "learning_rate": 1.8757415952537906e-05, + "loss": 0.3031, + "step": 11165 + }, + { + "epoch": 5.301376364499288, + "grad_norm": 0.2557619199077699, + "learning_rate": 1.8740936058009228e-05, + "loss": 0.3029, + "step": 11170 + }, + { + "epoch": 5.30374940673944, + "grad_norm": 0.2530852543902046, + "learning_rate": 1.8724456163480557e-05, + "loss": 0.2989, + "step": 11175 + }, + { + "epoch": 5.3061224489795915, + "grad_norm": 0.2813245126809098, + "learning_rate": 1.870797626895188e-05, + "loss": 0.3043, + "step": 11180 + }, + { + "epoch": 5.308495491219744, + "grad_norm": 0.26291922573912285, + "learning_rate": 1.8691496374423204e-05, + "loss": 0.3039, + "step": 11185 + }, + { + "epoch": 5.310868533459896, + "grad_norm": 0.27062906394043307, + "learning_rate": 1.867501647989453e-05, + "loss": 0.3104, + "step": 11190 + }, + { + "epoch": 5.313241575700047, + "grad_norm": 0.2614356581734274, + "learning_rate": 1.8658536585365855e-05, + "loss": 0.3044, + "step": 11195 + }, + { + "epoch": 5.3156146179401995, + "grad_norm": 0.2805398563525798, + "learning_rate": 1.8642056690837177e-05, + "loss": 0.3113, + "step": 11200 + }, + { + "epoch": 5.317987660180351, + "grad_norm": 0.2625883448912034, + "learning_rate": 1.8625576796308506e-05, + "loss": 0.3038, + "step": 11205 + }, + { + "epoch": 5.320360702420503, + "grad_norm": 0.2807880087743471, + "learning_rate": 1.860909690177983e-05, + "loss": 0.3035, + "step": 11210 + }, + { + "epoch": 5.322733744660655, + "grad_norm": 0.27807394129927976, + "learning_rate": 1.8592617007251153e-05, + "loss": 0.31, + "step": 11215 + }, + { + "epoch": 5.325106786900807, + "grad_norm": 0.28141216300170185, + "learning_rate": 1.857613711272248e-05, + "loss": 0.3041, + "step": 11220 + }, + { + "epoch": 5.327479829140959, + "grad_norm": 0.2591046557632163, + "learning_rate": 1.8559657218193804e-05, + "loss": 0.3032, + "step": 11225 + }, + { + "epoch": 5.32985287138111, + "grad_norm": 0.2677647391868419, + "learning_rate": 1.854317732366513e-05, + "loss": 0.3061, + "step": 11230 + }, + { + "epoch": 5.332225913621262, + "grad_norm": 0.26792666657466285, + "learning_rate": 1.8526697429136454e-05, + "loss": 0.3068, + "step": 11235 + }, + { + "epoch": 5.334598955861415, + "grad_norm": 0.2525491425924011, + "learning_rate": 1.851021753460778e-05, + "loss": 0.3058, + "step": 11240 + }, + { + "epoch": 5.336971998101566, + "grad_norm": 0.27191583082008985, + "learning_rate": 1.8493737640079105e-05, + "loss": 0.3004, + "step": 11245 + }, + { + "epoch": 5.339345040341718, + "grad_norm": 0.25764874380025976, + "learning_rate": 1.847725774555043e-05, + "loss": 0.312, + "step": 11250 + }, + { + "epoch": 5.34171808258187, + "grad_norm": 0.2658530122190542, + "learning_rate": 1.8460777851021756e-05, + "loss": 0.3029, + "step": 11255 + }, + { + "epoch": 5.344091124822022, + "grad_norm": 0.24784736304650298, + "learning_rate": 1.8444297956493078e-05, + "loss": 0.3006, + "step": 11260 + }, + { + "epoch": 5.346464167062174, + "grad_norm": 0.2648431882355444, + "learning_rate": 1.8427818061964406e-05, + "loss": 0.3133, + "step": 11265 + }, + { + "epoch": 5.348837209302325, + "grad_norm": 0.25688559728329996, + "learning_rate": 1.841133816743573e-05, + "loss": 0.3086, + "step": 11270 + }, + { + "epoch": 5.3512102515424775, + "grad_norm": 0.28071032693990194, + "learning_rate": 1.8394858272907054e-05, + "loss": 0.3053, + "step": 11275 + }, + { + "epoch": 5.353583293782629, + "grad_norm": 0.2727828147813839, + "learning_rate": 1.837837837837838e-05, + "loss": 0.2915, + "step": 11280 + }, + { + "epoch": 5.355956336022781, + "grad_norm": 0.26873927520896895, + "learning_rate": 1.8361898483849705e-05, + "loss": 0.2975, + "step": 11285 + }, + { + "epoch": 5.358329378262933, + "grad_norm": 0.26189024989901477, + "learning_rate": 1.834541858932103e-05, + "loss": 0.3, + "step": 11290 + }, + { + "epoch": 5.360702420503085, + "grad_norm": 0.2563813165717877, + "learning_rate": 1.8328938694792355e-05, + "loss": 0.3043, + "step": 11295 + }, + { + "epoch": 5.363075462743237, + "grad_norm": 0.2568640374848667, + "learning_rate": 1.831245880026368e-05, + "loss": 0.3082, + "step": 11300 + }, + { + "epoch": 5.365448504983389, + "grad_norm": 0.25871033193674353, + "learning_rate": 1.8295978905735003e-05, + "loss": 0.307, + "step": 11305 + }, + { + "epoch": 5.36782154722354, + "grad_norm": 0.25197949558397775, + "learning_rate": 1.827949901120633e-05, + "loss": 0.305, + "step": 11310 + }, + { + "epoch": 5.370194589463693, + "grad_norm": 0.2817168745275611, + "learning_rate": 1.8263019116677653e-05, + "loss": 0.3037, + "step": 11315 + }, + { + "epoch": 5.372567631703844, + "grad_norm": 0.2614313755053819, + "learning_rate": 1.824653922214898e-05, + "loss": 0.3022, + "step": 11320 + }, + { + "epoch": 5.374940673943996, + "grad_norm": 0.25203028692766083, + "learning_rate": 1.8230059327620304e-05, + "loss": 0.2972, + "step": 11325 + }, + { + "epoch": 5.377313716184148, + "grad_norm": 0.2499072399763356, + "learning_rate": 1.821357943309163e-05, + "loss": 0.3051, + "step": 11330 + }, + { + "epoch": 5.3796867584243, + "grad_norm": 0.2600860846456132, + "learning_rate": 1.8197099538562955e-05, + "loss": 0.3071, + "step": 11335 + }, + { + "epoch": 5.382059800664452, + "grad_norm": 0.2580980322045276, + "learning_rate": 1.818061964403428e-05, + "loss": 0.2996, + "step": 11340 + }, + { + "epoch": 5.384432842904603, + "grad_norm": 0.25812677097818054, + "learning_rate": 1.8164139749505605e-05, + "loss": 0.2984, + "step": 11345 + }, + { + "epoch": 5.386805885144756, + "grad_norm": 0.24660739923480207, + "learning_rate": 1.8147659854976927e-05, + "loss": 0.3014, + "step": 11350 + }, + { + "epoch": 5.389178927384908, + "grad_norm": 0.25750286588751725, + "learning_rate": 1.8131179960448256e-05, + "loss": 0.303, + "step": 11355 + }, + { + "epoch": 5.391551969625059, + "grad_norm": 0.27009720421198585, + "learning_rate": 1.8114700065919578e-05, + "loss": 0.3098, + "step": 11360 + }, + { + "epoch": 5.393925011865211, + "grad_norm": 0.2539008006592837, + "learning_rate": 1.8098220171390903e-05, + "loss": 0.3093, + "step": 11365 + }, + { + "epoch": 5.396298054105363, + "grad_norm": 0.2528612861756888, + "learning_rate": 1.808174027686223e-05, + "loss": 0.3011, + "step": 11370 + }, + { + "epoch": 5.398671096345515, + "grad_norm": 0.25268788427919975, + "learning_rate": 1.8065260382333554e-05, + "loss": 0.3035, + "step": 11375 + }, + { + "epoch": 5.401044138585667, + "grad_norm": 0.2507887402914372, + "learning_rate": 1.804878048780488e-05, + "loss": 0.3019, + "step": 11380 + }, + { + "epoch": 5.4034171808258185, + "grad_norm": 0.24944584054665161, + "learning_rate": 1.8032300593276205e-05, + "loss": 0.3055, + "step": 11385 + }, + { + "epoch": 5.405790223065971, + "grad_norm": 0.25121330093232586, + "learning_rate": 1.801582069874753e-05, + "loss": 0.3002, + "step": 11390 + }, + { + "epoch": 5.408163265306122, + "grad_norm": 0.26476484513170556, + "learning_rate": 1.7999340804218852e-05, + "loss": 0.3017, + "step": 11395 + }, + { + "epoch": 5.410536307546274, + "grad_norm": 0.259906032447066, + "learning_rate": 1.798286090969018e-05, + "loss": 0.3077, + "step": 11400 + }, + { + "epoch": 5.4129093497864265, + "grad_norm": 0.25989656596886657, + "learning_rate": 1.7966381015161503e-05, + "loss": 0.3084, + "step": 11405 + }, + { + "epoch": 5.415282392026578, + "grad_norm": 0.24552700901814525, + "learning_rate": 1.7949901120632828e-05, + "loss": 0.299, + "step": 11410 + }, + { + "epoch": 5.41765543426673, + "grad_norm": 0.2530955329951673, + "learning_rate": 1.7933421226104154e-05, + "loss": 0.2996, + "step": 11415 + }, + { + "epoch": 5.420028476506882, + "grad_norm": 0.24862408363871205, + "learning_rate": 1.791694133157548e-05, + "loss": 0.2973, + "step": 11420 + }, + { + "epoch": 5.422401518747034, + "grad_norm": 0.2605101782755961, + "learning_rate": 1.7900461437046804e-05, + "loss": 0.2987, + "step": 11425 + }, + { + "epoch": 5.424774560987186, + "grad_norm": 0.2434582878361068, + "learning_rate": 1.788398154251813e-05, + "loss": 0.3052, + "step": 11430 + }, + { + "epoch": 5.427147603227337, + "grad_norm": 0.25878596080458943, + "learning_rate": 1.7867501647989455e-05, + "loss": 0.2975, + "step": 11435 + }, + { + "epoch": 5.429520645467489, + "grad_norm": 0.25167129340127325, + "learning_rate": 1.7851021753460777e-05, + "loss": 0.3053, + "step": 11440 + }, + { + "epoch": 5.431893687707642, + "grad_norm": 0.2907667164986663, + "learning_rate": 1.7834541858932106e-05, + "loss": 0.3025, + "step": 11445 + }, + { + "epoch": 5.434266729947793, + "grad_norm": 0.25909420273328143, + "learning_rate": 1.7818061964403428e-05, + "loss": 0.2914, + "step": 11450 + }, + { + "epoch": 5.436639772187945, + "grad_norm": 0.24501614262281868, + "learning_rate": 1.7801582069874753e-05, + "loss": 0.2934, + "step": 11455 + }, + { + "epoch": 5.4390128144280965, + "grad_norm": 0.2541353618545066, + "learning_rate": 1.778510217534608e-05, + "loss": 0.3091, + "step": 11460 + }, + { + "epoch": 5.441385856668249, + "grad_norm": 0.2543717656966207, + "learning_rate": 1.7768622280817404e-05, + "loss": 0.3107, + "step": 11465 + }, + { + "epoch": 5.443758898908401, + "grad_norm": 0.2728200853683922, + "learning_rate": 1.775214238628873e-05, + "loss": 0.3061, + "step": 11470 + }, + { + "epoch": 5.446131941148552, + "grad_norm": 0.2476915171404467, + "learning_rate": 1.773566249176005e-05, + "loss": 0.2983, + "step": 11475 + }, + { + "epoch": 5.4485049833887045, + "grad_norm": 0.2594463815005302, + "learning_rate": 1.771918259723138e-05, + "loss": 0.3135, + "step": 11480 + }, + { + "epoch": 5.450878025628856, + "grad_norm": 0.2499237900025648, + "learning_rate": 1.7702702702702702e-05, + "loss": 0.3004, + "step": 11485 + }, + { + "epoch": 5.453251067869008, + "grad_norm": 0.25096295626865134, + "learning_rate": 1.7686222808174027e-05, + "loss": 0.3019, + "step": 11490 + }, + { + "epoch": 5.45562411010916, + "grad_norm": 0.24615497871157002, + "learning_rate": 1.7669742913645352e-05, + "loss": 0.3019, + "step": 11495 + }, + { + "epoch": 5.457997152349312, + "grad_norm": 0.2664306462739454, + "learning_rate": 1.7653263019116678e-05, + "loss": 0.3018, + "step": 11500 + }, + { + "epoch": 5.460370194589464, + "grad_norm": 0.2598697839670765, + "learning_rate": 1.7636783124588003e-05, + "loss": 0.3028, + "step": 11505 + }, + { + "epoch": 5.462743236829615, + "grad_norm": 0.2541409911216477, + "learning_rate": 1.762030323005933e-05, + "loss": 0.3047, + "step": 11510 + }, + { + "epoch": 5.465116279069767, + "grad_norm": 0.2459761068418685, + "learning_rate": 1.7603823335530654e-05, + "loss": 0.3005, + "step": 11515 + }, + { + "epoch": 5.46748932130992, + "grad_norm": 0.24823756022888355, + "learning_rate": 1.7587343441001976e-05, + "loss": 0.2987, + "step": 11520 + }, + { + "epoch": 5.469862363550071, + "grad_norm": 0.2623046674143223, + "learning_rate": 1.7570863546473305e-05, + "loss": 0.3059, + "step": 11525 + }, + { + "epoch": 5.472235405790223, + "grad_norm": 0.2643012772164894, + "learning_rate": 1.7554383651944626e-05, + "loss": 0.3013, + "step": 11530 + }, + { + "epoch": 5.474608448030375, + "grad_norm": 0.25747775626580166, + "learning_rate": 1.7537903757415952e-05, + "loss": 0.3017, + "step": 11535 + }, + { + "epoch": 5.476981490270527, + "grad_norm": 0.2488493925558744, + "learning_rate": 1.752142386288728e-05, + "loss": 0.3119, + "step": 11540 + }, + { + "epoch": 5.479354532510679, + "grad_norm": 0.2609085801021111, + "learning_rate": 1.7504943968358603e-05, + "loss": 0.3073, + "step": 11545 + }, + { + "epoch": 5.48172757475083, + "grad_norm": 0.2582818316001489, + "learning_rate": 1.7488464073829928e-05, + "loss": 0.2988, + "step": 11550 + }, + { + "epoch": 5.4841006169909825, + "grad_norm": 0.24897768457266276, + "learning_rate": 1.7471984179301253e-05, + "loss": 0.2975, + "step": 11555 + }, + { + "epoch": 5.486473659231135, + "grad_norm": 0.25421741651933455, + "learning_rate": 1.745550428477258e-05, + "loss": 0.3069, + "step": 11560 + }, + { + "epoch": 5.488846701471286, + "grad_norm": 0.2512386563890855, + "learning_rate": 1.74390243902439e-05, + "loss": 0.3057, + "step": 11565 + }, + { + "epoch": 5.491219743711438, + "grad_norm": 0.24935391333305518, + "learning_rate": 1.742254449571523e-05, + "loss": 0.306, + "step": 11570 + }, + { + "epoch": 5.49359278595159, + "grad_norm": 0.2590838148653512, + "learning_rate": 1.7406064601186555e-05, + "loss": 0.3026, + "step": 11575 + }, + { + "epoch": 5.495965828191742, + "grad_norm": 0.2645897247784908, + "learning_rate": 1.7389584706657877e-05, + "loss": 0.299, + "step": 11580 + }, + { + "epoch": 5.498338870431894, + "grad_norm": 0.2724450775472251, + "learning_rate": 1.7373104812129205e-05, + "loss": 0.3084, + "step": 11585 + }, + { + "epoch": 5.500711912672045, + "grad_norm": 0.26418828295628494, + "learning_rate": 1.7356624917600527e-05, + "loss": 0.3067, + "step": 11590 + }, + { + "epoch": 5.503084954912198, + "grad_norm": 0.2481773422593168, + "learning_rate": 1.7340145023071853e-05, + "loss": 0.3023, + "step": 11595 + }, + { + "epoch": 5.505457997152349, + "grad_norm": 0.2417926951063445, + "learning_rate": 1.7323665128543178e-05, + "loss": 0.3062, + "step": 11600 + }, + { + "epoch": 5.507831039392501, + "grad_norm": 0.26406438169312924, + "learning_rate": 1.7307185234014503e-05, + "loss": 0.307, + "step": 11605 + }, + { + "epoch": 5.510204081632653, + "grad_norm": 0.24991074887765174, + "learning_rate": 1.7290705339485825e-05, + "loss": 0.3089, + "step": 11610 + }, + { + "epoch": 5.512577123872805, + "grad_norm": 0.2534442665273704, + "learning_rate": 1.7274225444957154e-05, + "loss": 0.2997, + "step": 11615 + }, + { + "epoch": 5.514950166112957, + "grad_norm": 0.25604262507741843, + "learning_rate": 1.725774555042848e-05, + "loss": 0.3005, + "step": 11620 + }, + { + "epoch": 5.517323208353108, + "grad_norm": 0.25407494520642165, + "learning_rate": 1.72412656558998e-05, + "loss": 0.3087, + "step": 11625 + }, + { + "epoch": 5.5196962505932605, + "grad_norm": 0.25554641716293475, + "learning_rate": 1.722478576137113e-05, + "loss": 0.2968, + "step": 11630 + }, + { + "epoch": 5.522069292833413, + "grad_norm": 0.24896663292887303, + "learning_rate": 1.7208305866842452e-05, + "loss": 0.3056, + "step": 11635 + }, + { + "epoch": 5.524442335073564, + "grad_norm": 0.25651057960126733, + "learning_rate": 1.7191825972313777e-05, + "loss": 0.3014, + "step": 11640 + }, + { + "epoch": 5.526815377313716, + "grad_norm": 0.25553338746100596, + "learning_rate": 1.7175346077785103e-05, + "loss": 0.3014, + "step": 11645 + }, + { + "epoch": 5.5291884195538685, + "grad_norm": 0.2609329579749095, + "learning_rate": 1.7158866183256428e-05, + "loss": 0.3122, + "step": 11650 + }, + { + "epoch": 5.53156146179402, + "grad_norm": 0.2546318484452625, + "learning_rate": 1.7142386288727754e-05, + "loss": 0.3024, + "step": 11655 + }, + { + "epoch": 5.533934504034172, + "grad_norm": 0.2685915443936585, + "learning_rate": 1.712590639419908e-05, + "loss": 0.311, + "step": 11660 + }, + { + "epoch": 5.536307546274323, + "grad_norm": 0.26182631610812584, + "learning_rate": 1.7109426499670404e-05, + "loss": 0.3025, + "step": 11665 + }, + { + "epoch": 5.538680588514476, + "grad_norm": 0.2488469561542779, + "learning_rate": 1.7092946605141726e-05, + "loss": 0.3102, + "step": 11670 + }, + { + "epoch": 5.541053630754627, + "grad_norm": 0.25604800128169497, + "learning_rate": 1.7076466710613055e-05, + "loss": 0.3042, + "step": 11675 + }, + { + "epoch": 5.543426672994779, + "grad_norm": 0.25045375782331286, + "learning_rate": 1.7059986816084377e-05, + "loss": 0.309, + "step": 11680 + }, + { + "epoch": 5.545799715234931, + "grad_norm": 0.2655755349113368, + "learning_rate": 1.7043506921555702e-05, + "loss": 0.2986, + "step": 11685 + }, + { + "epoch": 5.548172757475083, + "grad_norm": 0.2639303368400448, + "learning_rate": 1.7027027027027028e-05, + "loss": 0.3091, + "step": 11690 + }, + { + "epoch": 5.550545799715235, + "grad_norm": 0.2542977954014637, + "learning_rate": 1.7010547132498353e-05, + "loss": 0.3148, + "step": 11695 + }, + { + "epoch": 5.552918841955387, + "grad_norm": 0.2684022900455932, + "learning_rate": 1.699406723796968e-05, + "loss": 0.3044, + "step": 11700 + }, + { + "epoch": 5.5552918841955385, + "grad_norm": 0.260309072030552, + "learning_rate": 1.6977587343441004e-05, + "loss": 0.3006, + "step": 11705 + }, + { + "epoch": 5.557664926435691, + "grad_norm": 0.26937667511763164, + "learning_rate": 1.696110744891233e-05, + "loss": 0.3052, + "step": 11710 + }, + { + "epoch": 5.560037968675842, + "grad_norm": 0.26620180280362243, + "learning_rate": 1.694462755438365e-05, + "loss": 0.3111, + "step": 11715 + }, + { + "epoch": 5.562411010915994, + "grad_norm": 0.2654697850563229, + "learning_rate": 1.692814765985498e-05, + "loss": 0.3054, + "step": 11720 + }, + { + "epoch": 5.5647840531561465, + "grad_norm": 0.26483667040936854, + "learning_rate": 1.6911667765326302e-05, + "loss": 0.3076, + "step": 11725 + }, + { + "epoch": 5.567157095396298, + "grad_norm": 0.2700922909868019, + "learning_rate": 1.6895187870797627e-05, + "loss": 0.309, + "step": 11730 + }, + { + "epoch": 5.56953013763645, + "grad_norm": 0.2690345810811881, + "learning_rate": 1.6878707976268952e-05, + "loss": 0.3139, + "step": 11735 + }, + { + "epoch": 5.571903179876601, + "grad_norm": 0.26293372214464206, + "learning_rate": 1.6862228081740278e-05, + "loss": 0.3081, + "step": 11740 + }, + { + "epoch": 5.574276222116754, + "grad_norm": 0.2501298369805767, + "learning_rate": 1.6845748187211603e-05, + "loss": 0.3049, + "step": 11745 + }, + { + "epoch": 5.576649264356906, + "grad_norm": 0.25512578902406785, + "learning_rate": 1.682926829268293e-05, + "loss": 0.3073, + "step": 11750 + }, + { + "epoch": 5.579022306597057, + "grad_norm": 0.2645707781355013, + "learning_rate": 1.6812788398154254e-05, + "loss": 0.3039, + "step": 11755 + }, + { + "epoch": 5.5813953488372094, + "grad_norm": 0.2506741014960164, + "learning_rate": 1.6796308503625576e-05, + "loss": 0.2998, + "step": 11760 + }, + { + "epoch": 5.583768391077362, + "grad_norm": 0.25935670200235084, + "learning_rate": 1.6779828609096905e-05, + "loss": 0.3064, + "step": 11765 + }, + { + "epoch": 5.586141433317513, + "grad_norm": 0.26828988297054585, + "learning_rate": 1.6763348714568227e-05, + "loss": 0.3066, + "step": 11770 + }, + { + "epoch": 5.588514475557665, + "grad_norm": 0.27755970722750695, + "learning_rate": 1.6746868820039552e-05, + "loss": 0.3, + "step": 11775 + }, + { + "epoch": 5.590887517797817, + "grad_norm": 0.2704867278379422, + "learning_rate": 1.6730388925510877e-05, + "loss": 0.3044, + "step": 11780 + }, + { + "epoch": 5.593260560037969, + "grad_norm": 0.24909271730812213, + "learning_rate": 1.6713909030982203e-05, + "loss": 0.3102, + "step": 11785 + }, + { + "epoch": 5.59563360227812, + "grad_norm": 0.24420282727496678, + "learning_rate": 1.6697429136453528e-05, + "loss": 0.3042, + "step": 11790 + }, + { + "epoch": 5.598006644518272, + "grad_norm": 0.2516717621205489, + "learning_rate": 1.6680949241924853e-05, + "loss": 0.3032, + "step": 11795 + }, + { + "epoch": 5.600379686758425, + "grad_norm": 0.27081828018181625, + "learning_rate": 1.666446934739618e-05, + "loss": 0.3045, + "step": 11800 + }, + { + "epoch": 5.602752728998576, + "grad_norm": 0.2570274841558095, + "learning_rate": 1.66479894528675e-05, + "loss": 0.3077, + "step": 11805 + }, + { + "epoch": 5.605125771238728, + "grad_norm": 0.274188302593744, + "learning_rate": 1.663150955833883e-05, + "loss": 0.3076, + "step": 11810 + }, + { + "epoch": 5.60749881347888, + "grad_norm": 0.25510243393073, + "learning_rate": 1.661502966381015e-05, + "loss": 0.3086, + "step": 11815 + }, + { + "epoch": 5.609871855719032, + "grad_norm": 0.25474157387146, + "learning_rate": 1.6598549769281477e-05, + "loss": 0.307, + "step": 11820 + }, + { + "epoch": 5.612244897959184, + "grad_norm": 0.2530347418627011, + "learning_rate": 1.6582069874752802e-05, + "loss": 0.3118, + "step": 11825 + }, + { + "epoch": 5.614617940199335, + "grad_norm": 0.26217442323499446, + "learning_rate": 1.6565589980224127e-05, + "loss": 0.3079, + "step": 11830 + }, + { + "epoch": 5.6169909824394875, + "grad_norm": 0.2565149025742845, + "learning_rate": 1.6549110085695453e-05, + "loss": 0.3091, + "step": 11835 + }, + { + "epoch": 5.619364024679639, + "grad_norm": 0.24883471370613977, + "learning_rate": 1.6532630191166778e-05, + "loss": 0.3141, + "step": 11840 + }, + { + "epoch": 5.621737066919791, + "grad_norm": 0.254838841974273, + "learning_rate": 1.6516150296638103e-05, + "loss": 0.3079, + "step": 11845 + }, + { + "epoch": 5.624110109159943, + "grad_norm": 0.2526840057694656, + "learning_rate": 1.6499670402109425e-05, + "loss": 0.3049, + "step": 11850 + }, + { + "epoch": 5.626483151400095, + "grad_norm": 0.2627488700941784, + "learning_rate": 1.6483190507580754e-05, + "loss": 0.3004, + "step": 11855 + }, + { + "epoch": 5.628856193640247, + "grad_norm": 0.2536143506670211, + "learning_rate": 1.6466710613052076e-05, + "loss": 0.3065, + "step": 11860 + }, + { + "epoch": 5.631229235880399, + "grad_norm": 0.25008039144408656, + "learning_rate": 1.64502307185234e-05, + "loss": 0.3049, + "step": 11865 + }, + { + "epoch": 5.63360227812055, + "grad_norm": 0.2548103380883669, + "learning_rate": 1.643375082399473e-05, + "loss": 0.2979, + "step": 11870 + }, + { + "epoch": 5.635975320360703, + "grad_norm": 0.2588188520978245, + "learning_rate": 1.6417270929466052e-05, + "loss": 0.3095, + "step": 11875 + }, + { + "epoch": 5.638348362600854, + "grad_norm": 0.24911549260388896, + "learning_rate": 1.6400791034937378e-05, + "loss": 0.3056, + "step": 11880 + }, + { + "epoch": 5.640721404841006, + "grad_norm": 0.25915596774784433, + "learning_rate": 1.6384311140408703e-05, + "loss": 0.3052, + "step": 11885 + }, + { + "epoch": 5.643094447081158, + "grad_norm": 0.2640801284319886, + "learning_rate": 1.6367831245880028e-05, + "loss": 0.3062, + "step": 11890 + }, + { + "epoch": 5.64546748932131, + "grad_norm": 0.25236685984982216, + "learning_rate": 1.635135135135135e-05, + "loss": 0.3012, + "step": 11895 + }, + { + "epoch": 5.647840531561462, + "grad_norm": 0.26034280604548116, + "learning_rate": 1.633487145682268e-05, + "loss": 0.3101, + "step": 11900 + }, + { + "epoch": 5.650213573801613, + "grad_norm": 0.27455176466092707, + "learning_rate": 1.6318391562294e-05, + "loss": 0.3082, + "step": 11905 + }, + { + "epoch": 5.6525866160417655, + "grad_norm": 0.24026534735882862, + "learning_rate": 1.6301911667765326e-05, + "loss": 0.301, + "step": 11910 + }, + { + "epoch": 5.654959658281918, + "grad_norm": 0.26177136668632955, + "learning_rate": 1.628543177323665e-05, + "loss": 0.31, + "step": 11915 + }, + { + "epoch": 5.657332700522069, + "grad_norm": 0.24635786039068272, + "learning_rate": 1.6268951878707977e-05, + "loss": 0.3073, + "step": 11920 + }, + { + "epoch": 5.659705742762221, + "grad_norm": 0.2490550071011112, + "learning_rate": 1.6252471984179302e-05, + "loss": 0.3066, + "step": 11925 + }, + { + "epoch": 5.6620787850023735, + "grad_norm": 0.24832497711219984, + "learning_rate": 1.6235992089650624e-05, + "loss": 0.3103, + "step": 11930 + }, + { + "epoch": 5.664451827242525, + "grad_norm": 0.260050345128445, + "learning_rate": 1.6219512195121953e-05, + "loss": 0.3023, + "step": 11935 + }, + { + "epoch": 5.666824869482677, + "grad_norm": 0.2447377266971387, + "learning_rate": 1.6203032300593275e-05, + "loss": 0.3009, + "step": 11940 + }, + { + "epoch": 5.669197911722828, + "grad_norm": 0.2600129117982203, + "learning_rate": 1.61865524060646e-05, + "loss": 0.3072, + "step": 11945 + }, + { + "epoch": 5.671570953962981, + "grad_norm": 0.2508241601666596, + "learning_rate": 1.617007251153593e-05, + "loss": 0.3087, + "step": 11950 + }, + { + "epoch": 5.673943996203132, + "grad_norm": 0.25524561976143806, + "learning_rate": 1.615359261700725e-05, + "loss": 0.3023, + "step": 11955 + }, + { + "epoch": 5.676317038443284, + "grad_norm": 0.264597850447162, + "learning_rate": 1.6137112722478576e-05, + "loss": 0.3091, + "step": 11960 + }, + { + "epoch": 5.678690080683436, + "grad_norm": 0.24898585106161739, + "learning_rate": 1.6120632827949902e-05, + "loss": 0.31, + "step": 11965 + }, + { + "epoch": 5.681063122923588, + "grad_norm": 0.2666131750376913, + "learning_rate": 1.6104152933421227e-05, + "loss": 0.3082, + "step": 11970 + }, + { + "epoch": 5.68343616516374, + "grad_norm": 0.2589856430048078, + "learning_rate": 1.608767303889255e-05, + "loss": 0.3094, + "step": 11975 + }, + { + "epoch": 5.685809207403892, + "grad_norm": 0.26535322275136275, + "learning_rate": 1.6071193144363878e-05, + "loss": 0.3056, + "step": 11980 + }, + { + "epoch": 5.6881822496440435, + "grad_norm": 0.2735342613862442, + "learning_rate": 1.6054713249835203e-05, + "loss": 0.3072, + "step": 11985 + }, + { + "epoch": 5.690555291884196, + "grad_norm": 0.2533918397153636, + "learning_rate": 1.6038233355306525e-05, + "loss": 0.3083, + "step": 11990 + }, + { + "epoch": 5.692928334124347, + "grad_norm": 0.25211595111011087, + "learning_rate": 1.6021753460777854e-05, + "loss": 0.3007, + "step": 11995 + }, + { + "epoch": 5.695301376364499, + "grad_norm": 0.2484875373228203, + "learning_rate": 1.6005273566249176e-05, + "loss": 0.3055, + "step": 12000 + }, + { + "epoch": 5.6976744186046515, + "grad_norm": 0.25658587577166314, + "learning_rate": 1.59887936717205e-05, + "loss": 0.3086, + "step": 12005 + }, + { + "epoch": 5.700047460844803, + "grad_norm": 0.24251064967402555, + "learning_rate": 1.5972313777191827e-05, + "loss": 0.3049, + "step": 12010 + }, + { + "epoch": 5.702420503084955, + "grad_norm": 0.2524926339309301, + "learning_rate": 1.5955833882663152e-05, + "loss": 0.3094, + "step": 12015 + }, + { + "epoch": 5.704793545325106, + "grad_norm": 0.2495624457263016, + "learning_rate": 1.5939353988134474e-05, + "loss": 0.3082, + "step": 12020 + }, + { + "epoch": 5.707166587565259, + "grad_norm": 0.2503455531900979, + "learning_rate": 1.5922874093605803e-05, + "loss": 0.3022, + "step": 12025 + }, + { + "epoch": 5.709539629805411, + "grad_norm": 0.2701418373074492, + "learning_rate": 1.5906394199077128e-05, + "loss": 0.3099, + "step": 12030 + }, + { + "epoch": 5.711912672045562, + "grad_norm": 0.2565111226650112, + "learning_rate": 1.588991430454845e-05, + "loss": 0.3025, + "step": 12035 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.25947715654767434, + "learning_rate": 1.587343441001978e-05, + "loss": 0.3082, + "step": 12040 + }, + { + "epoch": 5.716658756525867, + "grad_norm": 0.26298742383946094, + "learning_rate": 1.58569545154911e-05, + "loss": 0.3082, + "step": 12045 + }, + { + "epoch": 5.719031798766018, + "grad_norm": 0.2583825220619852, + "learning_rate": 1.5840474620962426e-05, + "loss": 0.3042, + "step": 12050 + }, + { + "epoch": 5.72140484100617, + "grad_norm": 0.2606585073244619, + "learning_rate": 1.582399472643375e-05, + "loss": 0.3072, + "step": 12055 + }, + { + "epoch": 5.7237778832463215, + "grad_norm": 0.26398996081306325, + "learning_rate": 1.5807514831905077e-05, + "loss": 0.3036, + "step": 12060 + }, + { + "epoch": 5.726150925486474, + "grad_norm": 0.2512260947104305, + "learning_rate": 1.5791034937376402e-05, + "loss": 0.3051, + "step": 12065 + }, + { + "epoch": 5.728523967726625, + "grad_norm": 0.25179380907803806, + "learning_rate": 1.5774555042847727e-05, + "loss": 0.3011, + "step": 12070 + }, + { + "epoch": 5.730897009966777, + "grad_norm": 0.2571406873380526, + "learning_rate": 1.5758075148319053e-05, + "loss": 0.3121, + "step": 12075 + }, + { + "epoch": 5.7332700522069295, + "grad_norm": 0.2733321564421925, + "learning_rate": 1.5741595253790375e-05, + "loss": 0.3107, + "step": 12080 + }, + { + "epoch": 5.735643094447081, + "grad_norm": 0.2544311097996241, + "learning_rate": 1.5725115359261703e-05, + "loss": 0.3115, + "step": 12085 + }, + { + "epoch": 5.738016136687233, + "grad_norm": 0.2606866745131736, + "learning_rate": 1.5708635464733025e-05, + "loss": 0.3123, + "step": 12090 + }, + { + "epoch": 5.740389178927385, + "grad_norm": 0.25467266683361794, + "learning_rate": 1.569215557020435e-05, + "loss": 0.308, + "step": 12095 + }, + { + "epoch": 5.742762221167537, + "grad_norm": 0.256350394691136, + "learning_rate": 1.5675675675675676e-05, + "loss": 0.2988, + "step": 12100 + }, + { + "epoch": 5.745135263407689, + "grad_norm": 0.23962815718863548, + "learning_rate": 1.5659195781147e-05, + "loss": 0.3002, + "step": 12105 + }, + { + "epoch": 5.74750830564784, + "grad_norm": 0.2730414217185656, + "learning_rate": 1.5642715886618327e-05, + "loss": 0.3129, + "step": 12110 + }, + { + "epoch": 5.749881347887992, + "grad_norm": 0.2542590475145818, + "learning_rate": 1.5626235992089652e-05, + "loss": 0.308, + "step": 12115 + }, + { + "epoch": 5.752254390128145, + "grad_norm": 0.2707588435030679, + "learning_rate": 1.5609756097560978e-05, + "loss": 0.2987, + "step": 12120 + }, + { + "epoch": 5.754627432368296, + "grad_norm": 0.24835111740264126, + "learning_rate": 1.55932762030323e-05, + "loss": 0.2998, + "step": 12125 + }, + { + "epoch": 5.757000474608448, + "grad_norm": 0.25172118517403275, + "learning_rate": 1.5576796308503628e-05, + "loss": 0.3043, + "step": 12130 + }, + { + "epoch": 5.7593735168485996, + "grad_norm": 0.26374788047755965, + "learning_rate": 1.556031641397495e-05, + "loss": 0.3039, + "step": 12135 + }, + { + "epoch": 5.761746559088752, + "grad_norm": 0.24567906613855464, + "learning_rate": 1.5543836519446276e-05, + "loss": 0.3125, + "step": 12140 + }, + { + "epoch": 5.764119601328904, + "grad_norm": 0.2798418171099834, + "learning_rate": 1.55273566249176e-05, + "loss": 0.3087, + "step": 12145 + }, + { + "epoch": 5.766492643569055, + "grad_norm": 0.2493412277565117, + "learning_rate": 1.5510876730388926e-05, + "loss": 0.3007, + "step": 12150 + }, + { + "epoch": 5.7688656858092076, + "grad_norm": 0.2558561994328443, + "learning_rate": 1.549439683586025e-05, + "loss": 0.3161, + "step": 12155 + }, + { + "epoch": 5.771238728049359, + "grad_norm": 0.25925288999716384, + "learning_rate": 1.5477916941331577e-05, + "loss": 0.3124, + "step": 12160 + }, + { + "epoch": 5.773611770289511, + "grad_norm": 0.24520695177512689, + "learning_rate": 1.5461437046802902e-05, + "loss": 0.3072, + "step": 12165 + }, + { + "epoch": 5.775984812529663, + "grad_norm": 0.240477191163084, + "learning_rate": 1.5444957152274224e-05, + "loss": 0.2951, + "step": 12170 + }, + { + "epoch": 5.778357854769815, + "grad_norm": 0.25677871212645065, + "learning_rate": 1.5428477257745553e-05, + "loss": 0.3104, + "step": 12175 + }, + { + "epoch": 5.780730897009967, + "grad_norm": 0.2476794817606906, + "learning_rate": 1.5411997363216875e-05, + "loss": 0.3062, + "step": 12180 + }, + { + "epoch": 5.783103939250118, + "grad_norm": 0.26287013352557015, + "learning_rate": 1.53955174686882e-05, + "loss": 0.3103, + "step": 12185 + }, + { + "epoch": 5.7854769814902705, + "grad_norm": 0.25431157906979157, + "learning_rate": 1.5379037574159526e-05, + "loss": 0.3089, + "step": 12190 + }, + { + "epoch": 5.787850023730423, + "grad_norm": 0.25762667194126543, + "learning_rate": 1.536255767963085e-05, + "loss": 0.3058, + "step": 12195 + }, + { + "epoch": 5.790223065970574, + "grad_norm": 0.26403225708125705, + "learning_rate": 1.5346077785102176e-05, + "loss": 0.3031, + "step": 12200 + }, + { + "epoch": 5.792596108210726, + "grad_norm": 0.26571649808634484, + "learning_rate": 1.5329597890573502e-05, + "loss": 0.3007, + "step": 12205 + }, + { + "epoch": 5.7949691504508785, + "grad_norm": 0.2456044748373653, + "learning_rate": 1.5313117996044827e-05, + "loss": 0.3131, + "step": 12210 + }, + { + "epoch": 5.79734219269103, + "grad_norm": 0.2548559292748471, + "learning_rate": 1.529663810151615e-05, + "loss": 0.302, + "step": 12215 + }, + { + "epoch": 5.799715234931182, + "grad_norm": 0.25599645724303927, + "learning_rate": 1.5280158206987478e-05, + "loss": 0.3079, + "step": 12220 + }, + { + "epoch": 5.802088277171333, + "grad_norm": 0.2545821733148261, + "learning_rate": 1.52636783124588e-05, + "loss": 0.3062, + "step": 12225 + }, + { + "epoch": 5.804461319411486, + "grad_norm": 0.25958530649892975, + "learning_rate": 1.5247198417930125e-05, + "loss": 0.3076, + "step": 12230 + }, + { + "epoch": 5.806834361651637, + "grad_norm": 0.2559189339169387, + "learning_rate": 1.5230718523401452e-05, + "loss": 0.2996, + "step": 12235 + }, + { + "epoch": 5.809207403891789, + "grad_norm": 0.2514676563825686, + "learning_rate": 1.5214238628872776e-05, + "loss": 0.3146, + "step": 12240 + }, + { + "epoch": 5.811580446131941, + "grad_norm": 0.2552295734028474, + "learning_rate": 1.51977587343441e-05, + "loss": 0.3019, + "step": 12245 + }, + { + "epoch": 5.813953488372093, + "grad_norm": 0.2574818932355742, + "learning_rate": 1.5181278839815427e-05, + "loss": 0.299, + "step": 12250 + }, + { + "epoch": 5.816326530612245, + "grad_norm": 0.24958668274604764, + "learning_rate": 1.5164798945286752e-05, + "loss": 0.3098, + "step": 12255 + }, + { + "epoch": 5.818699572852397, + "grad_norm": 0.2590984378077953, + "learning_rate": 1.5148319050758076e-05, + "loss": 0.3089, + "step": 12260 + }, + { + "epoch": 5.8210726150925485, + "grad_norm": 0.2586341585077135, + "learning_rate": 1.5131839156229403e-05, + "loss": 0.3079, + "step": 12265 + }, + { + "epoch": 5.823445657332701, + "grad_norm": 0.2934350297060551, + "learning_rate": 1.5115359261700726e-05, + "loss": 0.3089, + "step": 12270 + }, + { + "epoch": 5.825818699572852, + "grad_norm": 0.2558870777544069, + "learning_rate": 1.509887936717205e-05, + "loss": 0.3005, + "step": 12275 + }, + { + "epoch": 5.828191741813004, + "grad_norm": 0.25866257691296307, + "learning_rate": 1.5082399472643377e-05, + "loss": 0.3047, + "step": 12280 + }, + { + "epoch": 5.8305647840531565, + "grad_norm": 0.2538013363111582, + "learning_rate": 1.50659195781147e-05, + "loss": 0.3039, + "step": 12285 + }, + { + "epoch": 5.832937826293308, + "grad_norm": 0.2696220550631189, + "learning_rate": 1.5049439683586026e-05, + "loss": 0.3081, + "step": 12290 + }, + { + "epoch": 5.83531086853346, + "grad_norm": 0.2609439698715651, + "learning_rate": 1.5032959789057351e-05, + "loss": 0.3088, + "step": 12295 + }, + { + "epoch": 5.837683910773611, + "grad_norm": 0.258105336606041, + "learning_rate": 1.5016479894528677e-05, + "loss": 0.3084, + "step": 12300 + }, + { + "epoch": 5.840056953013764, + "grad_norm": 0.2441209823120443, + "learning_rate": 1.5e-05, + "loss": 0.3015, + "step": 12305 + }, + { + "epoch": 5.842429995253916, + "grad_norm": 0.2660473878388901, + "learning_rate": 1.4983520105471327e-05, + "loss": 0.3116, + "step": 12310 + }, + { + "epoch": 5.844803037494067, + "grad_norm": 0.26784852562543493, + "learning_rate": 1.4967040210942651e-05, + "loss": 0.305, + "step": 12315 + }, + { + "epoch": 5.847176079734219, + "grad_norm": 0.2467564452256862, + "learning_rate": 1.4950560316413975e-05, + "loss": 0.3009, + "step": 12320 + }, + { + "epoch": 5.849549121974372, + "grad_norm": 0.25345135883261827, + "learning_rate": 1.4934080421885302e-05, + "loss": 0.305, + "step": 12325 + }, + { + "epoch": 5.851922164214523, + "grad_norm": 0.25689346920030004, + "learning_rate": 1.4917600527356625e-05, + "loss": 0.305, + "step": 12330 + }, + { + "epoch": 5.854295206454675, + "grad_norm": 0.2433383351060513, + "learning_rate": 1.490112063282795e-05, + "loss": 0.3029, + "step": 12335 + }, + { + "epoch": 5.8566682486948265, + "grad_norm": 0.2618773066741805, + "learning_rate": 1.4884640738299276e-05, + "loss": 0.2997, + "step": 12340 + }, + { + "epoch": 5.859041290934979, + "grad_norm": 0.24538172827285448, + "learning_rate": 1.4868160843770602e-05, + "loss": 0.3067, + "step": 12345 + }, + { + "epoch": 5.86141433317513, + "grad_norm": 0.2638716032492813, + "learning_rate": 1.4851680949241925e-05, + "loss": 0.3015, + "step": 12350 + }, + { + "epoch": 5.863787375415282, + "grad_norm": 0.27065968155174813, + "learning_rate": 1.4835201054713249e-05, + "loss": 0.3087, + "step": 12355 + }, + { + "epoch": 5.8661604176554345, + "grad_norm": 0.26993576671765457, + "learning_rate": 1.4818721160184576e-05, + "loss": 0.3135, + "step": 12360 + }, + { + "epoch": 5.868533459895586, + "grad_norm": 0.2590931685925382, + "learning_rate": 1.48022412656559e-05, + "loss": 0.3056, + "step": 12365 + }, + { + "epoch": 5.870906502135738, + "grad_norm": 0.24435643150919129, + "learning_rate": 1.4785761371127225e-05, + "loss": 0.3059, + "step": 12370 + }, + { + "epoch": 5.87327954437589, + "grad_norm": 0.25555678455010755, + "learning_rate": 1.476928147659855e-05, + "loss": 0.3108, + "step": 12375 + }, + { + "epoch": 5.875652586616042, + "grad_norm": 0.25737064997897907, + "learning_rate": 1.4752801582069876e-05, + "loss": 0.3031, + "step": 12380 + }, + { + "epoch": 5.878025628856194, + "grad_norm": 0.2505310301298543, + "learning_rate": 1.47363216875412e-05, + "loss": 0.3013, + "step": 12385 + }, + { + "epoch": 5.880398671096345, + "grad_norm": 0.2558195957013835, + "learning_rate": 1.4719841793012526e-05, + "loss": 0.3089, + "step": 12390 + }, + { + "epoch": 5.882771713336497, + "grad_norm": 0.2594352333156111, + "learning_rate": 1.470336189848385e-05, + "loss": 0.3113, + "step": 12395 + }, + { + "epoch": 5.88514475557665, + "grad_norm": 0.24829495088283457, + "learning_rate": 1.4686882003955174e-05, + "loss": 0.3051, + "step": 12400 + }, + { + "epoch": 5.887517797816801, + "grad_norm": 0.2637944952418496, + "learning_rate": 1.46704021094265e-05, + "loss": 0.3051, + "step": 12405 + }, + { + "epoch": 5.889890840056953, + "grad_norm": 0.2608348175925827, + "learning_rate": 1.4653922214897824e-05, + "loss": 0.3076, + "step": 12410 + }, + { + "epoch": 5.8922638822971045, + "grad_norm": 0.2606034890624875, + "learning_rate": 1.463744232036915e-05, + "loss": 0.3039, + "step": 12415 + }, + { + "epoch": 5.894636924537257, + "grad_norm": 0.26512555437270247, + "learning_rate": 1.4620962425840477e-05, + "loss": 0.307, + "step": 12420 + }, + { + "epoch": 5.897009966777409, + "grad_norm": 0.25866533227039046, + "learning_rate": 1.46044825313118e-05, + "loss": 0.312, + "step": 12425 + }, + { + "epoch": 5.89938300901756, + "grad_norm": 0.24588590855059983, + "learning_rate": 1.4588002636783124e-05, + "loss": 0.3053, + "step": 12430 + }, + { + "epoch": 5.9017560512577125, + "grad_norm": 0.24706114680125976, + "learning_rate": 1.4571522742254451e-05, + "loss": 0.3068, + "step": 12435 + }, + { + "epoch": 5.904129093497865, + "grad_norm": 0.2444245969531811, + "learning_rate": 1.4555042847725775e-05, + "loss": 0.3032, + "step": 12440 + }, + { + "epoch": 5.906502135738016, + "grad_norm": 0.23934116792612697, + "learning_rate": 1.4538562953197098e-05, + "loss": 0.3146, + "step": 12445 + }, + { + "epoch": 5.908875177978168, + "grad_norm": 0.26085500655526056, + "learning_rate": 1.4522083058668425e-05, + "loss": 0.3058, + "step": 12450 + }, + { + "epoch": 5.91124822021832, + "grad_norm": 0.28106764651965316, + "learning_rate": 1.4505603164139749e-05, + "loss": 0.3194, + "step": 12455 + }, + { + "epoch": 5.913621262458472, + "grad_norm": 0.23683531108564215, + "learning_rate": 1.4489123269611074e-05, + "loss": 0.3058, + "step": 12460 + }, + { + "epoch": 5.915994304698623, + "grad_norm": 0.2555170022929604, + "learning_rate": 1.4472643375082402e-05, + "loss": 0.3009, + "step": 12465 + }, + { + "epoch": 5.918367346938775, + "grad_norm": 0.26578266944579937, + "learning_rate": 1.4456163480553725e-05, + "loss": 0.3149, + "step": 12470 + }, + { + "epoch": 5.920740389178928, + "grad_norm": 0.24241681411057397, + "learning_rate": 1.4439683586025049e-05, + "loss": 0.3058, + "step": 12475 + }, + { + "epoch": 5.923113431419079, + "grad_norm": 0.23591610686080539, + "learning_rate": 1.4423203691496376e-05, + "loss": 0.306, + "step": 12480 + }, + { + "epoch": 5.925486473659231, + "grad_norm": 0.24157097852011966, + "learning_rate": 1.44067237969677e-05, + "loss": 0.3026, + "step": 12485 + }, + { + "epoch": 5.927859515899383, + "grad_norm": 0.2454484220769938, + "learning_rate": 1.4390243902439023e-05, + "loss": 0.3066, + "step": 12490 + }, + { + "epoch": 5.930232558139535, + "grad_norm": 0.2565273668854793, + "learning_rate": 1.437376400791035e-05, + "loss": 0.304, + "step": 12495 + }, + { + "epoch": 5.932605600379687, + "grad_norm": 0.2512844469759193, + "learning_rate": 1.4357284113381676e-05, + "loss": 0.3027, + "step": 12500 + }, + { + "epoch": 5.934978642619838, + "grad_norm": 0.24938607385446562, + "learning_rate": 1.4340804218853e-05, + "loss": 0.3066, + "step": 12505 + }, + { + "epoch": 5.9373516848599905, + "grad_norm": 0.2600027369833189, + "learning_rate": 1.4324324324324326e-05, + "loss": 0.3166, + "step": 12510 + }, + { + "epoch": 5.939724727100143, + "grad_norm": 0.2572473315980239, + "learning_rate": 1.430784442979565e-05, + "loss": 0.3095, + "step": 12515 + }, + { + "epoch": 5.942097769340294, + "grad_norm": 0.24987423874021325, + "learning_rate": 1.4291364535266974e-05, + "loss": 0.3084, + "step": 12520 + }, + { + "epoch": 5.944470811580446, + "grad_norm": 0.24973582335095865, + "learning_rate": 1.42748846407383e-05, + "loss": 0.3013, + "step": 12525 + }, + { + "epoch": 5.946843853820598, + "grad_norm": 0.2452925684386864, + "learning_rate": 1.4258404746209624e-05, + "loss": 0.3038, + "step": 12530 + }, + { + "epoch": 5.94921689606075, + "grad_norm": 0.2559021467334993, + "learning_rate": 1.424192485168095e-05, + "loss": 0.3107, + "step": 12535 + }, + { + "epoch": 5.951589938300902, + "grad_norm": 0.25804250982514326, + "learning_rate": 1.4225444957152275e-05, + "loss": 0.3067, + "step": 12540 + }, + { + "epoch": 5.953962980541053, + "grad_norm": 0.258215258494656, + "learning_rate": 1.42089650626236e-05, + "loss": 0.2995, + "step": 12545 + }, + { + "epoch": 5.956336022781206, + "grad_norm": 0.25537059125439665, + "learning_rate": 1.4192485168094924e-05, + "loss": 0.3084, + "step": 12550 + }, + { + "epoch": 5.958709065021357, + "grad_norm": 0.238147628742964, + "learning_rate": 1.4176005273566251e-05, + "loss": 0.2967, + "step": 12555 + }, + { + "epoch": 5.961082107261509, + "grad_norm": 0.24892107667380908, + "learning_rate": 1.4159525379037575e-05, + "loss": 0.3031, + "step": 12560 + }, + { + "epoch": 5.9634551495016614, + "grad_norm": 0.26977680778461377, + "learning_rate": 1.4143045484508898e-05, + "loss": 0.3059, + "step": 12565 + }, + { + "epoch": 5.965828191741813, + "grad_norm": 0.24661348831166724, + "learning_rate": 1.4126565589980225e-05, + "loss": 0.3081, + "step": 12570 + }, + { + "epoch": 5.968201233981965, + "grad_norm": 0.25498800748720973, + "learning_rate": 1.4110085695451549e-05, + "loss": 0.3087, + "step": 12575 + }, + { + "epoch": 5.970574276222116, + "grad_norm": 0.24824024037171108, + "learning_rate": 1.4093605800922874e-05, + "loss": 0.3075, + "step": 12580 + }, + { + "epoch": 5.972947318462269, + "grad_norm": 0.2500776158235274, + "learning_rate": 1.4077125906394202e-05, + "loss": 0.3133, + "step": 12585 + }, + { + "epoch": 5.975320360702421, + "grad_norm": 0.2526010220502338, + "learning_rate": 1.4060646011865525e-05, + "loss": 0.3048, + "step": 12590 + }, + { + "epoch": 5.977693402942572, + "grad_norm": 0.25153661176646713, + "learning_rate": 1.4044166117336849e-05, + "loss": 0.314, + "step": 12595 + }, + { + "epoch": 5.980066445182724, + "grad_norm": 0.2520662610744708, + "learning_rate": 1.4027686222808176e-05, + "loss": 0.306, + "step": 12600 + }, + { + "epoch": 5.982439487422877, + "grad_norm": 0.2653517729954087, + "learning_rate": 1.40112063282795e-05, + "loss": 0.3114, + "step": 12605 + }, + { + "epoch": 5.984812529663028, + "grad_norm": 0.2497136933591676, + "learning_rate": 1.3994726433750823e-05, + "loss": 0.3053, + "step": 12610 + }, + { + "epoch": 5.98718557190318, + "grad_norm": 0.24601771658941474, + "learning_rate": 1.397824653922215e-05, + "loss": 0.3028, + "step": 12615 + }, + { + "epoch": 5.9895586141433315, + "grad_norm": 0.2578822771473128, + "learning_rate": 1.3961766644693474e-05, + "loss": 0.306, + "step": 12620 + }, + { + "epoch": 5.991931656383484, + "grad_norm": 0.2619784676100039, + "learning_rate": 1.39452867501648e-05, + "loss": 0.3069, + "step": 12625 + }, + { + "epoch": 5.994304698623635, + "grad_norm": 0.27444856506185805, + "learning_rate": 1.3928806855636126e-05, + "loss": 0.3101, + "step": 12630 + }, + { + "epoch": 5.996677740863787, + "grad_norm": 0.26580243328991926, + "learning_rate": 1.391232696110745e-05, + "loss": 0.3063, + "step": 12635 + }, + { + "epoch": 5.9990507831039395, + "grad_norm": 0.26058758687012834, + "learning_rate": 1.3895847066578774e-05, + "loss": 0.3056, + "step": 12640 + }, + { + "epoch": 6.001423825344091, + "grad_norm": 0.2619261021188609, + "learning_rate": 1.38793671720501e-05, + "loss": 0.291, + "step": 12645 + }, + { + "epoch": 6.003796867584243, + "grad_norm": 0.29202601144424534, + "learning_rate": 1.3862887277521424e-05, + "loss": 0.2784, + "step": 12650 + }, + { + "epoch": 6.006169909824395, + "grad_norm": 0.2699355427682732, + "learning_rate": 1.3846407382992748e-05, + "loss": 0.2772, + "step": 12655 + }, + { + "epoch": 6.008542952064547, + "grad_norm": 0.27214978543531104, + "learning_rate": 1.3829927488464075e-05, + "loss": 0.2712, + "step": 12660 + }, + { + "epoch": 6.010915994304699, + "grad_norm": 0.2641418864555498, + "learning_rate": 1.38134475939354e-05, + "loss": 0.2729, + "step": 12665 + }, + { + "epoch": 6.01328903654485, + "grad_norm": 0.2574844298253142, + "learning_rate": 1.3796967699406724e-05, + "loss": 0.2671, + "step": 12670 + }, + { + "epoch": 6.015662078785002, + "grad_norm": 0.27537432852028076, + "learning_rate": 1.3780487804878051e-05, + "loss": 0.2826, + "step": 12675 + }, + { + "epoch": 6.018035121025155, + "grad_norm": 0.26730405157911136, + "learning_rate": 1.3764007910349375e-05, + "loss": 0.2717, + "step": 12680 + }, + { + "epoch": 6.020408163265306, + "grad_norm": 0.2662424024458208, + "learning_rate": 1.3747528015820698e-05, + "loss": 0.276, + "step": 12685 + }, + { + "epoch": 6.022781205505458, + "grad_norm": 0.26953978029678594, + "learning_rate": 1.3731048121292025e-05, + "loss": 0.2791, + "step": 12690 + }, + { + "epoch": 6.0251542477456095, + "grad_norm": 0.2601055942858109, + "learning_rate": 1.3714568226763349e-05, + "loss": 0.2754, + "step": 12695 + }, + { + "epoch": 6.027527289985762, + "grad_norm": 0.2635247636310206, + "learning_rate": 1.3698088332234675e-05, + "loss": 0.2716, + "step": 12700 + }, + { + "epoch": 6.029900332225914, + "grad_norm": 0.2532569505302218, + "learning_rate": 1.3681608437706e-05, + "loss": 0.2715, + "step": 12705 + }, + { + "epoch": 6.032273374466065, + "grad_norm": 0.26280946573789615, + "learning_rate": 1.3665128543177325e-05, + "loss": 0.2731, + "step": 12710 + }, + { + "epoch": 6.0346464167062175, + "grad_norm": 0.24793366510750411, + "learning_rate": 1.3648648648648649e-05, + "loss": 0.2701, + "step": 12715 + }, + { + "epoch": 6.037019458946369, + "grad_norm": 0.2717386399933173, + "learning_rate": 1.3632168754119976e-05, + "loss": 0.278, + "step": 12720 + }, + { + "epoch": 6.039392501186521, + "grad_norm": 0.2530734802317166, + "learning_rate": 1.36156888595913e-05, + "loss": 0.2699, + "step": 12725 + }, + { + "epoch": 6.041765543426673, + "grad_norm": 0.27056563789069926, + "learning_rate": 1.3599208965062623e-05, + "loss": 0.2775, + "step": 12730 + }, + { + "epoch": 6.044138585666825, + "grad_norm": 0.2643961640544192, + "learning_rate": 1.358272907053395e-05, + "loss": 0.2737, + "step": 12735 + }, + { + "epoch": 6.046511627906977, + "grad_norm": 0.28000276579128436, + "learning_rate": 1.3566249176005274e-05, + "loss": 0.2759, + "step": 12740 + }, + { + "epoch": 6.048884670147129, + "grad_norm": 0.2594467338783032, + "learning_rate": 1.35497692814766e-05, + "loss": 0.2798, + "step": 12745 + }, + { + "epoch": 6.05125771238728, + "grad_norm": 0.2685266544879852, + "learning_rate": 1.3533289386947925e-05, + "loss": 0.2737, + "step": 12750 + }, + { + "epoch": 6.053630754627433, + "grad_norm": 0.2695130089620387, + "learning_rate": 1.351680949241925e-05, + "loss": 0.2697, + "step": 12755 + }, + { + "epoch": 6.056003796867584, + "grad_norm": 0.25844499732588333, + "learning_rate": 1.3500329597890574e-05, + "loss": 0.2695, + "step": 12760 + }, + { + "epoch": 6.058376839107736, + "grad_norm": 0.26549202802050403, + "learning_rate": 1.34838497033619e-05, + "loss": 0.2793, + "step": 12765 + }, + { + "epoch": 6.060749881347888, + "grad_norm": 0.24630842195187846, + "learning_rate": 1.3467369808833224e-05, + "loss": 0.2665, + "step": 12770 + }, + { + "epoch": 6.06312292358804, + "grad_norm": 0.2601031025143973, + "learning_rate": 1.3450889914304548e-05, + "loss": 0.2848, + "step": 12775 + }, + { + "epoch": 6.065495965828192, + "grad_norm": 0.2558062422533462, + "learning_rate": 1.3434410019775875e-05, + "loss": 0.2744, + "step": 12780 + }, + { + "epoch": 6.067869008068343, + "grad_norm": 0.2539924453601287, + "learning_rate": 1.3417930125247199e-05, + "loss": 0.2693, + "step": 12785 + }, + { + "epoch": 6.0702420503084955, + "grad_norm": 0.27335788458565025, + "learning_rate": 1.3401450230718524e-05, + "loss": 0.2751, + "step": 12790 + }, + { + "epoch": 6.072615092548648, + "grad_norm": 0.2675782186566956, + "learning_rate": 1.3384970336189851e-05, + "loss": 0.2777, + "step": 12795 + }, + { + "epoch": 6.074988134788799, + "grad_norm": 0.26563639974569153, + "learning_rate": 1.3368490441661175e-05, + "loss": 0.2828, + "step": 12800 + }, + { + "epoch": 6.077361177028951, + "grad_norm": 0.25052753726992816, + "learning_rate": 1.3352010547132498e-05, + "loss": 0.2723, + "step": 12805 + }, + { + "epoch": 6.079734219269103, + "grad_norm": 0.2700291544508638, + "learning_rate": 1.3335530652603822e-05, + "loss": 0.2722, + "step": 12810 + }, + { + "epoch": 6.082107261509255, + "grad_norm": 0.27464743514237205, + "learning_rate": 1.331905075807515e-05, + "loss": 0.2693, + "step": 12815 + }, + { + "epoch": 6.084480303749407, + "grad_norm": 0.27130942568833855, + "learning_rate": 1.3302570863546473e-05, + "loss": 0.2761, + "step": 12820 + }, + { + "epoch": 6.086853345989558, + "grad_norm": 0.2549133599975755, + "learning_rate": 1.3286090969017798e-05, + "loss": 0.2811, + "step": 12825 + }, + { + "epoch": 6.089226388229711, + "grad_norm": 0.2707825318712284, + "learning_rate": 1.3269611074489125e-05, + "loss": 0.2821, + "step": 12830 + }, + { + "epoch": 6.091599430469862, + "grad_norm": 0.25793749867976307, + "learning_rate": 1.3253131179960449e-05, + "loss": 0.2739, + "step": 12835 + }, + { + "epoch": 6.093972472710014, + "grad_norm": 0.2579916331101544, + "learning_rate": 1.3236651285431773e-05, + "loss": 0.2707, + "step": 12840 + }, + { + "epoch": 6.096345514950166, + "grad_norm": 0.25215448649959465, + "learning_rate": 1.32201713909031e-05, + "loss": 0.2662, + "step": 12845 + }, + { + "epoch": 6.098718557190318, + "grad_norm": 0.2525396951712229, + "learning_rate": 1.3203691496374423e-05, + "loss": 0.273, + "step": 12850 + }, + { + "epoch": 6.10109159943047, + "grad_norm": 0.26206745714570173, + "learning_rate": 1.3187211601845747e-05, + "loss": 0.2682, + "step": 12855 + }, + { + "epoch": 6.103464641670621, + "grad_norm": 0.25778707264097706, + "learning_rate": 1.3170731707317074e-05, + "loss": 0.2825, + "step": 12860 + }, + { + "epoch": 6.1058376839107735, + "grad_norm": 0.26758110897452564, + "learning_rate": 1.31542518127884e-05, + "loss": 0.2851, + "step": 12865 + }, + { + "epoch": 6.108210726150926, + "grad_norm": 0.2530984103593092, + "learning_rate": 1.3137771918259723e-05, + "loss": 0.273, + "step": 12870 + }, + { + "epoch": 6.110583768391077, + "grad_norm": 0.25299634210777183, + "learning_rate": 1.312129202373105e-05, + "loss": 0.2758, + "step": 12875 + }, + { + "epoch": 6.112956810631229, + "grad_norm": 0.2609344559503069, + "learning_rate": 1.3104812129202374e-05, + "loss": 0.2701, + "step": 12880 + }, + { + "epoch": 6.1153298528713815, + "grad_norm": 0.25528708209924905, + "learning_rate": 1.3088332234673697e-05, + "loss": 0.2779, + "step": 12885 + }, + { + "epoch": 6.117702895111533, + "grad_norm": 0.26177967887156406, + "learning_rate": 1.3071852340145024e-05, + "loss": 0.277, + "step": 12890 + }, + { + "epoch": 6.120075937351685, + "grad_norm": 0.27512520635743437, + "learning_rate": 1.3055372445616348e-05, + "loss": 0.2766, + "step": 12895 + }, + { + "epoch": 6.122448979591836, + "grad_norm": 0.2868447706908931, + "learning_rate": 1.3038892551087672e-05, + "loss": 0.2733, + "step": 12900 + }, + { + "epoch": 6.124822021831989, + "grad_norm": 0.25256196212296933, + "learning_rate": 1.3022412656558999e-05, + "loss": 0.2733, + "step": 12905 + }, + { + "epoch": 6.127195064072141, + "grad_norm": 0.2672799396280616, + "learning_rate": 1.3005932762030324e-05, + "loss": 0.2708, + "step": 12910 + }, + { + "epoch": 6.129568106312292, + "grad_norm": 0.25617344498635275, + "learning_rate": 1.2989452867501648e-05, + "loss": 0.2714, + "step": 12915 + }, + { + "epoch": 6.131941148552444, + "grad_norm": 0.26207321709396286, + "learning_rate": 1.2972972972972975e-05, + "loss": 0.2708, + "step": 12920 + }, + { + "epoch": 6.134314190792596, + "grad_norm": 0.2697775863398816, + "learning_rate": 1.2956493078444298e-05, + "loss": 0.2727, + "step": 12925 + }, + { + "epoch": 6.136687233032748, + "grad_norm": 0.2570679714656289, + "learning_rate": 1.2940013183915622e-05, + "loss": 0.271, + "step": 12930 + }, + { + "epoch": 6.1390602752729, + "grad_norm": 0.2710884723603348, + "learning_rate": 1.292353328938695e-05, + "loss": 0.2733, + "step": 12935 + }, + { + "epoch": 6.1414333175130515, + "grad_norm": 0.26425865250930786, + "learning_rate": 1.2907053394858273e-05, + "loss": 0.2732, + "step": 12940 + }, + { + "epoch": 6.143806359753204, + "grad_norm": 0.2541288687743152, + "learning_rate": 1.2890573500329598e-05, + "loss": 0.2748, + "step": 12945 + }, + { + "epoch": 6.146179401993355, + "grad_norm": 0.2653843601389536, + "learning_rate": 1.2874093605800924e-05, + "loss": 0.2837, + "step": 12950 + }, + { + "epoch": 6.148552444233507, + "grad_norm": 0.25636630326029136, + "learning_rate": 1.2857613711272249e-05, + "loss": 0.2779, + "step": 12955 + }, + { + "epoch": 6.1509254864736596, + "grad_norm": 0.2660385873522577, + "learning_rate": 1.2841133816743573e-05, + "loss": 0.2783, + "step": 12960 + }, + { + "epoch": 6.153298528713811, + "grad_norm": 0.2638794121739543, + "learning_rate": 1.28246539222149e-05, + "loss": 0.2699, + "step": 12965 + }, + { + "epoch": 6.155671570953963, + "grad_norm": 0.26200756668035047, + "learning_rate": 1.2808174027686223e-05, + "loss": 0.2713, + "step": 12970 + }, + { + "epoch": 6.1580446131941144, + "grad_norm": 0.25832187347456054, + "learning_rate": 1.2791694133157547e-05, + "loss": 0.2726, + "step": 12975 + }, + { + "epoch": 6.160417655434267, + "grad_norm": 0.27186963158558985, + "learning_rate": 1.2775214238628874e-05, + "loss": 0.2817, + "step": 12980 + }, + { + "epoch": 6.162790697674419, + "grad_norm": 0.2654694543804701, + "learning_rate": 1.2758734344100198e-05, + "loss": 0.2748, + "step": 12985 + }, + { + "epoch": 6.16516373991457, + "grad_norm": 0.253139199869618, + "learning_rate": 1.2742254449571523e-05, + "loss": 0.2753, + "step": 12990 + }, + { + "epoch": 6.1675367821547225, + "grad_norm": 0.2623751531783195, + "learning_rate": 1.272577455504285e-05, + "loss": 0.2672, + "step": 12995 + }, + { + "epoch": 6.169909824394875, + "grad_norm": 0.2662250392972506, + "learning_rate": 1.2709294660514174e-05, + "loss": 0.2835, + "step": 13000 + }, + { + "epoch": 6.172282866635026, + "grad_norm": 0.26966873959265975, + "learning_rate": 1.2692814765985497e-05, + "loss": 0.2713, + "step": 13005 + }, + { + "epoch": 6.174655908875178, + "grad_norm": 0.2583046779626439, + "learning_rate": 1.2676334871456824e-05, + "loss": 0.2703, + "step": 13010 + }, + { + "epoch": 6.17702895111533, + "grad_norm": 0.26010165190634715, + "learning_rate": 1.2659854976928148e-05, + "loss": 0.2737, + "step": 13015 + }, + { + "epoch": 6.179401993355482, + "grad_norm": 0.2597362409854531, + "learning_rate": 1.2643375082399472e-05, + "loss": 0.2742, + "step": 13020 + }, + { + "epoch": 6.181775035595634, + "grad_norm": 0.25853037957550945, + "learning_rate": 1.2626895187870799e-05, + "loss": 0.2802, + "step": 13025 + }, + { + "epoch": 6.184148077835785, + "grad_norm": 0.25818403140785057, + "learning_rate": 1.2610415293342122e-05, + "loss": 0.2754, + "step": 13030 + }, + { + "epoch": 6.186521120075938, + "grad_norm": 0.25304495703209345, + "learning_rate": 1.2593935398813448e-05, + "loss": 0.2665, + "step": 13035 + }, + { + "epoch": 6.188894162316089, + "grad_norm": 0.265973216122964, + "learning_rate": 1.2577455504284775e-05, + "loss": 0.2715, + "step": 13040 + }, + { + "epoch": 6.191267204556241, + "grad_norm": 0.2663039854024612, + "learning_rate": 1.2560975609756098e-05, + "loss": 0.2677, + "step": 13045 + }, + { + "epoch": 6.193640246796393, + "grad_norm": 0.26933956307574475, + "learning_rate": 1.2544495715227422e-05, + "loss": 0.2706, + "step": 13050 + }, + { + "epoch": 6.196013289036545, + "grad_norm": 0.25909946246288257, + "learning_rate": 1.252801582069875e-05, + "loss": 0.2786, + "step": 13055 + }, + { + "epoch": 6.198386331276697, + "grad_norm": 0.26214197027997815, + "learning_rate": 1.2511535926170073e-05, + "loss": 0.2692, + "step": 13060 + }, + { + "epoch": 6.200759373516848, + "grad_norm": 0.26264006847890425, + "learning_rate": 1.2495056031641398e-05, + "loss": 0.2788, + "step": 13065 + }, + { + "epoch": 6.2031324157570005, + "grad_norm": 0.2683023398105367, + "learning_rate": 1.2478576137112724e-05, + "loss": 0.2775, + "step": 13070 + }, + { + "epoch": 6.205505457997153, + "grad_norm": 0.25596208058484665, + "learning_rate": 1.2462096242584049e-05, + "loss": 0.2704, + "step": 13075 + }, + { + "epoch": 6.207878500237304, + "grad_norm": 0.2742285402420324, + "learning_rate": 1.2445616348055374e-05, + "loss": 0.2852, + "step": 13080 + }, + { + "epoch": 6.210251542477456, + "grad_norm": 0.26148439663123674, + "learning_rate": 1.2429136453526698e-05, + "loss": 0.2795, + "step": 13085 + }, + { + "epoch": 6.212624584717608, + "grad_norm": 0.2587288825404962, + "learning_rate": 1.2412656558998023e-05, + "loss": 0.2791, + "step": 13090 + }, + { + "epoch": 6.21499762695776, + "grad_norm": 0.2583934075723974, + "learning_rate": 1.2396176664469349e-05, + "loss": 0.2766, + "step": 13095 + }, + { + "epoch": 6.217370669197912, + "grad_norm": 0.2714674197540486, + "learning_rate": 1.2379696769940672e-05, + "loss": 0.2742, + "step": 13100 + }, + { + "epoch": 6.219743711438063, + "grad_norm": 0.2646822849937861, + "learning_rate": 1.2363216875411998e-05, + "loss": 0.2795, + "step": 13105 + }, + { + "epoch": 6.222116753678216, + "grad_norm": 0.2714996043727222, + "learning_rate": 1.2346736980883323e-05, + "loss": 0.2794, + "step": 13110 + }, + { + "epoch": 6.224489795918367, + "grad_norm": 0.2639531196838772, + "learning_rate": 1.2330257086354648e-05, + "loss": 0.274, + "step": 13115 + }, + { + "epoch": 6.226862838158519, + "grad_norm": 0.26575579124131166, + "learning_rate": 1.2313777191825974e-05, + "loss": 0.2799, + "step": 13120 + }, + { + "epoch": 6.229235880398671, + "grad_norm": 0.2631616468233482, + "learning_rate": 1.2297297297297299e-05, + "loss": 0.2739, + "step": 13125 + }, + { + "epoch": 6.231608922638823, + "grad_norm": 0.2528934841882479, + "learning_rate": 1.2280817402768623e-05, + "loss": 0.284, + "step": 13130 + }, + { + "epoch": 6.233981964878975, + "grad_norm": 0.2581987714281916, + "learning_rate": 1.2264337508239948e-05, + "loss": 0.2814, + "step": 13135 + }, + { + "epoch": 6.236355007119126, + "grad_norm": 0.2671345709658211, + "learning_rate": 1.2247857613711272e-05, + "loss": 0.272, + "step": 13140 + }, + { + "epoch": 6.2387280493592785, + "grad_norm": 0.2581634279217954, + "learning_rate": 1.2231377719182597e-05, + "loss": 0.2825, + "step": 13145 + }, + { + "epoch": 6.241101091599431, + "grad_norm": 0.25761069427177574, + "learning_rate": 1.2214897824653922e-05, + "loss": 0.2747, + "step": 13150 + }, + { + "epoch": 6.243474133839582, + "grad_norm": 0.26092308546992377, + "learning_rate": 1.2198417930125248e-05, + "loss": 0.2768, + "step": 13155 + }, + { + "epoch": 6.245847176079734, + "grad_norm": 0.2625464727556574, + "learning_rate": 1.2181938035596573e-05, + "loss": 0.2787, + "step": 13160 + }, + { + "epoch": 6.2482202183198865, + "grad_norm": 0.2623598797111995, + "learning_rate": 1.2165458141067898e-05, + "loss": 0.2797, + "step": 13165 + }, + { + "epoch": 6.250593260560038, + "grad_norm": 0.2648116652408699, + "learning_rate": 1.2148978246539222e-05, + "loss": 0.2797, + "step": 13170 + }, + { + "epoch": 6.25296630280019, + "grad_norm": 0.2684283251051956, + "learning_rate": 1.2132498352010548e-05, + "loss": 0.2719, + "step": 13175 + }, + { + "epoch": 6.255339345040341, + "grad_norm": 0.2735297314474532, + "learning_rate": 1.2116018457481873e-05, + "loss": 0.2788, + "step": 13180 + }, + { + "epoch": 6.257712387280494, + "grad_norm": 0.26202837410166163, + "learning_rate": 1.2099538562953197e-05, + "loss": 0.2784, + "step": 13185 + }, + { + "epoch": 6.260085429520646, + "grad_norm": 0.2591246763533983, + "learning_rate": 1.2083058668424522e-05, + "loss": 0.2754, + "step": 13190 + }, + { + "epoch": 6.262458471760797, + "grad_norm": 0.262461282643583, + "learning_rate": 1.2066578773895847e-05, + "loss": 0.2853, + "step": 13195 + }, + { + "epoch": 6.264831514000949, + "grad_norm": 0.2626519190674011, + "learning_rate": 1.2050098879367173e-05, + "loss": 0.2823, + "step": 13200 + }, + { + "epoch": 6.267204556241101, + "grad_norm": 0.25800521589689585, + "learning_rate": 1.2033618984838498e-05, + "loss": 0.2776, + "step": 13205 + }, + { + "epoch": 6.269577598481253, + "grad_norm": 0.2615063389709898, + "learning_rate": 1.2017139090309823e-05, + "loss": 0.2815, + "step": 13210 + }, + { + "epoch": 6.271950640721405, + "grad_norm": 0.267569448069995, + "learning_rate": 1.2000659195781147e-05, + "loss": 0.2777, + "step": 13215 + }, + { + "epoch": 6.2743236829615565, + "grad_norm": 0.26241224907373456, + "learning_rate": 1.1984179301252472e-05, + "loss": 0.2761, + "step": 13220 + }, + { + "epoch": 6.276696725201709, + "grad_norm": 0.2652472637199463, + "learning_rate": 1.1967699406723798e-05, + "loss": 0.2845, + "step": 13225 + }, + { + "epoch": 6.27906976744186, + "grad_norm": 0.2575384881197694, + "learning_rate": 1.1951219512195121e-05, + "loss": 0.2783, + "step": 13230 + }, + { + "epoch": 6.281442809682012, + "grad_norm": 0.26784978964261386, + "learning_rate": 1.1934739617666447e-05, + "loss": 0.279, + "step": 13235 + }, + { + "epoch": 6.2838158519221645, + "grad_norm": 0.25909181537951337, + "learning_rate": 1.1918259723137774e-05, + "loss": 0.2699, + "step": 13240 + }, + { + "epoch": 6.286188894162316, + "grad_norm": 0.26042165501566145, + "learning_rate": 1.1901779828609097e-05, + "loss": 0.2748, + "step": 13245 + }, + { + "epoch": 6.288561936402468, + "grad_norm": 0.25998052338033023, + "learning_rate": 1.1885299934080423e-05, + "loss": 0.282, + "step": 13250 + }, + { + "epoch": 6.290934978642619, + "grad_norm": 0.2622058918584371, + "learning_rate": 1.1868820039551748e-05, + "loss": 0.2801, + "step": 13255 + }, + { + "epoch": 6.293308020882772, + "grad_norm": 0.24888069642931737, + "learning_rate": 1.1852340145023072e-05, + "loss": 0.2733, + "step": 13260 + }, + { + "epoch": 6.295681063122924, + "grad_norm": 0.25724284326426855, + "learning_rate": 1.1835860250494397e-05, + "loss": 0.2777, + "step": 13265 + }, + { + "epoch": 6.298054105363075, + "grad_norm": 0.2745465926732081, + "learning_rate": 1.1819380355965722e-05, + "loss": 0.2805, + "step": 13270 + }, + { + "epoch": 6.300427147603227, + "grad_norm": 0.26114535318125487, + "learning_rate": 1.1802900461437048e-05, + "loss": 0.2782, + "step": 13275 + }, + { + "epoch": 6.30280018984338, + "grad_norm": 0.27500392605227125, + "learning_rate": 1.1786420566908373e-05, + "loss": 0.2756, + "step": 13280 + }, + { + "epoch": 6.305173232083531, + "grad_norm": 0.2607507538779071, + "learning_rate": 1.1769940672379699e-05, + "loss": 0.2764, + "step": 13285 + }, + { + "epoch": 6.307546274323683, + "grad_norm": 0.28402166738267903, + "learning_rate": 1.1753460777851022e-05, + "loss": 0.2751, + "step": 13290 + }, + { + "epoch": 6.3099193165638345, + "grad_norm": 0.2709456296236999, + "learning_rate": 1.1736980883322348e-05, + "loss": 0.2736, + "step": 13295 + }, + { + "epoch": 6.312292358803987, + "grad_norm": 0.25490482016201355, + "learning_rate": 1.1720500988793673e-05, + "loss": 0.2682, + "step": 13300 + }, + { + "epoch": 6.314665401044139, + "grad_norm": 0.2529195839387415, + "learning_rate": 1.1704021094264997e-05, + "loss": 0.28, + "step": 13305 + }, + { + "epoch": 6.31703844328429, + "grad_norm": 0.25349077371712014, + "learning_rate": 1.1687541199736322e-05, + "loss": 0.2698, + "step": 13310 + }, + { + "epoch": 6.3194114855244425, + "grad_norm": 0.2767701149799624, + "learning_rate": 1.1671061305207647e-05, + "loss": 0.2787, + "step": 13315 + }, + { + "epoch": 6.321784527764594, + "grad_norm": 0.26432650743813174, + "learning_rate": 1.1654581410678973e-05, + "loss": 0.2695, + "step": 13320 + }, + { + "epoch": 6.324157570004746, + "grad_norm": 0.26063614314694006, + "learning_rate": 1.1638101516150298e-05, + "loss": 0.2742, + "step": 13325 + }, + { + "epoch": 6.326530612244898, + "grad_norm": 0.2658867409794243, + "learning_rate": 1.1621621621621623e-05, + "loss": 0.273, + "step": 13330 + }, + { + "epoch": 6.32890365448505, + "grad_norm": 0.2546913555066743, + "learning_rate": 1.1605141727092947e-05, + "loss": 0.2731, + "step": 13335 + }, + { + "epoch": 6.331276696725202, + "grad_norm": 0.2544054621644839, + "learning_rate": 1.1588661832564272e-05, + "loss": 0.2764, + "step": 13340 + }, + { + "epoch": 6.333649738965353, + "grad_norm": 0.27485886447915975, + "learning_rate": 1.1572181938035598e-05, + "loss": 0.2775, + "step": 13345 + }, + { + "epoch": 6.336022781205505, + "grad_norm": 0.26723797927552223, + "learning_rate": 1.1555702043506921e-05, + "loss": 0.2735, + "step": 13350 + }, + { + "epoch": 6.338395823445658, + "grad_norm": 0.2489988654169891, + "learning_rate": 1.1539222148978247e-05, + "loss": 0.2792, + "step": 13355 + }, + { + "epoch": 6.340768865685809, + "grad_norm": 0.2558377696188189, + "learning_rate": 1.1522742254449572e-05, + "loss": 0.2722, + "step": 13360 + }, + { + "epoch": 6.343141907925961, + "grad_norm": 0.2759020643250481, + "learning_rate": 1.1506262359920897e-05, + "loss": 0.2805, + "step": 13365 + }, + { + "epoch": 6.3455149501661126, + "grad_norm": 0.26647906440570523, + "learning_rate": 1.1489782465392223e-05, + "loss": 0.2746, + "step": 13370 + }, + { + "epoch": 6.347887992406265, + "grad_norm": 0.2611531515033332, + "learning_rate": 1.1473302570863546e-05, + "loss": 0.2877, + "step": 13375 + }, + { + "epoch": 6.350261034646417, + "grad_norm": 0.263400618450495, + "learning_rate": 1.1456822676334872e-05, + "loss": 0.2803, + "step": 13380 + }, + { + "epoch": 6.352634076886568, + "grad_norm": 0.27116323540405274, + "learning_rate": 1.1440342781806197e-05, + "loss": 0.2751, + "step": 13385 + }, + { + "epoch": 6.355007119126721, + "grad_norm": 0.2637000143075081, + "learning_rate": 1.142386288727752e-05, + "loss": 0.2825, + "step": 13390 + }, + { + "epoch": 6.357380161366873, + "grad_norm": 0.2549539655577838, + "learning_rate": 1.1407382992748846e-05, + "loss": 0.2725, + "step": 13395 + }, + { + "epoch": 6.359753203607024, + "grad_norm": 0.25607319483425506, + "learning_rate": 1.1390903098220171e-05, + "loss": 0.2803, + "step": 13400 + }, + { + "epoch": 6.362126245847176, + "grad_norm": 0.24761533811763634, + "learning_rate": 1.1374423203691497e-05, + "loss": 0.2798, + "step": 13405 + }, + { + "epoch": 6.364499288087328, + "grad_norm": 0.25105297690799994, + "learning_rate": 1.1357943309162822e-05, + "loss": 0.2728, + "step": 13410 + }, + { + "epoch": 6.36687233032748, + "grad_norm": 0.2582096154258011, + "learning_rate": 1.1341463414634148e-05, + "loss": 0.2775, + "step": 13415 + }, + { + "epoch": 6.369245372567631, + "grad_norm": 0.25599093339059975, + "learning_rate": 1.1324983520105471e-05, + "loss": 0.2798, + "step": 13420 + }, + { + "epoch": 6.3716184148077835, + "grad_norm": 0.2531642681259146, + "learning_rate": 1.1308503625576797e-05, + "loss": 0.2786, + "step": 13425 + }, + { + "epoch": 6.373991457047936, + "grad_norm": 0.25385081111138474, + "learning_rate": 1.1292023731048122e-05, + "loss": 0.2853, + "step": 13430 + }, + { + "epoch": 6.376364499288087, + "grad_norm": 0.2515843515263871, + "learning_rate": 1.1275543836519446e-05, + "loss": 0.2677, + "step": 13435 + }, + { + "epoch": 6.378737541528239, + "grad_norm": 0.26391923629776487, + "learning_rate": 1.1259063941990771e-05, + "loss": 0.2857, + "step": 13440 + }, + { + "epoch": 6.3811105837683915, + "grad_norm": 0.27857230664196014, + "learning_rate": 1.1242584047462098e-05, + "loss": 0.2832, + "step": 13445 + }, + { + "epoch": 6.383483626008543, + "grad_norm": 0.2733602038238236, + "learning_rate": 1.1226104152933422e-05, + "loss": 0.2791, + "step": 13450 + }, + { + "epoch": 6.385856668248695, + "grad_norm": 0.265102084820993, + "learning_rate": 1.1209624258404747e-05, + "loss": 0.278, + "step": 13455 + }, + { + "epoch": 6.388229710488846, + "grad_norm": 0.2666772971847975, + "learning_rate": 1.1193144363876072e-05, + "loss": 0.2733, + "step": 13460 + }, + { + "epoch": 6.390602752728999, + "grad_norm": 0.26447501415829144, + "learning_rate": 1.1176664469347396e-05, + "loss": 0.2727, + "step": 13465 + }, + { + "epoch": 6.392975794969151, + "grad_norm": 0.2504568721179992, + "learning_rate": 1.1160184574818721e-05, + "loss": 0.2781, + "step": 13470 + }, + { + "epoch": 6.395348837209302, + "grad_norm": 0.2632911131141635, + "learning_rate": 1.1143704680290047e-05, + "loss": 0.2813, + "step": 13475 + }, + { + "epoch": 6.397721879449454, + "grad_norm": 0.2582364733052907, + "learning_rate": 1.1127224785761372e-05, + "loss": 0.288, + "step": 13480 + }, + { + "epoch": 6.400094921689606, + "grad_norm": 0.2636079827885384, + "learning_rate": 1.1110744891232697e-05, + "loss": 0.2743, + "step": 13485 + }, + { + "epoch": 6.402467963929758, + "grad_norm": 0.25896221241858347, + "learning_rate": 1.1094264996704023e-05, + "loss": 0.2721, + "step": 13490 + }, + { + "epoch": 6.40484100616991, + "grad_norm": 0.2631230300956152, + "learning_rate": 1.1077785102175346e-05, + "loss": 0.2805, + "step": 13495 + }, + { + "epoch": 6.4072140484100615, + "grad_norm": 0.2519499315262575, + "learning_rate": 1.1061305207646672e-05, + "loss": 0.2726, + "step": 13500 + }, + { + "epoch": 6.409587090650214, + "grad_norm": 0.2700969381919214, + "learning_rate": 1.1044825313117997e-05, + "loss": 0.2785, + "step": 13505 + }, + { + "epoch": 6.411960132890365, + "grad_norm": 0.26877792244969806, + "learning_rate": 1.102834541858932e-05, + "loss": 0.2793, + "step": 13510 + }, + { + "epoch": 6.414333175130517, + "grad_norm": 0.2586405368237331, + "learning_rate": 1.1011865524060646e-05, + "loss": 0.2824, + "step": 13515 + }, + { + "epoch": 6.4167062173706695, + "grad_norm": 0.24558980365407138, + "learning_rate": 1.0995385629531971e-05, + "loss": 0.2772, + "step": 13520 + }, + { + "epoch": 6.419079259610821, + "grad_norm": 0.26031436811964603, + "learning_rate": 1.0978905735003297e-05, + "loss": 0.2813, + "step": 13525 + }, + { + "epoch": 6.421452301850973, + "grad_norm": 0.26043830468436435, + "learning_rate": 1.0962425840474622e-05, + "loss": 0.273, + "step": 13530 + }, + { + "epoch": 6.423825344091124, + "grad_norm": 0.27277986239798085, + "learning_rate": 1.0945945945945948e-05, + "loss": 0.2797, + "step": 13535 + }, + { + "epoch": 6.426198386331277, + "grad_norm": 0.2621585522416006, + "learning_rate": 1.0929466051417271e-05, + "loss": 0.2693, + "step": 13540 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.2589633454310037, + "learning_rate": 1.0912986156888597e-05, + "loss": 0.2769, + "step": 13545 + }, + { + "epoch": 6.43094447081158, + "grad_norm": 0.2579611240048613, + "learning_rate": 1.0896506262359922e-05, + "loss": 0.2737, + "step": 13550 + }, + { + "epoch": 6.433317513051732, + "grad_norm": 0.2542810821955046, + "learning_rate": 1.0880026367831246e-05, + "loss": 0.2763, + "step": 13555 + }, + { + "epoch": 6.435690555291885, + "grad_norm": 0.27057510458431816, + "learning_rate": 1.0863546473302571e-05, + "loss": 0.278, + "step": 13560 + }, + { + "epoch": 6.438063597532036, + "grad_norm": 0.2569080253561844, + "learning_rate": 1.0847066578773896e-05, + "loss": 0.2736, + "step": 13565 + }, + { + "epoch": 6.440436639772188, + "grad_norm": 0.25903483778090763, + "learning_rate": 1.0830586684245222e-05, + "loss": 0.2715, + "step": 13570 + }, + { + "epoch": 6.4428096820123395, + "grad_norm": 0.26922974872437855, + "learning_rate": 1.0814106789716547e-05, + "loss": 0.2779, + "step": 13575 + }, + { + "epoch": 6.445182724252492, + "grad_norm": 0.2732587869541603, + "learning_rate": 1.079762689518787e-05, + "loss": 0.2855, + "step": 13580 + }, + { + "epoch": 6.447555766492644, + "grad_norm": 0.2472760460436152, + "learning_rate": 1.0781147000659196e-05, + "loss": 0.2733, + "step": 13585 + }, + { + "epoch": 6.449928808732795, + "grad_norm": 0.264075387443221, + "learning_rate": 1.0764667106130521e-05, + "loss": 0.2764, + "step": 13590 + }, + { + "epoch": 6.4523018509729475, + "grad_norm": 0.2754878786193583, + "learning_rate": 1.0748187211601845e-05, + "loss": 0.2854, + "step": 13595 + }, + { + "epoch": 6.454674893213099, + "grad_norm": 0.2546592845575765, + "learning_rate": 1.073170731707317e-05, + "loss": 0.2849, + "step": 13600 + }, + { + "epoch": 6.457047935453251, + "grad_norm": 0.24829313446810714, + "learning_rate": 1.0715227422544496e-05, + "loss": 0.2747, + "step": 13605 + }, + { + "epoch": 6.459420977693403, + "grad_norm": 0.2550198419508802, + "learning_rate": 1.0698747528015821e-05, + "loss": 0.2761, + "step": 13610 + }, + { + "epoch": 6.461794019933555, + "grad_norm": 0.25816143795406876, + "learning_rate": 1.0682267633487146e-05, + "loss": 0.2803, + "step": 13615 + }, + { + "epoch": 6.464167062173707, + "grad_norm": 0.24649114745632797, + "learning_rate": 1.0665787738958472e-05, + "loss": 0.2778, + "step": 13620 + }, + { + "epoch": 6.466540104413858, + "grad_norm": 0.2685413269353122, + "learning_rate": 1.0649307844429795e-05, + "loss": 0.2843, + "step": 13625 + }, + { + "epoch": 6.46891314665401, + "grad_norm": 0.2770979799640407, + "learning_rate": 1.063282794990112e-05, + "loss": 0.2716, + "step": 13630 + }, + { + "epoch": 6.471286188894163, + "grad_norm": 0.2691875735964228, + "learning_rate": 1.0616348055372446e-05, + "loss": 0.2852, + "step": 13635 + }, + { + "epoch": 6.473659231134314, + "grad_norm": 0.26569083204177263, + "learning_rate": 1.059986816084377e-05, + "loss": 0.2781, + "step": 13640 + }, + { + "epoch": 6.476032273374466, + "grad_norm": 0.26663197554449697, + "learning_rate": 1.0583388266315095e-05, + "loss": 0.283, + "step": 13645 + }, + { + "epoch": 6.4784053156146175, + "grad_norm": 0.2618085223699059, + "learning_rate": 1.0566908371786422e-05, + "loss": 0.2757, + "step": 13650 + }, + { + "epoch": 6.48077835785477, + "grad_norm": 0.25183896012470885, + "learning_rate": 1.0550428477257746e-05, + "loss": 0.2796, + "step": 13655 + }, + { + "epoch": 6.483151400094922, + "grad_norm": 0.25186792691935206, + "learning_rate": 1.0533948582729071e-05, + "loss": 0.2741, + "step": 13660 + }, + { + "epoch": 6.485524442335073, + "grad_norm": 0.2629694081247523, + "learning_rate": 1.0517468688200397e-05, + "loss": 0.2774, + "step": 13665 + }, + { + "epoch": 6.4878974845752255, + "grad_norm": 0.2756724007545849, + "learning_rate": 1.050098879367172e-05, + "loss": 0.2811, + "step": 13670 + }, + { + "epoch": 6.490270526815378, + "grad_norm": 0.2730095330478218, + "learning_rate": 1.0484508899143046e-05, + "loss": 0.2755, + "step": 13675 + }, + { + "epoch": 6.492643569055529, + "grad_norm": 0.25740760158731935, + "learning_rate": 1.0468029004614371e-05, + "loss": 0.2787, + "step": 13680 + }, + { + "epoch": 6.495016611295681, + "grad_norm": 0.25246620582969337, + "learning_rate": 1.0451549110085696e-05, + "loss": 0.2805, + "step": 13685 + }, + { + "epoch": 6.497389653535833, + "grad_norm": 0.257627655606574, + "learning_rate": 1.0435069215557022e-05, + "loss": 0.2736, + "step": 13690 + }, + { + "epoch": 6.499762695775985, + "grad_norm": 0.2583282760358387, + "learning_rate": 1.0418589321028347e-05, + "loss": 0.2817, + "step": 13695 + }, + { + "epoch": 6.502135738016136, + "grad_norm": 0.25453882803403727, + "learning_rate": 1.040210942649967e-05, + "loss": 0.2817, + "step": 13700 + }, + { + "epoch": 6.504508780256288, + "grad_norm": 0.25592538170222523, + "learning_rate": 1.0385629531970996e-05, + "loss": 0.2833, + "step": 13705 + }, + { + "epoch": 6.506881822496441, + "grad_norm": 0.2528467788582194, + "learning_rate": 1.0369149637442321e-05, + "loss": 0.2789, + "step": 13710 + }, + { + "epoch": 6.509254864736592, + "grad_norm": 0.2609058995493899, + "learning_rate": 1.0352669742913645e-05, + "loss": 0.2739, + "step": 13715 + }, + { + "epoch": 6.511627906976744, + "grad_norm": 0.25223469199714116, + "learning_rate": 1.033618984838497e-05, + "loss": 0.2777, + "step": 13720 + }, + { + "epoch": 6.514000949216896, + "grad_norm": 0.26829936236995394, + "learning_rate": 1.0319709953856296e-05, + "loss": 0.2748, + "step": 13725 + }, + { + "epoch": 6.516373991457048, + "grad_norm": 0.24706636230464635, + "learning_rate": 1.0303230059327621e-05, + "loss": 0.2726, + "step": 13730 + }, + { + "epoch": 6.5187470336972, + "grad_norm": 0.2580864466884645, + "learning_rate": 1.0286750164798946e-05, + "loss": 0.2809, + "step": 13735 + }, + { + "epoch": 6.521120075937351, + "grad_norm": 0.2655520382820176, + "learning_rate": 1.0270270270270272e-05, + "loss": 0.2777, + "step": 13740 + }, + { + "epoch": 6.5234931181775035, + "grad_norm": 0.2567684725975119, + "learning_rate": 1.0253790375741595e-05, + "loss": 0.28, + "step": 13745 + }, + { + "epoch": 6.525866160417656, + "grad_norm": 0.26018178039025774, + "learning_rate": 1.023731048121292e-05, + "loss": 0.2822, + "step": 13750 + }, + { + "epoch": 6.528239202657807, + "grad_norm": 0.26322581421891406, + "learning_rate": 1.0220830586684246e-05, + "loss": 0.2767, + "step": 13755 + }, + { + "epoch": 6.530612244897959, + "grad_norm": 0.25904964519723495, + "learning_rate": 1.020435069215557e-05, + "loss": 0.2833, + "step": 13760 + }, + { + "epoch": 6.532985287138111, + "grad_norm": 0.27033416965633345, + "learning_rate": 1.0187870797626895e-05, + "loss": 0.2784, + "step": 13765 + }, + { + "epoch": 6.535358329378263, + "grad_norm": 0.2689142860740617, + "learning_rate": 1.017139090309822e-05, + "loss": 0.28, + "step": 13770 + }, + { + "epoch": 6.537731371618415, + "grad_norm": 0.2474414717723568, + "learning_rate": 1.0154911008569546e-05, + "loss": 0.2733, + "step": 13775 + }, + { + "epoch": 6.5401044138585664, + "grad_norm": 0.2700157584372864, + "learning_rate": 1.0138431114040871e-05, + "loss": 0.2746, + "step": 13780 + }, + { + "epoch": 6.542477456098719, + "grad_norm": 0.2777902252800762, + "learning_rate": 1.0121951219512197e-05, + "loss": 0.2794, + "step": 13785 + }, + { + "epoch": 6.544850498338871, + "grad_norm": 0.2545544535170523, + "learning_rate": 1.010547132498352e-05, + "loss": 0.272, + "step": 13790 + }, + { + "epoch": 6.547223540579022, + "grad_norm": 0.2564873715812807, + "learning_rate": 1.0088991430454846e-05, + "loss": 0.276, + "step": 13795 + }, + { + "epoch": 6.5495965828191745, + "grad_norm": 0.27006107075533875, + "learning_rate": 1.007251153592617e-05, + "loss": 0.2767, + "step": 13800 + }, + { + "epoch": 6.551969625059326, + "grad_norm": 0.2621828176473246, + "learning_rate": 1.0056031641397495e-05, + "loss": 0.2816, + "step": 13805 + }, + { + "epoch": 6.554342667299478, + "grad_norm": 0.25892788828235164, + "learning_rate": 1.003955174686882e-05, + "loss": 0.2822, + "step": 13810 + }, + { + "epoch": 6.556715709539629, + "grad_norm": 0.2586916824320289, + "learning_rate": 1.0023071852340145e-05, + "loss": 0.2798, + "step": 13815 + }, + { + "epoch": 6.559088751779782, + "grad_norm": 0.25543125067176997, + "learning_rate": 1.000659195781147e-05, + "loss": 0.2798, + "step": 13820 + }, + { + "epoch": 6.561461794019934, + "grad_norm": 0.27605868715900267, + "learning_rate": 9.990112063282796e-06, + "loss": 0.2779, + "step": 13825 + }, + { + "epoch": 6.563834836260085, + "grad_norm": 0.2599460196835545, + "learning_rate": 9.97363216875412e-06, + "loss": 0.2834, + "step": 13830 + }, + { + "epoch": 6.566207878500237, + "grad_norm": 0.2556113091552626, + "learning_rate": 9.957152274225445e-06, + "loss": 0.2725, + "step": 13835 + }, + { + "epoch": 6.56858092074039, + "grad_norm": 0.2596724085332703, + "learning_rate": 9.94067237969677e-06, + "loss": 0.2704, + "step": 13840 + }, + { + "epoch": 6.570953962980541, + "grad_norm": 0.24263277136964292, + "learning_rate": 9.924192485168094e-06, + "loss": 0.2814, + "step": 13845 + }, + { + "epoch": 6.573327005220693, + "grad_norm": 0.25881264074444393, + "learning_rate": 9.90771259063942e-06, + "loss": 0.285, + "step": 13850 + }, + { + "epoch": 6.5757000474608445, + "grad_norm": 0.262112000343257, + "learning_rate": 9.891232696110746e-06, + "loss": 0.2825, + "step": 13855 + }, + { + "epoch": 6.578073089700997, + "grad_norm": 0.26576775343772735, + "learning_rate": 9.87475280158207e-06, + "loss": 0.2834, + "step": 13860 + }, + { + "epoch": 6.580446131941149, + "grad_norm": 0.2511470356241129, + "learning_rate": 9.858272907053395e-06, + "loss": 0.2767, + "step": 13865 + }, + { + "epoch": 6.5828191741813, + "grad_norm": 0.2608221587676017, + "learning_rate": 9.84179301252472e-06, + "loss": 0.2781, + "step": 13870 + }, + { + "epoch": 6.5851922164214525, + "grad_norm": 0.2612534426333143, + "learning_rate": 9.825313117996044e-06, + "loss": 0.2748, + "step": 13875 + }, + { + "epoch": 6.587565258661604, + "grad_norm": 0.25740711764744284, + "learning_rate": 9.80883322346737e-06, + "loss": 0.2766, + "step": 13880 + }, + { + "epoch": 6.589938300901756, + "grad_norm": 0.2653629439379795, + "learning_rate": 9.792353328938695e-06, + "loss": 0.2875, + "step": 13885 + }, + { + "epoch": 6.592311343141908, + "grad_norm": 0.25803605404941443, + "learning_rate": 9.77587343441002e-06, + "loss": 0.2799, + "step": 13890 + }, + { + "epoch": 6.59468438538206, + "grad_norm": 0.24576363868052434, + "learning_rate": 9.759393539881346e-06, + "loss": 0.2762, + "step": 13895 + }, + { + "epoch": 6.597057427622212, + "grad_norm": 0.26516062219714803, + "learning_rate": 9.742913645352671e-06, + "loss": 0.2837, + "step": 13900 + }, + { + "epoch": 6.599430469862364, + "grad_norm": 0.25566414856972985, + "learning_rate": 9.726433750823995e-06, + "loss": 0.2802, + "step": 13905 + }, + { + "epoch": 6.601803512102515, + "grad_norm": 0.26159686866624987, + "learning_rate": 9.70995385629532e-06, + "loss": 0.281, + "step": 13910 + }, + { + "epoch": 6.604176554342668, + "grad_norm": 0.24991274272506397, + "learning_rate": 9.693473961766646e-06, + "loss": 0.2807, + "step": 13915 + }, + { + "epoch": 6.606549596582819, + "grad_norm": 0.2565580463632502, + "learning_rate": 9.67699406723797e-06, + "loss": 0.2787, + "step": 13920 + }, + { + "epoch": 6.608922638822971, + "grad_norm": 0.25560999032972986, + "learning_rate": 9.660514172709295e-06, + "loss": 0.2743, + "step": 13925 + }, + { + "epoch": 6.6112956810631225, + "grad_norm": 0.2584902402565247, + "learning_rate": 9.64403427818062e-06, + "loss": 0.2756, + "step": 13930 + }, + { + "epoch": 6.613668723303275, + "grad_norm": 0.26158373970719595, + "learning_rate": 9.627554383651945e-06, + "loss": 0.2781, + "step": 13935 + }, + { + "epoch": 6.616041765543427, + "grad_norm": 0.25637853001593947, + "learning_rate": 9.61107448912327e-06, + "loss": 0.2808, + "step": 13940 + }, + { + "epoch": 6.618414807783578, + "grad_norm": 0.253919809063059, + "learning_rate": 9.594594594594596e-06, + "loss": 0.2747, + "step": 13945 + }, + { + "epoch": 6.6207878500237305, + "grad_norm": 0.24499973702854425, + "learning_rate": 9.57811470006592e-06, + "loss": 0.2767, + "step": 13950 + }, + { + "epoch": 6.623160892263883, + "grad_norm": 0.2551937215347071, + "learning_rate": 9.561634805537245e-06, + "loss": 0.2715, + "step": 13955 + }, + { + "epoch": 6.625533934504034, + "grad_norm": 0.2512948702664065, + "learning_rate": 9.54515491100857e-06, + "loss": 0.2774, + "step": 13960 + }, + { + "epoch": 6.627906976744186, + "grad_norm": 0.2503868221603941, + "learning_rate": 9.528675016479894e-06, + "loss": 0.2802, + "step": 13965 + }, + { + "epoch": 6.630280018984338, + "grad_norm": 0.2651470964139224, + "learning_rate": 9.51219512195122e-06, + "loss": 0.2777, + "step": 13970 + }, + { + "epoch": 6.63265306122449, + "grad_norm": 0.24872544188742945, + "learning_rate": 9.495715227422545e-06, + "loss": 0.2815, + "step": 13975 + }, + { + "epoch": 6.635026103464642, + "grad_norm": 0.24104526582900956, + "learning_rate": 9.47923533289387e-06, + "loss": 0.2757, + "step": 13980 + }, + { + "epoch": 6.637399145704793, + "grad_norm": 0.2601562991409658, + "learning_rate": 9.462755438365195e-06, + "loss": 0.2738, + "step": 13985 + }, + { + "epoch": 6.639772187944946, + "grad_norm": 0.25124641032751927, + "learning_rate": 9.44627554383652e-06, + "loss": 0.2701, + "step": 13990 + }, + { + "epoch": 6.642145230185097, + "grad_norm": 0.252036041320483, + "learning_rate": 9.429795649307844e-06, + "loss": 0.2744, + "step": 13995 + }, + { + "epoch": 6.644518272425249, + "grad_norm": 0.25812038039186735, + "learning_rate": 9.41331575477917e-06, + "loss": 0.2819, + "step": 14000 + }, + { + "epoch": 6.646891314665401, + "grad_norm": 0.25111976739314956, + "learning_rate": 9.396835860250495e-06, + "loss": 0.2772, + "step": 14005 + }, + { + "epoch": 6.649264356905553, + "grad_norm": 0.25731495294776885, + "learning_rate": 9.380355965721819e-06, + "loss": 0.2794, + "step": 14010 + }, + { + "epoch": 6.651637399145705, + "grad_norm": 0.25265840788979715, + "learning_rate": 9.363876071193144e-06, + "loss": 0.2717, + "step": 14015 + }, + { + "epoch": 6.654010441385856, + "grad_norm": 0.25996415398707645, + "learning_rate": 9.347396176664471e-06, + "loss": 0.2803, + "step": 14020 + }, + { + "epoch": 6.6563834836260085, + "grad_norm": 0.2498785831715891, + "learning_rate": 9.330916282135795e-06, + "loss": 0.2712, + "step": 14025 + }, + { + "epoch": 6.658756525866161, + "grad_norm": 0.26008702059622796, + "learning_rate": 9.31443638760712e-06, + "loss": 0.2791, + "step": 14030 + }, + { + "epoch": 6.661129568106312, + "grad_norm": 0.25278799476116404, + "learning_rate": 9.297956493078444e-06, + "loss": 0.2837, + "step": 14035 + }, + { + "epoch": 6.663502610346464, + "grad_norm": 0.26008181433623156, + "learning_rate": 9.28147659854977e-06, + "loss": 0.2787, + "step": 14040 + }, + { + "epoch": 6.665875652586616, + "grad_norm": 0.25249690008179776, + "learning_rate": 9.264996704021095e-06, + "loss": 0.2733, + "step": 14045 + }, + { + "epoch": 6.668248694826768, + "grad_norm": 0.26713356656924553, + "learning_rate": 9.248516809492418e-06, + "loss": 0.2829, + "step": 14050 + }, + { + "epoch": 6.67062173706692, + "grad_norm": 0.25576931564287136, + "learning_rate": 9.232036914963745e-06, + "loss": 0.2667, + "step": 14055 + }, + { + "epoch": 6.672994779307071, + "grad_norm": 0.2528207542323119, + "learning_rate": 9.21555702043507e-06, + "loss": 0.2747, + "step": 14060 + }, + { + "epoch": 6.675367821547224, + "grad_norm": 0.26160703543125774, + "learning_rate": 9.199077125906394e-06, + "loss": 0.2791, + "step": 14065 + }, + { + "epoch": 6.677740863787376, + "grad_norm": 0.26578360742917573, + "learning_rate": 9.18259723137772e-06, + "loss": 0.2883, + "step": 14070 + }, + { + "epoch": 6.680113906027527, + "grad_norm": 0.2610082096936672, + "learning_rate": 9.166117336849045e-06, + "loss": 0.2772, + "step": 14075 + }, + { + "epoch": 6.682486948267679, + "grad_norm": 0.2672172472373951, + "learning_rate": 9.149637442320369e-06, + "loss": 0.2782, + "step": 14080 + }, + { + "epoch": 6.684859990507831, + "grad_norm": 0.26172812014339797, + "learning_rate": 9.133157547791694e-06, + "loss": 0.2778, + "step": 14085 + }, + { + "epoch": 6.687233032747983, + "grad_norm": 0.2536782307911938, + "learning_rate": 9.11667765326302e-06, + "loss": 0.2802, + "step": 14090 + }, + { + "epoch": 6.689606074988134, + "grad_norm": 0.25490134939174974, + "learning_rate": 9.100197758734345e-06, + "loss": 0.2766, + "step": 14095 + }, + { + "epoch": 6.6919791172282865, + "grad_norm": 0.26764890988020823, + "learning_rate": 9.08371786420567e-06, + "loss": 0.282, + "step": 14100 + }, + { + "epoch": 6.694352159468439, + "grad_norm": 0.2566137657455081, + "learning_rate": 9.067237969676995e-06, + "loss": 0.2783, + "step": 14105 + }, + { + "epoch": 6.69672520170859, + "grad_norm": 0.2576187816282694, + "learning_rate": 9.05075807514832e-06, + "loss": 0.2846, + "step": 14110 + }, + { + "epoch": 6.699098243948742, + "grad_norm": 0.2476111487739297, + "learning_rate": 9.034278180619645e-06, + "loss": 0.2714, + "step": 14115 + }, + { + "epoch": 6.7014712861888945, + "grad_norm": 0.2641584051238434, + "learning_rate": 9.01779828609097e-06, + "loss": 0.2821, + "step": 14120 + }, + { + "epoch": 6.703844328429046, + "grad_norm": 0.26804620449113237, + "learning_rate": 9.001318391562294e-06, + "loss": 0.2791, + "step": 14125 + }, + { + "epoch": 6.706217370669198, + "grad_norm": 0.2593831274873948, + "learning_rate": 8.984838497033619e-06, + "loss": 0.2784, + "step": 14130 + }, + { + "epoch": 6.708590412909349, + "grad_norm": 0.2637249319681991, + "learning_rate": 8.968358602504944e-06, + "loss": 0.2733, + "step": 14135 + }, + { + "epoch": 6.710963455149502, + "grad_norm": 0.2630361927278315, + "learning_rate": 8.95187870797627e-06, + "loss": 0.2864, + "step": 14140 + }, + { + "epoch": 6.713336497389654, + "grad_norm": 0.25857176498820217, + "learning_rate": 8.935398813447595e-06, + "loss": 0.2827, + "step": 14145 + }, + { + "epoch": 6.715709539629805, + "grad_norm": 0.2518727658524054, + "learning_rate": 8.91891891891892e-06, + "loss": 0.2768, + "step": 14150 + }, + { + "epoch": 6.718082581869957, + "grad_norm": 0.2504228037100468, + "learning_rate": 8.902439024390244e-06, + "loss": 0.284, + "step": 14155 + }, + { + "epoch": 6.720455624110109, + "grad_norm": 0.25119268053257565, + "learning_rate": 8.88595912986157e-06, + "loss": 0.2778, + "step": 14160 + }, + { + "epoch": 6.722828666350261, + "grad_norm": 0.2621475490161544, + "learning_rate": 8.869479235332895e-06, + "loss": 0.2738, + "step": 14165 + }, + { + "epoch": 6.725201708590413, + "grad_norm": 0.24449340473980163, + "learning_rate": 8.852999340804218e-06, + "loss": 0.2799, + "step": 14170 + }, + { + "epoch": 6.7275747508305646, + "grad_norm": 0.24919631356790461, + "learning_rate": 8.836519446275544e-06, + "loss": 0.2738, + "step": 14175 + }, + { + "epoch": 6.729947793070717, + "grad_norm": 0.25371952456444025, + "learning_rate": 8.820039551746869e-06, + "loss": 0.2794, + "step": 14180 + }, + { + "epoch": 6.732320835310869, + "grad_norm": 0.26150316638358706, + "learning_rate": 8.803559657218194e-06, + "loss": 0.2825, + "step": 14185 + }, + { + "epoch": 6.73469387755102, + "grad_norm": 0.25953568595628257, + "learning_rate": 8.78707976268952e-06, + "loss": 0.2878, + "step": 14190 + }, + { + "epoch": 6.737066919791173, + "grad_norm": 0.2553126471172036, + "learning_rate": 8.770599868160845e-06, + "loss": 0.282, + "step": 14195 + }, + { + "epoch": 6.739439962031324, + "grad_norm": 0.25419180996606516, + "learning_rate": 8.754119973632169e-06, + "loss": 0.2776, + "step": 14200 + }, + { + "epoch": 6.741813004271476, + "grad_norm": 0.2648590093926957, + "learning_rate": 8.737640079103494e-06, + "loss": 0.2783, + "step": 14205 + }, + { + "epoch": 6.7441860465116275, + "grad_norm": 0.2659472937992694, + "learning_rate": 8.72116018457482e-06, + "loss": 0.2829, + "step": 14210 + }, + { + "epoch": 6.74655908875178, + "grad_norm": 0.2568749624273519, + "learning_rate": 8.704680290046143e-06, + "loss": 0.2852, + "step": 14215 + }, + { + "epoch": 6.748932130991932, + "grad_norm": 0.25612993905053677, + "learning_rate": 8.688200395517468e-06, + "loss": 0.2716, + "step": 14220 + }, + { + "epoch": 6.751305173232083, + "grad_norm": 0.25465608697601183, + "learning_rate": 8.671720500988796e-06, + "loss": 0.2776, + "step": 14225 + }, + { + "epoch": 6.7536782154722355, + "grad_norm": 0.2547217417211409, + "learning_rate": 8.65524060646012e-06, + "loss": 0.2802, + "step": 14230 + }, + { + "epoch": 6.756051257712388, + "grad_norm": 0.2686425244495052, + "learning_rate": 8.638760711931445e-06, + "loss": 0.2831, + "step": 14235 + }, + { + "epoch": 6.758424299952539, + "grad_norm": 0.2631592182933949, + "learning_rate": 8.62228081740277e-06, + "loss": 0.2883, + "step": 14240 + }, + { + "epoch": 6.760797342192691, + "grad_norm": 0.2457700131736631, + "learning_rate": 8.605800922874094e-06, + "loss": 0.2807, + "step": 14245 + }, + { + "epoch": 6.763170384432843, + "grad_norm": 0.25930725948379113, + "learning_rate": 8.589321028345419e-06, + "loss": 0.2776, + "step": 14250 + }, + { + "epoch": 6.765543426672995, + "grad_norm": 0.2624264353068585, + "learning_rate": 8.572841133816743e-06, + "loss": 0.2765, + "step": 14255 + }, + { + "epoch": 6.767916468913147, + "grad_norm": 0.25698002890519145, + "learning_rate": 8.55636123928807e-06, + "loss": 0.2832, + "step": 14260 + }, + { + "epoch": 6.770289511153298, + "grad_norm": 0.2540719774844052, + "learning_rate": 8.539881344759395e-06, + "loss": 0.2748, + "step": 14265 + }, + { + "epoch": 6.772662553393451, + "grad_norm": 0.24959345798302404, + "learning_rate": 8.523401450230719e-06, + "loss": 0.2785, + "step": 14270 + }, + { + "epoch": 6.775035595633602, + "grad_norm": 0.25919420356376094, + "learning_rate": 8.506921555702044e-06, + "loss": 0.2778, + "step": 14275 + }, + { + "epoch": 6.777408637873754, + "grad_norm": 0.25387489102293437, + "learning_rate": 8.49044166117337e-06, + "loss": 0.275, + "step": 14280 + }, + { + "epoch": 6.779781680113906, + "grad_norm": 0.2572947138141563, + "learning_rate": 8.473961766644693e-06, + "loss": 0.28, + "step": 14285 + }, + { + "epoch": 6.782154722354058, + "grad_norm": 0.25038413469521414, + "learning_rate": 8.457481872116018e-06, + "loss": 0.272, + "step": 14290 + }, + { + "epoch": 6.78452776459421, + "grad_norm": 0.2567880832893814, + "learning_rate": 8.441001977587344e-06, + "loss": 0.2795, + "step": 14295 + }, + { + "epoch": 6.786900806834362, + "grad_norm": 0.26262729415299924, + "learning_rate": 8.424522083058669e-06, + "loss": 0.2842, + "step": 14300 + }, + { + "epoch": 6.7892738490745135, + "grad_norm": 0.2494902203999297, + "learning_rate": 8.408042188529994e-06, + "loss": 0.28, + "step": 14305 + }, + { + "epoch": 6.791646891314666, + "grad_norm": 0.2552050342302813, + "learning_rate": 8.39156229400132e-06, + "loss": 0.2794, + "step": 14310 + }, + { + "epoch": 6.794019933554817, + "grad_norm": 0.2552898345099969, + "learning_rate": 8.375082399472643e-06, + "loss": 0.2745, + "step": 14315 + }, + { + "epoch": 6.796392975794969, + "grad_norm": 0.2549174452042125, + "learning_rate": 8.358602504943969e-06, + "loss": 0.2837, + "step": 14320 + }, + { + "epoch": 6.798766018035121, + "grad_norm": 0.25734476640523796, + "learning_rate": 8.342122610415294e-06, + "loss": 0.2783, + "step": 14325 + }, + { + "epoch": 6.801139060275273, + "grad_norm": 0.25660905204661927, + "learning_rate": 8.325642715886618e-06, + "loss": 0.2775, + "step": 14330 + }, + { + "epoch": 6.803512102515425, + "grad_norm": 0.26306914497286465, + "learning_rate": 8.309162821357943e-06, + "loss": 0.2857, + "step": 14335 + }, + { + "epoch": 6.805885144755576, + "grad_norm": 0.2668745684460143, + "learning_rate": 8.292682926829268e-06, + "loss": 0.2755, + "step": 14340 + }, + { + "epoch": 6.808258186995729, + "grad_norm": 0.24664924068909397, + "learning_rate": 8.276203032300594e-06, + "loss": 0.272, + "step": 14345 + }, + { + "epoch": 6.810631229235881, + "grad_norm": 0.2587137247140685, + "learning_rate": 8.25972313777192e-06, + "loss": 0.2762, + "step": 14350 + }, + { + "epoch": 6.813004271476032, + "grad_norm": 0.26193579704218883, + "learning_rate": 8.243243243243245e-06, + "loss": 0.28, + "step": 14355 + }, + { + "epoch": 6.815377313716184, + "grad_norm": 0.2678455504692746, + "learning_rate": 8.226763348714568e-06, + "loss": 0.2831, + "step": 14360 + }, + { + "epoch": 6.817750355956336, + "grad_norm": 0.25652461842386554, + "learning_rate": 8.210283454185894e-06, + "loss": 0.275, + "step": 14365 + }, + { + "epoch": 6.820123398196488, + "grad_norm": 0.26286721267032215, + "learning_rate": 8.193803559657219e-06, + "loss": 0.2827, + "step": 14370 + }, + { + "epoch": 6.822496440436639, + "grad_norm": 0.256674041710041, + "learning_rate": 8.177323665128543e-06, + "loss": 0.2732, + "step": 14375 + }, + { + "epoch": 6.8248694826767915, + "grad_norm": 0.25503112472809725, + "learning_rate": 8.160843770599868e-06, + "loss": 0.2826, + "step": 14380 + }, + { + "epoch": 6.827242524916944, + "grad_norm": 0.2625386376892865, + "learning_rate": 8.144363876071193e-06, + "loss": 0.2875, + "step": 14385 + }, + { + "epoch": 6.829615567157095, + "grad_norm": 0.2522340957879465, + "learning_rate": 8.127883981542519e-06, + "loss": 0.2785, + "step": 14390 + }, + { + "epoch": 6.831988609397247, + "grad_norm": 0.2593145801468581, + "learning_rate": 8.111404087013844e-06, + "loss": 0.2835, + "step": 14395 + }, + { + "epoch": 6.8343616516373995, + "grad_norm": 0.23958778611951298, + "learning_rate": 8.09492419248517e-06, + "loss": 0.2774, + "step": 14400 + }, + { + "epoch": 6.836734693877551, + "grad_norm": 0.2619403736217451, + "learning_rate": 8.078444297956493e-06, + "loss": 0.2763, + "step": 14405 + }, + { + "epoch": 6.839107736117703, + "grad_norm": 0.2449422897201711, + "learning_rate": 8.061964403427818e-06, + "loss": 0.271, + "step": 14410 + }, + { + "epoch": 6.841480778357854, + "grad_norm": 0.25209657744694125, + "learning_rate": 8.045484508899144e-06, + "loss": 0.2759, + "step": 14415 + }, + { + "epoch": 6.843853820598007, + "grad_norm": 0.26440437411898143, + "learning_rate": 8.029004614370467e-06, + "loss": 0.2818, + "step": 14420 + }, + { + "epoch": 6.846226862838159, + "grad_norm": 0.25708802290289345, + "learning_rate": 8.012524719841793e-06, + "loss": 0.2786, + "step": 14425 + }, + { + "epoch": 6.84859990507831, + "grad_norm": 0.24382342825184147, + "learning_rate": 7.99604482531312e-06, + "loss": 0.2786, + "step": 14430 + }, + { + "epoch": 6.850972947318462, + "grad_norm": 0.26336069259953987, + "learning_rate": 7.979564930784443e-06, + "loss": 0.2822, + "step": 14435 + }, + { + "epoch": 6.853345989558614, + "grad_norm": 0.26213804170451, + "learning_rate": 7.963085036255769e-06, + "loss": 0.2815, + "step": 14440 + }, + { + "epoch": 6.855719031798766, + "grad_norm": 0.25755739941095573, + "learning_rate": 7.946605141727094e-06, + "loss": 0.2755, + "step": 14445 + }, + { + "epoch": 6.858092074038918, + "grad_norm": 0.2660910144028923, + "learning_rate": 7.930125247198418e-06, + "loss": 0.2836, + "step": 14450 + }, + { + "epoch": 6.8604651162790695, + "grad_norm": 0.2564294946776538, + "learning_rate": 7.913645352669743e-06, + "loss": 0.2813, + "step": 14455 + }, + { + "epoch": 6.862838158519222, + "grad_norm": 0.2571120809888376, + "learning_rate": 7.897165458141068e-06, + "loss": 0.2807, + "step": 14460 + }, + { + "epoch": 6.865211200759374, + "grad_norm": 0.25576816395661517, + "learning_rate": 7.880685563612394e-06, + "loss": 0.2764, + "step": 14465 + }, + { + "epoch": 6.867584242999525, + "grad_norm": 0.25857875896574467, + "learning_rate": 7.86420566908372e-06, + "loss": 0.2762, + "step": 14470 + }, + { + "epoch": 6.8699572852396775, + "grad_norm": 0.26522090024926503, + "learning_rate": 7.847725774555043e-06, + "loss": 0.2816, + "step": 14475 + }, + { + "epoch": 6.872330327479829, + "grad_norm": 0.2718966090137553, + "learning_rate": 7.831245880026368e-06, + "loss": 0.2843, + "step": 14480 + }, + { + "epoch": 6.874703369719981, + "grad_norm": 0.2633275389483919, + "learning_rate": 7.814765985497694e-06, + "loss": 0.2826, + "step": 14485 + }, + { + "epoch": 6.877076411960132, + "grad_norm": 0.2613959928265376, + "learning_rate": 7.798286090969017e-06, + "loss": 0.2807, + "step": 14490 + }, + { + "epoch": 6.879449454200285, + "grad_norm": 0.24734992539059777, + "learning_rate": 7.781806196440343e-06, + "loss": 0.2794, + "step": 14495 + }, + { + "epoch": 6.881822496440437, + "grad_norm": 0.2595417812906496, + "learning_rate": 7.765326301911668e-06, + "loss": 0.2736, + "step": 14500 + }, + { + "epoch": 6.884195538680588, + "grad_norm": 0.26244277865494525, + "learning_rate": 7.748846407382993e-06, + "loss": 0.2732, + "step": 14505 + }, + { + "epoch": 6.88656858092074, + "grad_norm": 0.2654314929959097, + "learning_rate": 7.732366512854319e-06, + "loss": 0.2838, + "step": 14510 + }, + { + "epoch": 6.888941623160893, + "grad_norm": 0.26578190254217826, + "learning_rate": 7.715886618325644e-06, + "loss": 0.2674, + "step": 14515 + }, + { + "epoch": 6.891314665401044, + "grad_norm": 0.24406728617106324, + "learning_rate": 7.699406723796968e-06, + "loss": 0.2806, + "step": 14520 + }, + { + "epoch": 6.893687707641196, + "grad_norm": 0.24732196616142924, + "learning_rate": 7.682926829268293e-06, + "loss": 0.2793, + "step": 14525 + }, + { + "epoch": 6.8960607498813475, + "grad_norm": 0.2597184564824095, + "learning_rate": 7.666446934739618e-06, + "loss": 0.2837, + "step": 14530 + }, + { + "epoch": 6.8984337921215, + "grad_norm": 0.2601981033325236, + "learning_rate": 7.649967040210942e-06, + "loss": 0.2776, + "step": 14535 + }, + { + "epoch": 6.900806834361652, + "grad_norm": 0.2633533133139523, + "learning_rate": 7.633487145682267e-06, + "loss": 0.284, + "step": 14540 + }, + { + "epoch": 6.903179876601803, + "grad_norm": 0.24941094302675282, + "learning_rate": 7.6170072511535936e-06, + "loss": 0.2739, + "step": 14545 + }, + { + "epoch": 6.9055529188419555, + "grad_norm": 0.2604956213010711, + "learning_rate": 7.600527356624917e-06, + "loss": 0.2749, + "step": 14550 + }, + { + "epoch": 6.907925961082107, + "grad_norm": 0.23620491747471264, + "learning_rate": 7.584047462096243e-06, + "loss": 0.275, + "step": 14555 + }, + { + "epoch": 6.910299003322259, + "grad_norm": 0.2701667467100631, + "learning_rate": 7.567567567567568e-06, + "loss": 0.2873, + "step": 14560 + }, + { + "epoch": 6.912672045562411, + "grad_norm": 0.24762180762773342, + "learning_rate": 7.5510876730388924e-06, + "loss": 0.2759, + "step": 14565 + }, + { + "epoch": 6.915045087802563, + "grad_norm": 0.25450617279528803, + "learning_rate": 7.534607778510218e-06, + "loss": 0.2832, + "step": 14570 + }, + { + "epoch": 6.917418130042715, + "grad_norm": 0.254921975456922, + "learning_rate": 7.518127883981543e-06, + "loss": 0.2769, + "step": 14575 + }, + { + "epoch": 6.919791172282867, + "grad_norm": 0.25346730595731964, + "learning_rate": 7.501647989452868e-06, + "loss": 0.2856, + "step": 14580 + }, + { + "epoch": 6.922164214523018, + "grad_norm": 0.24633661311732993, + "learning_rate": 7.485168094924193e-06, + "loss": 0.276, + "step": 14585 + }, + { + "epoch": 6.924537256763171, + "grad_norm": 0.2573456339274401, + "learning_rate": 7.468688200395518e-06, + "loss": 0.278, + "step": 14590 + }, + { + "epoch": 6.926910299003322, + "grad_norm": 0.2585547358862035, + "learning_rate": 7.452208305866842e-06, + "loss": 0.2688, + "step": 14595 + }, + { + "epoch": 6.929283341243474, + "grad_norm": 0.25357697078030994, + "learning_rate": 7.435728411338168e-06, + "loss": 0.283, + "step": 14600 + }, + { + "epoch": 6.931656383483626, + "grad_norm": 0.2598603091757251, + "learning_rate": 7.4192485168094936e-06, + "loss": 0.2818, + "step": 14605 + }, + { + "epoch": 6.934029425723778, + "grad_norm": 0.25467384741756866, + "learning_rate": 7.402768622280817e-06, + "loss": 0.2799, + "step": 14610 + }, + { + "epoch": 6.93640246796393, + "grad_norm": 0.2605342246422391, + "learning_rate": 7.386288727752143e-06, + "loss": 0.2753, + "step": 14615 + }, + { + "epoch": 6.938775510204081, + "grad_norm": 0.260494127768276, + "learning_rate": 7.369808833223468e-06, + "loss": 0.2801, + "step": 14620 + }, + { + "epoch": 6.941148552444234, + "grad_norm": 0.2530704231380907, + "learning_rate": 7.3533289386947924e-06, + "loss": 0.2805, + "step": 14625 + }, + { + "epoch": 6.943521594684386, + "grad_norm": 0.2506915415294471, + "learning_rate": 7.336849044166118e-06, + "loss": 0.2767, + "step": 14630 + }, + { + "epoch": 6.945894636924537, + "grad_norm": 0.27055792632164094, + "learning_rate": 7.320369149637443e-06, + "loss": 0.2809, + "step": 14635 + }, + { + "epoch": 6.948267679164689, + "grad_norm": 0.24799813128514167, + "learning_rate": 7.303889255108768e-06, + "loss": 0.2761, + "step": 14640 + }, + { + "epoch": 6.950640721404841, + "grad_norm": 0.2491662222557702, + "learning_rate": 7.287409360580093e-06, + "loss": 0.2768, + "step": 14645 + }, + { + "epoch": 6.953013763644993, + "grad_norm": 0.25471648112549494, + "learning_rate": 7.270929466051418e-06, + "loss": 0.2811, + "step": 14650 + }, + { + "epoch": 6.955386805885145, + "grad_norm": 0.26523230097805744, + "learning_rate": 7.254449571522742e-06, + "loss": 0.281, + "step": 14655 + }, + { + "epoch": 6.9577598481252965, + "grad_norm": 0.2543575461625987, + "learning_rate": 7.237969676994067e-06, + "loss": 0.2805, + "step": 14660 + }, + { + "epoch": 6.960132890365449, + "grad_norm": 0.2621768837989001, + "learning_rate": 7.221489782465394e-06, + "loss": 0.2745, + "step": 14665 + }, + { + "epoch": 6.9625059326056, + "grad_norm": 0.25615384799907326, + "learning_rate": 7.205009887936717e-06, + "loss": 0.2791, + "step": 14670 + }, + { + "epoch": 6.964878974845752, + "grad_norm": 0.26146856395800444, + "learning_rate": 7.188529993408043e-06, + "loss": 0.283, + "step": 14675 + }, + { + "epoch": 6.9672520170859045, + "grad_norm": 0.26336796031300963, + "learning_rate": 7.172050098879368e-06, + "loss": 0.2783, + "step": 14680 + }, + { + "epoch": 6.969625059326056, + "grad_norm": 0.25764112716652493, + "learning_rate": 7.1555702043506925e-06, + "loss": 0.2754, + "step": 14685 + }, + { + "epoch": 6.971998101566208, + "grad_norm": 0.25975366501452546, + "learning_rate": 7.139090309822018e-06, + "loss": 0.2803, + "step": 14690 + }, + { + "epoch": 6.97437114380636, + "grad_norm": 0.2598962518446714, + "learning_rate": 7.1226104152933415e-06, + "loss": 0.2818, + "step": 14695 + }, + { + "epoch": 6.976744186046512, + "grad_norm": 0.24963762894441568, + "learning_rate": 7.106130520764667e-06, + "loss": 0.2802, + "step": 14700 + }, + { + "epoch": 6.979117228286664, + "grad_norm": 0.25473458096342305, + "learning_rate": 7.089650626235993e-06, + "loss": 0.2832, + "step": 14705 + }, + { + "epoch": 6.981490270526815, + "grad_norm": 0.2520309300662844, + "learning_rate": 7.073170731707317e-06, + "loss": 0.278, + "step": 14710 + }, + { + "epoch": 6.983863312766967, + "grad_norm": 0.25751261525941566, + "learning_rate": 7.056690837178642e-06, + "loss": 0.2784, + "step": 14715 + }, + { + "epoch": 6.986236355007119, + "grad_norm": 0.27101200939391146, + "learning_rate": 7.040210942649967e-06, + "loss": 0.2768, + "step": 14720 + }, + { + "epoch": 6.988609397247271, + "grad_norm": 0.2536791809886415, + "learning_rate": 7.023731048121292e-06, + "loss": 0.284, + "step": 14725 + }, + { + "epoch": 6.990982439487423, + "grad_norm": 0.25392307855878843, + "learning_rate": 7.007251153592617e-06, + "loss": 0.281, + "step": 14730 + }, + { + "epoch": 6.9933554817275745, + "grad_norm": 0.2629787431407045, + "learning_rate": 6.990771259063943e-06, + "loss": 0.2816, + "step": 14735 + }, + { + "epoch": 6.995728523967727, + "grad_norm": 0.2614021428716325, + "learning_rate": 6.974291364535267e-06, + "loss": 0.2756, + "step": 14740 + }, + { + "epoch": 6.998101566207879, + "grad_norm": 0.2515086817846864, + "learning_rate": 6.9578114700065925e-06, + "loss": 0.2777, + "step": 14745 + }, + { + "epoch": 7.00047460844803, + "grad_norm": 0.29063399016922215, + "learning_rate": 6.941331575477918e-06, + "loss": 0.276, + "step": 14750 + }, + { + "epoch": 7.0028476506881825, + "grad_norm": 0.25230278605003315, + "learning_rate": 6.9248516809492415e-06, + "loss": 0.2514, + "step": 14755 + }, + { + "epoch": 7.005220692928334, + "grad_norm": 0.2734820146738501, + "learning_rate": 6.908371786420567e-06, + "loss": 0.2497, + "step": 14760 + }, + { + "epoch": 7.007593735168486, + "grad_norm": 0.2657214668620252, + "learning_rate": 6.891891891891893e-06, + "loss": 0.2541, + "step": 14765 + }, + { + "epoch": 7.009966777408638, + "grad_norm": 0.2533229595720318, + "learning_rate": 6.875411997363217e-06, + "loss": 0.2565, + "step": 14770 + }, + { + "epoch": 7.01233981964879, + "grad_norm": 0.26079050206682713, + "learning_rate": 6.858932102834542e-06, + "loss": 0.2504, + "step": 14775 + }, + { + "epoch": 7.014712861888942, + "grad_norm": 0.250452570650111, + "learning_rate": 6.842452208305867e-06, + "loss": 0.2593, + "step": 14780 + }, + { + "epoch": 7.017085904129093, + "grad_norm": 0.2636192878480706, + "learning_rate": 6.825972313777192e-06, + "loss": 0.2525, + "step": 14785 + }, + { + "epoch": 7.019458946369245, + "grad_norm": 0.2639914877710837, + "learning_rate": 6.809492419248517e-06, + "loss": 0.2553, + "step": 14790 + }, + { + "epoch": 7.021831988609398, + "grad_norm": 0.2562630539882429, + "learning_rate": 6.793012524719843e-06, + "loss": 0.2599, + "step": 14795 + }, + { + "epoch": 7.024205030849549, + "grad_norm": 0.2553756423141542, + "learning_rate": 6.776532630191166e-06, + "loss": 0.2561, + "step": 14800 + }, + { + "epoch": 7.026578073089701, + "grad_norm": 0.2689934860230134, + "learning_rate": 6.7600527356624925e-06, + "loss": 0.261, + "step": 14805 + }, + { + "epoch": 7.0289511153298525, + "grad_norm": 0.26719148802829357, + "learning_rate": 6.743572841133818e-06, + "loss": 0.251, + "step": 14810 + }, + { + "epoch": 7.031324157570005, + "grad_norm": 0.2713510835043433, + "learning_rate": 6.7270929466051415e-06, + "loss": 0.2503, + "step": 14815 + }, + { + "epoch": 7.033697199810157, + "grad_norm": 0.25444131143187754, + "learning_rate": 6.710613052076467e-06, + "loss": 0.2524, + "step": 14820 + }, + { + "epoch": 7.036070242050308, + "grad_norm": 0.25112516401168206, + "learning_rate": 6.694133157547792e-06, + "loss": 0.2561, + "step": 14825 + }, + { + "epoch": 7.0384432842904605, + "grad_norm": 0.27613512052483596, + "learning_rate": 6.677653263019117e-06, + "loss": 0.2536, + "step": 14830 + }, + { + "epoch": 7.040816326530612, + "grad_norm": 0.2552849153045379, + "learning_rate": 6.661173368490442e-06, + "loss": 0.2482, + "step": 14835 + }, + { + "epoch": 7.043189368770764, + "grad_norm": 0.2628188597564533, + "learning_rate": 6.644693473961767e-06, + "loss": 0.2543, + "step": 14840 + }, + { + "epoch": 7.045562411010916, + "grad_norm": 0.2550039354790892, + "learning_rate": 6.628213579433092e-06, + "loss": 0.2535, + "step": 14845 + }, + { + "epoch": 7.047935453251068, + "grad_norm": 0.26518036112898397, + "learning_rate": 6.611733684904417e-06, + "loss": 0.2543, + "step": 14850 + }, + { + "epoch": 7.05030849549122, + "grad_norm": 0.26550058212702804, + "learning_rate": 6.595253790375743e-06, + "loss": 0.2554, + "step": 14855 + }, + { + "epoch": 7.052681537731371, + "grad_norm": 0.27107395978835364, + "learning_rate": 6.578773895847066e-06, + "loss": 0.2481, + "step": 14860 + }, + { + "epoch": 7.055054579971523, + "grad_norm": 0.25020320065590335, + "learning_rate": 6.562294001318392e-06, + "loss": 0.2513, + "step": 14865 + }, + { + "epoch": 7.057427622211676, + "grad_norm": 0.2598861649204568, + "learning_rate": 6.545814106789718e-06, + "loss": 0.2505, + "step": 14870 + }, + { + "epoch": 7.059800664451827, + "grad_norm": 0.2578059799735483, + "learning_rate": 6.5293342122610415e-06, + "loss": 0.2551, + "step": 14875 + }, + { + "epoch": 7.062173706691979, + "grad_norm": 0.26086172479704794, + "learning_rate": 6.512854317732367e-06, + "loss": 0.2484, + "step": 14880 + }, + { + "epoch": 7.064546748932131, + "grad_norm": 0.26172265929017285, + "learning_rate": 6.496374423203692e-06, + "loss": 0.2525, + "step": 14885 + }, + { + "epoch": 7.066919791172283, + "grad_norm": 0.2433653548301032, + "learning_rate": 6.479894528675017e-06, + "loss": 0.2514, + "step": 14890 + }, + { + "epoch": 7.069292833412435, + "grad_norm": 0.2564291394507638, + "learning_rate": 6.463414634146342e-06, + "loss": 0.2474, + "step": 14895 + }, + { + "epoch": 7.071665875652586, + "grad_norm": 0.26846558316892244, + "learning_rate": 6.446934739617667e-06, + "loss": 0.2568, + "step": 14900 + }, + { + "epoch": 7.0740389178927385, + "grad_norm": 0.2563198156053234, + "learning_rate": 6.430454845088991e-06, + "loss": 0.2559, + "step": 14905 + }, + { + "epoch": 7.076411960132891, + "grad_norm": 0.25069370951019493, + "learning_rate": 6.413974950560317e-06, + "loss": 0.2469, + "step": 14910 + }, + { + "epoch": 7.078785002373042, + "grad_norm": 0.2615241594143978, + "learning_rate": 6.397495056031643e-06, + "loss": 0.2527, + "step": 14915 + }, + { + "epoch": 7.081158044613194, + "grad_norm": 0.254560071999046, + "learning_rate": 6.381015161502966e-06, + "loss": 0.252, + "step": 14920 + }, + { + "epoch": 7.083531086853346, + "grad_norm": 0.2584294072056998, + "learning_rate": 6.364535266974292e-06, + "loss": 0.2565, + "step": 14925 + }, + { + "epoch": 7.085904129093498, + "grad_norm": 0.27541628981834404, + "learning_rate": 6.348055372445616e-06, + "loss": 0.2511, + "step": 14930 + }, + { + "epoch": 7.08827717133365, + "grad_norm": 0.25451478842173114, + "learning_rate": 6.3315754779169415e-06, + "loss": 0.2537, + "step": 14935 + }, + { + "epoch": 7.090650213573801, + "grad_norm": 0.2583581448035062, + "learning_rate": 6.315095583388267e-06, + "loss": 0.2534, + "step": 14940 + }, + { + "epoch": 7.093023255813954, + "grad_norm": 0.25709298466444164, + "learning_rate": 6.298615688859591e-06, + "loss": 0.2466, + "step": 14945 + }, + { + "epoch": 7.095396298054105, + "grad_norm": 0.2718655548800056, + "learning_rate": 6.282135794330917e-06, + "loss": 0.2533, + "step": 14950 + }, + { + "epoch": 7.097769340294257, + "grad_norm": 0.256758741104744, + "learning_rate": 6.265655899802242e-06, + "loss": 0.2549, + "step": 14955 + }, + { + "epoch": 7.100142382534409, + "grad_norm": 0.2816702709002742, + "learning_rate": 6.2491760052735666e-06, + "loss": 0.2533, + "step": 14960 + }, + { + "epoch": 7.102515424774561, + "grad_norm": 0.24997378200295456, + "learning_rate": 6.232696110744891e-06, + "loss": 0.2504, + "step": 14965 + }, + { + "epoch": 7.104888467014713, + "grad_norm": 0.25703738554141137, + "learning_rate": 6.216216216216217e-06, + "loss": 0.2553, + "step": 14970 + }, + { + "epoch": 7.107261509254864, + "grad_norm": 0.2641361089037012, + "learning_rate": 6.199736321687542e-06, + "loss": 0.2554, + "step": 14975 + }, + { + "epoch": 7.1096345514950166, + "grad_norm": 0.2591135719280114, + "learning_rate": 6.183256427158866e-06, + "loss": 0.2566, + "step": 14980 + }, + { + "epoch": 7.112007593735169, + "grad_norm": 0.2571868095491642, + "learning_rate": 6.166776532630191e-06, + "loss": 0.2545, + "step": 14985 + }, + { + "epoch": 7.11438063597532, + "grad_norm": 0.25821043489271067, + "learning_rate": 6.150296638101517e-06, + "loss": 0.2497, + "step": 14990 + }, + { + "epoch": 7.116753678215472, + "grad_norm": 0.26607024687862585, + "learning_rate": 6.1338167435728415e-06, + "loss": 0.2582, + "step": 14995 + }, + { + "epoch": 7.119126720455624, + "grad_norm": 0.2605225170348882, + "learning_rate": 6.117336849044166e-06, + "loss": 0.2507, + "step": 15000 + }, + { + "epoch": 7.121499762695776, + "grad_norm": 0.26798930036341806, + "learning_rate": 6.100856954515491e-06, + "loss": 0.2482, + "step": 15005 + }, + { + "epoch": 7.123872804935928, + "grad_norm": 0.2705314321746041, + "learning_rate": 6.084377059986817e-06, + "loss": 0.2539, + "step": 15010 + }, + { + "epoch": 7.1262458471760795, + "grad_norm": 0.26026705590658733, + "learning_rate": 6.067897165458141e-06, + "loss": 0.2589, + "step": 15015 + }, + { + "epoch": 7.128618889416232, + "grad_norm": 0.26126060092206904, + "learning_rate": 6.0514172709294666e-06, + "loss": 0.2569, + "step": 15020 + }, + { + "epoch": 7.130991931656384, + "grad_norm": 0.2784266062595696, + "learning_rate": 6.034937376400791e-06, + "loss": 0.2565, + "step": 15025 + }, + { + "epoch": 7.133364973896535, + "grad_norm": 0.2543805931934106, + "learning_rate": 6.0184574818721164e-06, + "loss": 0.2503, + "step": 15030 + }, + { + "epoch": 7.1357380161366875, + "grad_norm": 0.25646645307663635, + "learning_rate": 6.001977587343442e-06, + "loss": 0.2527, + "step": 15035 + }, + { + "epoch": 7.138111058376839, + "grad_norm": 0.2535948085980532, + "learning_rate": 5.985497692814766e-06, + "loss": 0.2457, + "step": 15040 + }, + { + "epoch": 7.140484100616991, + "grad_norm": 0.2523595706316366, + "learning_rate": 5.969017798286091e-06, + "loss": 0.2576, + "step": 15045 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.26764935544784046, + "learning_rate": 5.952537903757416e-06, + "loss": 0.259, + "step": 15050 + }, + { + "epoch": 7.145230185097295, + "grad_norm": 0.25347531924244304, + "learning_rate": 5.9360580092287415e-06, + "loss": 0.2543, + "step": 15055 + }, + { + "epoch": 7.147603227337447, + "grad_norm": 0.2593156529419254, + "learning_rate": 5.919578114700066e-06, + "loss": 0.252, + "step": 15060 + }, + { + "epoch": 7.149976269577598, + "grad_norm": 0.26206918520603917, + "learning_rate": 5.903098220171391e-06, + "loss": 0.2529, + "step": 15065 + }, + { + "epoch": 7.15234931181775, + "grad_norm": 0.26408642271078925, + "learning_rate": 5.886618325642716e-06, + "loss": 0.255, + "step": 15070 + }, + { + "epoch": 7.154722354057903, + "grad_norm": 0.2633271865983951, + "learning_rate": 5.870138431114041e-06, + "loss": 0.2481, + "step": 15075 + }, + { + "epoch": 7.157095396298054, + "grad_norm": 0.2567658018846935, + "learning_rate": 5.853658536585367e-06, + "loss": 0.2498, + "step": 15080 + }, + { + "epoch": 7.159468438538206, + "grad_norm": 0.26197669935252593, + "learning_rate": 5.837178642056691e-06, + "loss": 0.2576, + "step": 15085 + }, + { + "epoch": 7.1618414807783575, + "grad_norm": 0.25685279134026634, + "learning_rate": 5.820698747528016e-06, + "loss": 0.255, + "step": 15090 + }, + { + "epoch": 7.16421452301851, + "grad_norm": 0.2504042947643301, + "learning_rate": 5.804218852999341e-06, + "loss": 0.2554, + "step": 15095 + }, + { + "epoch": 7.166587565258662, + "grad_norm": 0.26955940627119584, + "learning_rate": 5.787738958470666e-06, + "loss": 0.2562, + "step": 15100 + }, + { + "epoch": 7.168960607498813, + "grad_norm": 0.2658568431840734, + "learning_rate": 5.771259063941991e-06, + "loss": 0.254, + "step": 15105 + }, + { + "epoch": 7.1713336497389655, + "grad_norm": 0.2681417014500662, + "learning_rate": 5.754779169413316e-06, + "loss": 0.2619, + "step": 15110 + }, + { + "epoch": 7.173706691979117, + "grad_norm": 0.255077479168531, + "learning_rate": 5.7382992748846415e-06, + "loss": 0.2498, + "step": 15115 + }, + { + "epoch": 7.176079734219269, + "grad_norm": 0.2678896854868307, + "learning_rate": 5.721819380355966e-06, + "loss": 0.2541, + "step": 15120 + }, + { + "epoch": 7.178452776459421, + "grad_norm": 0.26524501314440757, + "learning_rate": 5.7053394858272905e-06, + "loss": 0.2507, + "step": 15125 + }, + { + "epoch": 7.180825818699573, + "grad_norm": 0.2654690040396773, + "learning_rate": 5.688859591298616e-06, + "loss": 0.2551, + "step": 15130 + }, + { + "epoch": 7.183198860939725, + "grad_norm": 0.26733572204825023, + "learning_rate": 5.672379696769941e-06, + "loss": 0.2554, + "step": 15135 + }, + { + "epoch": 7.185571903179877, + "grad_norm": 0.25701840113608754, + "learning_rate": 5.655899802241266e-06, + "loss": 0.253, + "step": 15140 + }, + { + "epoch": 7.187944945420028, + "grad_norm": 0.2646861601582718, + "learning_rate": 5.639419907712591e-06, + "loss": 0.2588, + "step": 15145 + }, + { + "epoch": 7.190317987660181, + "grad_norm": 0.265912901299105, + "learning_rate": 5.622940013183916e-06, + "loss": 0.2589, + "step": 15150 + }, + { + "epoch": 7.192691029900332, + "grad_norm": 0.2624806269208391, + "learning_rate": 5.606460118655241e-06, + "loss": 0.2581, + "step": 15155 + }, + { + "epoch": 7.195064072140484, + "grad_norm": 0.25436770731387676, + "learning_rate": 5.589980224126566e-06, + "loss": 0.2585, + "step": 15160 + }, + { + "epoch": 7.197437114380636, + "grad_norm": 0.25966440611987457, + "learning_rate": 5.573500329597891e-06, + "loss": 0.2568, + "step": 15165 + }, + { + "epoch": 7.199810156620788, + "grad_norm": 0.28099837602573513, + "learning_rate": 5.557020435069215e-06, + "loss": 0.2555, + "step": 15170 + }, + { + "epoch": 7.20218319886094, + "grad_norm": 0.2504587302111784, + "learning_rate": 5.5405405405405415e-06, + "loss": 0.252, + "step": 15175 + }, + { + "epoch": 7.204556241101091, + "grad_norm": 0.25605495636200293, + "learning_rate": 5.524060646011866e-06, + "loss": 0.255, + "step": 15180 + }, + { + "epoch": 7.2069292833412435, + "grad_norm": 0.27365696462847783, + "learning_rate": 5.5075807514831905e-06, + "loss": 0.2502, + "step": 15185 + }, + { + "epoch": 7.209302325581396, + "grad_norm": 0.25724556119049263, + "learning_rate": 5.491100856954516e-06, + "loss": 0.2507, + "step": 15190 + }, + { + "epoch": 7.211675367821547, + "grad_norm": 0.2675759226630015, + "learning_rate": 5.474620962425841e-06, + "loss": 0.2578, + "step": 15195 + }, + { + "epoch": 7.214048410061699, + "grad_norm": 0.259402719480126, + "learning_rate": 5.458141067897166e-06, + "loss": 0.2519, + "step": 15200 + }, + { + "epoch": 7.216421452301851, + "grad_norm": 0.2625486495309784, + "learning_rate": 5.44166117336849e-06, + "loss": 0.2548, + "step": 15205 + }, + { + "epoch": 7.218794494542003, + "grad_norm": 0.256111186631601, + "learning_rate": 5.425181278839816e-06, + "loss": 0.2523, + "step": 15210 + }, + { + "epoch": 7.221167536782155, + "grad_norm": 0.2554395379213268, + "learning_rate": 5.408701384311141e-06, + "loss": 0.2517, + "step": 15215 + }, + { + "epoch": 7.223540579022306, + "grad_norm": 0.2555749604960513, + "learning_rate": 5.3922214897824655e-06, + "loss": 0.2506, + "step": 15220 + }, + { + "epoch": 7.225913621262459, + "grad_norm": 0.26393754925045954, + "learning_rate": 5.375741595253791e-06, + "loss": 0.2565, + "step": 15225 + }, + { + "epoch": 7.22828666350261, + "grad_norm": 0.25733421644717774, + "learning_rate": 5.359261700725115e-06, + "loss": 0.2521, + "step": 15230 + }, + { + "epoch": 7.230659705742762, + "grad_norm": 0.2559224120924341, + "learning_rate": 5.342781806196441e-06, + "loss": 0.2548, + "step": 15235 + }, + { + "epoch": 7.233032747982914, + "grad_norm": 0.25994432516599775, + "learning_rate": 5.326301911667766e-06, + "loss": 0.2537, + "step": 15240 + }, + { + "epoch": 7.235405790223066, + "grad_norm": 0.27216363164709845, + "learning_rate": 5.3098220171390905e-06, + "loss": 0.2617, + "step": 15245 + }, + { + "epoch": 7.237778832463218, + "grad_norm": 0.26469497524629026, + "learning_rate": 5.293342122610415e-06, + "loss": 0.26, + "step": 15250 + }, + { + "epoch": 7.240151874703369, + "grad_norm": 0.2666743539873964, + "learning_rate": 5.27686222808174e-06, + "loss": 0.2547, + "step": 15255 + }, + { + "epoch": 7.2425249169435215, + "grad_norm": 0.2647155063948723, + "learning_rate": 5.260382333553066e-06, + "loss": 0.2516, + "step": 15260 + }, + { + "epoch": 7.244897959183674, + "grad_norm": 0.25934220520844087, + "learning_rate": 5.24390243902439e-06, + "loss": 0.2589, + "step": 15265 + }, + { + "epoch": 7.247271001423825, + "grad_norm": 0.26240265892409786, + "learning_rate": 5.227422544495716e-06, + "loss": 0.2539, + "step": 15270 + }, + { + "epoch": 7.249644043663977, + "grad_norm": 0.25990265845719107, + "learning_rate": 5.21094264996704e-06, + "loss": 0.2562, + "step": 15275 + }, + { + "epoch": 7.252017085904129, + "grad_norm": 0.27366320611315925, + "learning_rate": 5.1944627554383655e-06, + "loss": 0.2585, + "step": 15280 + }, + { + "epoch": 7.254390128144281, + "grad_norm": 0.25015788605294703, + "learning_rate": 5.177982860909691e-06, + "loss": 0.2517, + "step": 15285 + }, + { + "epoch": 7.256763170384433, + "grad_norm": 0.2671456799638223, + "learning_rate": 5.161502966381015e-06, + "loss": 0.2525, + "step": 15290 + }, + { + "epoch": 7.259136212624584, + "grad_norm": 0.2539994760640382, + "learning_rate": 5.14502307185234e-06, + "loss": 0.2516, + "step": 15295 + }, + { + "epoch": 7.261509254864737, + "grad_norm": 0.2631140722202374, + "learning_rate": 5.128543177323666e-06, + "loss": 0.2526, + "step": 15300 + }, + { + "epoch": 7.263882297104889, + "grad_norm": 0.2567337105860702, + "learning_rate": 5.1120632827949905e-06, + "loss": 0.2468, + "step": 15305 + }, + { + "epoch": 7.26625533934504, + "grad_norm": 0.2617518886525317, + "learning_rate": 5.095583388266315e-06, + "loss": 0.2497, + "step": 15310 + }, + { + "epoch": 7.268628381585192, + "grad_norm": 0.25768667309567933, + "learning_rate": 5.07910349373764e-06, + "loss": 0.2524, + "step": 15315 + }, + { + "epoch": 7.271001423825344, + "grad_norm": 0.25893320746907017, + "learning_rate": 5.062623599208966e-06, + "loss": 0.2505, + "step": 15320 + }, + { + "epoch": 7.273374466065496, + "grad_norm": 0.26199672689514064, + "learning_rate": 5.04614370468029e-06, + "loss": 0.2609, + "step": 15325 + }, + { + "epoch": 7.275747508305648, + "grad_norm": 0.2623707514405612, + "learning_rate": 5.029663810151615e-06, + "loss": 0.2529, + "step": 15330 + }, + { + "epoch": 7.2781205505457995, + "grad_norm": 0.2624328465427988, + "learning_rate": 5.01318391562294e-06, + "loss": 0.2524, + "step": 15335 + }, + { + "epoch": 7.280493592785952, + "grad_norm": 0.2653282570070666, + "learning_rate": 4.9967040210942655e-06, + "loss": 0.2551, + "step": 15340 + }, + { + "epoch": 7.282866635026103, + "grad_norm": 0.2785521775732438, + "learning_rate": 4.98022412656559e-06, + "loss": 0.252, + "step": 15345 + }, + { + "epoch": 7.285239677266255, + "grad_norm": 0.26310863575445953, + "learning_rate": 4.963744232036915e-06, + "loss": 0.2564, + "step": 15350 + }, + { + "epoch": 7.2876127195064075, + "grad_norm": 0.2608094547142962, + "learning_rate": 4.94726433750824e-06, + "loss": 0.2573, + "step": 15355 + }, + { + "epoch": 7.289985761746559, + "grad_norm": 0.25579313536164705, + "learning_rate": 4.930784442979565e-06, + "loss": 0.251, + "step": 15360 + }, + { + "epoch": 7.292358803986711, + "grad_norm": 0.26431453382722436, + "learning_rate": 4.9143045484508906e-06, + "loss": 0.2536, + "step": 15365 + }, + { + "epoch": 7.294731846226862, + "grad_norm": 0.25877627006541104, + "learning_rate": 4.897824653922215e-06, + "loss": 0.2502, + "step": 15370 + }, + { + "epoch": 7.297104888467015, + "grad_norm": 0.25737705974895997, + "learning_rate": 4.8813447593935396e-06, + "loss": 0.2584, + "step": 15375 + }, + { + "epoch": 7.299477930707167, + "grad_norm": 0.27253088704237566, + "learning_rate": 4.864864864864866e-06, + "loss": 0.2547, + "step": 15380 + }, + { + "epoch": 7.301850972947318, + "grad_norm": 0.2551893635716467, + "learning_rate": 4.84838497033619e-06, + "loss": 0.2492, + "step": 15385 + }, + { + "epoch": 7.30422401518747, + "grad_norm": 0.2646582486707214, + "learning_rate": 4.831905075807515e-06, + "loss": 0.2554, + "step": 15390 + }, + { + "epoch": 7.306597057427622, + "grad_norm": 0.262155734159255, + "learning_rate": 4.81542518127884e-06, + "loss": 0.2498, + "step": 15395 + }, + { + "epoch": 7.308970099667774, + "grad_norm": 0.2586713818324005, + "learning_rate": 4.7989452867501655e-06, + "loss": 0.2542, + "step": 15400 + }, + { + "epoch": 7.311343141907926, + "grad_norm": 0.26191652908561486, + "learning_rate": 4.78246539222149e-06, + "loss": 0.2562, + "step": 15405 + }, + { + "epoch": 7.313716184148078, + "grad_norm": 0.2463907334171442, + "learning_rate": 4.765985497692815e-06, + "loss": 0.2439, + "step": 15410 + }, + { + "epoch": 7.31608922638823, + "grad_norm": 0.25967024432123414, + "learning_rate": 4.74950560316414e-06, + "loss": 0.2544, + "step": 15415 + }, + { + "epoch": 7.318462268628382, + "grad_norm": 0.26436320393820933, + "learning_rate": 4.733025708635465e-06, + "loss": 0.2575, + "step": 15420 + }, + { + "epoch": 7.320835310868533, + "grad_norm": 0.2658274476820887, + "learning_rate": 4.71654581410679e-06, + "loss": 0.2611, + "step": 15425 + }, + { + "epoch": 7.323208353108686, + "grad_norm": 0.26380516725123004, + "learning_rate": 4.700065919578115e-06, + "loss": 0.2521, + "step": 15430 + }, + { + "epoch": 7.325581395348837, + "grad_norm": 0.26479728133985075, + "learning_rate": 4.6835860250494396e-06, + "loss": 0.2556, + "step": 15435 + }, + { + "epoch": 7.327954437588989, + "grad_norm": 0.27209776200793434, + "learning_rate": 4.667106130520765e-06, + "loss": 0.2586, + "step": 15440 + }, + { + "epoch": 7.330327479829141, + "grad_norm": 0.2593180946120326, + "learning_rate": 4.65062623599209e-06, + "loss": 0.2476, + "step": 15445 + }, + { + "epoch": 7.332700522069293, + "grad_norm": 0.2598251499471737, + "learning_rate": 4.634146341463415e-06, + "loss": 0.2487, + "step": 15450 + }, + { + "epoch": 7.335073564309445, + "grad_norm": 0.2580699703533772, + "learning_rate": 4.617666446934739e-06, + "loss": 0.2541, + "step": 15455 + }, + { + "epoch": 7.337446606549596, + "grad_norm": 0.25190308316241583, + "learning_rate": 4.601186552406065e-06, + "loss": 0.2535, + "step": 15460 + }, + { + "epoch": 7.3398196487897485, + "grad_norm": 0.26221560779704767, + "learning_rate": 4.58470665787739e-06, + "loss": 0.2525, + "step": 15465 + }, + { + "epoch": 7.342192691029901, + "grad_norm": 0.33646924461586775, + "learning_rate": 4.5682267633487145e-06, + "loss": 0.2523, + "step": 15470 + }, + { + "epoch": 7.344565733270052, + "grad_norm": 0.2626840393773975, + "learning_rate": 4.55174686882004e-06, + "loss": 0.263, + "step": 15475 + }, + { + "epoch": 7.346938775510204, + "grad_norm": 0.2695705929919797, + "learning_rate": 4.535266974291364e-06, + "loss": 0.2497, + "step": 15480 + }, + { + "epoch": 7.349311817750356, + "grad_norm": 0.2535387163441089, + "learning_rate": 4.51878707976269e-06, + "loss": 0.2531, + "step": 15485 + }, + { + "epoch": 7.351684859990508, + "grad_norm": 0.2629156671078447, + "learning_rate": 4.502307185234015e-06, + "loss": 0.2533, + "step": 15490 + }, + { + "epoch": 7.35405790223066, + "grad_norm": 0.25310456025064765, + "learning_rate": 4.48582729070534e-06, + "loss": 0.2585, + "step": 15495 + }, + { + "epoch": 7.356430944470811, + "grad_norm": 0.257953252768614, + "learning_rate": 4.469347396176664e-06, + "loss": 0.2537, + "step": 15500 + }, + { + "epoch": 7.358803986710964, + "grad_norm": 0.2620888699848982, + "learning_rate": 4.45286750164799e-06, + "loss": 0.2522, + "step": 15505 + }, + { + "epoch": 7.361177028951115, + "grad_norm": 0.2642633900574979, + "learning_rate": 4.436387607119315e-06, + "loss": 0.252, + "step": 15510 + }, + { + "epoch": 7.363550071191267, + "grad_norm": 0.26110270686681797, + "learning_rate": 4.419907712590639e-06, + "loss": 0.2538, + "step": 15515 + }, + { + "epoch": 7.365923113431419, + "grad_norm": 0.2607306310255514, + "learning_rate": 4.403427818061965e-06, + "loss": 0.2519, + "step": 15520 + }, + { + "epoch": 7.368296155671571, + "grad_norm": 0.2672591023490623, + "learning_rate": 4.38694792353329e-06, + "loss": 0.2553, + "step": 15525 + }, + { + "epoch": 7.370669197911723, + "grad_norm": 0.26446975383553883, + "learning_rate": 4.3704680290046145e-06, + "loss": 0.2495, + "step": 15530 + }, + { + "epoch": 7.373042240151875, + "grad_norm": 0.26462984003671125, + "learning_rate": 4.353988134475939e-06, + "loss": 0.2561, + "step": 15535 + }, + { + "epoch": 7.3754152823920265, + "grad_norm": 0.2620561972214506, + "learning_rate": 4.337508239947264e-06, + "loss": 0.2511, + "step": 15540 + }, + { + "epoch": 7.377788324632179, + "grad_norm": 0.25643530282162696, + "learning_rate": 4.32102834541859e-06, + "loss": 0.2586, + "step": 15545 + }, + { + "epoch": 7.38016136687233, + "grad_norm": 0.2562113930192993, + "learning_rate": 4.304548450889914e-06, + "loss": 0.2517, + "step": 15550 + }, + { + "epoch": 7.382534409112482, + "grad_norm": 0.25239609816529635, + "learning_rate": 4.28806855636124e-06, + "loss": 0.2584, + "step": 15555 + }, + { + "epoch": 7.3849074513526345, + "grad_norm": 0.2682577335854426, + "learning_rate": 4.271588661832564e-06, + "loss": 0.2562, + "step": 15560 + }, + { + "epoch": 7.387280493592786, + "grad_norm": 0.26543011116989745, + "learning_rate": 4.2551087673038894e-06, + "loss": 0.2522, + "step": 15565 + }, + { + "epoch": 7.389653535832938, + "grad_norm": 0.25511525928291634, + "learning_rate": 4.238628872775215e-06, + "loss": 0.2522, + "step": 15570 + }, + { + "epoch": 7.392026578073089, + "grad_norm": 0.25328127708327314, + "learning_rate": 4.222148978246539e-06, + "loss": 0.2484, + "step": 15575 + }, + { + "epoch": 7.394399620313242, + "grad_norm": 0.2652015907320623, + "learning_rate": 4.205669083717864e-06, + "loss": 0.2594, + "step": 15580 + }, + { + "epoch": 7.396772662553394, + "grad_norm": 0.2740364920719193, + "learning_rate": 4.18918918918919e-06, + "loss": 0.2574, + "step": 15585 + }, + { + "epoch": 7.399145704793545, + "grad_norm": 0.25605528005891603, + "learning_rate": 4.1727092946605145e-06, + "loss": 0.2528, + "step": 15590 + }, + { + "epoch": 7.401518747033697, + "grad_norm": 0.25316763120408164, + "learning_rate": 4.156229400131839e-06, + "loss": 0.2537, + "step": 15595 + }, + { + "epoch": 7.403891789273849, + "grad_norm": 0.25715878692578703, + "learning_rate": 4.139749505603164e-06, + "loss": 0.2511, + "step": 15600 + }, + { + "epoch": 7.406264831514001, + "grad_norm": 0.2697652521569522, + "learning_rate": 4.12326961107449e-06, + "loss": 0.2575, + "step": 15605 + }, + { + "epoch": 7.408637873754153, + "grad_norm": 0.25385955919658093, + "learning_rate": 4.106789716545814e-06, + "loss": 0.2525, + "step": 15610 + }, + { + "epoch": 7.4110109159943045, + "grad_norm": 0.2606791405305184, + "learning_rate": 4.09030982201714e-06, + "loss": 0.254, + "step": 15615 + }, + { + "epoch": 7.413383958234457, + "grad_norm": 0.2637165086268816, + "learning_rate": 4.073829927488464e-06, + "loss": 0.2511, + "step": 15620 + }, + { + "epoch": 7.415757000474608, + "grad_norm": 0.2656044804156631, + "learning_rate": 4.0573500329597895e-06, + "loss": 0.2571, + "step": 15625 + }, + { + "epoch": 7.41813004271476, + "grad_norm": 0.25720252336447363, + "learning_rate": 4.040870138431115e-06, + "loss": 0.2499, + "step": 15630 + }, + { + "epoch": 7.4205030849549125, + "grad_norm": 0.2674771035568139, + "learning_rate": 4.024390243902439e-06, + "loss": 0.2546, + "step": 15635 + }, + { + "epoch": 7.422876127195064, + "grad_norm": 0.2554625967435073, + "learning_rate": 4.007910349373764e-06, + "loss": 0.2544, + "step": 15640 + }, + { + "epoch": 7.425249169435216, + "grad_norm": 0.2548482625474544, + "learning_rate": 3.991430454845089e-06, + "loss": 0.2513, + "step": 15645 + }, + { + "epoch": 7.427622211675368, + "grad_norm": 0.26120012176952734, + "learning_rate": 3.9749505603164145e-06, + "loss": 0.248, + "step": 15650 + }, + { + "epoch": 7.42999525391552, + "grad_norm": 0.262074377730023, + "learning_rate": 3.958470665787739e-06, + "loss": 0.2574, + "step": 15655 + }, + { + "epoch": 7.432368296155672, + "grad_norm": 0.2670964223148279, + "learning_rate": 3.9419907712590635e-06, + "loss": 0.2524, + "step": 15660 + }, + { + "epoch": 7.434741338395823, + "grad_norm": 0.25017578677632024, + "learning_rate": 3.925510876730389e-06, + "loss": 0.2494, + "step": 15665 + }, + { + "epoch": 7.437114380635975, + "grad_norm": 0.2724745290733518, + "learning_rate": 3.909030982201714e-06, + "loss": 0.2559, + "step": 15670 + }, + { + "epoch": 7.439487422876127, + "grad_norm": 0.2538828938798995, + "learning_rate": 3.892551087673039e-06, + "loss": 0.2583, + "step": 15675 + }, + { + "epoch": 7.441860465116279, + "grad_norm": 0.26921815183285797, + "learning_rate": 3.876071193144364e-06, + "loss": 0.2582, + "step": 15680 + }, + { + "epoch": 7.444233507356431, + "grad_norm": 0.2555066383145356, + "learning_rate": 3.859591298615689e-06, + "loss": 0.247, + "step": 15685 + }, + { + "epoch": 7.4466065495965825, + "grad_norm": 0.2533210485832028, + "learning_rate": 3.843111404087014e-06, + "loss": 0.2528, + "step": 15690 + }, + { + "epoch": 7.448979591836735, + "grad_norm": 0.26272950402866496, + "learning_rate": 3.826631509558339e-06, + "loss": 0.2591, + "step": 15695 + }, + { + "epoch": 7.451352634076887, + "grad_norm": 0.2669788415343474, + "learning_rate": 3.810151615029664e-06, + "loss": 0.2564, + "step": 15700 + }, + { + "epoch": 7.453725676317038, + "grad_norm": 0.26474783353670667, + "learning_rate": 3.7936717205009888e-06, + "loss": 0.2572, + "step": 15705 + }, + { + "epoch": 7.4560987185571905, + "grad_norm": 0.2677467649936379, + "learning_rate": 3.777191825972314e-06, + "loss": 0.248, + "step": 15710 + }, + { + "epoch": 7.458471760797342, + "grad_norm": 0.2606022008464192, + "learning_rate": 3.760711931443639e-06, + "loss": 0.2576, + "step": 15715 + }, + { + "epoch": 7.460844803037494, + "grad_norm": 0.2551117603308692, + "learning_rate": 3.7442320369149635e-06, + "loss": 0.2493, + "step": 15720 + }, + { + "epoch": 7.463217845277646, + "grad_norm": 0.2621081941450753, + "learning_rate": 3.7277521423862893e-06, + "loss": 0.2578, + "step": 15725 + }, + { + "epoch": 7.465590887517798, + "grad_norm": 0.2694579681890892, + "learning_rate": 3.711272247857614e-06, + "loss": 0.2571, + "step": 15730 + }, + { + "epoch": 7.46796392975795, + "grad_norm": 0.258696393597769, + "learning_rate": 3.6947923533289388e-06, + "loss": 0.2538, + "step": 15735 + }, + { + "epoch": 7.470336971998101, + "grad_norm": 0.2558307343498113, + "learning_rate": 3.678312458800264e-06, + "loss": 0.2518, + "step": 15740 + }, + { + "epoch": 7.472710014238253, + "grad_norm": 0.2594743440475766, + "learning_rate": 3.661832564271589e-06, + "loss": 0.2565, + "step": 15745 + }, + { + "epoch": 7.475083056478406, + "grad_norm": 0.25863698070270286, + "learning_rate": 3.6453526697429135e-06, + "loss": 0.2522, + "step": 15750 + }, + { + "epoch": 7.477456098718557, + "grad_norm": 0.26265878582357965, + "learning_rate": 3.6288727752142385e-06, + "loss": 0.255, + "step": 15755 + }, + { + "epoch": 7.479829140958709, + "grad_norm": 0.2674682200992112, + "learning_rate": 3.612392880685564e-06, + "loss": 0.2507, + "step": 15760 + }, + { + "epoch": 7.4822021831988605, + "grad_norm": 0.2618563681380444, + "learning_rate": 3.5959129861568888e-06, + "loss": 0.2541, + "step": 15765 + }, + { + "epoch": 7.484575225439013, + "grad_norm": 0.27288651892272936, + "learning_rate": 3.5794330916282133e-06, + "loss": 0.2527, + "step": 15770 + }, + { + "epoch": 7.486948267679165, + "grad_norm": 0.2517275289345003, + "learning_rate": 3.562953197099539e-06, + "loss": 0.2459, + "step": 15775 + }, + { + "epoch": 7.489321309919316, + "grad_norm": 0.26665180830803764, + "learning_rate": 3.5464733025708636e-06, + "loss": 0.2546, + "step": 15780 + }, + { + "epoch": 7.4916943521594686, + "grad_norm": 0.2561302644843663, + "learning_rate": 3.5299934080421885e-06, + "loss": 0.2562, + "step": 15785 + }, + { + "epoch": 7.49406739439962, + "grad_norm": 0.26485751552589976, + "learning_rate": 3.513513513513514e-06, + "loss": 0.2502, + "step": 15790 + }, + { + "epoch": 7.496440436639772, + "grad_norm": 0.2619736116122152, + "learning_rate": 3.4970336189848388e-06, + "loss": 0.2614, + "step": 15795 + }, + { + "epoch": 7.498813478879924, + "grad_norm": 0.27164236562651634, + "learning_rate": 3.4805537244561633e-06, + "loss": 0.2539, + "step": 15800 + }, + { + "epoch": 7.501186521120076, + "grad_norm": 0.2500305359614084, + "learning_rate": 3.464073829927489e-06, + "loss": 0.2553, + "step": 15805 + }, + { + "epoch": 7.503559563360228, + "grad_norm": 0.2622672794801486, + "learning_rate": 3.4475939353988136e-06, + "loss": 0.2578, + "step": 15810 + }, + { + "epoch": 7.50593260560038, + "grad_norm": 0.27650418069065363, + "learning_rate": 3.4311140408701385e-06, + "loss": 0.2635, + "step": 15815 + }, + { + "epoch": 7.5083056478405314, + "grad_norm": 0.2476446469415284, + "learning_rate": 3.414634146341464e-06, + "loss": 0.2511, + "step": 15820 + }, + { + "epoch": 7.510678690080684, + "grad_norm": 0.25855378265424916, + "learning_rate": 3.3981542518127888e-06, + "loss": 0.256, + "step": 15825 + }, + { + "epoch": 7.513051732320835, + "grad_norm": 0.25879497656192924, + "learning_rate": 3.3816743572841133e-06, + "loss": 0.2584, + "step": 15830 + }, + { + "epoch": 7.515424774560987, + "grad_norm": 0.2556178824117976, + "learning_rate": 3.3651944627554386e-06, + "loss": 0.2525, + "step": 15835 + }, + { + "epoch": 7.517797816801139, + "grad_norm": 0.2601086375645065, + "learning_rate": 3.3487145682267636e-06, + "loss": 0.2525, + "step": 15840 + }, + { + "epoch": 7.520170859041291, + "grad_norm": 0.25929176368953966, + "learning_rate": 3.3322346736980885e-06, + "loss": 0.2566, + "step": 15845 + }, + { + "epoch": 7.522543901281443, + "grad_norm": 0.25469060853051534, + "learning_rate": 3.315754779169414e-06, + "loss": 0.2536, + "step": 15850 + }, + { + "epoch": 7.524916943521594, + "grad_norm": 0.24918518571637396, + "learning_rate": 3.2992748846407383e-06, + "loss": 0.2465, + "step": 15855 + }, + { + "epoch": 7.527289985761747, + "grad_norm": 0.2645578142642616, + "learning_rate": 3.2827949901120633e-06, + "loss": 0.249, + "step": 15860 + }, + { + "epoch": 7.529663028001899, + "grad_norm": 0.2478631838662725, + "learning_rate": 3.266315095583388e-06, + "loss": 0.2628, + "step": 15865 + }, + { + "epoch": 7.53203607024205, + "grad_norm": 0.2694309134601939, + "learning_rate": 3.2498352010547136e-06, + "loss": 0.2616, + "step": 15870 + }, + { + "epoch": 7.534409112482202, + "grad_norm": 0.27101300551842106, + "learning_rate": 3.2333553065260385e-06, + "loss": 0.2608, + "step": 15875 + }, + { + "epoch": 7.536782154722354, + "grad_norm": 0.26030303340371314, + "learning_rate": 3.216875411997363e-06, + "loss": 0.2603, + "step": 15880 + }, + { + "epoch": 7.539155196962506, + "grad_norm": 0.2633140291471364, + "learning_rate": 3.2003955174686884e-06, + "loss": 0.2529, + "step": 15885 + }, + { + "epoch": 7.541528239202658, + "grad_norm": 0.24784179272069615, + "learning_rate": 3.1839156229400133e-06, + "loss": 0.253, + "step": 15890 + }, + { + "epoch": 7.5439012814428095, + "grad_norm": 0.26004723076997066, + "learning_rate": 3.167435728411338e-06, + "loss": 0.261, + "step": 15895 + }, + { + "epoch": 7.546274323682962, + "grad_norm": 0.2557485304288884, + "learning_rate": 3.1509558338826636e-06, + "loss": 0.2549, + "step": 15900 + }, + { + "epoch": 7.548647365923113, + "grad_norm": 0.2517840772913834, + "learning_rate": 3.134475939353988e-06, + "loss": 0.255, + "step": 15905 + }, + { + "epoch": 7.551020408163265, + "grad_norm": 0.26537014969866374, + "learning_rate": 3.1179960448253134e-06, + "loss": 0.2562, + "step": 15910 + }, + { + "epoch": 7.5533934504034175, + "grad_norm": 0.25773492344771415, + "learning_rate": 3.101516150296638e-06, + "loss": 0.2559, + "step": 15915 + }, + { + "epoch": 7.555766492643569, + "grad_norm": 0.2529426258506149, + "learning_rate": 3.0850362557679633e-06, + "loss": 0.2534, + "step": 15920 + }, + { + "epoch": 7.558139534883721, + "grad_norm": 0.25426044350332433, + "learning_rate": 3.0685563612392882e-06, + "loss": 0.2514, + "step": 15925 + }, + { + "epoch": 7.560512577123873, + "grad_norm": 0.2561315975197009, + "learning_rate": 3.052076466710613e-06, + "loss": 0.2564, + "step": 15930 + }, + { + "epoch": 7.562885619364025, + "grad_norm": 0.2525388721832474, + "learning_rate": 3.035596572181938e-06, + "loss": 0.2522, + "step": 15935 + }, + { + "epoch": 7.565258661604177, + "grad_norm": 0.2594085849854225, + "learning_rate": 3.0191166776532634e-06, + "loss": 0.2584, + "step": 15940 + }, + { + "epoch": 7.567631703844328, + "grad_norm": 0.25602079580177695, + "learning_rate": 3.002636783124588e-06, + "loss": 0.2543, + "step": 15945 + }, + { + "epoch": 7.57000474608448, + "grad_norm": 0.25963366648089137, + "learning_rate": 2.9861568885959133e-06, + "loss": 0.2524, + "step": 15950 + }, + { + "epoch": 7.572377788324632, + "grad_norm": 0.25619345130066706, + "learning_rate": 2.969676994067238e-06, + "loss": 0.2528, + "step": 15955 + }, + { + "epoch": 7.574750830564784, + "grad_norm": 0.2654081145056753, + "learning_rate": 2.953197099538563e-06, + "loss": 0.2553, + "step": 15960 + }, + { + "epoch": 7.577123872804936, + "grad_norm": 0.2621273807492404, + "learning_rate": 2.936717205009888e-06, + "loss": 0.2529, + "step": 15965 + }, + { + "epoch": 7.5794969150450875, + "grad_norm": 0.25677237875043607, + "learning_rate": 2.920237310481213e-06, + "loss": 0.2599, + "step": 15970 + }, + { + "epoch": 7.58186995728524, + "grad_norm": 0.2577325442513576, + "learning_rate": 2.903757415952538e-06, + "loss": 0.2539, + "step": 15975 + }, + { + "epoch": 7.584242999525392, + "grad_norm": 0.2754188944666232, + "learning_rate": 2.8872775214238633e-06, + "loss": 0.2556, + "step": 15980 + }, + { + "epoch": 7.586616041765543, + "grad_norm": 0.25348795929595785, + "learning_rate": 2.870797626895188e-06, + "loss": 0.2474, + "step": 15985 + }, + { + "epoch": 7.5889890840056955, + "grad_norm": 0.2643387087313936, + "learning_rate": 2.854317732366513e-06, + "loss": 0.258, + "step": 15990 + }, + { + "epoch": 7.591362126245847, + "grad_norm": 0.2549923141822983, + "learning_rate": 2.837837837837838e-06, + "loss": 0.2516, + "step": 15995 + }, + { + "epoch": 7.593735168485999, + "grad_norm": 0.2522074946831055, + "learning_rate": 2.821357943309163e-06, + "loss": 0.2526, + "step": 16000 + }, + { + "epoch": 7.596108210726151, + "grad_norm": 0.27198062902837783, + "learning_rate": 2.804878048780488e-06, + "loss": 0.2574, + "step": 16005 + }, + { + "epoch": 7.598481252966303, + "grad_norm": 0.2683870674622184, + "learning_rate": 2.788398154251813e-06, + "loss": 0.2586, + "step": 16010 + }, + { + "epoch": 7.600854295206455, + "grad_norm": 0.25319139206373326, + "learning_rate": 2.771918259723138e-06, + "loss": 0.2508, + "step": 16015 + }, + { + "epoch": 7.603227337446606, + "grad_norm": 0.2557867387397477, + "learning_rate": 2.755438365194463e-06, + "loss": 0.2522, + "step": 16020 + }, + { + "epoch": 7.605600379686758, + "grad_norm": 0.2551688495513455, + "learning_rate": 2.7389584706657877e-06, + "loss": 0.2552, + "step": 16025 + }, + { + "epoch": 7.607973421926911, + "grad_norm": 0.25751432243203726, + "learning_rate": 2.722478576137113e-06, + "loss": 0.2566, + "step": 16030 + }, + { + "epoch": 7.610346464167062, + "grad_norm": 0.2511286892857767, + "learning_rate": 2.705998681608438e-06, + "loss": 0.2512, + "step": 16035 + }, + { + "epoch": 7.612719506407214, + "grad_norm": 0.2553326343172424, + "learning_rate": 2.689518787079763e-06, + "loss": 0.2513, + "step": 16040 + }, + { + "epoch": 7.615092548647366, + "grad_norm": 0.25829942998905414, + "learning_rate": 2.673038892551088e-06, + "loss": 0.257, + "step": 16045 + }, + { + "epoch": 7.617465590887518, + "grad_norm": 0.26566921543618505, + "learning_rate": 2.6565589980224127e-06, + "loss": 0.2538, + "step": 16050 + }, + { + "epoch": 7.61983863312767, + "grad_norm": 0.2709536022941069, + "learning_rate": 2.6400791034937377e-06, + "loss": 0.2568, + "step": 16055 + }, + { + "epoch": 7.622211675367821, + "grad_norm": 0.2618603188348254, + "learning_rate": 2.623599208965063e-06, + "loss": 0.2596, + "step": 16060 + }, + { + "epoch": 7.6245847176079735, + "grad_norm": 0.256257093618839, + "learning_rate": 2.6071193144363875e-06, + "loss": 0.2551, + "step": 16065 + }, + { + "epoch": 7.626957759848125, + "grad_norm": 0.25044289990173957, + "learning_rate": 2.590639419907713e-06, + "loss": 0.2552, + "step": 16070 + }, + { + "epoch": 7.629330802088277, + "grad_norm": 0.2513713564106514, + "learning_rate": 2.574159525379038e-06, + "loss": 0.2537, + "step": 16075 + }, + { + "epoch": 7.631703844328429, + "grad_norm": 0.26169642002095383, + "learning_rate": 2.5576796308503627e-06, + "loss": 0.2539, + "step": 16080 + }, + { + "epoch": 7.634076886568581, + "grad_norm": 0.2598801779152166, + "learning_rate": 2.5411997363216877e-06, + "loss": 0.2543, + "step": 16085 + }, + { + "epoch": 7.636449928808733, + "grad_norm": 0.263787184599246, + "learning_rate": 2.5247198417930126e-06, + "loss": 0.2625, + "step": 16090 + }, + { + "epoch": 7.638822971048885, + "grad_norm": 0.26579210890035504, + "learning_rate": 2.5082399472643375e-06, + "loss": 0.2527, + "step": 16095 + }, + { + "epoch": 7.641196013289036, + "grad_norm": 0.2649012660648144, + "learning_rate": 2.4917600527356625e-06, + "loss": 0.2603, + "step": 16100 + }, + { + "epoch": 7.643569055529189, + "grad_norm": 0.26091418374731395, + "learning_rate": 2.475280158206988e-06, + "loss": 0.2543, + "step": 16105 + }, + { + "epoch": 7.64594209776934, + "grad_norm": 0.25885962050806305, + "learning_rate": 2.4588002636783123e-06, + "loss": 0.2514, + "step": 16110 + }, + { + "epoch": 7.648315140009492, + "grad_norm": 0.2635469114376577, + "learning_rate": 2.4423203691496377e-06, + "loss": 0.2534, + "step": 16115 + }, + { + "epoch": 7.650688182249644, + "grad_norm": 0.2652295724915455, + "learning_rate": 2.425840474620962e-06, + "loss": 0.254, + "step": 16120 + }, + { + "epoch": 7.653061224489796, + "grad_norm": 0.25519383848813526, + "learning_rate": 2.4093605800922875e-06, + "loss": 0.2556, + "step": 16125 + }, + { + "epoch": 7.655434266729948, + "grad_norm": 0.26199761517339515, + "learning_rate": 2.3928806855636125e-06, + "loss": 0.2621, + "step": 16130 + }, + { + "epoch": 7.657807308970099, + "grad_norm": 0.25227358456286464, + "learning_rate": 2.3764007910349374e-06, + "loss": 0.2523, + "step": 16135 + }, + { + "epoch": 7.6601803512102515, + "grad_norm": 0.26488179342859114, + "learning_rate": 2.3599208965062623e-06, + "loss": 0.2563, + "step": 16140 + }, + { + "epoch": 7.662553393450404, + "grad_norm": 0.2573040352549576, + "learning_rate": 2.3434410019775877e-06, + "loss": 0.2541, + "step": 16145 + }, + { + "epoch": 7.664926435690555, + "grad_norm": 0.2562465400390901, + "learning_rate": 2.326961107448912e-06, + "loss": 0.2536, + "step": 16150 + }, + { + "epoch": 7.667299477930707, + "grad_norm": 0.2609522998119844, + "learning_rate": 2.3104812129202375e-06, + "loss": 0.2538, + "step": 16155 + }, + { + "epoch": 7.669672520170859, + "grad_norm": 0.2622492565458484, + "learning_rate": 2.2940013183915625e-06, + "loss": 0.2534, + "step": 16160 + }, + { + "epoch": 7.672045562411011, + "grad_norm": 0.2559583952610754, + "learning_rate": 2.2775214238628874e-06, + "loss": 0.2553, + "step": 16165 + }, + { + "epoch": 7.674418604651163, + "grad_norm": 0.2517838183310195, + "learning_rate": 2.2610415293342123e-06, + "loss": 0.2523, + "step": 16170 + }, + { + "epoch": 7.676791646891314, + "grad_norm": 0.2576399618596823, + "learning_rate": 2.2445616348055373e-06, + "loss": 0.2563, + "step": 16175 + }, + { + "epoch": 7.679164689131467, + "grad_norm": 0.2561769106735147, + "learning_rate": 2.228081740276862e-06, + "loss": 0.2577, + "step": 16180 + }, + { + "epoch": 7.681537731371618, + "grad_norm": 0.25335326257865254, + "learning_rate": 2.2116018457481875e-06, + "loss": 0.2534, + "step": 16185 + }, + { + "epoch": 7.68391077361177, + "grad_norm": 0.25333319552465994, + "learning_rate": 2.195121951219512e-06, + "loss": 0.2532, + "step": 16190 + }, + { + "epoch": 7.686283815851922, + "grad_norm": 0.26762791434448724, + "learning_rate": 2.1786420566908374e-06, + "loss": 0.259, + "step": 16195 + }, + { + "epoch": 7.688656858092074, + "grad_norm": 0.24571880488867057, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.2516, + "step": 16200 + }, + { + "epoch": 7.691029900332226, + "grad_norm": 0.25326966429865233, + "learning_rate": 2.1456822676334873e-06, + "loss": 0.2561, + "step": 16205 + }, + { + "epoch": 7.693402942572378, + "grad_norm": 0.2595056362638502, + "learning_rate": 2.129202373104812e-06, + "loss": 0.252, + "step": 16210 + }, + { + "epoch": 7.69577598481253, + "grad_norm": 0.291687883403115, + "learning_rate": 2.1127224785761375e-06, + "loss": 0.2571, + "step": 16215 + }, + { + "epoch": 7.698149027052682, + "grad_norm": 0.25708682325005167, + "learning_rate": 2.096242584047462e-06, + "loss": 0.2589, + "step": 16220 + }, + { + "epoch": 7.700522069292833, + "grad_norm": 0.25859412711349294, + "learning_rate": 2.0797626895187874e-06, + "loss": 0.2518, + "step": 16225 + }, + { + "epoch": 7.702895111532985, + "grad_norm": 0.2570949182002276, + "learning_rate": 2.063282794990112e-06, + "loss": 0.253, + "step": 16230 + }, + { + "epoch": 7.705268153773137, + "grad_norm": 0.2596010559244857, + "learning_rate": 2.0468029004614373e-06, + "loss": 0.2539, + "step": 16235 + }, + { + "epoch": 7.707641196013289, + "grad_norm": 0.24519318951537708, + "learning_rate": 2.030323005932762e-06, + "loss": 0.2572, + "step": 16240 + }, + { + "epoch": 7.710014238253441, + "grad_norm": 0.25140598930912156, + "learning_rate": 2.013843111404087e-06, + "loss": 0.2546, + "step": 16245 + }, + { + "epoch": 7.7123872804935925, + "grad_norm": 0.27129463930818265, + "learning_rate": 1.997363216875412e-06, + "loss": 0.2567, + "step": 16250 + }, + { + "epoch": 7.714760322733745, + "grad_norm": 0.2632583048514345, + "learning_rate": 1.9808833223467374e-06, + "loss": 0.2528, + "step": 16255 + }, + { + "epoch": 7.717133364973897, + "grad_norm": 0.25802479977572684, + "learning_rate": 1.964403427818062e-06, + "loss": 0.2536, + "step": 16260 + }, + { + "epoch": 7.719506407214048, + "grad_norm": 0.2562815889064407, + "learning_rate": 1.9479235332893873e-06, + "loss": 0.2491, + "step": 16265 + }, + { + "epoch": 7.7218794494542005, + "grad_norm": 0.2616809658935663, + "learning_rate": 1.931443638760712e-06, + "loss": 0.2512, + "step": 16270 + }, + { + "epoch": 7.724252491694352, + "grad_norm": 0.25954210483286133, + "learning_rate": 1.914963744232037e-06, + "loss": 0.2585, + "step": 16275 + }, + { + "epoch": 7.726625533934504, + "grad_norm": 0.2547926385041475, + "learning_rate": 1.898483849703362e-06, + "loss": 0.2531, + "step": 16280 + }, + { + "epoch": 7.728998576174656, + "grad_norm": 0.25870870557231695, + "learning_rate": 1.8820039551746868e-06, + "loss": 0.2596, + "step": 16285 + }, + { + "epoch": 7.731371618414808, + "grad_norm": 0.26098898664546955, + "learning_rate": 1.865524060646012e-06, + "loss": 0.2563, + "step": 16290 + }, + { + "epoch": 7.73374466065496, + "grad_norm": 0.25151813346873025, + "learning_rate": 1.849044166117337e-06, + "loss": 0.2549, + "step": 16295 + }, + { + "epoch": 7.736117702895111, + "grad_norm": 0.26200788073414016, + "learning_rate": 1.8325642715886618e-06, + "loss": 0.2497, + "step": 16300 + }, + { + "epoch": 7.738490745135263, + "grad_norm": 0.2625480732157138, + "learning_rate": 1.816084377059987e-06, + "loss": 0.2526, + "step": 16305 + }, + { + "epoch": 7.740863787375416, + "grad_norm": 0.25263651199343645, + "learning_rate": 1.799604482531312e-06, + "loss": 0.2575, + "step": 16310 + }, + { + "epoch": 7.743236829615567, + "grad_norm": 0.25679865144868563, + "learning_rate": 1.7831245880026368e-06, + "loss": 0.2577, + "step": 16315 + }, + { + "epoch": 7.745609871855719, + "grad_norm": 0.2509700572450945, + "learning_rate": 1.766644693473962e-06, + "loss": 0.2546, + "step": 16320 + }, + { + "epoch": 7.747982914095871, + "grad_norm": 0.262167955186637, + "learning_rate": 1.750164798945287e-06, + "loss": 0.2512, + "step": 16325 + }, + { + "epoch": 7.750355956336023, + "grad_norm": 0.26583809661519614, + "learning_rate": 1.7336849044166118e-06, + "loss": 0.2549, + "step": 16330 + }, + { + "epoch": 7.752728998576175, + "grad_norm": 0.25055794979896123, + "learning_rate": 1.717205009887937e-06, + "loss": 0.2545, + "step": 16335 + }, + { + "epoch": 7.755102040816326, + "grad_norm": 0.2595369349537786, + "learning_rate": 1.7007251153592616e-06, + "loss": 0.2548, + "step": 16340 + }, + { + "epoch": 7.7574750830564785, + "grad_norm": 0.2548616191663813, + "learning_rate": 1.6842452208305868e-06, + "loss": 0.2578, + "step": 16345 + }, + { + "epoch": 7.75984812529663, + "grad_norm": 0.25676674345648637, + "learning_rate": 1.667765326301912e-06, + "loss": 0.2526, + "step": 16350 + }, + { + "epoch": 7.762221167536782, + "grad_norm": 0.25727523149168124, + "learning_rate": 1.6512854317732366e-06, + "loss": 0.2496, + "step": 16355 + }, + { + "epoch": 7.764594209776934, + "grad_norm": 0.2676304415839853, + "learning_rate": 1.6348055372445618e-06, + "loss": 0.2568, + "step": 16360 + }, + { + "epoch": 7.766967252017086, + "grad_norm": 0.2586541118950198, + "learning_rate": 1.6183256427158867e-06, + "loss": 0.2527, + "step": 16365 + }, + { + "epoch": 7.769340294257238, + "grad_norm": 0.2580425115691165, + "learning_rate": 1.6018457481872116e-06, + "loss": 0.2551, + "step": 16370 + }, + { + "epoch": 7.77171333649739, + "grad_norm": 0.25680227602055283, + "learning_rate": 1.5853658536585368e-06, + "loss": 0.2554, + "step": 16375 + }, + { + "epoch": 7.774086378737541, + "grad_norm": 0.25802318027476706, + "learning_rate": 1.5688859591298617e-06, + "loss": 0.2526, + "step": 16380 + }, + { + "epoch": 7.776459420977694, + "grad_norm": 0.2488080672948303, + "learning_rate": 1.5524060646011866e-06, + "loss": 0.2485, + "step": 16385 + }, + { + "epoch": 7.778832463217845, + "grad_norm": 0.26165517135654676, + "learning_rate": 1.5359261700725116e-06, + "loss": 0.2564, + "step": 16390 + }, + { + "epoch": 7.781205505457997, + "grad_norm": 0.26304545067797724, + "learning_rate": 1.5194462755438365e-06, + "loss": 0.2555, + "step": 16395 + }, + { + "epoch": 7.783578547698149, + "grad_norm": 0.25073579056014517, + "learning_rate": 1.5029663810151614e-06, + "loss": 0.2511, + "step": 16400 + }, + { + "epoch": 7.785951589938301, + "grad_norm": 0.24870138544230083, + "learning_rate": 1.4864864864864866e-06, + "loss": 0.2553, + "step": 16405 + }, + { + "epoch": 7.788324632178453, + "grad_norm": 0.2879714758039322, + "learning_rate": 1.4700065919578115e-06, + "loss": 0.2583, + "step": 16410 + }, + { + "epoch": 7.790697674418604, + "grad_norm": 0.26697107093134365, + "learning_rate": 1.4535266974291364e-06, + "loss": 0.2612, + "step": 16415 + }, + { + "epoch": 7.7930707166587565, + "grad_norm": 0.256031200894526, + "learning_rate": 1.4370468029004614e-06, + "loss": 0.2555, + "step": 16420 + }, + { + "epoch": 7.795443758898909, + "grad_norm": 0.2519982587460135, + "learning_rate": 1.4205669083717865e-06, + "loss": 0.2588, + "step": 16425 + }, + { + "epoch": 7.79781680113906, + "grad_norm": 0.25767527514790334, + "learning_rate": 1.4040870138431114e-06, + "loss": 0.2554, + "step": 16430 + }, + { + "epoch": 7.800189843379212, + "grad_norm": 0.2537559550804601, + "learning_rate": 1.3876071193144364e-06, + "loss": 0.2569, + "step": 16435 + }, + { + "epoch": 7.8025628856193645, + "grad_norm": 0.2537561097305329, + "learning_rate": 1.3711272247857613e-06, + "loss": 0.259, + "step": 16440 + }, + { + "epoch": 7.804935927859516, + "grad_norm": 0.25929819482859273, + "learning_rate": 1.3546473302570864e-06, + "loss": 0.2622, + "step": 16445 + }, + { + "epoch": 7.807308970099668, + "grad_norm": 0.2569333678854389, + "learning_rate": 1.3381674357284114e-06, + "loss": 0.2522, + "step": 16450 + }, + { + "epoch": 7.809682012339819, + "grad_norm": 0.2510961766660175, + "learning_rate": 1.3216875411997363e-06, + "loss": 0.2512, + "step": 16455 + }, + { + "epoch": 7.812055054579972, + "grad_norm": 0.2632633889815527, + "learning_rate": 1.3052076466710614e-06, + "loss": 0.2596, + "step": 16460 + }, + { + "epoch": 7.814428096820123, + "grad_norm": 0.25951706228347166, + "learning_rate": 1.2887277521423864e-06, + "loss": 0.2435, + "step": 16465 + }, + { + "epoch": 7.816801139060275, + "grad_norm": 0.2572229467973195, + "learning_rate": 1.2722478576137113e-06, + "loss": 0.2524, + "step": 16470 + }, + { + "epoch": 7.819174181300427, + "grad_norm": 0.2510723133604922, + "learning_rate": 1.2557679630850362e-06, + "loss": 0.2511, + "step": 16475 + }, + { + "epoch": 7.821547223540579, + "grad_norm": 0.25858669622824443, + "learning_rate": 1.2392880685563614e-06, + "loss": 0.2568, + "step": 16480 + }, + { + "epoch": 7.823920265780731, + "grad_norm": 0.2651135773396345, + "learning_rate": 1.2228081740276863e-06, + "loss": 0.2538, + "step": 16485 + }, + { + "epoch": 7.826293308020883, + "grad_norm": 0.2539404927967126, + "learning_rate": 1.2063282794990112e-06, + "loss": 0.251, + "step": 16490 + }, + { + "epoch": 7.8286663502610345, + "grad_norm": 0.263563678480886, + "learning_rate": 1.1898483849703362e-06, + "loss": 0.2523, + "step": 16495 + }, + { + "epoch": 7.831039392501187, + "grad_norm": 0.2579769811363694, + "learning_rate": 1.1733684904416613e-06, + "loss": 0.2542, + "step": 16500 + }, + { + "epoch": 7.833412434741338, + "grad_norm": 0.2523432158467801, + "learning_rate": 1.1568885959129862e-06, + "loss": 0.2535, + "step": 16505 + }, + { + "epoch": 7.83578547698149, + "grad_norm": 0.25153780417377936, + "learning_rate": 1.1404087013843112e-06, + "loss": 0.2529, + "step": 16510 + }, + { + "epoch": 7.8381585192216425, + "grad_norm": 0.2518083365568771, + "learning_rate": 1.1239288068556363e-06, + "loss": 0.2543, + "step": 16515 + }, + { + "epoch": 7.840531561461794, + "grad_norm": 0.25026503073948453, + "learning_rate": 1.1074489123269612e-06, + "loss": 0.2577, + "step": 16520 + }, + { + "epoch": 7.842904603701946, + "grad_norm": 0.2592242737266426, + "learning_rate": 1.0909690177982862e-06, + "loss": 0.2555, + "step": 16525 + }, + { + "epoch": 7.845277645942097, + "grad_norm": 0.2714334123430139, + "learning_rate": 1.074489123269611e-06, + "loss": 0.2577, + "step": 16530 + }, + { + "epoch": 7.84765068818225, + "grad_norm": 0.2643620873588199, + "learning_rate": 1.0580092287409362e-06, + "loss": 0.2552, + "step": 16535 + }, + { + "epoch": 7.850023730422402, + "grad_norm": 0.25914710123681317, + "learning_rate": 1.0415293342122612e-06, + "loss": 0.2588, + "step": 16540 + }, + { + "epoch": 7.852396772662553, + "grad_norm": 0.25309616703256466, + "learning_rate": 1.025049439683586e-06, + "loss": 0.2636, + "step": 16545 + }, + { + "epoch": 7.854769814902705, + "grad_norm": 0.25434142581866653, + "learning_rate": 1.008569545154911e-06, + "loss": 0.2645, + "step": 16550 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.2564701269612241, + "learning_rate": 9.920896506262362e-07, + "loss": 0.2528, + "step": 16555 + }, + { + "epoch": 7.859515899383009, + "grad_norm": 0.265841337925829, + "learning_rate": 9.75609756097561e-07, + "loss": 0.2602, + "step": 16560 + }, + { + "epoch": 7.861888941623161, + "grad_norm": 0.2520170775433296, + "learning_rate": 9.59129861568886e-07, + "loss": 0.2536, + "step": 16565 + }, + { + "epoch": 7.8642619838633125, + "grad_norm": 0.2620266433613965, + "learning_rate": 9.426499670402111e-07, + "loss": 0.2619, + "step": 16570 + }, + { + "epoch": 7.866635026103465, + "grad_norm": 0.25449222004003413, + "learning_rate": 9.26170072511536e-07, + "loss": 0.2621, + "step": 16575 + }, + { + "epoch": 7.869008068343616, + "grad_norm": 0.25390580868181944, + "learning_rate": 9.096901779828609e-07, + "loss": 0.2529, + "step": 16580 + }, + { + "epoch": 7.871381110583768, + "grad_norm": 0.24510717033825144, + "learning_rate": 8.932102834541859e-07, + "loss": 0.2531, + "step": 16585 + }, + { + "epoch": 7.8737541528239205, + "grad_norm": 0.2561145261757757, + "learning_rate": 8.76730388925511e-07, + "loss": 0.2566, + "step": 16590 + }, + { + "epoch": 7.876127195064072, + "grad_norm": 0.25615955996556866, + "learning_rate": 8.602504943968359e-07, + "loss": 0.2508, + "step": 16595 + }, + { + "epoch": 7.878500237304224, + "grad_norm": 0.2611305361644675, + "learning_rate": 8.437705998681609e-07, + "loss": 0.2576, + "step": 16600 + }, + { + "epoch": 7.880873279544376, + "grad_norm": 0.25394509006681254, + "learning_rate": 8.272907053394858e-07, + "loss": 0.2522, + "step": 16605 + }, + { + "epoch": 7.883246321784528, + "grad_norm": 0.2582487037631995, + "learning_rate": 8.108108108108109e-07, + "loss": 0.259, + "step": 16610 + }, + { + "epoch": 7.88561936402468, + "grad_norm": 0.26070385427744575, + "learning_rate": 7.943309162821359e-07, + "loss": 0.2577, + "step": 16615 + }, + { + "epoch": 7.887992406264831, + "grad_norm": 0.26430969543381966, + "learning_rate": 7.778510217534609e-07, + "loss": 0.2603, + "step": 16620 + }, + { + "epoch": 7.8903654485049834, + "grad_norm": 0.25477795704946715, + "learning_rate": 7.613711272247858e-07, + "loss": 0.256, + "step": 16625 + }, + { + "epoch": 7.892738490745135, + "grad_norm": 0.24952779069822345, + "learning_rate": 7.448912326961109e-07, + "loss": 0.2492, + "step": 16630 + }, + { + "epoch": 7.895111532985287, + "grad_norm": 0.24343406082830027, + "learning_rate": 7.284113381674358e-07, + "loss": 0.2595, + "step": 16635 + }, + { + "epoch": 7.897484575225439, + "grad_norm": 0.25201514858534807, + "learning_rate": 7.119314436387607e-07, + "loss": 0.2571, + "step": 16640 + }, + { + "epoch": 7.899857617465591, + "grad_norm": 0.25110247989119494, + "learning_rate": 6.954515491100858e-07, + "loss": 0.2565, + "step": 16645 + }, + { + "epoch": 7.902230659705743, + "grad_norm": 0.25324179003860786, + "learning_rate": 6.789716545814107e-07, + "loss": 0.2537, + "step": 16650 + }, + { + "epoch": 7.904603701945895, + "grad_norm": 0.25616681111570594, + "learning_rate": 6.624917600527356e-07, + "loss": 0.253, + "step": 16655 + }, + { + "epoch": 7.906976744186046, + "grad_norm": 0.2552468687393918, + "learning_rate": 6.460118655240606e-07, + "loss": 0.2507, + "step": 16660 + }, + { + "epoch": 7.909349786426199, + "grad_norm": 0.28264481256746615, + "learning_rate": 6.295319709953856e-07, + "loss": 0.2597, + "step": 16665 + }, + { + "epoch": 7.91172282866635, + "grad_norm": 0.25295516898666764, + "learning_rate": 6.130520764667106e-07, + "loss": 0.2535, + "step": 16670 + }, + { + "epoch": 7.914095870906502, + "grad_norm": 0.25293458824406956, + "learning_rate": 5.965721819380357e-07, + "loss": 0.2572, + "step": 16675 + }, + { + "epoch": 7.916468913146654, + "grad_norm": 0.2562273972497567, + "learning_rate": 5.800922874093606e-07, + "loss": 0.2531, + "step": 16680 + }, + { + "epoch": 7.918841955386806, + "grad_norm": 0.255915477817985, + "learning_rate": 5.636123928806856e-07, + "loss": 0.2565, + "step": 16685 + }, + { + "epoch": 7.921214997626958, + "grad_norm": 0.2563884172358397, + "learning_rate": 5.471324983520105e-07, + "loss": 0.2558, + "step": 16690 + }, + { + "epoch": 7.923588039867109, + "grad_norm": 0.24950671328810176, + "learning_rate": 5.306526038233356e-07, + "loss": 0.2534, + "step": 16695 + }, + { + "epoch": 7.9259610821072615, + "grad_norm": 0.26370973194132885, + "learning_rate": 5.141727092946605e-07, + "loss": 0.2568, + "step": 16700 + }, + { + "epoch": 7.928334124347414, + "grad_norm": 0.2415502838568532, + "learning_rate": 4.976928147659855e-07, + "loss": 0.2469, + "step": 16705 + }, + { + "epoch": 7.930707166587565, + "grad_norm": 0.26683031591813083, + "learning_rate": 4.812129202373105e-07, + "loss": 0.2564, + "step": 16710 + }, + { + "epoch": 7.933080208827717, + "grad_norm": 0.25782925095435816, + "learning_rate": 4.647330257086355e-07, + "loss": 0.2508, + "step": 16715 + }, + { + "epoch": 7.9354532510678695, + "grad_norm": 0.2619022920075476, + "learning_rate": 4.482531311799605e-07, + "loss": 0.2541, + "step": 16720 + }, + { + "epoch": 7.937826293308021, + "grad_norm": 0.25162858791723475, + "learning_rate": 4.3177323665128543e-07, + "loss": 0.2578, + "step": 16725 + }, + { + "epoch": 7.940199335548173, + "grad_norm": 0.27208949559992607, + "learning_rate": 4.1529334212261046e-07, + "loss": 0.2613, + "step": 16730 + }, + { + "epoch": 7.942572377788324, + "grad_norm": 0.2506860303180732, + "learning_rate": 3.988134475939354e-07, + "loss": 0.2456, + "step": 16735 + }, + { + "epoch": 7.944945420028477, + "grad_norm": 0.2509492075416272, + "learning_rate": 3.823335530652604e-07, + "loss": 0.2593, + "step": 16740 + }, + { + "epoch": 7.947318462268628, + "grad_norm": 0.25260997753086795, + "learning_rate": 3.658536585365854e-07, + "loss": 0.2508, + "step": 16745 + }, + { + "epoch": 7.94969150450878, + "grad_norm": 0.2523613203500048, + "learning_rate": 3.493737640079104e-07, + "loss": 0.2561, + "step": 16750 + }, + { + "epoch": 7.952064546748932, + "grad_norm": 0.2614557403038053, + "learning_rate": 3.328938694792354e-07, + "loss": 0.2621, + "step": 16755 + }, + { + "epoch": 7.954437588989084, + "grad_norm": 0.2514388916946806, + "learning_rate": 3.1641397495056036e-07, + "loss": 0.2501, + "step": 16760 + }, + { + "epoch": 7.956810631229236, + "grad_norm": 0.2491293422945726, + "learning_rate": 2.9993408042188534e-07, + "loss": 0.2473, + "step": 16765 + }, + { + "epoch": 7.959183673469388, + "grad_norm": 0.26158738397236736, + "learning_rate": 2.8345418589321027e-07, + "loss": 0.2523, + "step": 16770 + }, + { + "epoch": 7.9615567157095395, + "grad_norm": 0.2512423036859617, + "learning_rate": 2.6697429136453526e-07, + "loss": 0.2483, + "step": 16775 + }, + { + "epoch": 7.963929757949692, + "grad_norm": 0.24744970759470158, + "learning_rate": 2.5049439683586024e-07, + "loss": 0.2534, + "step": 16780 + }, + { + "epoch": 7.966302800189843, + "grad_norm": 0.2618256689698497, + "learning_rate": 2.3401450230718522e-07, + "loss": 0.2525, + "step": 16785 + }, + { + "epoch": 7.968675842429995, + "grad_norm": 0.2662539459407098, + "learning_rate": 2.175346077785102e-07, + "loss": 0.2511, + "step": 16790 + }, + { + "epoch": 7.9710488846701475, + "grad_norm": 0.25487014676673914, + "learning_rate": 2.010547132498352e-07, + "loss": 0.2572, + "step": 16795 + }, + { + "epoch": 7.973421926910299, + "grad_norm": 0.25189643150724506, + "learning_rate": 1.845748187211602e-07, + "loss": 0.2474, + "step": 16800 + }, + { + "epoch": 7.975794969150451, + "grad_norm": 0.2516892915476456, + "learning_rate": 1.6809492419248518e-07, + "loss": 0.2537, + "step": 16805 + }, + { + "epoch": 7.978168011390602, + "grad_norm": 0.26097046330011203, + "learning_rate": 1.5161502966381014e-07, + "loss": 0.2577, + "step": 16810 + }, + { + "epoch": 7.980541053630755, + "grad_norm": 0.25080500567096686, + "learning_rate": 1.3513513513513515e-07, + "loss": 0.2506, + "step": 16815 + }, + { + "epoch": 7.982914095870907, + "grad_norm": 0.2634512258831302, + "learning_rate": 1.1865524060646013e-07, + "loss": 0.253, + "step": 16820 + }, + { + "epoch": 7.985287138111058, + "grad_norm": 0.23887463399275868, + "learning_rate": 1.0217534607778511e-07, + "loss": 0.2483, + "step": 16825 + }, + { + "epoch": 7.98766018035121, + "grad_norm": 0.2546098079701367, + "learning_rate": 8.569545154911008e-08, + "loss": 0.2558, + "step": 16830 + }, + { + "epoch": 7.990033222591363, + "grad_norm": 0.2491687366721281, + "learning_rate": 6.921555702043508e-08, + "loss": 0.2559, + "step": 16835 + }, + { + "epoch": 7.992406264831514, + "grad_norm": 0.24584715578516653, + "learning_rate": 5.2735662491760053e-08, + "loss": 0.2597, + "step": 16840 + }, + { + "epoch": 7.994779307071666, + "grad_norm": 0.24989432024563088, + "learning_rate": 3.6255767963085036e-08, + "loss": 0.2572, + "step": 16845 + }, + { + "epoch": 7.9971523493118175, + "grad_norm": 0.2548770658191364, + "learning_rate": 1.9775873434410022e-08, + "loss": 0.2565, + "step": 16850 + }, + { + "epoch": 7.99952539155197, + "grad_norm": 0.2596859478709254, + "learning_rate": 3.2959789057350033e-09, + "loss": 0.2502, + "step": 16855 + }, + { + "epoch": 8.0, + "step": 16856, + "total_flos": 7655792892706816.0, + "train_loss": 0.36366783275026304, + "train_runtime": 184023.5131, + "train_samples_per_second": 1.466, + "train_steps_per_second": 0.092 + } + ], + "logging_steps": 5, + "max_steps": 16856, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7655792892706816.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}