|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 5959, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0033562678301728476, |
|
"grad_norm": 3.027224540710449, |
|
"learning_rate": 4.984896794764222e-05, |
|
"loss": 2.5909, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006712535660345695, |
|
"grad_norm": 2.6114368438720703, |
|
"learning_rate": 4.968115455613358e-05, |
|
"loss": 2.367, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.010068803490518544, |
|
"grad_norm": 18.757535934448242, |
|
"learning_rate": 4.951334116462494e-05, |
|
"loss": 2.286, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01342507132069139, |
|
"grad_norm": 2.032654047012329, |
|
"learning_rate": 4.9345527773116296e-05, |
|
"loss": 2.2961, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01678133915086424, |
|
"grad_norm": 1.8827602863311768, |
|
"learning_rate": 4.9177714381607655e-05, |
|
"loss": 2.2622, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.020137606981037087, |
|
"grad_norm": 1.8459181785583496, |
|
"learning_rate": 4.9009900990099014e-05, |
|
"loss": 2.2569, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.023493874811209934, |
|
"grad_norm": 1.9432493448257446, |
|
"learning_rate": 4.8842087598590366e-05, |
|
"loss": 2.2069, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.02685014264138278, |
|
"grad_norm": 1.9693349599838257, |
|
"learning_rate": 4.8674274207081725e-05, |
|
"loss": 2.2437, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03020641047155563, |
|
"grad_norm": 1.8286640644073486, |
|
"learning_rate": 4.8506460815573084e-05, |
|
"loss": 2.1793, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03356267830172848, |
|
"grad_norm": 1.883164644241333, |
|
"learning_rate": 4.833864742406444e-05, |
|
"loss": 2.2013, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.036918946131901324, |
|
"grad_norm": 10.042224884033203, |
|
"learning_rate": 4.817922470213123e-05, |
|
"loss": 2.2325, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.040275213962074174, |
|
"grad_norm": 1.909852147102356, |
|
"learning_rate": 4.801141131062259e-05, |
|
"loss": 2.2372, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.043631481792247025, |
|
"grad_norm": 1.8228334188461304, |
|
"learning_rate": 4.784359791911395e-05, |
|
"loss": 2.236, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.04698774962241987, |
|
"grad_norm": 1.8686975240707397, |
|
"learning_rate": 4.767578452760531e-05, |
|
"loss": 2.1993, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05034401745259272, |
|
"grad_norm": 1.6455596685409546, |
|
"learning_rate": 4.750797113609666e-05, |
|
"loss": 2.2244, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05370028528276556, |
|
"grad_norm": 1.5980720520019531, |
|
"learning_rate": 4.734015774458802e-05, |
|
"loss": 2.1412, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.05705655311293841, |
|
"grad_norm": 1.7958396673202515, |
|
"learning_rate": 4.717234435307938e-05, |
|
"loss": 2.1516, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06041282094311126, |
|
"grad_norm": 1.6902369260787964, |
|
"learning_rate": 4.700453096157074e-05, |
|
"loss": 2.1842, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0637690887732841, |
|
"grad_norm": 1.7754089832305908, |
|
"learning_rate": 4.6836717570062096e-05, |
|
"loss": 2.1591, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.06712535660345696, |
|
"grad_norm": 1.718556523323059, |
|
"learning_rate": 4.6668904178553455e-05, |
|
"loss": 2.155, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0704816244336298, |
|
"grad_norm": 1.6255844831466675, |
|
"learning_rate": 4.650109078704481e-05, |
|
"loss": 2.2235, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.07383789226380265, |
|
"grad_norm": 1.6990654468536377, |
|
"learning_rate": 4.6333277395536166e-05, |
|
"loss": 2.1942, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.0771941600939755, |
|
"grad_norm": 1.7159547805786133, |
|
"learning_rate": 4.6165464004027525e-05, |
|
"loss": 2.2133, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.08055042792414835, |
|
"grad_norm": 1.8792229890823364, |
|
"learning_rate": 4.5997650612518884e-05, |
|
"loss": 2.1534, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.08390669575432119, |
|
"grad_norm": 1.4530068635940552, |
|
"learning_rate": 4.582983722101024e-05, |
|
"loss": 2.1365, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08726296358449405, |
|
"grad_norm": 1.5878769159317017, |
|
"learning_rate": 4.56620238295016e-05, |
|
"loss": 2.1513, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.09061923141466689, |
|
"grad_norm": 1.6161599159240723, |
|
"learning_rate": 4.5494210437992954e-05, |
|
"loss": 2.1434, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.09397549924483974, |
|
"grad_norm": 1.5275565385818481, |
|
"learning_rate": 4.5326397046484306e-05, |
|
"loss": 2.1995, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.09733176707501258, |
|
"grad_norm": 1.3771929740905762, |
|
"learning_rate": 4.5158583654975665e-05, |
|
"loss": 2.1546, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.10068803490518544, |
|
"grad_norm": 1.630591630935669, |
|
"learning_rate": 4.4990770263467024e-05, |
|
"loss": 2.1317, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.10404430273535828, |
|
"grad_norm": 1.5199378728866577, |
|
"learning_rate": 4.482295687195838e-05, |
|
"loss": 2.116, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.10740057056553112, |
|
"grad_norm": 1.7431048154830933, |
|
"learning_rate": 4.465514348044974e-05, |
|
"loss": 2.1379, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.11075683839570398, |
|
"grad_norm": 1.6243093013763428, |
|
"learning_rate": 4.44873300889411e-05, |
|
"loss": 2.1478, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.11411310622587682, |
|
"grad_norm": 1.4836941957473755, |
|
"learning_rate": 4.431951669743245e-05, |
|
"loss": 2.1522, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.11746937405604967, |
|
"grad_norm": 1.5157350301742554, |
|
"learning_rate": 4.415170330592381e-05, |
|
"loss": 2.104, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.12082564188622252, |
|
"grad_norm": 1.3875478506088257, |
|
"learning_rate": 4.398388991441517e-05, |
|
"loss": 2.1495, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.12418190971639537, |
|
"grad_norm": 1.4270449876785278, |
|
"learning_rate": 4.381607652290653e-05, |
|
"loss": 2.1275, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.1275381775465682, |
|
"grad_norm": 1.522594690322876, |
|
"learning_rate": 4.364826313139789e-05, |
|
"loss": 2.1439, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.13089444537674105, |
|
"grad_norm": 1.3996539115905762, |
|
"learning_rate": 4.348044973988925e-05, |
|
"loss": 2.1384, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.13425071320691392, |
|
"grad_norm": 1.5052591562271118, |
|
"learning_rate": 4.33126363483806e-05, |
|
"loss": 2.1005, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.13760698103708677, |
|
"grad_norm": 1.4939976930618286, |
|
"learning_rate": 4.314482295687196e-05, |
|
"loss": 2.078, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.1409632488672596, |
|
"grad_norm": 1.5155856609344482, |
|
"learning_rate": 4.297700956536332e-05, |
|
"loss": 2.0766, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.14431951669743245, |
|
"grad_norm": 1.5032541751861572, |
|
"learning_rate": 4.280919617385468e-05, |
|
"loss": 2.1383, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.1476757845276053, |
|
"grad_norm": 1.5815625190734863, |
|
"learning_rate": 4.2641382782346036e-05, |
|
"loss": 2.138, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.15103205235777814, |
|
"grad_norm": 1.5529719591140747, |
|
"learning_rate": 4.2473569390837395e-05, |
|
"loss": 2.1252, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.154388320187951, |
|
"grad_norm": 1.5002542734146118, |
|
"learning_rate": 4.230575599932875e-05, |
|
"loss": 2.1158, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.15774458801812385, |
|
"grad_norm": 1.4248254299163818, |
|
"learning_rate": 4.2137942607820106e-05, |
|
"loss": 2.1441, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.1611008558482967, |
|
"grad_norm": 1.3779915571212769, |
|
"learning_rate": 4.1970129216311465e-05, |
|
"loss": 2.1105, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.16445712367846954, |
|
"grad_norm": 1.439987301826477, |
|
"learning_rate": 4.1802315824802824e-05, |
|
"loss": 2.0988, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.16781339150864238, |
|
"grad_norm": 1.3819688558578491, |
|
"learning_rate": 4.163450243329418e-05, |
|
"loss": 2.0594, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.17116965933881523, |
|
"grad_norm": 1.4762790203094482, |
|
"learning_rate": 4.146668904178554e-05, |
|
"loss": 2.1448, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.1745259271689881, |
|
"grad_norm": 1.3910346031188965, |
|
"learning_rate": 4.1298875650276894e-05, |
|
"loss": 2.1189, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.17788219499916094, |
|
"grad_norm": 1.3728772401809692, |
|
"learning_rate": 4.1131062258768246e-05, |
|
"loss": 2.1025, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.18123846282933379, |
|
"grad_norm": 1.466235637664795, |
|
"learning_rate": 4.0963248867259605e-05, |
|
"loss": 2.1163, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.18459473065950663, |
|
"grad_norm": 1.418556809425354, |
|
"learning_rate": 4.0795435475750964e-05, |
|
"loss": 2.11, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.18795099848967947, |
|
"grad_norm": 1.3588992357254028, |
|
"learning_rate": 4.062762208424232e-05, |
|
"loss": 2.1174, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.19130726631985231, |
|
"grad_norm": 1.490565538406372, |
|
"learning_rate": 4.045980869273368e-05, |
|
"loss": 2.1176, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.19466353415002516, |
|
"grad_norm": 1.4399744272232056, |
|
"learning_rate": 4.029199530122504e-05, |
|
"loss": 2.1222, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.19801980198019803, |
|
"grad_norm": 1.379418134689331, |
|
"learning_rate": 4.012418190971639e-05, |
|
"loss": 2.1003, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.20137606981037087, |
|
"grad_norm": 1.4052506685256958, |
|
"learning_rate": 3.995636851820775e-05, |
|
"loss": 2.0951, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.20473233764054372, |
|
"grad_norm": 1.3870118856430054, |
|
"learning_rate": 3.978855512669911e-05, |
|
"loss": 2.1201, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.20808860547071656, |
|
"grad_norm": 1.334796667098999, |
|
"learning_rate": 3.962074173519047e-05, |
|
"loss": 2.1252, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.2114448733008894, |
|
"grad_norm": 1.2433503866195679, |
|
"learning_rate": 3.945292834368183e-05, |
|
"loss": 2.1048, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.21480114113106225, |
|
"grad_norm": 1.422275185585022, |
|
"learning_rate": 3.928511495217319e-05, |
|
"loss": 2.0829, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.21815740896123512, |
|
"grad_norm": 1.429117202758789, |
|
"learning_rate": 3.911730156066454e-05, |
|
"loss": 2.0545, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.22151367679140796, |
|
"grad_norm": 1.40293550491333, |
|
"learning_rate": 3.89494881691559e-05, |
|
"loss": 2.0176, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.2248699446215808, |
|
"grad_norm": 1.2488166093826294, |
|
"learning_rate": 3.878167477764726e-05, |
|
"loss": 2.0846, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.22822621245175365, |
|
"grad_norm": 1.2881702184677124, |
|
"learning_rate": 3.861386138613862e-05, |
|
"loss": 2.069, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.2315824802819265, |
|
"grad_norm": 1.3793479204177856, |
|
"learning_rate": 3.8446047994629976e-05, |
|
"loss": 2.0808, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.23493874811209933, |
|
"grad_norm": 1.3845285177230835, |
|
"learning_rate": 3.8278234603121335e-05, |
|
"loss": 2.1008, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2382950159422722, |
|
"grad_norm": 1.4609571695327759, |
|
"learning_rate": 3.811042121161269e-05, |
|
"loss": 2.085, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.24165128377244505, |
|
"grad_norm": 1.3259775638580322, |
|
"learning_rate": 3.7942607820104046e-05, |
|
"loss": 2.0631, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.2450075516026179, |
|
"grad_norm": 1.248612880706787, |
|
"learning_rate": 3.7774794428595405e-05, |
|
"loss": 2.0908, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.24836381943279073, |
|
"grad_norm": 1.4042353630065918, |
|
"learning_rate": 3.7606981037086764e-05, |
|
"loss": 2.0705, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.2517200872629636, |
|
"grad_norm": 1.2640959024429321, |
|
"learning_rate": 3.743916764557812e-05, |
|
"loss": 2.0626, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2550763550931364, |
|
"grad_norm": 1.355383038520813, |
|
"learning_rate": 3.7271354254069475e-05, |
|
"loss": 2.1025, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.25843262292330926, |
|
"grad_norm": 1.2184921503067017, |
|
"learning_rate": 3.7103540862560834e-05, |
|
"loss": 2.0187, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.2617888907534821, |
|
"grad_norm": 1.2813303470611572, |
|
"learning_rate": 3.6935727471052186e-05, |
|
"loss": 2.087, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.26514515858365495, |
|
"grad_norm": 1.3968662023544312, |
|
"learning_rate": 3.6767914079543545e-05, |
|
"loss": 2.0643, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.26850142641382785, |
|
"grad_norm": 1.337203025817871, |
|
"learning_rate": 3.6600100688034904e-05, |
|
"loss": 2.0377, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2718576942440007, |
|
"grad_norm": 1.3844518661499023, |
|
"learning_rate": 3.643228729652626e-05, |
|
"loss": 2.0807, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.27521396207417353, |
|
"grad_norm": 1.787477731704712, |
|
"learning_rate": 3.626447390501762e-05, |
|
"loss": 2.0775, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.2785702299043464, |
|
"grad_norm": 1.3247798681259155, |
|
"learning_rate": 3.609666051350898e-05, |
|
"loss": 2.0996, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.2819264977345192, |
|
"grad_norm": 1.2748069763183594, |
|
"learning_rate": 3.592884712200033e-05, |
|
"loss": 2.0737, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.28528276556469206, |
|
"grad_norm": 1.346238374710083, |
|
"learning_rate": 3.576103373049169e-05, |
|
"loss": 2.1076, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.2886390333948649, |
|
"grad_norm": 1.3545295000076294, |
|
"learning_rate": 3.559322033898305e-05, |
|
"loss": 2.0663, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.29199530122503775, |
|
"grad_norm": 1.3100897073745728, |
|
"learning_rate": 3.542540694747441e-05, |
|
"loss": 2.0872, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.2953515690552106, |
|
"grad_norm": 1.3519947528839111, |
|
"learning_rate": 3.525759355596577e-05, |
|
"loss": 2.0269, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.29870783688538344, |
|
"grad_norm": 1.2966337203979492, |
|
"learning_rate": 3.508978016445713e-05, |
|
"loss": 2.0784, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.3020641047155563, |
|
"grad_norm": 1.3702917098999023, |
|
"learning_rate": 3.492196677294848e-05, |
|
"loss": 2.0906, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3054203725457291, |
|
"grad_norm": 1.338409185409546, |
|
"learning_rate": 3.475415338143984e-05, |
|
"loss": 2.0899, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.308776640375902, |
|
"grad_norm": 1.3070242404937744, |
|
"learning_rate": 3.45863399899312e-05, |
|
"loss": 2.0514, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.31213290820607487, |
|
"grad_norm": 1.2753440141677856, |
|
"learning_rate": 3.441852659842256e-05, |
|
"loss": 2.0809, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.3154891760362477, |
|
"grad_norm": 1.3125340938568115, |
|
"learning_rate": 3.4250713206913916e-05, |
|
"loss": 2.0372, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.31884544386642055, |
|
"grad_norm": 1.3050023317337036, |
|
"learning_rate": 3.4082899815405275e-05, |
|
"loss": 2.072, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3222017116965934, |
|
"grad_norm": 1.4283207654953003, |
|
"learning_rate": 3.391508642389663e-05, |
|
"loss": 2.0489, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.32555797952676624, |
|
"grad_norm": 1.4320636987686157, |
|
"learning_rate": 3.3747273032387986e-05, |
|
"loss": 2.1004, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.3289142473569391, |
|
"grad_norm": 1.3432669639587402, |
|
"learning_rate": 3.3579459640879345e-05, |
|
"loss": 2.0837, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.3322705151871119, |
|
"grad_norm": 1.22812819480896, |
|
"learning_rate": 3.3411646249370704e-05, |
|
"loss": 2.0651, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.33562678301728477, |
|
"grad_norm": 1.3818988800048828, |
|
"learning_rate": 3.324383285786206e-05, |
|
"loss": 2.0446, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3389830508474576, |
|
"grad_norm": 1.2890549898147583, |
|
"learning_rate": 3.3076019466353415e-05, |
|
"loss": 2.0326, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.34233931867763046, |
|
"grad_norm": 1.341260552406311, |
|
"learning_rate": 3.2908206074844774e-05, |
|
"loss": 2.0733, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.3456955865078033, |
|
"grad_norm": 1.2970625162124634, |
|
"learning_rate": 3.274039268333613e-05, |
|
"loss": 2.0749, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.3490518543379762, |
|
"grad_norm": 1.3300013542175293, |
|
"learning_rate": 3.2572579291827485e-05, |
|
"loss": 2.0283, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.35240812216814904, |
|
"grad_norm": 1.372938871383667, |
|
"learning_rate": 3.2404765900318844e-05, |
|
"loss": 2.0703, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3557643899983219, |
|
"grad_norm": 1.355000615119934, |
|
"learning_rate": 3.22369525088102e-05, |
|
"loss": 2.0641, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.3591206578284947, |
|
"grad_norm": 1.3288871049880981, |
|
"learning_rate": 3.206913911730156e-05, |
|
"loss": 2.0169, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.36247692565866757, |
|
"grad_norm": 1.2701308727264404, |
|
"learning_rate": 3.190132572579292e-05, |
|
"loss": 2.1025, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.3658331934888404, |
|
"grad_norm": 1.3620569705963135, |
|
"learning_rate": 3.173351233428428e-05, |
|
"loss": 2.0621, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.36918946131901326, |
|
"grad_norm": 1.1960408687591553, |
|
"learning_rate": 3.156569894277563e-05, |
|
"loss": 2.0452, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3725457291491861, |
|
"grad_norm": 1.3412033319473267, |
|
"learning_rate": 3.139788555126699e-05, |
|
"loss": 2.0655, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.37590199697935894, |
|
"grad_norm": 1.3558080196380615, |
|
"learning_rate": 3.123007215975835e-05, |
|
"loss": 2.0277, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.3792582648095318, |
|
"grad_norm": 1.2877964973449707, |
|
"learning_rate": 3.106225876824971e-05, |
|
"loss": 2.0384, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.38261453263970463, |
|
"grad_norm": 1.3294323682785034, |
|
"learning_rate": 3.089444537674107e-05, |
|
"loss": 1.9997, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.3859708004698775, |
|
"grad_norm": 1.2727768421173096, |
|
"learning_rate": 3.072663198523243e-05, |
|
"loss": 1.9849, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.3893270683000503, |
|
"grad_norm": 1.2134991884231567, |
|
"learning_rate": 3.055881859372378e-05, |
|
"loss": 2.0725, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.3926833361302232, |
|
"grad_norm": 1.2667819261550903, |
|
"learning_rate": 3.0391005202215138e-05, |
|
"loss": 2.0429, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.39603960396039606, |
|
"grad_norm": 1.3160877227783203, |
|
"learning_rate": 3.0223191810706497e-05, |
|
"loss": 2.0154, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.3993958717905689, |
|
"grad_norm": 1.2551796436309814, |
|
"learning_rate": 3.0055378419197856e-05, |
|
"loss": 2.0383, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.40275213962074174, |
|
"grad_norm": 1.3122833967208862, |
|
"learning_rate": 2.988756502768921e-05, |
|
"loss": 2.0263, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4061084074509146, |
|
"grad_norm": 1.269384503364563, |
|
"learning_rate": 2.971975163618057e-05, |
|
"loss": 2.1241, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.40946467528108743, |
|
"grad_norm": 1.3302291631698608, |
|
"learning_rate": 2.955193824467193e-05, |
|
"loss": 2.0616, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.4128209431112603, |
|
"grad_norm": 1.2562564611434937, |
|
"learning_rate": 2.9384124853163285e-05, |
|
"loss": 2.0613, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.4161772109414331, |
|
"grad_norm": 1.2373179197311401, |
|
"learning_rate": 2.9216311461654644e-05, |
|
"loss": 2.0202, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.41953347877160596, |
|
"grad_norm": 1.3266092538833618, |
|
"learning_rate": 2.9048498070145996e-05, |
|
"loss": 2.0174, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4228897466017788, |
|
"grad_norm": 1.27751624584198, |
|
"learning_rate": 2.8880684678637355e-05, |
|
"loss": 2.0549, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.42624601443195165, |
|
"grad_norm": 1.2350589036941528, |
|
"learning_rate": 2.871287128712871e-05, |
|
"loss": 2.0126, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.4296022822621245, |
|
"grad_norm": 1.3619853258132935, |
|
"learning_rate": 2.854505789562007e-05, |
|
"loss": 1.9725, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.4329585500922974, |
|
"grad_norm": 1.354687213897705, |
|
"learning_rate": 2.837724450411143e-05, |
|
"loss": 2.0492, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.43631481792247023, |
|
"grad_norm": 1.3418916463851929, |
|
"learning_rate": 2.8209431112602784e-05, |
|
"loss": 1.976, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4396710857526431, |
|
"grad_norm": 1.232704520225525, |
|
"learning_rate": 2.8041617721094143e-05, |
|
"loss": 2.0035, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.4430273535828159, |
|
"grad_norm": 1.3459244966506958, |
|
"learning_rate": 2.7873804329585502e-05, |
|
"loss": 2.0612, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.44638362141298876, |
|
"grad_norm": 1.2673430442810059, |
|
"learning_rate": 2.7705990938076858e-05, |
|
"loss": 2.0134, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.4497398892431616, |
|
"grad_norm": 1.229880928993225, |
|
"learning_rate": 2.7538177546568216e-05, |
|
"loss": 2.0324, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.45309615707333445, |
|
"grad_norm": 1.3053526878356934, |
|
"learning_rate": 2.7370364155059575e-05, |
|
"loss": 2.0031, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4564524249035073, |
|
"grad_norm": 1.3416264057159424, |
|
"learning_rate": 2.720255076355093e-05, |
|
"loss": 2.0513, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.45980869273368014, |
|
"grad_norm": 1.3494229316711426, |
|
"learning_rate": 2.703473737204229e-05, |
|
"loss": 2.0527, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.463164960563853, |
|
"grad_norm": 1.2861367464065552, |
|
"learning_rate": 2.686692398053365e-05, |
|
"loss": 2.0327, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.4665212283940258, |
|
"grad_norm": 1.260968565940857, |
|
"learning_rate": 2.6699110589025004e-05, |
|
"loss": 2.0399, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.46987749622419867, |
|
"grad_norm": 1.4496228694915771, |
|
"learning_rate": 2.6531297197516363e-05, |
|
"loss": 2.0532, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.47323376405437156, |
|
"grad_norm": 1.2266364097595215, |
|
"learning_rate": 2.6363483806007722e-05, |
|
"loss": 2.0196, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.4765900318845444, |
|
"grad_norm": 1.4289458990097046, |
|
"learning_rate": 2.6195670414499078e-05, |
|
"loss": 2.0379, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.47994629971471725, |
|
"grad_norm": 1.3267526626586914, |
|
"learning_rate": 2.6027857022990437e-05, |
|
"loss": 2.0068, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.4833025675448901, |
|
"grad_norm": 1.308477520942688, |
|
"learning_rate": 2.5860043631481796e-05, |
|
"loss": 2.0204, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.48665883537506294, |
|
"grad_norm": 1.2613425254821777, |
|
"learning_rate": 2.569223023997315e-05, |
|
"loss": 2.0319, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.4900151032052358, |
|
"grad_norm": 1.2851049900054932, |
|
"learning_rate": 2.552441684846451e-05, |
|
"loss": 2.044, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.4933713710354086, |
|
"grad_norm": 1.2507027387619019, |
|
"learning_rate": 2.535660345695587e-05, |
|
"loss": 1.9964, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.49672763886558147, |
|
"grad_norm": 1.3747198581695557, |
|
"learning_rate": 2.5188790065447225e-05, |
|
"loss": 1.9995, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.5000839066957543, |
|
"grad_norm": 1.2839736938476562, |
|
"learning_rate": 2.5020976673938584e-05, |
|
"loss": 2.0137, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.5034401745259272, |
|
"grad_norm": 1.428585410118103, |
|
"learning_rate": 2.485316328242994e-05, |
|
"loss": 2.0149, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5067964423561, |
|
"grad_norm": 1.3017884492874146, |
|
"learning_rate": 2.46853498909213e-05, |
|
"loss": 2.0023, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.5101527101862728, |
|
"grad_norm": 1.2209047079086304, |
|
"learning_rate": 2.4517536499412654e-05, |
|
"loss": 2.0092, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.5135089780164457, |
|
"grad_norm": 1.2542091608047485, |
|
"learning_rate": 2.4349723107904013e-05, |
|
"loss": 2.0137, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.5168652458466185, |
|
"grad_norm": 1.1981834173202515, |
|
"learning_rate": 2.4181909716395372e-05, |
|
"loss": 2.0013, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.5202215136767914, |
|
"grad_norm": 1.3641618490219116, |
|
"learning_rate": 2.4014096324886727e-05, |
|
"loss": 2.0412, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5235777815069642, |
|
"grad_norm": 1.2675738334655762, |
|
"learning_rate": 2.3846282933378083e-05, |
|
"loss": 2.0088, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.5269340493371371, |
|
"grad_norm": 1.2499597072601318, |
|
"learning_rate": 2.3678469541869442e-05, |
|
"loss": 1.9926, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.5302903171673099, |
|
"grad_norm": 1.260705828666687, |
|
"learning_rate": 2.3510656150360798e-05, |
|
"loss": 1.9775, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.5336465849974829, |
|
"grad_norm": 1.2815592288970947, |
|
"learning_rate": 2.3342842758852156e-05, |
|
"loss": 1.9964, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.5370028528276557, |
|
"grad_norm": 1.191841721534729, |
|
"learning_rate": 2.3175029367343515e-05, |
|
"loss": 1.9858, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5403591206578285, |
|
"grad_norm": 1.2903523445129395, |
|
"learning_rate": 2.300721597583487e-05, |
|
"loss": 2.0101, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.5437153884880014, |
|
"grad_norm": 1.257596731185913, |
|
"learning_rate": 2.283940258432623e-05, |
|
"loss": 1.9916, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.5470716563181742, |
|
"grad_norm": 1.2740247249603271, |
|
"learning_rate": 2.267158919281759e-05, |
|
"loss": 1.9724, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.5504279241483471, |
|
"grad_norm": 1.2834815979003906, |
|
"learning_rate": 2.2503775801308944e-05, |
|
"loss": 2.0144, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.5537841919785199, |
|
"grad_norm": 1.2792032957077026, |
|
"learning_rate": 2.2335962409800303e-05, |
|
"loss": 2.0241, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5571404598086928, |
|
"grad_norm": 1.3966797590255737, |
|
"learning_rate": 2.2168149018291662e-05, |
|
"loss": 2.0202, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.5604967276388656, |
|
"grad_norm": 1.2389239072799683, |
|
"learning_rate": 2.2000335626783018e-05, |
|
"loss": 2.0221, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.5638529954690384, |
|
"grad_norm": 1.2616690397262573, |
|
"learning_rate": 2.1832522235274374e-05, |
|
"loss": 2.0266, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.5672092632992113, |
|
"grad_norm": 1.2490557432174683, |
|
"learning_rate": 2.1664708843765732e-05, |
|
"loss": 2.011, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.5705655311293841, |
|
"grad_norm": 1.2576720714569092, |
|
"learning_rate": 2.149689545225709e-05, |
|
"loss": 2.0368, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.573921798959557, |
|
"grad_norm": 1.2379933595657349, |
|
"learning_rate": 2.1329082060748447e-05, |
|
"loss": 1.9964, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.5772780667897298, |
|
"grad_norm": 1.259509801864624, |
|
"learning_rate": 2.1161268669239806e-05, |
|
"loss": 1.9827, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.5806343346199027, |
|
"grad_norm": 1.2831110954284668, |
|
"learning_rate": 2.0993455277731165e-05, |
|
"loss": 2.0438, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.5839906024500755, |
|
"grad_norm": 1.331018090248108, |
|
"learning_rate": 2.082564188622252e-05, |
|
"loss": 2.0088, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.5873468702802483, |
|
"grad_norm": 1.2574256658554077, |
|
"learning_rate": 2.065782849471388e-05, |
|
"loss": 2.0474, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5907031381104212, |
|
"grad_norm": 1.283894658088684, |
|
"learning_rate": 2.049001510320524e-05, |
|
"loss": 2.0543, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.594059405940594, |
|
"grad_norm": 1.2675976753234863, |
|
"learning_rate": 2.0322201711696594e-05, |
|
"loss": 2.0182, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.5974156737707669, |
|
"grad_norm": 1.2579065561294556, |
|
"learning_rate": 2.0154388320187953e-05, |
|
"loss": 1.9765, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.6007719416009397, |
|
"grad_norm": 1.2290009260177612, |
|
"learning_rate": 1.9986574928679312e-05, |
|
"loss": 1.9856, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.6041282094311126, |
|
"grad_norm": 1.3220741748809814, |
|
"learning_rate": 1.9818761537170667e-05, |
|
"loss": 1.9789, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6074844772612854, |
|
"grad_norm": 1.3620175123214722, |
|
"learning_rate": 1.9650948145662023e-05, |
|
"loss": 1.973, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.6108407450914582, |
|
"grad_norm": 1.3105921745300293, |
|
"learning_rate": 1.9483134754153382e-05, |
|
"loss": 2.0177, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.6141970129216311, |
|
"grad_norm": 1.303818941116333, |
|
"learning_rate": 1.931532136264474e-05, |
|
"loss": 2.0125, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.617553280751804, |
|
"grad_norm": 1.2042673826217651, |
|
"learning_rate": 1.9147507971136096e-05, |
|
"loss": 2.0166, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.6209095485819769, |
|
"grad_norm": 1.2778286933898926, |
|
"learning_rate": 1.8979694579627455e-05, |
|
"loss": 1.9743, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6242658164121497, |
|
"grad_norm": 1.3580365180969238, |
|
"learning_rate": 1.8811881188118814e-05, |
|
"loss": 2.0165, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.6276220842423226, |
|
"grad_norm": 1.2921738624572754, |
|
"learning_rate": 1.864406779661017e-05, |
|
"loss": 1.96, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.6309783520724954, |
|
"grad_norm": 1.347383737564087, |
|
"learning_rate": 1.847625440510153e-05, |
|
"loss": 1.9895, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.6343346199026683, |
|
"grad_norm": 1.2058087587356567, |
|
"learning_rate": 1.8308441013592888e-05, |
|
"loss": 1.9735, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.6376908877328411, |
|
"grad_norm": 1.3013418912887573, |
|
"learning_rate": 1.8140627622084243e-05, |
|
"loss": 1.96, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.641047155563014, |
|
"grad_norm": 1.2196553945541382, |
|
"learning_rate": 1.7972814230575602e-05, |
|
"loss": 1.9808, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.6444034233931868, |
|
"grad_norm": 1.2661124467849731, |
|
"learning_rate": 1.7805000839066958e-05, |
|
"loss": 2.0022, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.6477596912233596, |
|
"grad_norm": 1.3183765411376953, |
|
"learning_rate": 1.7637187447558313e-05, |
|
"loss": 1.9696, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.6511159590535325, |
|
"grad_norm": 1.2164355516433716, |
|
"learning_rate": 1.7469374056049672e-05, |
|
"loss": 1.9387, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.6544722268837053, |
|
"grad_norm": 1.2561441659927368, |
|
"learning_rate": 1.730156066454103e-05, |
|
"loss": 2.0252, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6578284947138782, |
|
"grad_norm": 1.240861177444458, |
|
"learning_rate": 1.7133747273032387e-05, |
|
"loss": 1.9737, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.661184762544051, |
|
"grad_norm": 1.2271203994750977, |
|
"learning_rate": 1.6965933881523746e-05, |
|
"loss": 1.991, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.6645410303742239, |
|
"grad_norm": 1.224165678024292, |
|
"learning_rate": 1.6798120490015105e-05, |
|
"loss": 1.9771, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.6678972982043967, |
|
"grad_norm": 1.222956657409668, |
|
"learning_rate": 1.663030709850646e-05, |
|
"loss": 1.9659, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.6712535660345695, |
|
"grad_norm": 1.2539464235305786, |
|
"learning_rate": 1.646249370699782e-05, |
|
"loss": 1.9811, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6746098338647424, |
|
"grad_norm": 1.272801160812378, |
|
"learning_rate": 1.629468031548918e-05, |
|
"loss": 2.0171, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"grad_norm": 1.2539173364639282, |
|
"learning_rate": 1.6126866923980534e-05, |
|
"loss": 1.953, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.6813223695250881, |
|
"grad_norm": 1.2918198108673096, |
|
"learning_rate": 1.5959053532471893e-05, |
|
"loss": 1.9742, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.6846786373552609, |
|
"grad_norm": 1.3085472583770752, |
|
"learning_rate": 1.5791240140963252e-05, |
|
"loss": 1.9693, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.6880349051854338, |
|
"grad_norm": 1.2754298448562622, |
|
"learning_rate": 1.5623426749454607e-05, |
|
"loss": 1.9819, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.6913911730156066, |
|
"grad_norm": 1.3267842531204224, |
|
"learning_rate": 1.5455613357945963e-05, |
|
"loss": 1.9549, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.6947474408457794, |
|
"grad_norm": 1.1842241287231445, |
|
"learning_rate": 1.5287799966437322e-05, |
|
"loss": 1.9816, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.6981037086759524, |
|
"grad_norm": 1.2266135215759277, |
|
"learning_rate": 1.511998657492868e-05, |
|
"loss": 1.956, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.7014599765061252, |
|
"grad_norm": 1.2384594678878784, |
|
"learning_rate": 1.4952173183420038e-05, |
|
"loss": 1.9761, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.7048162443362981, |
|
"grad_norm": 1.2703733444213867, |
|
"learning_rate": 1.4784359791911395e-05, |
|
"loss": 1.9602, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7081725121664709, |
|
"grad_norm": 1.3347383737564087, |
|
"learning_rate": 1.4616546400402753e-05, |
|
"loss": 1.9863, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.7115287799966438, |
|
"grad_norm": 1.2556087970733643, |
|
"learning_rate": 1.4448733008894112e-05, |
|
"loss": 1.9937, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.7148850478268166, |
|
"grad_norm": 1.2546402215957642, |
|
"learning_rate": 1.4280919617385469e-05, |
|
"loss": 1.9428, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.7182413156569895, |
|
"grad_norm": 1.20867121219635, |
|
"learning_rate": 1.4113106225876826e-05, |
|
"loss": 1.9702, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.7215975834871623, |
|
"grad_norm": 1.2617672681808472, |
|
"learning_rate": 1.3945292834368185e-05, |
|
"loss": 1.9688, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7249538513173351, |
|
"grad_norm": 1.306674599647522, |
|
"learning_rate": 1.3777479442859542e-05, |
|
"loss": 1.9497, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.728310119147508, |
|
"grad_norm": 1.3713186979293823, |
|
"learning_rate": 1.3609666051350898e-05, |
|
"loss": 2.0179, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.7316663869776808, |
|
"grad_norm": 1.1903387308120728, |
|
"learning_rate": 1.3441852659842255e-05, |
|
"loss": 1.9861, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.7350226548078537, |
|
"grad_norm": 1.2360427379608154, |
|
"learning_rate": 1.3274039268333612e-05, |
|
"loss": 1.9742, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.7383789226380265, |
|
"grad_norm": 1.1744493246078491, |
|
"learning_rate": 1.3106225876824971e-05, |
|
"loss": 2.0021, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7417351904681994, |
|
"grad_norm": 1.25652015209198, |
|
"learning_rate": 1.2938412485316329e-05, |
|
"loss": 2.0034, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.7450914582983722, |
|
"grad_norm": 1.2795733213424683, |
|
"learning_rate": 1.2770599093807686e-05, |
|
"loss": 2.0015, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.748447726128545, |
|
"grad_norm": 1.2552342414855957, |
|
"learning_rate": 1.2602785702299045e-05, |
|
"loss": 1.9703, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.7518039939587179, |
|
"grad_norm": 1.196215271949768, |
|
"learning_rate": 1.2434972310790402e-05, |
|
"loss": 1.9434, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.7551602617888907, |
|
"grad_norm": 1.2752282619476318, |
|
"learning_rate": 1.226715891928176e-05, |
|
"loss": 2.0011, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7585165296190636, |
|
"grad_norm": 1.361426591873169, |
|
"learning_rate": 1.2099345527773117e-05, |
|
"loss": 2.001, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.7618727974492364, |
|
"grad_norm": 1.295401930809021, |
|
"learning_rate": 1.1931532136264474e-05, |
|
"loss": 1.9785, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.7652290652794093, |
|
"grad_norm": 1.1889725923538208, |
|
"learning_rate": 1.1763718744755831e-05, |
|
"loss": 2.0203, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.7685853331095821, |
|
"grad_norm": 1.2194198369979858, |
|
"learning_rate": 1.159590535324719e-05, |
|
"loss": 1.954, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.771941600939755, |
|
"grad_norm": 1.1998612880706787, |
|
"learning_rate": 1.1428091961738547e-05, |
|
"loss": 1.9129, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.7752978687699278, |
|
"grad_norm": 1.1526966094970703, |
|
"learning_rate": 1.1260278570229905e-05, |
|
"loss": 1.941, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.7786541366001006, |
|
"grad_norm": 1.2833491563796997, |
|
"learning_rate": 1.1092465178721262e-05, |
|
"loss": 1.9606, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.7820104044302736, |
|
"grad_norm": 1.3405060768127441, |
|
"learning_rate": 1.0924651787212619e-05, |
|
"loss": 1.9828, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.7853666722604464, |
|
"grad_norm": 1.2940865755081177, |
|
"learning_rate": 1.0756838395703978e-05, |
|
"loss": 1.9309, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.7887229400906193, |
|
"grad_norm": 1.3541054725646973, |
|
"learning_rate": 1.0589025004195335e-05, |
|
"loss": 1.9213, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.7920792079207921, |
|
"grad_norm": 1.2736974954605103, |
|
"learning_rate": 1.0421211612686693e-05, |
|
"loss": 1.9526, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.795435475750965, |
|
"grad_norm": 1.2431901693344116, |
|
"learning_rate": 1.0253398221178052e-05, |
|
"loss": 1.9348, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.7987917435811378, |
|
"grad_norm": 1.245354175567627, |
|
"learning_rate": 1.0085584829669407e-05, |
|
"loss": 1.9294, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.8021480114113106, |
|
"grad_norm": 1.2285710573196411, |
|
"learning_rate": 9.917771438160766e-06, |
|
"loss": 1.9766, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.8055042792414835, |
|
"grad_norm": 1.1929337978363037, |
|
"learning_rate": 9.749958046652123e-06, |
|
"loss": 1.9265, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8088605470716563, |
|
"grad_norm": 1.3306503295898438, |
|
"learning_rate": 9.58214465514348e-06, |
|
"loss": 1.966, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.8122168149018292, |
|
"grad_norm": 1.203856110572815, |
|
"learning_rate": 9.41433126363484e-06, |
|
"loss": 1.9775, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.815573082732002, |
|
"grad_norm": 1.2907806634902954, |
|
"learning_rate": 9.246517872126197e-06, |
|
"loss": 1.9544, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.8189293505621749, |
|
"grad_norm": 1.36453378200531, |
|
"learning_rate": 9.078704480617552e-06, |
|
"loss": 1.935, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.8222856183923477, |
|
"grad_norm": 1.228378415107727, |
|
"learning_rate": 8.910891089108911e-06, |
|
"loss": 1.9942, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8256418862225205, |
|
"grad_norm": 1.2804518938064575, |
|
"learning_rate": 8.743077697600269e-06, |
|
"loss": 1.9865, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.8289981540526934, |
|
"grad_norm": 1.2092355489730835, |
|
"learning_rate": 8.575264306091626e-06, |
|
"loss": 1.9931, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.8323544218828662, |
|
"grad_norm": 1.248852252960205, |
|
"learning_rate": 8.407450914582985e-06, |
|
"loss": 1.9831, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.8357106897130391, |
|
"grad_norm": 1.3584181070327759, |
|
"learning_rate": 8.239637523074342e-06, |
|
"loss": 1.9706, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.8390669575432119, |
|
"grad_norm": 1.250568151473999, |
|
"learning_rate": 8.0718241315657e-06, |
|
"loss": 1.9449, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8424232253733848, |
|
"grad_norm": 1.3448213338851929, |
|
"learning_rate": 7.904010740057057e-06, |
|
"loss": 1.9221, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.8457794932035576, |
|
"grad_norm": 1.2519199848175049, |
|
"learning_rate": 7.736197348548414e-06, |
|
"loss": 1.9444, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.8491357610337305, |
|
"grad_norm": 1.2377424240112305, |
|
"learning_rate": 7.568383957039772e-06, |
|
"loss": 1.9438, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.8524920288639033, |
|
"grad_norm": 1.3130027055740356, |
|
"learning_rate": 7.40057056553113e-06, |
|
"loss": 1.9791, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.8558482966940761, |
|
"grad_norm": 1.2841044664382935, |
|
"learning_rate": 7.232757174022488e-06, |
|
"loss": 1.9539, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.859204564524249, |
|
"grad_norm": 1.2105425596237183, |
|
"learning_rate": 7.064943782513845e-06, |
|
"loss": 1.9738, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.8625608323544219, |
|
"grad_norm": 1.2560657262802124, |
|
"learning_rate": 6.897130391005202e-06, |
|
"loss": 1.9486, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.8659171001845948, |
|
"grad_norm": 1.2118816375732422, |
|
"learning_rate": 6.72931699949656e-06, |
|
"loss": 1.9107, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.8692733680147676, |
|
"grad_norm": 1.2164198160171509, |
|
"learning_rate": 6.561503607987918e-06, |
|
"loss": 2.0168, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.8726296358449405, |
|
"grad_norm": 1.358729362487793, |
|
"learning_rate": 6.393690216479275e-06, |
|
"loss": 1.9401, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.8759859036751133, |
|
"grad_norm": 1.2905333042144775, |
|
"learning_rate": 6.225876824970633e-06, |
|
"loss": 1.9235, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.8793421715052862, |
|
"grad_norm": 1.2965201139450073, |
|
"learning_rate": 6.058063433461991e-06, |
|
"loss": 1.9166, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.882698439335459, |
|
"grad_norm": 1.3677794933319092, |
|
"learning_rate": 5.890250041953348e-06, |
|
"loss": 1.9327, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.8860547071656318, |
|
"grad_norm": 1.312015175819397, |
|
"learning_rate": 5.722436650444705e-06, |
|
"loss": 1.9032, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.8894109749958047, |
|
"grad_norm": 1.2666916847229004, |
|
"learning_rate": 5.554623258936063e-06, |
|
"loss": 1.9014, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.8927672428259775, |
|
"grad_norm": 1.2646892070770264, |
|
"learning_rate": 5.3868098674274214e-06, |
|
"loss": 1.9586, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.8961235106561504, |
|
"grad_norm": 1.2514413595199585, |
|
"learning_rate": 5.218996475918779e-06, |
|
"loss": 1.959, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.8994797784863232, |
|
"grad_norm": 1.28076171875, |
|
"learning_rate": 5.051183084410136e-06, |
|
"loss": 1.9393, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.902836046316496, |
|
"grad_norm": 1.3480361700057983, |
|
"learning_rate": 4.883369692901494e-06, |
|
"loss": 1.9074, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.9061923141466689, |
|
"grad_norm": 1.3494782447814941, |
|
"learning_rate": 4.715556301392851e-06, |
|
"loss": 1.9162, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.9095485819768417, |
|
"grad_norm": 1.3703207969665527, |
|
"learning_rate": 4.547742909884209e-06, |
|
"loss": 1.9728, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.9129048498070146, |
|
"grad_norm": 1.2115719318389893, |
|
"learning_rate": 4.379929518375567e-06, |
|
"loss": 1.9381, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.9162611176371874, |
|
"grad_norm": 1.2599093914031982, |
|
"learning_rate": 4.212116126866924e-06, |
|
"loss": 1.9405, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.9196173854673603, |
|
"grad_norm": 1.1931716203689575, |
|
"learning_rate": 4.044302735358282e-06, |
|
"loss": 1.9205, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.9229736532975331, |
|
"grad_norm": 1.2630369663238525, |
|
"learning_rate": 3.876489343849639e-06, |
|
"loss": 1.9257, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.926329921127706, |
|
"grad_norm": 1.26536226272583, |
|
"learning_rate": 3.7086759523409966e-06, |
|
"loss": 1.9136, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.9296861889578788, |
|
"grad_norm": 1.25338876247406, |
|
"learning_rate": 3.5408625608323547e-06, |
|
"loss": 1.875, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.9330424567880516, |
|
"grad_norm": 1.2348542213439941, |
|
"learning_rate": 3.3730491693237124e-06, |
|
"loss": 1.959, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.9363987246182245, |
|
"grad_norm": 1.3011400699615479, |
|
"learning_rate": 3.2052357778150696e-06, |
|
"loss": 1.9571, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.9397549924483973, |
|
"grad_norm": 1.343929409980774, |
|
"learning_rate": 3.0374223863064273e-06, |
|
"loss": 1.9492, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9431112602785703, |
|
"grad_norm": 1.281829833984375, |
|
"learning_rate": 2.869608994797785e-06, |
|
"loss": 1.9429, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.9464675281087431, |
|
"grad_norm": 1.1878955364227295, |
|
"learning_rate": 2.7017956032891427e-06, |
|
"loss": 1.9288, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.949823795938916, |
|
"grad_norm": 1.2426503896713257, |
|
"learning_rate": 2.5339822117805e-06, |
|
"loss": 1.9498, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.9531800637690888, |
|
"grad_norm": 1.3009203672409058, |
|
"learning_rate": 2.366168820271858e-06, |
|
"loss": 1.9254, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.9565363315992617, |
|
"grad_norm": 1.3365298509597778, |
|
"learning_rate": 2.1983554287632153e-06, |
|
"loss": 1.9745, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.9598925994294345, |
|
"grad_norm": 1.2553895711898804, |
|
"learning_rate": 2.030542037254573e-06, |
|
"loss": 1.9636, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.9632488672596073, |
|
"grad_norm": 1.168750286102295, |
|
"learning_rate": 1.8627286457459307e-06, |
|
"loss": 1.9333, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.9666051350897802, |
|
"grad_norm": 1.3157403469085693, |
|
"learning_rate": 1.6949152542372882e-06, |
|
"loss": 1.9635, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.969961402919953, |
|
"grad_norm": 1.2229833602905273, |
|
"learning_rate": 1.5271018627286458e-06, |
|
"loss": 1.9534, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.9733176707501259, |
|
"grad_norm": 1.3683537244796753, |
|
"learning_rate": 1.3592884712200033e-06, |
|
"loss": 1.9916, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.9766739385802987, |
|
"grad_norm": 1.1630358695983887, |
|
"learning_rate": 1.191475079711361e-06, |
|
"loss": 1.9098, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.9800302064104716, |
|
"grad_norm": 1.249770164489746, |
|
"learning_rate": 1.0236616882027187e-06, |
|
"loss": 1.9411, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.9833864742406444, |
|
"grad_norm": 1.2611877918243408, |
|
"learning_rate": 8.558482966940762e-07, |
|
"loss": 1.9093, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.9867427420708172, |
|
"grad_norm": 1.2660034894943237, |
|
"learning_rate": 6.880349051854338e-07, |
|
"loss": 1.9, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.9900990099009901, |
|
"grad_norm": 1.263708233833313, |
|
"learning_rate": 5.202215136767914e-07, |
|
"loss": 1.9256, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.9934552777311629, |
|
"grad_norm": 1.257265329360962, |
|
"learning_rate": 3.5240812216814904e-07, |
|
"loss": 1.9474, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.9968115455613358, |
|
"grad_norm": 1.3897913694381714, |
|
"learning_rate": 1.8459473065950665e-07, |
|
"loss": 1.9567, |
|
"step": 5940 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 5959, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5959, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.81749288803369e+17, |
|
"train_batch_size": 18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|