|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7054574185902139, |
|
"eval_steps": 500, |
|
"global_step": 25000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0028218296743608554, |
|
"grad_norm": 175.4037628173828, |
|
"learning_rate": 1.3261851015801355e-07, |
|
"loss": 3.1599, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.005643659348721711, |
|
"grad_norm": 24.302709579467773, |
|
"learning_rate": 2.737020316027088e-07, |
|
"loss": 1.8481, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.008465489023082567, |
|
"grad_norm": 39.38914489746094, |
|
"learning_rate": 4.147855530474041e-07, |
|
"loss": 1.2574, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.011287318697443422, |
|
"grad_norm": 28.08403205871582, |
|
"learning_rate": 5.558690744920993e-07, |
|
"loss": 1.0333, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.014109148371804279, |
|
"grad_norm": 22.54578399658203, |
|
"learning_rate": 6.969525959367947e-07, |
|
"loss": 0.9413, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.016930978046165134, |
|
"grad_norm": 17.940231323242188, |
|
"learning_rate": 8.3803611738149e-07, |
|
"loss": 0.7995, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.01975280772052599, |
|
"grad_norm": 26.575462341308594, |
|
"learning_rate": 9.79119638826185e-07, |
|
"loss": 0.7403, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.022574637394886844, |
|
"grad_norm": 51.95569610595703, |
|
"learning_rate": 1.1202031602708804e-06, |
|
"loss": 0.7774, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0253964670692477, |
|
"grad_norm": 140.2630157470703, |
|
"learning_rate": 1.2612866817155757e-06, |
|
"loss": 0.6903, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.028218296743608557, |
|
"grad_norm": 28.976390838623047, |
|
"learning_rate": 1.402370203160271e-06, |
|
"loss": 0.6727, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03104012641796941, |
|
"grad_norm": 36.016754150390625, |
|
"learning_rate": 1.5434537246049664e-06, |
|
"loss": 0.6351, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03386195609233027, |
|
"grad_norm": 41.480491638183594, |
|
"learning_rate": 1.6845372460496615e-06, |
|
"loss": 0.6555, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.036683785766691124, |
|
"grad_norm": 24.90089225769043, |
|
"learning_rate": 1.8256207674943568e-06, |
|
"loss": 0.6714, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.03950561544105198, |
|
"grad_norm": 31.070730209350586, |
|
"learning_rate": 1.966704288939052e-06, |
|
"loss": 0.6781, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.04232744511541283, |
|
"grad_norm": 33.644046783447266, |
|
"learning_rate": 2.1077878103837474e-06, |
|
"loss": 0.6284, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04514927478977369, |
|
"grad_norm": 22.074399948120117, |
|
"learning_rate": 2.2488713318284427e-06, |
|
"loss": 0.578, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.047971104464134544, |
|
"grad_norm": 30.381807327270508, |
|
"learning_rate": 2.389954853273138e-06, |
|
"loss": 0.631, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.0507929341384954, |
|
"grad_norm": 31.510488510131836, |
|
"learning_rate": 2.531038374717833e-06, |
|
"loss": 0.61, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.05361476381285626, |
|
"grad_norm": 27.364355087280273, |
|
"learning_rate": 2.6721218961625283e-06, |
|
"loss": 0.5953, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.056436593487217114, |
|
"grad_norm": 48.54792404174805, |
|
"learning_rate": 2.8132054176072236e-06, |
|
"loss": 0.6046, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.059258423161577964, |
|
"grad_norm": 49.58477020263672, |
|
"learning_rate": 2.9542889390519193e-06, |
|
"loss": 0.5609, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.06208025283593882, |
|
"grad_norm": 26.104825973510742, |
|
"learning_rate": 3.095372460496614e-06, |
|
"loss": 0.5931, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.06490208251029968, |
|
"grad_norm": 36.350685119628906, |
|
"learning_rate": 3.2364559819413096e-06, |
|
"loss": 0.5884, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.06772391218466053, |
|
"grad_norm": 36.55471420288086, |
|
"learning_rate": 3.377539503386005e-06, |
|
"loss": 0.5957, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.07054574185902139, |
|
"grad_norm": 35.742225646972656, |
|
"learning_rate": 3.5186230248307e-06, |
|
"loss": 0.5958, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.07336757153338225, |
|
"grad_norm": 20.766326904296875, |
|
"learning_rate": 3.6597065462753955e-06, |
|
"loss": 0.561, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.0761894012077431, |
|
"grad_norm": 43.46022033691406, |
|
"learning_rate": 3.8007900677200904e-06, |
|
"loss": 0.5751, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.07901123088210396, |
|
"grad_norm": 35.60908889770508, |
|
"learning_rate": 3.941873589164786e-06, |
|
"loss": 0.6054, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.08183306055646482, |
|
"grad_norm": 25.768211364746094, |
|
"learning_rate": 4.082957110609481e-06, |
|
"loss": 0.5796, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.08465489023082566, |
|
"grad_norm": 38.575496673583984, |
|
"learning_rate": 4.224040632054177e-06, |
|
"loss": 0.5874, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08747671990518652, |
|
"grad_norm": 23.993473052978516, |
|
"learning_rate": 4.363713318284425e-06, |
|
"loss": 0.5564, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.09029854957954737, |
|
"grad_norm": 42.830509185791016, |
|
"learning_rate": 4.50479683972912e-06, |
|
"loss": 0.5645, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.09312037925390823, |
|
"grad_norm": 34.766197204589844, |
|
"learning_rate": 4.6458803611738155e-06, |
|
"loss": 0.5433, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.09594220892826909, |
|
"grad_norm": 32.30384826660156, |
|
"learning_rate": 4.78696388261851e-06, |
|
"loss": 0.5691, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.09876403860262994, |
|
"grad_norm": 77.13898468017578, |
|
"learning_rate": 4.928047404063206e-06, |
|
"loss": 0.5685, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.1015858682769908, |
|
"grad_norm": 42.225563049316406, |
|
"learning_rate": 5.069130925507901e-06, |
|
"loss": 0.5381, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.10440769795135166, |
|
"grad_norm": 32.32414245605469, |
|
"learning_rate": 5.210214446952596e-06, |
|
"loss": 0.5406, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.10722952762571251, |
|
"grad_norm": 25.47852325439453, |
|
"learning_rate": 5.3512979683972925e-06, |
|
"loss": 0.5309, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.11005135730007337, |
|
"grad_norm": 31.160581588745117, |
|
"learning_rate": 5.4923814898419865e-06, |
|
"loss": 0.5259, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.11287318697443423, |
|
"grad_norm": 36.84406661987305, |
|
"learning_rate": 5.632054176072235e-06, |
|
"loss": 0.5297, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.11569501664879508, |
|
"grad_norm": 10.82895565032959, |
|
"learning_rate": 5.77313769751693e-06, |
|
"loss": 0.4846, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.11851684632315593, |
|
"grad_norm": 21.587764739990234, |
|
"learning_rate": 5.914221218961625e-06, |
|
"loss": 0.5333, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.12133867599751678, |
|
"grad_norm": 22.369434356689453, |
|
"learning_rate": 6.055304740406322e-06, |
|
"loss": 0.5372, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.12416050567187764, |
|
"grad_norm": 33.766910552978516, |
|
"learning_rate": 6.196388261851016e-06, |
|
"loss": 0.5652, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.1269823353462385, |
|
"grad_norm": 27.229183197021484, |
|
"learning_rate": 6.337471783295711e-06, |
|
"loss": 0.531, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.12980416502059935, |
|
"grad_norm": 25.017833709716797, |
|
"learning_rate": 6.478555304740407e-06, |
|
"loss": 0.5483, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.13262599469496023, |
|
"grad_norm": 27.347057342529297, |
|
"learning_rate": 6.619638826185102e-06, |
|
"loss": 0.5289, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.13544782436932107, |
|
"grad_norm": 32.57697296142578, |
|
"learning_rate": 6.760722347629798e-06, |
|
"loss": 0.5225, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.1382696540436819, |
|
"grad_norm": 31.288978576660156, |
|
"learning_rate": 6.901805869074493e-06, |
|
"loss": 0.4876, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.14109148371804278, |
|
"grad_norm": 14.970600128173828, |
|
"learning_rate": 7.042889390519188e-06, |
|
"loss": 0.4723, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.14391331339240362, |
|
"grad_norm": 50.01182556152344, |
|
"learning_rate": 7.1839729119638835e-06, |
|
"loss": 0.518, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.1467351430667645, |
|
"grad_norm": 28.612035751342773, |
|
"learning_rate": 7.325056433408578e-06, |
|
"loss": 0.4673, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.14955697274112534, |
|
"grad_norm": 79.7120132446289, |
|
"learning_rate": 7.466139954853274e-06, |
|
"loss": 0.5274, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.1523788024154862, |
|
"grad_norm": 15.11323070526123, |
|
"learning_rate": 7.607223476297969e-06, |
|
"loss": 0.5082, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.15520063208984705, |
|
"grad_norm": 18.369705200195312, |
|
"learning_rate": 7.748306997742663e-06, |
|
"loss": 0.4996, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.15802246176420792, |
|
"grad_norm": 36.83811950683594, |
|
"learning_rate": 7.88939051918736e-06, |
|
"loss": 0.5587, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.16084429143856876, |
|
"grad_norm": 45.13284683227539, |
|
"learning_rate": 8.030474040632055e-06, |
|
"loss": 0.5121, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.16366612111292964, |
|
"grad_norm": 22.508358001708984, |
|
"learning_rate": 8.17155756207675e-06, |
|
"loss": 0.5167, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.16648795078729048, |
|
"grad_norm": 37.16364288330078, |
|
"learning_rate": 8.312641083521446e-06, |
|
"loss": 0.524, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.16930978046165132, |
|
"grad_norm": 25.45941925048828, |
|
"learning_rate": 8.453724604966141e-06, |
|
"loss": 0.4932, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.1721316101360122, |
|
"grad_norm": 29.486251831054688, |
|
"learning_rate": 8.594808126410836e-06, |
|
"loss": 0.5069, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.17495343981037303, |
|
"grad_norm": 36.87773513793945, |
|
"learning_rate": 8.73589164785553e-06, |
|
"loss": 0.5286, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.1777752694847339, |
|
"grad_norm": 27.660303115844727, |
|
"learning_rate": 8.876975169300226e-06, |
|
"loss": 0.5015, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.18059709915909475, |
|
"grad_norm": 20.193119049072266, |
|
"learning_rate": 9.018058690744922e-06, |
|
"loss": 0.5353, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.18341892883345562, |
|
"grad_norm": 46.997806549072266, |
|
"learning_rate": 9.159142212189617e-06, |
|
"loss": 0.4702, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.18624075850781646, |
|
"grad_norm": 33.448543548583984, |
|
"learning_rate": 9.300225733634312e-06, |
|
"loss": 0.51, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.18906258818217733, |
|
"grad_norm": 36.22298049926758, |
|
"learning_rate": 9.441309255079007e-06, |
|
"loss": 0.4891, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.19188441785653818, |
|
"grad_norm": 37.21258544921875, |
|
"learning_rate": 9.582392776523702e-06, |
|
"loss": 0.5353, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.19470624753089905, |
|
"grad_norm": 14.430715560913086, |
|
"learning_rate": 9.723476297968398e-06, |
|
"loss": 0.5091, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.1975280772052599, |
|
"grad_norm": 37.424869537353516, |
|
"learning_rate": 9.864559819413093e-06, |
|
"loss": 0.5326, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.20034990687962076, |
|
"grad_norm": 12.207716941833496, |
|
"learning_rate": 9.999372922806798e-06, |
|
"loss": 0.517, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.2031717365539816, |
|
"grad_norm": 10.985408782958984, |
|
"learning_rate": 9.983695992976736e-06, |
|
"loss": 0.5263, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.20599356622834245, |
|
"grad_norm": 32.765968322753906, |
|
"learning_rate": 9.968019063146674e-06, |
|
"loss": 0.5073, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.20881539590270332, |
|
"grad_norm": 24.93343734741211, |
|
"learning_rate": 9.952342133316612e-06, |
|
"loss": 0.5049, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.21163722557706416, |
|
"grad_norm": 42.219844818115234, |
|
"learning_rate": 9.93666520348655e-06, |
|
"loss": 0.5039, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.21445905525142503, |
|
"grad_norm": 23.783781051635742, |
|
"learning_rate": 9.920988273656488e-06, |
|
"loss": 0.5179, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.21728088492578587, |
|
"grad_norm": 29.034082412719727, |
|
"learning_rate": 9.905311343826426e-06, |
|
"loss": 0.5053, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.22010271460014674, |
|
"grad_norm": 33.75339126586914, |
|
"learning_rate": 9.889634413996364e-06, |
|
"loss": 0.4951, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.22292454427450759, |
|
"grad_norm": 13.03099536895752, |
|
"learning_rate": 9.873957484166302e-06, |
|
"loss": 0.5285, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.22574637394886846, |
|
"grad_norm": 31.321319580078125, |
|
"learning_rate": 9.858280554336239e-06, |
|
"loss": 0.5172, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2285682036232293, |
|
"grad_norm": 16.2780704498291, |
|
"learning_rate": 9.842603624506177e-06, |
|
"loss": 0.5214, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.23139003329759017, |
|
"grad_norm": 23.63935661315918, |
|
"learning_rate": 9.826926694676115e-06, |
|
"loss": 0.4817, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.234211862971951, |
|
"grad_norm": 28.826778411865234, |
|
"learning_rate": 9.811249764846053e-06, |
|
"loss": 0.5106, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.23703369264631186, |
|
"grad_norm": 23.31501007080078, |
|
"learning_rate": 9.795572835015993e-06, |
|
"loss": 0.5101, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.23985552232067273, |
|
"grad_norm": 22.932710647583008, |
|
"learning_rate": 9.779895905185929e-06, |
|
"loss": 0.4779, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.24267735199503357, |
|
"grad_norm": 32.443641662597656, |
|
"learning_rate": 9.764218975355867e-06, |
|
"loss": 0.5057, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.24549918166939444, |
|
"grad_norm": 60.96305847167969, |
|
"learning_rate": 9.748542045525805e-06, |
|
"loss": 0.4986, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.24832101134375528, |
|
"grad_norm": 27.693511962890625, |
|
"learning_rate": 9.732865115695743e-06, |
|
"loss": 0.5099, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.2511428410181161, |
|
"grad_norm": 16.883127212524414, |
|
"learning_rate": 9.717188185865681e-06, |
|
"loss": 0.5244, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.253964670692477, |
|
"grad_norm": 30.549161911010742, |
|
"learning_rate": 9.701511256035619e-06, |
|
"loss": 0.5023, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.25678650036683787, |
|
"grad_norm": 23.576152801513672, |
|
"learning_rate": 9.685991095503856e-06, |
|
"loss": 0.5008, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.2596083300411987, |
|
"grad_norm": 21.88428497314453, |
|
"learning_rate": 9.670314165673796e-06, |
|
"loss": 0.5295, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.26243015971555955, |
|
"grad_norm": 21.60301971435547, |
|
"learning_rate": 9.654637235843734e-06, |
|
"loss": 0.5001, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.26525198938992045, |
|
"grad_norm": 36.93144607543945, |
|
"learning_rate": 9.638960306013672e-06, |
|
"loss": 0.4505, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.2680738190642813, |
|
"grad_norm": 17.426191329956055, |
|
"learning_rate": 9.623283376183608e-06, |
|
"loss": 0.4694, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.27089564873864214, |
|
"grad_norm": 22.8311710357666, |
|
"learning_rate": 9.607606446353546e-06, |
|
"loss": 0.488, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.273717478413003, |
|
"grad_norm": 32.39801025390625, |
|
"learning_rate": 9.591929516523485e-06, |
|
"loss": 0.5119, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.2765393080873638, |
|
"grad_norm": 14.913991928100586, |
|
"learning_rate": 9.576252586693423e-06, |
|
"loss": 0.4523, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.2793611377617247, |
|
"grad_norm": 28.448148727416992, |
|
"learning_rate": 9.56057565686336e-06, |
|
"loss": 0.4689, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.28218296743608556, |
|
"grad_norm": 49.07644271850586, |
|
"learning_rate": 9.544898727033299e-06, |
|
"loss": 0.5235, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2850047971104464, |
|
"grad_norm": 31.61011505126953, |
|
"learning_rate": 9.529221797203237e-06, |
|
"loss": 0.5053, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.28782662678480725, |
|
"grad_norm": 23.562646865844727, |
|
"learning_rate": 9.513544867373175e-06, |
|
"loss": 0.485, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.29064845645916815, |
|
"grad_norm": 22.867277145385742, |
|
"learning_rate": 9.497867937543113e-06, |
|
"loss": 0.523, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.293470286133529, |
|
"grad_norm": 44.8724365234375, |
|
"learning_rate": 9.48219100771305e-06, |
|
"loss": 0.4691, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.29629211580788983, |
|
"grad_norm": 15.85916519165039, |
|
"learning_rate": 9.466514077882987e-06, |
|
"loss": 0.4646, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.2991139454822507, |
|
"grad_norm": 19.45340347290039, |
|
"learning_rate": 9.450837148052927e-06, |
|
"loss": 0.4907, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.3019357751566116, |
|
"grad_norm": 14.807464599609375, |
|
"learning_rate": 9.435160218222865e-06, |
|
"loss": 0.4539, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.3047576048309724, |
|
"grad_norm": 24.157548904418945, |
|
"learning_rate": 9.419483288392803e-06, |
|
"loss": 0.3937, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.30757943450533326, |
|
"grad_norm": 37.66196060180664, |
|
"learning_rate": 9.40380635856274e-06, |
|
"loss": 0.4346, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.3104012641796941, |
|
"grad_norm": 54.08269500732422, |
|
"learning_rate": 9.388129428732677e-06, |
|
"loss": 0.4566, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.31322309385405495, |
|
"grad_norm": 37.25579833984375, |
|
"learning_rate": 9.372452498902615e-06, |
|
"loss": 0.4333, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.31604492352841584, |
|
"grad_norm": 32.52021026611328, |
|
"learning_rate": 9.356775569072553e-06, |
|
"loss": 0.4971, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.3188667532027767, |
|
"grad_norm": 24.20098876953125, |
|
"learning_rate": 9.341098639242491e-06, |
|
"loss": 0.4936, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.32168858287713753, |
|
"grad_norm": 37.67250061035156, |
|
"learning_rate": 9.32542170941243e-06, |
|
"loss": 0.4925, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.3245104125514984, |
|
"grad_norm": 51.37995147705078, |
|
"learning_rate": 9.309744779582367e-06, |
|
"loss": 0.4567, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.32733224222585927, |
|
"grad_norm": 2.77247953414917, |
|
"learning_rate": 9.294067849752305e-06, |
|
"loss": 0.4273, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.3301540719002201, |
|
"grad_norm": 26.4583740234375, |
|
"learning_rate": 9.278390919922243e-06, |
|
"loss": 0.4972, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.33297590157458096, |
|
"grad_norm": 23.889802932739258, |
|
"learning_rate": 9.262713990092181e-06, |
|
"loss": 0.4748, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.3357977312489418, |
|
"grad_norm": 26.13481903076172, |
|
"learning_rate": 9.24703706026212e-06, |
|
"loss": 0.4772, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.33861956092330264, |
|
"grad_norm": 32.484073638916016, |
|
"learning_rate": 9.231360130432057e-06, |
|
"loss": 0.4582, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.34144139059766354, |
|
"grad_norm": 12.51020336151123, |
|
"learning_rate": 9.215683200601995e-06, |
|
"loss": 0.466, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.3442632202720244, |
|
"grad_norm": 22.987064361572266, |
|
"learning_rate": 9.200006270771933e-06, |
|
"loss": 0.474, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.3470850499463852, |
|
"grad_norm": 19.731643676757812, |
|
"learning_rate": 9.184329340941871e-06, |
|
"loss": 0.428, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.34990687962074607, |
|
"grad_norm": 30.095190048217773, |
|
"learning_rate": 9.16865241111181e-06, |
|
"loss": 0.4448, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.35272870929510697, |
|
"grad_norm": 23.412023544311523, |
|
"learning_rate": 9.152975481281746e-06, |
|
"loss": 0.4607, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.3555505389694678, |
|
"grad_norm": 15.968676567077637, |
|
"learning_rate": 9.137298551451684e-06, |
|
"loss": 0.4774, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.35837236864382865, |
|
"grad_norm": 22.27809715270996, |
|
"learning_rate": 9.121621621621622e-06, |
|
"loss": 0.4796, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.3611941983181895, |
|
"grad_norm": 25.096717834472656, |
|
"learning_rate": 9.106101461089861e-06, |
|
"loss": 0.4238, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.3640160279925504, |
|
"grad_norm": 32.24757766723633, |
|
"learning_rate": 9.090424531259799e-06, |
|
"loss": 0.4719, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.36683785766691124, |
|
"grad_norm": 43.72540283203125, |
|
"learning_rate": 9.074747601429737e-06, |
|
"loss": 0.4571, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.3696596873412721, |
|
"grad_norm": 25.39431381225586, |
|
"learning_rate": 9.059070671599675e-06, |
|
"loss": 0.464, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.3724815170156329, |
|
"grad_norm": 15.996291160583496, |
|
"learning_rate": 9.043393741769613e-06, |
|
"loss": 0.4345, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.37530334668999377, |
|
"grad_norm": 108.50527954101562, |
|
"learning_rate": 9.027716811939551e-06, |
|
"loss": 0.4013, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.37812517636435466, |
|
"grad_norm": 28.973108291625977, |
|
"learning_rate": 9.012039882109487e-06, |
|
"loss": 0.4512, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.3809470060387155, |
|
"grad_norm": 74.1708984375, |
|
"learning_rate": 8.996362952279425e-06, |
|
"loss": 0.4343, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.38376883571307635, |
|
"grad_norm": 19.657316207885742, |
|
"learning_rate": 8.980686022449363e-06, |
|
"loss": 0.4125, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.3865906653874372, |
|
"grad_norm": 28.859350204467773, |
|
"learning_rate": 8.965165861917603e-06, |
|
"loss": 0.4746, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.3894124950617981, |
|
"grad_norm": 18.13363265991211, |
|
"learning_rate": 8.94948893208754e-06, |
|
"loss": 0.4771, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.39223432473615893, |
|
"grad_norm": 22.466726303100586, |
|
"learning_rate": 8.933812002257479e-06, |
|
"loss": 0.4613, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.3950561544105198, |
|
"grad_norm": 27.679174423217773, |
|
"learning_rate": 8.918135072427417e-06, |
|
"loss": 0.462, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.3978779840848806, |
|
"grad_norm": 27.347543716430664, |
|
"learning_rate": 8.902458142597355e-06, |
|
"loss": 0.4828, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.4006998137592415, |
|
"grad_norm": 16.224889755249023, |
|
"learning_rate": 8.886781212767293e-06, |
|
"loss": 0.4608, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.40352164343360236, |
|
"grad_norm": 23.845518112182617, |
|
"learning_rate": 8.87110428293723e-06, |
|
"loss": 0.4187, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.4063434731079632, |
|
"grad_norm": 18.049110412597656, |
|
"learning_rate": 8.855427353107167e-06, |
|
"loss": 0.4794, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.40916530278232405, |
|
"grad_norm": 30.752511978149414, |
|
"learning_rate": 8.839750423277105e-06, |
|
"loss": 0.473, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.4119871324566849, |
|
"grad_norm": 29.279788970947266, |
|
"learning_rate": 8.824073493447045e-06, |
|
"loss": 0.4564, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.4148089621310458, |
|
"grad_norm": 7.239739894866943, |
|
"learning_rate": 8.808396563616983e-06, |
|
"loss": 0.4363, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.41763079180540663, |
|
"grad_norm": 19.604881286621094, |
|
"learning_rate": 8.79271963378692e-06, |
|
"loss": 0.4745, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.4204526214797675, |
|
"grad_norm": 21.136322021484375, |
|
"learning_rate": 8.777042703956857e-06, |
|
"loss": 0.4552, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.4232744511541283, |
|
"grad_norm": 23.589292526245117, |
|
"learning_rate": 8.761365774126795e-06, |
|
"loss": 0.4245, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.4260962808284892, |
|
"grad_norm": 9.279696464538574, |
|
"learning_rate": 8.745688844296733e-06, |
|
"loss": 0.4583, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.42891811050285006, |
|
"grad_norm": 23.76906394958496, |
|
"learning_rate": 8.730011914466671e-06, |
|
"loss": 0.4873, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.4317399401772109, |
|
"grad_norm": 29.270069122314453, |
|
"learning_rate": 8.71433498463661e-06, |
|
"loss": 0.453, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.43456176985157174, |
|
"grad_norm": 31.86260414123535, |
|
"learning_rate": 8.698658054806547e-06, |
|
"loss": 0.4939, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.4373835995259326, |
|
"grad_norm": 36.27268600463867, |
|
"learning_rate": 8.682981124976485e-06, |
|
"loss": 0.447, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.4402054292002935, |
|
"grad_norm": 26.660816192626953, |
|
"learning_rate": 8.667304195146423e-06, |
|
"loss": 0.5021, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.44302725887465433, |
|
"grad_norm": 24.500545501708984, |
|
"learning_rate": 8.651627265316361e-06, |
|
"loss": 0.4443, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.44584908854901517, |
|
"grad_norm": 30.132831573486328, |
|
"learning_rate": 8.6359503354863e-06, |
|
"loss": 0.4095, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.448670918223376, |
|
"grad_norm": 33.71192169189453, |
|
"learning_rate": 8.620273405656236e-06, |
|
"loss": 0.442, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.4514927478977369, |
|
"grad_norm": 59.854530334472656, |
|
"learning_rate": 8.604596475826175e-06, |
|
"loss": 0.4319, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.45431457757209776, |
|
"grad_norm": 23.0345516204834, |
|
"learning_rate": 8.588919545996113e-06, |
|
"loss": 0.4537, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.4571364072464586, |
|
"grad_norm": 12.112003326416016, |
|
"learning_rate": 8.573242616166051e-06, |
|
"loss": 0.4462, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.45995823692081944, |
|
"grad_norm": 24.53924560546875, |
|
"learning_rate": 8.55756568633599e-06, |
|
"loss": 0.4354, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.46278006659518034, |
|
"grad_norm": 13.580459594726562, |
|
"learning_rate": 8.541888756505926e-06, |
|
"loss": 0.4662, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.4656018962695412, |
|
"grad_norm": 14.42153263092041, |
|
"learning_rate": 8.526211826675864e-06, |
|
"loss": 0.4699, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.468423725943902, |
|
"grad_norm": 31.8900089263916, |
|
"learning_rate": 8.510534896845802e-06, |
|
"loss": 0.4473, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.47124555561826287, |
|
"grad_norm": 22.49077606201172, |
|
"learning_rate": 8.49485796701574e-06, |
|
"loss": 0.4516, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.4740673852926237, |
|
"grad_norm": 41.829689025878906, |
|
"learning_rate": 8.479181037185678e-06, |
|
"loss": 0.3909, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.4768892149669846, |
|
"grad_norm": 31.608774185180664, |
|
"learning_rate": 8.463504107355616e-06, |
|
"loss": 0.4598, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.47971104464134545, |
|
"grad_norm": 16.948047637939453, |
|
"learning_rate": 8.447827177525554e-06, |
|
"loss": 0.4696, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.4825328743157063, |
|
"grad_norm": 26.435142517089844, |
|
"learning_rate": 8.432150247695492e-06, |
|
"loss": 0.4523, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.48535470399006714, |
|
"grad_norm": 29.971567153930664, |
|
"learning_rate": 8.41647331786543e-06, |
|
"loss": 0.4253, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.48817653366442804, |
|
"grad_norm": 16.34187126159668, |
|
"learning_rate": 8.400796388035368e-06, |
|
"loss": 0.4364, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.4909983633387889, |
|
"grad_norm": 17.09442710876465, |
|
"learning_rate": 8.385119458205306e-06, |
|
"loss": 0.4243, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.4938201930131497, |
|
"grad_norm": 17.122453689575195, |
|
"learning_rate": 8.369442528375244e-06, |
|
"loss": 0.4456, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.49664202268751056, |
|
"grad_norm": 19.6232852935791, |
|
"learning_rate": 8.353765598545182e-06, |
|
"loss": 0.4297, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.49946385236187146, |
|
"grad_norm": 26.52530288696289, |
|
"learning_rate": 8.33808866871512e-06, |
|
"loss": 0.442, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.5022856820362323, |
|
"grad_norm": 23.909753799438477, |
|
"learning_rate": 8.322411738885058e-06, |
|
"loss": 0.4656, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.5051075117105931, |
|
"grad_norm": 28.525676727294922, |
|
"learning_rate": 8.306734809054994e-06, |
|
"loss": 0.4366, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.507929341384954, |
|
"grad_norm": 18.39270782470703, |
|
"learning_rate": 8.291057879224932e-06, |
|
"loss": 0.4473, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5107511710593149, |
|
"grad_norm": 17.035900115966797, |
|
"learning_rate": 8.27538094939487e-06, |
|
"loss": 0.3899, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.5135730007336757, |
|
"grad_norm": 34.1118049621582, |
|
"learning_rate": 8.259704019564809e-06, |
|
"loss": 0.4569, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.5163948304080366, |
|
"grad_norm": 46.11088562011719, |
|
"learning_rate": 8.244027089734748e-06, |
|
"loss": 0.4155, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.5192166600823974, |
|
"grad_norm": 13.6370267868042, |
|
"learning_rate": 8.228350159904685e-06, |
|
"loss": 0.4137, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.5220384897567583, |
|
"grad_norm": 27.46001434326172, |
|
"learning_rate": 8.212673230074623e-06, |
|
"loss": 0.4355, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.5248603194311191, |
|
"grad_norm": 50.026153564453125, |
|
"learning_rate": 8.19699630024456e-06, |
|
"loss": 0.4602, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.52768214910548, |
|
"grad_norm": 10.866439819335938, |
|
"learning_rate": 8.181319370414499e-06, |
|
"loss": 0.4433, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.5305039787798409, |
|
"grad_norm": 25.32647705078125, |
|
"learning_rate": 8.165642440584437e-06, |
|
"loss": 0.4633, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.5333258084542017, |
|
"grad_norm": 24.343759536743164, |
|
"learning_rate": 8.149965510754375e-06, |
|
"loss": 0.4046, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.5361476381285626, |
|
"grad_norm": 6.707170009613037, |
|
"learning_rate": 8.134288580924313e-06, |
|
"loss": 0.371, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.5389694678029234, |
|
"grad_norm": 31.448177337646484, |
|
"learning_rate": 8.11861165109425e-06, |
|
"loss": 0.4444, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.5417912974772843, |
|
"grad_norm": 27.307594299316406, |
|
"learning_rate": 8.102934721264189e-06, |
|
"loss": 0.4495, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.5446131271516451, |
|
"grad_norm": 33.58913040161133, |
|
"learning_rate": 8.087257791434127e-06, |
|
"loss": 0.4265, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.547434956826006, |
|
"grad_norm": 10.490382194519043, |
|
"learning_rate": 8.071580861604063e-06, |
|
"loss": 0.4098, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.5502567865003668, |
|
"grad_norm": 34.94256591796875, |
|
"learning_rate": 8.055903931774001e-06, |
|
"loss": 0.4345, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.5530786161747276, |
|
"grad_norm": 15.932448387145996, |
|
"learning_rate": 8.040227001943939e-06, |
|
"loss": 0.4064, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.5559004458490886, |
|
"grad_norm": 43.63095474243164, |
|
"learning_rate": 8.024550072113879e-06, |
|
"loss": 0.4644, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.5587222755234494, |
|
"grad_norm": 16.069515228271484, |
|
"learning_rate": 8.008873142283817e-06, |
|
"loss": 0.4064, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.5615441051978103, |
|
"grad_norm": 21.504106521606445, |
|
"learning_rate": 7.993196212453753e-06, |
|
"loss": 0.4134, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.5643659348721711, |
|
"grad_norm": 30.280887603759766, |
|
"learning_rate": 7.977519282623691e-06, |
|
"loss": 0.4066, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.567187764546532, |
|
"grad_norm": 35.03623580932617, |
|
"learning_rate": 7.96184235279363e-06, |
|
"loss": 0.4568, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.5700095942208928, |
|
"grad_norm": 33.643226623535156, |
|
"learning_rate": 7.946165422963567e-06, |
|
"loss": 0.4131, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.5728314238952537, |
|
"grad_norm": 17.969562530517578, |
|
"learning_rate": 7.930488493133505e-06, |
|
"loss": 0.447, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.5756532535696145, |
|
"grad_norm": 24.389982223510742, |
|
"learning_rate": 7.914811563303443e-06, |
|
"loss": 0.448, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.5784750832439753, |
|
"grad_norm": 28.242773056030273, |
|
"learning_rate": 7.899134633473381e-06, |
|
"loss": 0.4822, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.5812969129183363, |
|
"grad_norm": 13.920737266540527, |
|
"learning_rate": 7.88345770364332e-06, |
|
"loss": 0.432, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.5841187425926971, |
|
"grad_norm": 24.488445281982422, |
|
"learning_rate": 7.867780773813257e-06, |
|
"loss": 0.4167, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.586940572267058, |
|
"grad_norm": 13.902533531188965, |
|
"learning_rate": 7.852103843983195e-06, |
|
"loss": 0.4354, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.5897624019414188, |
|
"grad_norm": 13.963842391967773, |
|
"learning_rate": 7.836426914153132e-06, |
|
"loss": 0.4484, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.5925842316157797, |
|
"grad_norm": 17.448009490966797, |
|
"learning_rate": 7.82074998432307e-06, |
|
"loss": 0.38, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.5954060612901405, |
|
"grad_norm": 16.734331130981445, |
|
"learning_rate": 7.80507305449301e-06, |
|
"loss": 0.4409, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.5982278909645014, |
|
"grad_norm": 29.44654655456543, |
|
"learning_rate": 7.789396124662948e-06, |
|
"loss": 0.4482, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.6010497206388622, |
|
"grad_norm": 21.073148727416992, |
|
"learning_rate": 7.773719194832886e-06, |
|
"loss": 0.4136, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.6038715503132231, |
|
"grad_norm": 8.795435905456543, |
|
"learning_rate": 7.758042265002822e-06, |
|
"loss": 0.4726, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.606693379987584, |
|
"grad_norm": 18.364139556884766, |
|
"learning_rate": 7.74236533517276e-06, |
|
"loss": 0.4165, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.6095152096619448, |
|
"grad_norm": 16.08391761779785, |
|
"learning_rate": 7.726688405342698e-06, |
|
"loss": 0.3864, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.6123370393363057, |
|
"grad_norm": 20.25522804260254, |
|
"learning_rate": 7.711011475512636e-06, |
|
"loss": 0.4517, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.6151588690106665, |
|
"grad_norm": 8.111662864685059, |
|
"learning_rate": 7.695334545682574e-06, |
|
"loss": 0.3994, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.6179806986850274, |
|
"grad_norm": 20.919527053833008, |
|
"learning_rate": 7.679657615852512e-06, |
|
"loss": 0.4073, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.6208025283593882, |
|
"grad_norm": 17.961687088012695, |
|
"learning_rate": 7.66398068602245e-06, |
|
"loss": 0.4139, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.623624358033749, |
|
"grad_norm": 28.198232650756836, |
|
"learning_rate": 7.648303756192388e-06, |
|
"loss": 0.4406, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.6264461877081099, |
|
"grad_norm": 9.413177490234375, |
|
"learning_rate": 7.632626826362326e-06, |
|
"loss": 0.4488, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.6292680173824708, |
|
"grad_norm": 15.02452278137207, |
|
"learning_rate": 7.616949896532264e-06, |
|
"loss": 0.4523, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.6320898470568317, |
|
"grad_norm": 37.498958587646484, |
|
"learning_rate": 7.601272966702201e-06, |
|
"loss": 0.4219, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.6349116767311925, |
|
"grad_norm": 14.869726181030273, |
|
"learning_rate": 7.585596036872139e-06, |
|
"loss": 0.4111, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.6377335064055534, |
|
"grad_norm": 19.956878662109375, |
|
"learning_rate": 7.569919107042077e-06, |
|
"loss": 0.4143, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.6405553360799142, |
|
"grad_norm": 20.845680236816406, |
|
"learning_rate": 7.554242177212015e-06, |
|
"loss": 0.4326, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.6433771657542751, |
|
"grad_norm": 7.167665958404541, |
|
"learning_rate": 7.538565247381953e-06, |
|
"loss": 0.3882, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.6461989954286359, |
|
"grad_norm": 24.01291847229004, |
|
"learning_rate": 7.5228883175518905e-06, |
|
"loss": 0.4506, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.6490208251029967, |
|
"grad_norm": 18.37822151184082, |
|
"learning_rate": 7.5072113877218286e-06, |
|
"loss": 0.4495, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.6518426547773576, |
|
"grad_norm": 17.8455810546875, |
|
"learning_rate": 7.4915344578917674e-06, |
|
"loss": 0.4631, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 0.6546644844517185, |
|
"grad_norm": 21.360794067382812, |
|
"learning_rate": 7.4758575280617055e-06, |
|
"loss": 0.3775, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 0.6574863141260794, |
|
"grad_norm": 25.73183250427246, |
|
"learning_rate": 7.4601805982316435e-06, |
|
"loss": 0.4126, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 0.6603081438004402, |
|
"grad_norm": 23.874588012695312, |
|
"learning_rate": 7.444503668401581e-06, |
|
"loss": 0.4415, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 0.6631299734748011, |
|
"grad_norm": 23.876819610595703, |
|
"learning_rate": 7.428826738571519e-06, |
|
"loss": 0.3982, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.6659518031491619, |
|
"grad_norm": 12.06535816192627, |
|
"learning_rate": 7.413149808741457e-06, |
|
"loss": 0.4357, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 0.6687736328235228, |
|
"grad_norm": 26.34955406188965, |
|
"learning_rate": 7.397472878911395e-06, |
|
"loss": 0.4377, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 0.6715954624978836, |
|
"grad_norm": 22.41992950439453, |
|
"learning_rate": 7.381795949081333e-06, |
|
"loss": 0.434, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 0.6744172921722444, |
|
"grad_norm": 33.86873245239258, |
|
"learning_rate": 7.36611901925127e-06, |
|
"loss": 0.4248, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 0.6772391218466053, |
|
"grad_norm": 10.177706718444824, |
|
"learning_rate": 7.350598858719509e-06, |
|
"loss": 0.3836, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6800609515209662, |
|
"grad_norm": 16.718257904052734, |
|
"learning_rate": 7.334921928889447e-06, |
|
"loss": 0.4107, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 0.6828827811953271, |
|
"grad_norm": 27.072444915771484, |
|
"learning_rate": 7.319244999059385e-06, |
|
"loss": 0.3875, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 0.6857046108696879, |
|
"grad_norm": 14.283991813659668, |
|
"learning_rate": 7.303568069229323e-06, |
|
"loss": 0.4267, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 0.6885264405440488, |
|
"grad_norm": 12.753026008605957, |
|
"learning_rate": 7.28789113939926e-06, |
|
"loss": 0.4146, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 0.6913482702184096, |
|
"grad_norm": 40.42727279663086, |
|
"learning_rate": 7.272214209569198e-06, |
|
"loss": 0.4194, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.6941700998927705, |
|
"grad_norm": 13.08806324005127, |
|
"learning_rate": 7.256537279739136e-06, |
|
"loss": 0.3789, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 0.6969919295671313, |
|
"grad_norm": 31.469749450683594, |
|
"learning_rate": 7.240860349909074e-06, |
|
"loss": 0.3996, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 0.6998137592414921, |
|
"grad_norm": 12.432112693786621, |
|
"learning_rate": 7.225183420079012e-06, |
|
"loss": 0.4347, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 0.7026355889158531, |
|
"grad_norm": 22.593225479125977, |
|
"learning_rate": 7.2095064902489495e-06, |
|
"loss": 0.3896, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 0.7054574185902139, |
|
"grad_norm": 28.22087860107422, |
|
"learning_rate": 7.1938295604188876e-06, |
|
"loss": 0.4072, |
|
"step": 25000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 70876, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|