KoseiUemura's picture
Add checkpoint checkpoint-25000
8b095b3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7054574185902139,
"eval_steps": 500,
"global_step": 25000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028218296743608554,
"grad_norm": 175.4037628173828,
"learning_rate": 1.3261851015801355e-07,
"loss": 3.1599,
"step": 100
},
{
"epoch": 0.005643659348721711,
"grad_norm": 24.302709579467773,
"learning_rate": 2.737020316027088e-07,
"loss": 1.8481,
"step": 200
},
{
"epoch": 0.008465489023082567,
"grad_norm": 39.38914489746094,
"learning_rate": 4.147855530474041e-07,
"loss": 1.2574,
"step": 300
},
{
"epoch": 0.011287318697443422,
"grad_norm": 28.08403205871582,
"learning_rate": 5.558690744920993e-07,
"loss": 1.0333,
"step": 400
},
{
"epoch": 0.014109148371804279,
"grad_norm": 22.54578399658203,
"learning_rate": 6.969525959367947e-07,
"loss": 0.9413,
"step": 500
},
{
"epoch": 0.016930978046165134,
"grad_norm": 17.940231323242188,
"learning_rate": 8.3803611738149e-07,
"loss": 0.7995,
"step": 600
},
{
"epoch": 0.01975280772052599,
"grad_norm": 26.575462341308594,
"learning_rate": 9.79119638826185e-07,
"loss": 0.7403,
"step": 700
},
{
"epoch": 0.022574637394886844,
"grad_norm": 51.95569610595703,
"learning_rate": 1.1202031602708804e-06,
"loss": 0.7774,
"step": 800
},
{
"epoch": 0.0253964670692477,
"grad_norm": 140.2630157470703,
"learning_rate": 1.2612866817155757e-06,
"loss": 0.6903,
"step": 900
},
{
"epoch": 0.028218296743608557,
"grad_norm": 28.976390838623047,
"learning_rate": 1.402370203160271e-06,
"loss": 0.6727,
"step": 1000
},
{
"epoch": 0.03104012641796941,
"grad_norm": 36.016754150390625,
"learning_rate": 1.5434537246049664e-06,
"loss": 0.6351,
"step": 1100
},
{
"epoch": 0.03386195609233027,
"grad_norm": 41.480491638183594,
"learning_rate": 1.6845372460496615e-06,
"loss": 0.6555,
"step": 1200
},
{
"epoch": 0.036683785766691124,
"grad_norm": 24.90089225769043,
"learning_rate": 1.8256207674943568e-06,
"loss": 0.6714,
"step": 1300
},
{
"epoch": 0.03950561544105198,
"grad_norm": 31.070730209350586,
"learning_rate": 1.966704288939052e-06,
"loss": 0.6781,
"step": 1400
},
{
"epoch": 0.04232744511541283,
"grad_norm": 33.644046783447266,
"learning_rate": 2.1077878103837474e-06,
"loss": 0.6284,
"step": 1500
},
{
"epoch": 0.04514927478977369,
"grad_norm": 22.074399948120117,
"learning_rate": 2.2488713318284427e-06,
"loss": 0.578,
"step": 1600
},
{
"epoch": 0.047971104464134544,
"grad_norm": 30.381807327270508,
"learning_rate": 2.389954853273138e-06,
"loss": 0.631,
"step": 1700
},
{
"epoch": 0.0507929341384954,
"grad_norm": 31.510488510131836,
"learning_rate": 2.531038374717833e-06,
"loss": 0.61,
"step": 1800
},
{
"epoch": 0.05361476381285626,
"grad_norm": 27.364355087280273,
"learning_rate": 2.6721218961625283e-06,
"loss": 0.5953,
"step": 1900
},
{
"epoch": 0.056436593487217114,
"grad_norm": 48.54792404174805,
"learning_rate": 2.8132054176072236e-06,
"loss": 0.6046,
"step": 2000
},
{
"epoch": 0.059258423161577964,
"grad_norm": 49.58477020263672,
"learning_rate": 2.9542889390519193e-06,
"loss": 0.5609,
"step": 2100
},
{
"epoch": 0.06208025283593882,
"grad_norm": 26.104825973510742,
"learning_rate": 3.095372460496614e-06,
"loss": 0.5931,
"step": 2200
},
{
"epoch": 0.06490208251029968,
"grad_norm": 36.350685119628906,
"learning_rate": 3.2364559819413096e-06,
"loss": 0.5884,
"step": 2300
},
{
"epoch": 0.06772391218466053,
"grad_norm": 36.55471420288086,
"learning_rate": 3.377539503386005e-06,
"loss": 0.5957,
"step": 2400
},
{
"epoch": 0.07054574185902139,
"grad_norm": 35.742225646972656,
"learning_rate": 3.5186230248307e-06,
"loss": 0.5958,
"step": 2500
},
{
"epoch": 0.07336757153338225,
"grad_norm": 20.766326904296875,
"learning_rate": 3.6597065462753955e-06,
"loss": 0.561,
"step": 2600
},
{
"epoch": 0.0761894012077431,
"grad_norm": 43.46022033691406,
"learning_rate": 3.8007900677200904e-06,
"loss": 0.5751,
"step": 2700
},
{
"epoch": 0.07901123088210396,
"grad_norm": 35.60908889770508,
"learning_rate": 3.941873589164786e-06,
"loss": 0.6054,
"step": 2800
},
{
"epoch": 0.08183306055646482,
"grad_norm": 25.768211364746094,
"learning_rate": 4.082957110609481e-06,
"loss": 0.5796,
"step": 2900
},
{
"epoch": 0.08465489023082566,
"grad_norm": 38.575496673583984,
"learning_rate": 4.224040632054177e-06,
"loss": 0.5874,
"step": 3000
},
{
"epoch": 0.08747671990518652,
"grad_norm": 23.993473052978516,
"learning_rate": 4.363713318284425e-06,
"loss": 0.5564,
"step": 3100
},
{
"epoch": 0.09029854957954737,
"grad_norm": 42.830509185791016,
"learning_rate": 4.50479683972912e-06,
"loss": 0.5645,
"step": 3200
},
{
"epoch": 0.09312037925390823,
"grad_norm": 34.766197204589844,
"learning_rate": 4.6458803611738155e-06,
"loss": 0.5433,
"step": 3300
},
{
"epoch": 0.09594220892826909,
"grad_norm": 32.30384826660156,
"learning_rate": 4.78696388261851e-06,
"loss": 0.5691,
"step": 3400
},
{
"epoch": 0.09876403860262994,
"grad_norm": 77.13898468017578,
"learning_rate": 4.928047404063206e-06,
"loss": 0.5685,
"step": 3500
},
{
"epoch": 0.1015858682769908,
"grad_norm": 42.225563049316406,
"learning_rate": 5.069130925507901e-06,
"loss": 0.5381,
"step": 3600
},
{
"epoch": 0.10440769795135166,
"grad_norm": 32.32414245605469,
"learning_rate": 5.210214446952596e-06,
"loss": 0.5406,
"step": 3700
},
{
"epoch": 0.10722952762571251,
"grad_norm": 25.47852325439453,
"learning_rate": 5.3512979683972925e-06,
"loss": 0.5309,
"step": 3800
},
{
"epoch": 0.11005135730007337,
"grad_norm": 31.160581588745117,
"learning_rate": 5.4923814898419865e-06,
"loss": 0.5259,
"step": 3900
},
{
"epoch": 0.11287318697443423,
"grad_norm": 36.84406661987305,
"learning_rate": 5.632054176072235e-06,
"loss": 0.5297,
"step": 4000
},
{
"epoch": 0.11569501664879508,
"grad_norm": 10.82895565032959,
"learning_rate": 5.77313769751693e-06,
"loss": 0.4846,
"step": 4100
},
{
"epoch": 0.11851684632315593,
"grad_norm": 21.587764739990234,
"learning_rate": 5.914221218961625e-06,
"loss": 0.5333,
"step": 4200
},
{
"epoch": 0.12133867599751678,
"grad_norm": 22.369434356689453,
"learning_rate": 6.055304740406322e-06,
"loss": 0.5372,
"step": 4300
},
{
"epoch": 0.12416050567187764,
"grad_norm": 33.766910552978516,
"learning_rate": 6.196388261851016e-06,
"loss": 0.5652,
"step": 4400
},
{
"epoch": 0.1269823353462385,
"grad_norm": 27.229183197021484,
"learning_rate": 6.337471783295711e-06,
"loss": 0.531,
"step": 4500
},
{
"epoch": 0.12980416502059935,
"grad_norm": 25.017833709716797,
"learning_rate": 6.478555304740407e-06,
"loss": 0.5483,
"step": 4600
},
{
"epoch": 0.13262599469496023,
"grad_norm": 27.347057342529297,
"learning_rate": 6.619638826185102e-06,
"loss": 0.5289,
"step": 4700
},
{
"epoch": 0.13544782436932107,
"grad_norm": 32.57697296142578,
"learning_rate": 6.760722347629798e-06,
"loss": 0.5225,
"step": 4800
},
{
"epoch": 0.1382696540436819,
"grad_norm": 31.288978576660156,
"learning_rate": 6.901805869074493e-06,
"loss": 0.4876,
"step": 4900
},
{
"epoch": 0.14109148371804278,
"grad_norm": 14.970600128173828,
"learning_rate": 7.042889390519188e-06,
"loss": 0.4723,
"step": 5000
},
{
"epoch": 0.14391331339240362,
"grad_norm": 50.01182556152344,
"learning_rate": 7.1839729119638835e-06,
"loss": 0.518,
"step": 5100
},
{
"epoch": 0.1467351430667645,
"grad_norm": 28.612035751342773,
"learning_rate": 7.325056433408578e-06,
"loss": 0.4673,
"step": 5200
},
{
"epoch": 0.14955697274112534,
"grad_norm": 79.7120132446289,
"learning_rate": 7.466139954853274e-06,
"loss": 0.5274,
"step": 5300
},
{
"epoch": 0.1523788024154862,
"grad_norm": 15.11323070526123,
"learning_rate": 7.607223476297969e-06,
"loss": 0.5082,
"step": 5400
},
{
"epoch": 0.15520063208984705,
"grad_norm": 18.369705200195312,
"learning_rate": 7.748306997742663e-06,
"loss": 0.4996,
"step": 5500
},
{
"epoch": 0.15802246176420792,
"grad_norm": 36.83811950683594,
"learning_rate": 7.88939051918736e-06,
"loss": 0.5587,
"step": 5600
},
{
"epoch": 0.16084429143856876,
"grad_norm": 45.13284683227539,
"learning_rate": 8.030474040632055e-06,
"loss": 0.5121,
"step": 5700
},
{
"epoch": 0.16366612111292964,
"grad_norm": 22.508358001708984,
"learning_rate": 8.17155756207675e-06,
"loss": 0.5167,
"step": 5800
},
{
"epoch": 0.16648795078729048,
"grad_norm": 37.16364288330078,
"learning_rate": 8.312641083521446e-06,
"loss": 0.524,
"step": 5900
},
{
"epoch": 0.16930978046165132,
"grad_norm": 25.45941925048828,
"learning_rate": 8.453724604966141e-06,
"loss": 0.4932,
"step": 6000
},
{
"epoch": 0.1721316101360122,
"grad_norm": 29.486251831054688,
"learning_rate": 8.594808126410836e-06,
"loss": 0.5069,
"step": 6100
},
{
"epoch": 0.17495343981037303,
"grad_norm": 36.87773513793945,
"learning_rate": 8.73589164785553e-06,
"loss": 0.5286,
"step": 6200
},
{
"epoch": 0.1777752694847339,
"grad_norm": 27.660303115844727,
"learning_rate": 8.876975169300226e-06,
"loss": 0.5015,
"step": 6300
},
{
"epoch": 0.18059709915909475,
"grad_norm": 20.193119049072266,
"learning_rate": 9.018058690744922e-06,
"loss": 0.5353,
"step": 6400
},
{
"epoch": 0.18341892883345562,
"grad_norm": 46.997806549072266,
"learning_rate": 9.159142212189617e-06,
"loss": 0.4702,
"step": 6500
},
{
"epoch": 0.18624075850781646,
"grad_norm": 33.448543548583984,
"learning_rate": 9.300225733634312e-06,
"loss": 0.51,
"step": 6600
},
{
"epoch": 0.18906258818217733,
"grad_norm": 36.22298049926758,
"learning_rate": 9.441309255079007e-06,
"loss": 0.4891,
"step": 6700
},
{
"epoch": 0.19188441785653818,
"grad_norm": 37.21258544921875,
"learning_rate": 9.582392776523702e-06,
"loss": 0.5353,
"step": 6800
},
{
"epoch": 0.19470624753089905,
"grad_norm": 14.430715560913086,
"learning_rate": 9.723476297968398e-06,
"loss": 0.5091,
"step": 6900
},
{
"epoch": 0.1975280772052599,
"grad_norm": 37.424869537353516,
"learning_rate": 9.864559819413093e-06,
"loss": 0.5326,
"step": 7000
},
{
"epoch": 0.20034990687962076,
"grad_norm": 12.207716941833496,
"learning_rate": 9.999372922806798e-06,
"loss": 0.517,
"step": 7100
},
{
"epoch": 0.2031717365539816,
"grad_norm": 10.985408782958984,
"learning_rate": 9.983695992976736e-06,
"loss": 0.5263,
"step": 7200
},
{
"epoch": 0.20599356622834245,
"grad_norm": 32.765968322753906,
"learning_rate": 9.968019063146674e-06,
"loss": 0.5073,
"step": 7300
},
{
"epoch": 0.20881539590270332,
"grad_norm": 24.93343734741211,
"learning_rate": 9.952342133316612e-06,
"loss": 0.5049,
"step": 7400
},
{
"epoch": 0.21163722557706416,
"grad_norm": 42.219844818115234,
"learning_rate": 9.93666520348655e-06,
"loss": 0.5039,
"step": 7500
},
{
"epoch": 0.21445905525142503,
"grad_norm": 23.783781051635742,
"learning_rate": 9.920988273656488e-06,
"loss": 0.5179,
"step": 7600
},
{
"epoch": 0.21728088492578587,
"grad_norm": 29.034082412719727,
"learning_rate": 9.905311343826426e-06,
"loss": 0.5053,
"step": 7700
},
{
"epoch": 0.22010271460014674,
"grad_norm": 33.75339126586914,
"learning_rate": 9.889634413996364e-06,
"loss": 0.4951,
"step": 7800
},
{
"epoch": 0.22292454427450759,
"grad_norm": 13.03099536895752,
"learning_rate": 9.873957484166302e-06,
"loss": 0.5285,
"step": 7900
},
{
"epoch": 0.22574637394886846,
"grad_norm": 31.321319580078125,
"learning_rate": 9.858280554336239e-06,
"loss": 0.5172,
"step": 8000
},
{
"epoch": 0.2285682036232293,
"grad_norm": 16.2780704498291,
"learning_rate": 9.842603624506177e-06,
"loss": 0.5214,
"step": 8100
},
{
"epoch": 0.23139003329759017,
"grad_norm": 23.63935661315918,
"learning_rate": 9.826926694676115e-06,
"loss": 0.4817,
"step": 8200
},
{
"epoch": 0.234211862971951,
"grad_norm": 28.826778411865234,
"learning_rate": 9.811249764846053e-06,
"loss": 0.5106,
"step": 8300
},
{
"epoch": 0.23703369264631186,
"grad_norm": 23.31501007080078,
"learning_rate": 9.795572835015993e-06,
"loss": 0.5101,
"step": 8400
},
{
"epoch": 0.23985552232067273,
"grad_norm": 22.932710647583008,
"learning_rate": 9.779895905185929e-06,
"loss": 0.4779,
"step": 8500
},
{
"epoch": 0.24267735199503357,
"grad_norm": 32.443641662597656,
"learning_rate": 9.764218975355867e-06,
"loss": 0.5057,
"step": 8600
},
{
"epoch": 0.24549918166939444,
"grad_norm": 60.96305847167969,
"learning_rate": 9.748542045525805e-06,
"loss": 0.4986,
"step": 8700
},
{
"epoch": 0.24832101134375528,
"grad_norm": 27.693511962890625,
"learning_rate": 9.732865115695743e-06,
"loss": 0.5099,
"step": 8800
},
{
"epoch": 0.2511428410181161,
"grad_norm": 16.883127212524414,
"learning_rate": 9.717188185865681e-06,
"loss": 0.5244,
"step": 8900
},
{
"epoch": 0.253964670692477,
"grad_norm": 30.549161911010742,
"learning_rate": 9.701511256035619e-06,
"loss": 0.5023,
"step": 9000
},
{
"epoch": 0.25678650036683787,
"grad_norm": 23.576152801513672,
"learning_rate": 9.685991095503856e-06,
"loss": 0.5008,
"step": 9100
},
{
"epoch": 0.2596083300411987,
"grad_norm": 21.88428497314453,
"learning_rate": 9.670314165673796e-06,
"loss": 0.5295,
"step": 9200
},
{
"epoch": 0.26243015971555955,
"grad_norm": 21.60301971435547,
"learning_rate": 9.654637235843734e-06,
"loss": 0.5001,
"step": 9300
},
{
"epoch": 0.26525198938992045,
"grad_norm": 36.93144607543945,
"learning_rate": 9.638960306013672e-06,
"loss": 0.4505,
"step": 9400
},
{
"epoch": 0.2680738190642813,
"grad_norm": 17.426191329956055,
"learning_rate": 9.623283376183608e-06,
"loss": 0.4694,
"step": 9500
},
{
"epoch": 0.27089564873864214,
"grad_norm": 22.8311710357666,
"learning_rate": 9.607606446353546e-06,
"loss": 0.488,
"step": 9600
},
{
"epoch": 0.273717478413003,
"grad_norm": 32.39801025390625,
"learning_rate": 9.591929516523485e-06,
"loss": 0.5119,
"step": 9700
},
{
"epoch": 0.2765393080873638,
"grad_norm": 14.913991928100586,
"learning_rate": 9.576252586693423e-06,
"loss": 0.4523,
"step": 9800
},
{
"epoch": 0.2793611377617247,
"grad_norm": 28.448148727416992,
"learning_rate": 9.56057565686336e-06,
"loss": 0.4689,
"step": 9900
},
{
"epoch": 0.28218296743608556,
"grad_norm": 49.07644271850586,
"learning_rate": 9.544898727033299e-06,
"loss": 0.5235,
"step": 10000
},
{
"epoch": 0.2850047971104464,
"grad_norm": 31.61011505126953,
"learning_rate": 9.529221797203237e-06,
"loss": 0.5053,
"step": 10100
},
{
"epoch": 0.28782662678480725,
"grad_norm": 23.562646865844727,
"learning_rate": 9.513544867373175e-06,
"loss": 0.485,
"step": 10200
},
{
"epoch": 0.29064845645916815,
"grad_norm": 22.867277145385742,
"learning_rate": 9.497867937543113e-06,
"loss": 0.523,
"step": 10300
},
{
"epoch": 0.293470286133529,
"grad_norm": 44.8724365234375,
"learning_rate": 9.48219100771305e-06,
"loss": 0.4691,
"step": 10400
},
{
"epoch": 0.29629211580788983,
"grad_norm": 15.85916519165039,
"learning_rate": 9.466514077882987e-06,
"loss": 0.4646,
"step": 10500
},
{
"epoch": 0.2991139454822507,
"grad_norm": 19.45340347290039,
"learning_rate": 9.450837148052927e-06,
"loss": 0.4907,
"step": 10600
},
{
"epoch": 0.3019357751566116,
"grad_norm": 14.807464599609375,
"learning_rate": 9.435160218222865e-06,
"loss": 0.4539,
"step": 10700
},
{
"epoch": 0.3047576048309724,
"grad_norm": 24.157548904418945,
"learning_rate": 9.419483288392803e-06,
"loss": 0.3937,
"step": 10800
},
{
"epoch": 0.30757943450533326,
"grad_norm": 37.66196060180664,
"learning_rate": 9.40380635856274e-06,
"loss": 0.4346,
"step": 10900
},
{
"epoch": 0.3104012641796941,
"grad_norm": 54.08269500732422,
"learning_rate": 9.388129428732677e-06,
"loss": 0.4566,
"step": 11000
},
{
"epoch": 0.31322309385405495,
"grad_norm": 37.25579833984375,
"learning_rate": 9.372452498902615e-06,
"loss": 0.4333,
"step": 11100
},
{
"epoch": 0.31604492352841584,
"grad_norm": 32.52021026611328,
"learning_rate": 9.356775569072553e-06,
"loss": 0.4971,
"step": 11200
},
{
"epoch": 0.3188667532027767,
"grad_norm": 24.20098876953125,
"learning_rate": 9.341098639242491e-06,
"loss": 0.4936,
"step": 11300
},
{
"epoch": 0.32168858287713753,
"grad_norm": 37.67250061035156,
"learning_rate": 9.32542170941243e-06,
"loss": 0.4925,
"step": 11400
},
{
"epoch": 0.3245104125514984,
"grad_norm": 51.37995147705078,
"learning_rate": 9.309744779582367e-06,
"loss": 0.4567,
"step": 11500
},
{
"epoch": 0.32733224222585927,
"grad_norm": 2.77247953414917,
"learning_rate": 9.294067849752305e-06,
"loss": 0.4273,
"step": 11600
},
{
"epoch": 0.3301540719002201,
"grad_norm": 26.4583740234375,
"learning_rate": 9.278390919922243e-06,
"loss": 0.4972,
"step": 11700
},
{
"epoch": 0.33297590157458096,
"grad_norm": 23.889802932739258,
"learning_rate": 9.262713990092181e-06,
"loss": 0.4748,
"step": 11800
},
{
"epoch": 0.3357977312489418,
"grad_norm": 26.13481903076172,
"learning_rate": 9.24703706026212e-06,
"loss": 0.4772,
"step": 11900
},
{
"epoch": 0.33861956092330264,
"grad_norm": 32.484073638916016,
"learning_rate": 9.231360130432057e-06,
"loss": 0.4582,
"step": 12000
},
{
"epoch": 0.34144139059766354,
"grad_norm": 12.51020336151123,
"learning_rate": 9.215683200601995e-06,
"loss": 0.466,
"step": 12100
},
{
"epoch": 0.3442632202720244,
"grad_norm": 22.987064361572266,
"learning_rate": 9.200006270771933e-06,
"loss": 0.474,
"step": 12200
},
{
"epoch": 0.3470850499463852,
"grad_norm": 19.731643676757812,
"learning_rate": 9.184329340941871e-06,
"loss": 0.428,
"step": 12300
},
{
"epoch": 0.34990687962074607,
"grad_norm": 30.095190048217773,
"learning_rate": 9.16865241111181e-06,
"loss": 0.4448,
"step": 12400
},
{
"epoch": 0.35272870929510697,
"grad_norm": 23.412023544311523,
"learning_rate": 9.152975481281746e-06,
"loss": 0.4607,
"step": 12500
},
{
"epoch": 0.3555505389694678,
"grad_norm": 15.968676567077637,
"learning_rate": 9.137298551451684e-06,
"loss": 0.4774,
"step": 12600
},
{
"epoch": 0.35837236864382865,
"grad_norm": 22.27809715270996,
"learning_rate": 9.121621621621622e-06,
"loss": 0.4796,
"step": 12700
},
{
"epoch": 0.3611941983181895,
"grad_norm": 25.096717834472656,
"learning_rate": 9.106101461089861e-06,
"loss": 0.4238,
"step": 12800
},
{
"epoch": 0.3640160279925504,
"grad_norm": 32.24757766723633,
"learning_rate": 9.090424531259799e-06,
"loss": 0.4719,
"step": 12900
},
{
"epoch": 0.36683785766691124,
"grad_norm": 43.72540283203125,
"learning_rate": 9.074747601429737e-06,
"loss": 0.4571,
"step": 13000
},
{
"epoch": 0.3696596873412721,
"grad_norm": 25.39431381225586,
"learning_rate": 9.059070671599675e-06,
"loss": 0.464,
"step": 13100
},
{
"epoch": 0.3724815170156329,
"grad_norm": 15.996291160583496,
"learning_rate": 9.043393741769613e-06,
"loss": 0.4345,
"step": 13200
},
{
"epoch": 0.37530334668999377,
"grad_norm": 108.50527954101562,
"learning_rate": 9.027716811939551e-06,
"loss": 0.4013,
"step": 13300
},
{
"epoch": 0.37812517636435466,
"grad_norm": 28.973108291625977,
"learning_rate": 9.012039882109487e-06,
"loss": 0.4512,
"step": 13400
},
{
"epoch": 0.3809470060387155,
"grad_norm": 74.1708984375,
"learning_rate": 8.996362952279425e-06,
"loss": 0.4343,
"step": 13500
},
{
"epoch": 0.38376883571307635,
"grad_norm": 19.657316207885742,
"learning_rate": 8.980686022449363e-06,
"loss": 0.4125,
"step": 13600
},
{
"epoch": 0.3865906653874372,
"grad_norm": 28.859350204467773,
"learning_rate": 8.965165861917603e-06,
"loss": 0.4746,
"step": 13700
},
{
"epoch": 0.3894124950617981,
"grad_norm": 18.13363265991211,
"learning_rate": 8.94948893208754e-06,
"loss": 0.4771,
"step": 13800
},
{
"epoch": 0.39223432473615893,
"grad_norm": 22.466726303100586,
"learning_rate": 8.933812002257479e-06,
"loss": 0.4613,
"step": 13900
},
{
"epoch": 0.3950561544105198,
"grad_norm": 27.679174423217773,
"learning_rate": 8.918135072427417e-06,
"loss": 0.462,
"step": 14000
},
{
"epoch": 0.3978779840848806,
"grad_norm": 27.347543716430664,
"learning_rate": 8.902458142597355e-06,
"loss": 0.4828,
"step": 14100
},
{
"epoch": 0.4006998137592415,
"grad_norm": 16.224889755249023,
"learning_rate": 8.886781212767293e-06,
"loss": 0.4608,
"step": 14200
},
{
"epoch": 0.40352164343360236,
"grad_norm": 23.845518112182617,
"learning_rate": 8.87110428293723e-06,
"loss": 0.4187,
"step": 14300
},
{
"epoch": 0.4063434731079632,
"grad_norm": 18.049110412597656,
"learning_rate": 8.855427353107167e-06,
"loss": 0.4794,
"step": 14400
},
{
"epoch": 0.40916530278232405,
"grad_norm": 30.752511978149414,
"learning_rate": 8.839750423277105e-06,
"loss": 0.473,
"step": 14500
},
{
"epoch": 0.4119871324566849,
"grad_norm": 29.279788970947266,
"learning_rate": 8.824073493447045e-06,
"loss": 0.4564,
"step": 14600
},
{
"epoch": 0.4148089621310458,
"grad_norm": 7.239739894866943,
"learning_rate": 8.808396563616983e-06,
"loss": 0.4363,
"step": 14700
},
{
"epoch": 0.41763079180540663,
"grad_norm": 19.604881286621094,
"learning_rate": 8.79271963378692e-06,
"loss": 0.4745,
"step": 14800
},
{
"epoch": 0.4204526214797675,
"grad_norm": 21.136322021484375,
"learning_rate": 8.777042703956857e-06,
"loss": 0.4552,
"step": 14900
},
{
"epoch": 0.4232744511541283,
"grad_norm": 23.589292526245117,
"learning_rate": 8.761365774126795e-06,
"loss": 0.4245,
"step": 15000
},
{
"epoch": 0.4260962808284892,
"grad_norm": 9.279696464538574,
"learning_rate": 8.745688844296733e-06,
"loss": 0.4583,
"step": 15100
},
{
"epoch": 0.42891811050285006,
"grad_norm": 23.76906394958496,
"learning_rate": 8.730011914466671e-06,
"loss": 0.4873,
"step": 15200
},
{
"epoch": 0.4317399401772109,
"grad_norm": 29.270069122314453,
"learning_rate": 8.71433498463661e-06,
"loss": 0.453,
"step": 15300
},
{
"epoch": 0.43456176985157174,
"grad_norm": 31.86260414123535,
"learning_rate": 8.698658054806547e-06,
"loss": 0.4939,
"step": 15400
},
{
"epoch": 0.4373835995259326,
"grad_norm": 36.27268600463867,
"learning_rate": 8.682981124976485e-06,
"loss": 0.447,
"step": 15500
},
{
"epoch": 0.4402054292002935,
"grad_norm": 26.660816192626953,
"learning_rate": 8.667304195146423e-06,
"loss": 0.5021,
"step": 15600
},
{
"epoch": 0.44302725887465433,
"grad_norm": 24.500545501708984,
"learning_rate": 8.651627265316361e-06,
"loss": 0.4443,
"step": 15700
},
{
"epoch": 0.44584908854901517,
"grad_norm": 30.132831573486328,
"learning_rate": 8.6359503354863e-06,
"loss": 0.4095,
"step": 15800
},
{
"epoch": 0.448670918223376,
"grad_norm": 33.71192169189453,
"learning_rate": 8.620273405656236e-06,
"loss": 0.442,
"step": 15900
},
{
"epoch": 0.4514927478977369,
"grad_norm": 59.854530334472656,
"learning_rate": 8.604596475826175e-06,
"loss": 0.4319,
"step": 16000
},
{
"epoch": 0.45431457757209776,
"grad_norm": 23.0345516204834,
"learning_rate": 8.588919545996113e-06,
"loss": 0.4537,
"step": 16100
},
{
"epoch": 0.4571364072464586,
"grad_norm": 12.112003326416016,
"learning_rate": 8.573242616166051e-06,
"loss": 0.4462,
"step": 16200
},
{
"epoch": 0.45995823692081944,
"grad_norm": 24.53924560546875,
"learning_rate": 8.55756568633599e-06,
"loss": 0.4354,
"step": 16300
},
{
"epoch": 0.46278006659518034,
"grad_norm": 13.580459594726562,
"learning_rate": 8.541888756505926e-06,
"loss": 0.4662,
"step": 16400
},
{
"epoch": 0.4656018962695412,
"grad_norm": 14.42153263092041,
"learning_rate": 8.526211826675864e-06,
"loss": 0.4699,
"step": 16500
},
{
"epoch": 0.468423725943902,
"grad_norm": 31.8900089263916,
"learning_rate": 8.510534896845802e-06,
"loss": 0.4473,
"step": 16600
},
{
"epoch": 0.47124555561826287,
"grad_norm": 22.49077606201172,
"learning_rate": 8.49485796701574e-06,
"loss": 0.4516,
"step": 16700
},
{
"epoch": 0.4740673852926237,
"grad_norm": 41.829689025878906,
"learning_rate": 8.479181037185678e-06,
"loss": 0.3909,
"step": 16800
},
{
"epoch": 0.4768892149669846,
"grad_norm": 31.608774185180664,
"learning_rate": 8.463504107355616e-06,
"loss": 0.4598,
"step": 16900
},
{
"epoch": 0.47971104464134545,
"grad_norm": 16.948047637939453,
"learning_rate": 8.447827177525554e-06,
"loss": 0.4696,
"step": 17000
},
{
"epoch": 0.4825328743157063,
"grad_norm": 26.435142517089844,
"learning_rate": 8.432150247695492e-06,
"loss": 0.4523,
"step": 17100
},
{
"epoch": 0.48535470399006714,
"grad_norm": 29.971567153930664,
"learning_rate": 8.41647331786543e-06,
"loss": 0.4253,
"step": 17200
},
{
"epoch": 0.48817653366442804,
"grad_norm": 16.34187126159668,
"learning_rate": 8.400796388035368e-06,
"loss": 0.4364,
"step": 17300
},
{
"epoch": 0.4909983633387889,
"grad_norm": 17.09442710876465,
"learning_rate": 8.385119458205306e-06,
"loss": 0.4243,
"step": 17400
},
{
"epoch": 0.4938201930131497,
"grad_norm": 17.122453689575195,
"learning_rate": 8.369442528375244e-06,
"loss": 0.4456,
"step": 17500
},
{
"epoch": 0.49664202268751056,
"grad_norm": 19.6232852935791,
"learning_rate": 8.353765598545182e-06,
"loss": 0.4297,
"step": 17600
},
{
"epoch": 0.49946385236187146,
"grad_norm": 26.52530288696289,
"learning_rate": 8.33808866871512e-06,
"loss": 0.442,
"step": 17700
},
{
"epoch": 0.5022856820362323,
"grad_norm": 23.909753799438477,
"learning_rate": 8.322411738885058e-06,
"loss": 0.4656,
"step": 17800
},
{
"epoch": 0.5051075117105931,
"grad_norm": 28.525676727294922,
"learning_rate": 8.306734809054994e-06,
"loss": 0.4366,
"step": 17900
},
{
"epoch": 0.507929341384954,
"grad_norm": 18.39270782470703,
"learning_rate": 8.291057879224932e-06,
"loss": 0.4473,
"step": 18000
},
{
"epoch": 0.5107511710593149,
"grad_norm": 17.035900115966797,
"learning_rate": 8.27538094939487e-06,
"loss": 0.3899,
"step": 18100
},
{
"epoch": 0.5135730007336757,
"grad_norm": 34.1118049621582,
"learning_rate": 8.259704019564809e-06,
"loss": 0.4569,
"step": 18200
},
{
"epoch": 0.5163948304080366,
"grad_norm": 46.11088562011719,
"learning_rate": 8.244027089734748e-06,
"loss": 0.4155,
"step": 18300
},
{
"epoch": 0.5192166600823974,
"grad_norm": 13.6370267868042,
"learning_rate": 8.228350159904685e-06,
"loss": 0.4137,
"step": 18400
},
{
"epoch": 0.5220384897567583,
"grad_norm": 27.46001434326172,
"learning_rate": 8.212673230074623e-06,
"loss": 0.4355,
"step": 18500
},
{
"epoch": 0.5248603194311191,
"grad_norm": 50.026153564453125,
"learning_rate": 8.19699630024456e-06,
"loss": 0.4602,
"step": 18600
},
{
"epoch": 0.52768214910548,
"grad_norm": 10.866439819335938,
"learning_rate": 8.181319370414499e-06,
"loss": 0.4433,
"step": 18700
},
{
"epoch": 0.5305039787798409,
"grad_norm": 25.32647705078125,
"learning_rate": 8.165642440584437e-06,
"loss": 0.4633,
"step": 18800
},
{
"epoch": 0.5333258084542017,
"grad_norm": 24.343759536743164,
"learning_rate": 8.149965510754375e-06,
"loss": 0.4046,
"step": 18900
},
{
"epoch": 0.5361476381285626,
"grad_norm": 6.707170009613037,
"learning_rate": 8.134288580924313e-06,
"loss": 0.371,
"step": 19000
},
{
"epoch": 0.5389694678029234,
"grad_norm": 31.448177337646484,
"learning_rate": 8.11861165109425e-06,
"loss": 0.4444,
"step": 19100
},
{
"epoch": 0.5417912974772843,
"grad_norm": 27.307594299316406,
"learning_rate": 8.102934721264189e-06,
"loss": 0.4495,
"step": 19200
},
{
"epoch": 0.5446131271516451,
"grad_norm": 33.58913040161133,
"learning_rate": 8.087257791434127e-06,
"loss": 0.4265,
"step": 19300
},
{
"epoch": 0.547434956826006,
"grad_norm": 10.490382194519043,
"learning_rate": 8.071580861604063e-06,
"loss": 0.4098,
"step": 19400
},
{
"epoch": 0.5502567865003668,
"grad_norm": 34.94256591796875,
"learning_rate": 8.055903931774001e-06,
"loss": 0.4345,
"step": 19500
},
{
"epoch": 0.5530786161747276,
"grad_norm": 15.932448387145996,
"learning_rate": 8.040227001943939e-06,
"loss": 0.4064,
"step": 19600
},
{
"epoch": 0.5559004458490886,
"grad_norm": 43.63095474243164,
"learning_rate": 8.024550072113879e-06,
"loss": 0.4644,
"step": 19700
},
{
"epoch": 0.5587222755234494,
"grad_norm": 16.069515228271484,
"learning_rate": 8.008873142283817e-06,
"loss": 0.4064,
"step": 19800
},
{
"epoch": 0.5615441051978103,
"grad_norm": 21.504106521606445,
"learning_rate": 7.993196212453753e-06,
"loss": 0.4134,
"step": 19900
},
{
"epoch": 0.5643659348721711,
"grad_norm": 30.280887603759766,
"learning_rate": 7.977519282623691e-06,
"loss": 0.4066,
"step": 20000
},
{
"epoch": 0.567187764546532,
"grad_norm": 35.03623580932617,
"learning_rate": 7.96184235279363e-06,
"loss": 0.4568,
"step": 20100
},
{
"epoch": 0.5700095942208928,
"grad_norm": 33.643226623535156,
"learning_rate": 7.946165422963567e-06,
"loss": 0.4131,
"step": 20200
},
{
"epoch": 0.5728314238952537,
"grad_norm": 17.969562530517578,
"learning_rate": 7.930488493133505e-06,
"loss": 0.447,
"step": 20300
},
{
"epoch": 0.5756532535696145,
"grad_norm": 24.389982223510742,
"learning_rate": 7.914811563303443e-06,
"loss": 0.448,
"step": 20400
},
{
"epoch": 0.5784750832439753,
"grad_norm": 28.242773056030273,
"learning_rate": 7.899134633473381e-06,
"loss": 0.4822,
"step": 20500
},
{
"epoch": 0.5812969129183363,
"grad_norm": 13.920737266540527,
"learning_rate": 7.88345770364332e-06,
"loss": 0.432,
"step": 20600
},
{
"epoch": 0.5841187425926971,
"grad_norm": 24.488445281982422,
"learning_rate": 7.867780773813257e-06,
"loss": 0.4167,
"step": 20700
},
{
"epoch": 0.586940572267058,
"grad_norm": 13.902533531188965,
"learning_rate": 7.852103843983195e-06,
"loss": 0.4354,
"step": 20800
},
{
"epoch": 0.5897624019414188,
"grad_norm": 13.963842391967773,
"learning_rate": 7.836426914153132e-06,
"loss": 0.4484,
"step": 20900
},
{
"epoch": 0.5925842316157797,
"grad_norm": 17.448009490966797,
"learning_rate": 7.82074998432307e-06,
"loss": 0.38,
"step": 21000
},
{
"epoch": 0.5954060612901405,
"grad_norm": 16.734331130981445,
"learning_rate": 7.80507305449301e-06,
"loss": 0.4409,
"step": 21100
},
{
"epoch": 0.5982278909645014,
"grad_norm": 29.44654655456543,
"learning_rate": 7.789396124662948e-06,
"loss": 0.4482,
"step": 21200
},
{
"epoch": 0.6010497206388622,
"grad_norm": 21.073148727416992,
"learning_rate": 7.773719194832886e-06,
"loss": 0.4136,
"step": 21300
},
{
"epoch": 0.6038715503132231,
"grad_norm": 8.795435905456543,
"learning_rate": 7.758042265002822e-06,
"loss": 0.4726,
"step": 21400
},
{
"epoch": 0.606693379987584,
"grad_norm": 18.364139556884766,
"learning_rate": 7.74236533517276e-06,
"loss": 0.4165,
"step": 21500
},
{
"epoch": 0.6095152096619448,
"grad_norm": 16.08391761779785,
"learning_rate": 7.726688405342698e-06,
"loss": 0.3864,
"step": 21600
},
{
"epoch": 0.6123370393363057,
"grad_norm": 20.25522804260254,
"learning_rate": 7.711011475512636e-06,
"loss": 0.4517,
"step": 21700
},
{
"epoch": 0.6151588690106665,
"grad_norm": 8.111662864685059,
"learning_rate": 7.695334545682574e-06,
"loss": 0.3994,
"step": 21800
},
{
"epoch": 0.6179806986850274,
"grad_norm": 20.919527053833008,
"learning_rate": 7.679657615852512e-06,
"loss": 0.4073,
"step": 21900
},
{
"epoch": 0.6208025283593882,
"grad_norm": 17.961687088012695,
"learning_rate": 7.66398068602245e-06,
"loss": 0.4139,
"step": 22000
},
{
"epoch": 0.623624358033749,
"grad_norm": 28.198232650756836,
"learning_rate": 7.648303756192388e-06,
"loss": 0.4406,
"step": 22100
},
{
"epoch": 0.6264461877081099,
"grad_norm": 9.413177490234375,
"learning_rate": 7.632626826362326e-06,
"loss": 0.4488,
"step": 22200
},
{
"epoch": 0.6292680173824708,
"grad_norm": 15.02452278137207,
"learning_rate": 7.616949896532264e-06,
"loss": 0.4523,
"step": 22300
},
{
"epoch": 0.6320898470568317,
"grad_norm": 37.498958587646484,
"learning_rate": 7.601272966702201e-06,
"loss": 0.4219,
"step": 22400
},
{
"epoch": 0.6349116767311925,
"grad_norm": 14.869726181030273,
"learning_rate": 7.585596036872139e-06,
"loss": 0.4111,
"step": 22500
},
{
"epoch": 0.6377335064055534,
"grad_norm": 19.956878662109375,
"learning_rate": 7.569919107042077e-06,
"loss": 0.4143,
"step": 22600
},
{
"epoch": 0.6405553360799142,
"grad_norm": 20.845680236816406,
"learning_rate": 7.554242177212015e-06,
"loss": 0.4326,
"step": 22700
},
{
"epoch": 0.6433771657542751,
"grad_norm": 7.167665958404541,
"learning_rate": 7.538565247381953e-06,
"loss": 0.3882,
"step": 22800
},
{
"epoch": 0.6461989954286359,
"grad_norm": 24.01291847229004,
"learning_rate": 7.5228883175518905e-06,
"loss": 0.4506,
"step": 22900
},
{
"epoch": 0.6490208251029967,
"grad_norm": 18.37822151184082,
"learning_rate": 7.5072113877218286e-06,
"loss": 0.4495,
"step": 23000
},
{
"epoch": 0.6518426547773576,
"grad_norm": 17.8455810546875,
"learning_rate": 7.4915344578917674e-06,
"loss": 0.4631,
"step": 23100
},
{
"epoch": 0.6546644844517185,
"grad_norm": 21.360794067382812,
"learning_rate": 7.4758575280617055e-06,
"loss": 0.3775,
"step": 23200
},
{
"epoch": 0.6574863141260794,
"grad_norm": 25.73183250427246,
"learning_rate": 7.4601805982316435e-06,
"loss": 0.4126,
"step": 23300
},
{
"epoch": 0.6603081438004402,
"grad_norm": 23.874588012695312,
"learning_rate": 7.444503668401581e-06,
"loss": 0.4415,
"step": 23400
},
{
"epoch": 0.6631299734748011,
"grad_norm": 23.876819610595703,
"learning_rate": 7.428826738571519e-06,
"loss": 0.3982,
"step": 23500
},
{
"epoch": 0.6659518031491619,
"grad_norm": 12.06535816192627,
"learning_rate": 7.413149808741457e-06,
"loss": 0.4357,
"step": 23600
},
{
"epoch": 0.6687736328235228,
"grad_norm": 26.34955406188965,
"learning_rate": 7.397472878911395e-06,
"loss": 0.4377,
"step": 23700
},
{
"epoch": 0.6715954624978836,
"grad_norm": 22.41992950439453,
"learning_rate": 7.381795949081333e-06,
"loss": 0.434,
"step": 23800
},
{
"epoch": 0.6744172921722444,
"grad_norm": 33.86873245239258,
"learning_rate": 7.36611901925127e-06,
"loss": 0.4248,
"step": 23900
},
{
"epoch": 0.6772391218466053,
"grad_norm": 10.177706718444824,
"learning_rate": 7.350598858719509e-06,
"loss": 0.3836,
"step": 24000
},
{
"epoch": 0.6800609515209662,
"grad_norm": 16.718257904052734,
"learning_rate": 7.334921928889447e-06,
"loss": 0.4107,
"step": 24100
},
{
"epoch": 0.6828827811953271,
"grad_norm": 27.072444915771484,
"learning_rate": 7.319244999059385e-06,
"loss": 0.3875,
"step": 24200
},
{
"epoch": 0.6857046108696879,
"grad_norm": 14.283991813659668,
"learning_rate": 7.303568069229323e-06,
"loss": 0.4267,
"step": 24300
},
{
"epoch": 0.6885264405440488,
"grad_norm": 12.753026008605957,
"learning_rate": 7.28789113939926e-06,
"loss": 0.4146,
"step": 24400
},
{
"epoch": 0.6913482702184096,
"grad_norm": 40.42727279663086,
"learning_rate": 7.272214209569198e-06,
"loss": 0.4194,
"step": 24500
},
{
"epoch": 0.6941700998927705,
"grad_norm": 13.08806324005127,
"learning_rate": 7.256537279739136e-06,
"loss": 0.3789,
"step": 24600
},
{
"epoch": 0.6969919295671313,
"grad_norm": 31.469749450683594,
"learning_rate": 7.240860349909074e-06,
"loss": 0.3996,
"step": 24700
},
{
"epoch": 0.6998137592414921,
"grad_norm": 12.432112693786621,
"learning_rate": 7.225183420079012e-06,
"loss": 0.4347,
"step": 24800
},
{
"epoch": 0.7026355889158531,
"grad_norm": 22.593225479125977,
"learning_rate": 7.2095064902489495e-06,
"loss": 0.3896,
"step": 24900
},
{
"epoch": 0.7054574185902139,
"grad_norm": 28.22087860107422,
"learning_rate": 7.1938295604188876e-06,
"loss": 0.4072,
"step": 25000
}
],
"logging_steps": 100,
"max_steps": 70876,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}