trongg's picture
Upload folder using huggingface_hub
4b10ccd verified
raw
history blame
61.3 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8616937691959715,
"eval_steps": 1005,
"global_step": 9048,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0038094331087355064,
"grad_norm": 1.1105036735534668,
"learning_rate": 1.2714024683199598e-06,
"loss": 1.9169,
"mean_token_accuracy": 0.5754982324317097,
"num_tokens": 213515.0,
"step": 40
},
{
"epoch": 0.007618866217471013,
"grad_norm": 0.8602012991905212,
"learning_rate": 2.575404999930175e-06,
"loss": 1.7612,
"mean_token_accuracy": 0.5973538164049387,
"num_tokens": 428291.0,
"step": 80
},
{
"epoch": 0.01142829932620652,
"grad_norm": 0.7212763428688049,
"learning_rate": 3.87940753154039e-06,
"loss": 1.7641,
"mean_token_accuracy": 0.6031968526542186,
"num_tokens": 649316.0,
"step": 120
},
{
"epoch": 0.015237732434942025,
"grad_norm": 0.9749614000320435,
"learning_rate": 5.183410063150605e-06,
"loss": 1.5732,
"mean_token_accuracy": 0.6395209338515997,
"num_tokens": 867038.0,
"step": 160
},
{
"epoch": 0.01904716554367753,
"grad_norm": 0.8746033310890198,
"learning_rate": 6.48741259476082e-06,
"loss": 1.5567,
"mean_token_accuracy": 0.6250597681850195,
"num_tokens": 1084477.0,
"step": 200
},
{
"epoch": 0.02285659865241304,
"grad_norm": 0.6283444762229919,
"learning_rate": 7.791415126371035e-06,
"loss": 1.4563,
"mean_token_accuracy": 0.6498559027910232,
"num_tokens": 1304254.0,
"step": 240
},
{
"epoch": 0.026666031761148543,
"grad_norm": 0.8473705053329468,
"learning_rate": 9.09541765798125e-06,
"loss": 1.5125,
"mean_token_accuracy": 0.640368782542646,
"num_tokens": 1518177.0,
"step": 280
},
{
"epoch": 0.03047546486988405,
"grad_norm": 0.8421404361724854,
"learning_rate": 1.0399420189591466e-05,
"loss": 1.43,
"mean_token_accuracy": 0.6589812656864524,
"num_tokens": 1745573.0,
"step": 320
},
{
"epoch": 0.034284897978619555,
"grad_norm": 0.7308921813964844,
"learning_rate": 1.1703422721201682e-05,
"loss": 1.3957,
"mean_token_accuracy": 0.6596783269196749,
"num_tokens": 1970323.0,
"step": 360
},
{
"epoch": 0.03809433108735506,
"grad_norm": 1.024109125137329,
"learning_rate": 1.3007425252811896e-05,
"loss": 1.2906,
"mean_token_accuracy": 0.6734623985365034,
"num_tokens": 2209851.0,
"step": 400
},
{
"epoch": 0.04190376419609057,
"grad_norm": 0.8211472034454346,
"learning_rate": 1.4311427784422112e-05,
"loss": 1.4547,
"mean_token_accuracy": 0.65384187027812,
"num_tokens": 2419422.0,
"step": 440
},
{
"epoch": 0.04571319730482608,
"grad_norm": 0.9193497896194458,
"learning_rate": 1.5615430316032328e-05,
"loss": 1.3763,
"mean_token_accuracy": 0.6668608419597148,
"num_tokens": 2650647.0,
"step": 480
},
{
"epoch": 0.04952263041356158,
"grad_norm": 0.9686090350151062,
"learning_rate": 1.6919432847642544e-05,
"loss": 1.3092,
"mean_token_accuracy": 0.6819563843309879,
"num_tokens": 2895742.0,
"step": 520
},
{
"epoch": 0.053332063522297087,
"grad_norm": 0.6123139262199402,
"learning_rate": 1.8223435379252756e-05,
"loss": 1.3106,
"mean_token_accuracy": 0.6761070437729358,
"num_tokens": 3131131.0,
"step": 560
},
{
"epoch": 0.057141496631032594,
"grad_norm": 0.8315911293029785,
"learning_rate": 1.952743791086297e-05,
"loss": 1.295,
"mean_token_accuracy": 0.6829990902915597,
"num_tokens": 3358554.0,
"step": 600
},
{
"epoch": 0.0609509297397681,
"grad_norm": 0.6099031567573547,
"learning_rate": 2.0831440442473187e-05,
"loss": 1.315,
"mean_token_accuracy": 0.6704990575090051,
"num_tokens": 3584968.0,
"step": 640
},
{
"epoch": 0.0647603628485036,
"grad_norm": 0.8671184778213501,
"learning_rate": 2.2135442974083403e-05,
"loss": 1.4007,
"mean_token_accuracy": 0.6610645942389966,
"num_tokens": 3789884.0,
"step": 680
},
{
"epoch": 0.06856979595723911,
"grad_norm": 0.8112688064575195,
"learning_rate": 2.3439445505693616e-05,
"loss": 1.3192,
"mean_token_accuracy": 0.6786379875615239,
"num_tokens": 4010228.0,
"step": 720
},
{
"epoch": 0.07237922906597462,
"grad_norm": 0.7991048097610474,
"learning_rate": 2.474344803730383e-05,
"loss": 1.357,
"mean_token_accuracy": 0.6695337913930416,
"num_tokens": 4240601.0,
"step": 760
},
{
"epoch": 0.07618866217471013,
"grad_norm": 0.6952106356620789,
"learning_rate": 2.6047450568914047e-05,
"loss": 1.3805,
"mean_token_accuracy": 0.666619416512549,
"num_tokens": 4457613.0,
"step": 800
},
{
"epoch": 0.07999809528344563,
"grad_norm": 0.5898691415786743,
"learning_rate": 2.7351453100524263e-05,
"loss": 1.3645,
"mean_token_accuracy": 0.6681723784655332,
"num_tokens": 4675575.0,
"step": 840
},
{
"epoch": 0.08380752839218114,
"grad_norm": 0.8592577576637268,
"learning_rate": 2.865545563213448e-05,
"loss": 1.3007,
"mean_token_accuracy": 0.6759131707251071,
"num_tokens": 4914793.0,
"step": 880
},
{
"epoch": 0.08761696150091665,
"grad_norm": 0.47871366143226624,
"learning_rate": 2.9959458163744694e-05,
"loss": 1.3316,
"mean_token_accuracy": 0.6775730215013027,
"num_tokens": 5145053.0,
"step": 920
},
{
"epoch": 0.09142639460965216,
"grad_norm": 0.6131187081336975,
"learning_rate": 3.126346069535491e-05,
"loss": 1.3631,
"mean_token_accuracy": 0.6744197152554989,
"num_tokens": 5353224.0,
"step": 960
},
{
"epoch": 0.09523582771838766,
"grad_norm": 0.6329927444458008,
"learning_rate": 3.256746322696512e-05,
"loss": 1.3321,
"mean_token_accuracy": 0.677939809858799,
"num_tokens": 5574886.0,
"step": 1000
},
{
"epoch": 0.0957120068569796,
"eval_mean_token_accuracy": 0.6180267786704393,
"eval_num_tokens": 5598115.0,
"eval_test_loss": 1.8654624223709106,
"eval_test_runtime": 94.5921,
"eval_test_samples_per_second": 21.048,
"eval_test_steps_per_second": 10.529,
"step": 1005
},
{
"epoch": 0.09904526082712316,
"grad_norm": 0.5825843214988708,
"learning_rate": 3.387146575857534e-05,
"loss": 1.3293,
"mean_token_accuracy": 0.6768566837534309,
"num_tokens": 5800189.0,
"step": 1040
},
{
"epoch": 0.10285469393585867,
"grad_norm": 0.47114425897598267,
"learning_rate": 3.5175468290185554e-05,
"loss": 1.273,
"mean_token_accuracy": 0.6856468811631202,
"num_tokens": 6035590.0,
"step": 1080
},
{
"epoch": 0.10666412704459417,
"grad_norm": 0.6536944508552551,
"learning_rate": 3.647947082179577e-05,
"loss": 1.2521,
"mean_token_accuracy": 0.6856056058779358,
"num_tokens": 6271507.0,
"step": 1120
},
{
"epoch": 0.11047356015332968,
"grad_norm": 0.49625030159950256,
"learning_rate": 3.7783473353405986e-05,
"loss": 1.3219,
"mean_token_accuracy": 0.6740474671125412,
"num_tokens": 6485242.0,
"step": 1160
},
{
"epoch": 0.11428299326206519,
"grad_norm": 0.4760020971298218,
"learning_rate": 3.90874758850162e-05,
"loss": 1.3866,
"mean_token_accuracy": 0.666905522160232,
"num_tokens": 6694201.0,
"step": 1200
},
{
"epoch": 0.1180924263708007,
"grad_norm": 0.6702063083648682,
"learning_rate": 4.039147841662642e-05,
"loss": 1.3396,
"mean_token_accuracy": 0.673108272999525,
"num_tokens": 6920931.0,
"step": 1240
},
{
"epoch": 0.1219018594795362,
"grad_norm": 0.5828559398651123,
"learning_rate": 4.169548094823663e-05,
"loss": 1.2849,
"mean_token_accuracy": 0.6773701002821326,
"num_tokens": 7147355.0,
"step": 1280
},
{
"epoch": 0.1257112925882717,
"grad_norm": 0.558748185634613,
"learning_rate": 4.299948347984684e-05,
"loss": 1.3391,
"mean_token_accuracy": 0.6739463916048407,
"num_tokens": 7365364.0,
"step": 1320
},
{
"epoch": 0.1295207256970072,
"grad_norm": 0.5812460780143738,
"learning_rate": 4.430348601145706e-05,
"loss": 1.3572,
"mean_token_accuracy": 0.6651050833985209,
"num_tokens": 7575079.0,
"step": 1360
},
{
"epoch": 0.13333015880574273,
"grad_norm": 0.7141745686531067,
"learning_rate": 4.5607488543067274e-05,
"loss": 1.2944,
"mean_token_accuracy": 0.681589861959219,
"num_tokens": 7804608.0,
"step": 1400
},
{
"epoch": 0.13713959191447822,
"grad_norm": 0.5530197024345398,
"learning_rate": 4.691149107467749e-05,
"loss": 1.3375,
"mean_token_accuracy": 0.6718398928642273,
"num_tokens": 8017882.0,
"step": 1440
},
{
"epoch": 0.14094902502321374,
"grad_norm": 0.48236706852912903,
"learning_rate": 4.8215493606287705e-05,
"loss": 1.335,
"mean_token_accuracy": 0.6747591784223914,
"num_tokens": 8238637.0,
"step": 1480
},
{
"epoch": 0.14475845813194924,
"grad_norm": 0.5896688103675842,
"learning_rate": 4.919348145872209e-05,
"loss": 1.359,
"mean_token_accuracy": 0.6685425175353885,
"num_tokens": 8448265.0,
"step": 1520
},
{
"epoch": 0.14856789124068476,
"grad_norm": 0.6068539619445801,
"learning_rate": 4.9193144349007555e-05,
"loss": 1.3475,
"mean_token_accuracy": 0.6698486657813192,
"num_tokens": 8663046.0,
"step": 1560
},
{
"epoch": 0.15237732434942025,
"grad_norm": 0.4848249852657318,
"learning_rate": 4.91923577659781e-05,
"loss": 1.3355,
"mean_token_accuracy": 0.6719444127753377,
"num_tokens": 8882966.0,
"step": 1600
},
{
"epoch": 0.15618675745815574,
"grad_norm": 0.5089111924171448,
"learning_rate": 4.9191121724764224e-05,
"loss": 1.3455,
"mean_token_accuracy": 0.6727668078616261,
"num_tokens": 9099039.0,
"step": 1640
},
{
"epoch": 0.15999619056689127,
"grad_norm": 0.5716065168380737,
"learning_rate": 4.9189436249142116e-05,
"loss": 1.3555,
"mean_token_accuracy": 0.6711426375433802,
"num_tokens": 9320395.0,
"step": 1680
},
{
"epoch": 0.16380562367562676,
"grad_norm": 0.7265896797180176,
"learning_rate": 4.918730137153316e-05,
"loss": 1.3418,
"mean_token_accuracy": 0.6719378443434835,
"num_tokens": 9535056.0,
"step": 1720
},
{
"epoch": 0.16761505678436228,
"grad_norm": 0.6088116765022278,
"learning_rate": 4.9184717133003326e-05,
"loss": 1.306,
"mean_token_accuracy": 0.6769841868430376,
"num_tokens": 9753473.0,
"step": 1760
},
{
"epoch": 0.17142448989309778,
"grad_norm": 0.3883936405181885,
"learning_rate": 4.918168358326239e-05,
"loss": 1.2348,
"mean_token_accuracy": 0.6958686344325542,
"num_tokens": 9977829.0,
"step": 1800
},
{
"epoch": 0.1752339230018333,
"grad_norm": 0.5694851279258728,
"learning_rate": 4.917820078066296e-05,
"loss": 1.3571,
"mean_token_accuracy": 0.6749148614704609,
"num_tokens": 10202796.0,
"step": 1840
},
{
"epoch": 0.1790433561105688,
"grad_norm": 0.4964405298233032,
"learning_rate": 4.9174268792199355e-05,
"loss": 1.292,
"mean_token_accuracy": 0.682447912171483,
"num_tokens": 10446726.0,
"step": 1880
},
{
"epoch": 0.1828527892193043,
"grad_norm": 0.585851788520813,
"learning_rate": 4.916988769350633e-05,
"loss": 1.3271,
"mean_token_accuracy": 0.6706608653068542,
"num_tokens": 10675299.0,
"step": 1920
},
{
"epoch": 0.1866622223280398,
"grad_norm": 0.4936956763267517,
"learning_rate": 4.91650575688576e-05,
"loss": 1.3212,
"mean_token_accuracy": 0.6725098427385092,
"num_tokens": 10908277.0,
"step": 1960
},
{
"epoch": 0.19047165543677533,
"grad_norm": 0.4473949670791626,
"learning_rate": 4.9159778511164254e-05,
"loss": 1.2485,
"mean_token_accuracy": 0.6835329543799162,
"num_tokens": 11138801.0,
"step": 2000
},
{
"epoch": 0.1914240137139592,
"eval_mean_token_accuracy": 0.6230506894878115,
"eval_num_tokens": 11194791.0,
"eval_test_loss": 1.832930564880371,
"eval_test_runtime": 95.4667,
"eval_test_samples_per_second": 20.855,
"eval_test_steps_per_second": 10.433,
"step": 2010
},
{
"epoch": 0.19428108854551082,
"grad_norm": 0.5689784288406372,
"learning_rate": 4.915405062197292e-05,
"loss": 1.2842,
"mean_token_accuracy": 0.6809853713959455,
"num_tokens": 11368252.0,
"step": 2040
},
{
"epoch": 0.19809052165424632,
"grad_norm": 0.5339015126228333,
"learning_rate": 4.914787401146385e-05,
"loss": 1.3224,
"mean_token_accuracy": 0.67562406193465,
"num_tokens": 11590368.0,
"step": 2080
},
{
"epoch": 0.20189995476298184,
"grad_norm": 0.6560924053192139,
"learning_rate": 4.91412487984488e-05,
"loss": 1.3732,
"mean_token_accuracy": 0.6702003039419651,
"num_tokens": 11798414.0,
"step": 2120
},
{
"epoch": 0.20570938787171733,
"grad_norm": 0.5972558259963989,
"learning_rate": 4.91341751103687e-05,
"loss": 1.2983,
"mean_token_accuracy": 0.6805376719683409,
"num_tokens": 12013463.0,
"step": 2160
},
{
"epoch": 0.20951882098045285,
"grad_norm": 0.4506518542766571,
"learning_rate": 4.912665308329125e-05,
"loss": 1.2653,
"mean_token_accuracy": 0.6877350242808461,
"num_tokens": 12236318.0,
"step": 2200
},
{
"epoch": 0.21332825408918835,
"grad_norm": 0.5365843772888184,
"learning_rate": 4.91186828619083e-05,
"loss": 1.3285,
"mean_token_accuracy": 0.6712553380057216,
"num_tokens": 12458126.0,
"step": 2240
},
{
"epoch": 0.21713768719792387,
"grad_norm": 0.4755626916885376,
"learning_rate": 4.9110264599533055e-05,
"loss": 1.3315,
"mean_token_accuracy": 0.6608016982674598,
"num_tokens": 12678890.0,
"step": 2280
},
{
"epoch": 0.22094712030665936,
"grad_norm": 0.43261516094207764,
"learning_rate": 4.9101398458097093e-05,
"loss": 1.3166,
"mean_token_accuracy": 0.6768794044852257,
"num_tokens": 12900305.0,
"step": 2320
},
{
"epoch": 0.22475655341539488,
"grad_norm": 0.5397626161575317,
"learning_rate": 4.9092084608147315e-05,
"loss": 1.3296,
"mean_token_accuracy": 0.6792715717107057,
"num_tokens": 13109419.0,
"step": 2360
},
{
"epoch": 0.22856598652413038,
"grad_norm": 0.4163241386413574,
"learning_rate": 4.908232322884261e-05,
"loss": 1.2938,
"mean_token_accuracy": 0.6871387459337711,
"num_tokens": 13335797.0,
"step": 2400
},
{
"epoch": 0.23237541963286587,
"grad_norm": 0.7546857595443726,
"learning_rate": 4.907211450795045e-05,
"loss": 1.2872,
"mean_token_accuracy": 0.6850869571790099,
"num_tokens": 13565988.0,
"step": 2440
},
{
"epoch": 0.2361848527416014,
"grad_norm": 0.6345241069793701,
"learning_rate": 4.906145864184325e-05,
"loss": 1.3234,
"mean_token_accuracy": 0.6739947069436312,
"num_tokens": 13779004.0,
"step": 2480
},
{
"epoch": 0.23999428585033689,
"grad_norm": 0.5710681080818176,
"learning_rate": 4.905035583549459e-05,
"loss": 1.3,
"mean_token_accuracy": 0.6780852235853672,
"num_tokens": 14005597.0,
"step": 2520
},
{
"epoch": 0.2438037189590724,
"grad_norm": 0.5527021288871765,
"learning_rate": 4.903880630247529e-05,
"loss": 1.3019,
"mean_token_accuracy": 0.6793860571458936,
"num_tokens": 14235179.0,
"step": 2560
},
{
"epoch": 0.2476131520678079,
"grad_norm": 0.4566735327243805,
"learning_rate": 4.902681026494929e-05,
"loss": 1.2927,
"mean_token_accuracy": 0.6824004529044032,
"num_tokens": 14464950.0,
"step": 2600
},
{
"epoch": 0.2514225851765434,
"grad_norm": 0.5023188591003418,
"learning_rate": 4.901436795366938e-05,
"loss": 1.2996,
"mean_token_accuracy": 0.6823519911617041,
"num_tokens": 14686771.0,
"step": 2640
},
{
"epoch": 0.2552320182852789,
"grad_norm": 0.6048426628112793,
"learning_rate": 4.900147960797276e-05,
"loss": 1.3523,
"mean_token_accuracy": 0.6745322810485959,
"num_tokens": 14896388.0,
"step": 2680
},
{
"epoch": 0.2590414513940144,
"grad_norm": 0.5594220757484436,
"learning_rate": 4.898814547577645e-05,
"loss": 1.2733,
"mean_token_accuracy": 0.685325993783772,
"num_tokens": 15115431.0,
"step": 2720
},
{
"epoch": 0.26285088450274996,
"grad_norm": 0.5266786813735962,
"learning_rate": 4.89743658135725e-05,
"loss": 1.378,
"mean_token_accuracy": 0.661252242513001,
"num_tokens": 15318046.0,
"step": 2760
},
{
"epoch": 0.26666031761148545,
"grad_norm": 0.33945998549461365,
"learning_rate": 4.896014088642304e-05,
"loss": 1.2849,
"mean_token_accuracy": 0.68093207962811,
"num_tokens": 15546171.0,
"step": 2800
},
{
"epoch": 0.27046975072022095,
"grad_norm": 0.38815465569496155,
"learning_rate": 4.8945470967955234e-05,
"loss": 1.3622,
"mean_token_accuracy": 0.6677591029554606,
"num_tokens": 15763709.0,
"step": 2840
},
{
"epoch": 0.27427918382895644,
"grad_norm": 0.5876178741455078,
"learning_rate": 4.8930356340355986e-05,
"loss": 1.2209,
"mean_token_accuracy": 0.7016260443255306,
"num_tokens": 15994580.0,
"step": 2880
},
{
"epoch": 0.27808861693769193,
"grad_norm": 0.6305630207061768,
"learning_rate": 4.8914797294366514e-05,
"loss": 1.3553,
"mean_token_accuracy": 0.6636421175673604,
"num_tokens": 16208325.0,
"step": 2920
},
{
"epoch": 0.2818980500464275,
"grad_norm": 0.47434547543525696,
"learning_rate": 4.889879412927675e-05,
"loss": 1.3085,
"mean_token_accuracy": 0.6797755250707269,
"num_tokens": 16428418.0,
"step": 2960
},
{
"epoch": 0.285707483155163,
"grad_norm": 0.5395858883857727,
"learning_rate": 4.888234715291958e-05,
"loss": 1.2705,
"mean_token_accuracy": 0.6883305061608553,
"num_tokens": 16658226.0,
"step": 3000
},
{
"epoch": 0.2871360205709388,
"eval_mean_token_accuracy": 0.6258668910906496,
"eval_num_tokens": 16745761.0,
"eval_test_loss": 1.8088411092758179,
"eval_test_runtime": 94.958,
"eval_test_samples_per_second": 20.967,
"eval_test_steps_per_second": 10.489,
"step": 3015
},
{
"epoch": 0.28951691626389847,
"grad_norm": 0.5518168210983276,
"learning_rate": 4.8865456681664965e-05,
"loss": 1.2997,
"mean_token_accuracy": 0.6776028882712126,
"num_tokens": 16876039.0,
"step": 3040
},
{
"epoch": 0.29332634937263397,
"grad_norm": 0.6039868593215942,
"learning_rate": 4.88481230404138e-05,
"loss": 1.2621,
"mean_token_accuracy": 0.6928930662572383,
"num_tokens": 17104229.0,
"step": 3080
},
{
"epoch": 0.2971357824813695,
"grad_norm": 0.5715156197547913,
"learning_rate": 4.883034656259167e-05,
"loss": 1.3461,
"mean_token_accuracy": 0.6714908115565776,
"num_tokens": 17312134.0,
"step": 3120
},
{
"epoch": 0.300945215590105,
"grad_norm": 0.5405182838439941,
"learning_rate": 4.8812127590142505e-05,
"loss": 1.2837,
"mean_token_accuracy": 0.6896362887695432,
"num_tokens": 17550198.0,
"step": 3160
},
{
"epoch": 0.3047546486988405,
"grad_norm": 0.5504481792449951,
"learning_rate": 4.8793466473521904e-05,
"loss": 1.2181,
"mean_token_accuracy": 0.6988515704870224,
"num_tokens": 17778838.0,
"step": 3200
},
{
"epoch": 0.308564081807576,
"grad_norm": 0.621837317943573,
"learning_rate": 4.877436357169048e-05,
"loss": 1.3323,
"mean_token_accuracy": 0.6710341470316052,
"num_tokens": 17985533.0,
"step": 3240
},
{
"epoch": 0.3123735149163115,
"grad_norm": 0.6566672325134277,
"learning_rate": 4.8754819252106876e-05,
"loss": 1.2744,
"mean_token_accuracy": 0.683073416352272,
"num_tokens": 18208416.0,
"step": 3280
},
{
"epoch": 0.31618294802504704,
"grad_norm": 0.5615633130073547,
"learning_rate": 4.873483389072076e-05,
"loss": 1.315,
"mean_token_accuracy": 0.684354678913951,
"num_tokens": 18426223.0,
"step": 3320
},
{
"epoch": 0.31999238113378253,
"grad_norm": 0.4581094980239868,
"learning_rate": 4.871440787196558e-05,
"loss": 1.3269,
"mean_token_accuracy": 0.6761635078117252,
"num_tokens": 18635703.0,
"step": 3360
},
{
"epoch": 0.323801814242518,
"grad_norm": 0.5478665232658386,
"learning_rate": 4.8693541588751144e-05,
"loss": 1.2554,
"mean_token_accuracy": 0.688594707287848,
"num_tokens": 18859074.0,
"step": 3400
},
{
"epoch": 0.3276112473512535,
"grad_norm": 0.39972299337387085,
"learning_rate": 4.867223544245607e-05,
"loss": 1.2497,
"mean_token_accuracy": 0.6876592069864274,
"num_tokens": 19084486.0,
"step": 3440
},
{
"epoch": 0.33142068045998907,
"grad_norm": 0.6696220636367798,
"learning_rate": 4.865048984292009e-05,
"loss": 1.2776,
"mean_token_accuracy": 0.6786478063091635,
"num_tokens": 19305228.0,
"step": 3480
},
{
"epoch": 0.33523011356872456,
"grad_norm": 0.5517823100090027,
"learning_rate": 4.8628305208436137e-05,
"loss": 1.2922,
"mean_token_accuracy": 0.6886478485539556,
"num_tokens": 19538363.0,
"step": 3520
},
{
"epoch": 0.33903954667746006,
"grad_norm": 0.44766560196876526,
"learning_rate": 4.860568196574232e-05,
"loss": 1.2851,
"mean_token_accuracy": 0.6843299470841885,
"num_tokens": 19774720.0,
"step": 3560
},
{
"epoch": 0.34284897978619555,
"grad_norm": 0.38793668150901794,
"learning_rate": 4.858262055001371e-05,
"loss": 1.3057,
"mean_token_accuracy": 0.6804726634174585,
"num_tokens": 19998879.0,
"step": 3600
},
{
"epoch": 0.3466584128949311,
"grad_norm": 0.45702189207077026,
"learning_rate": 4.8559121404853944e-05,
"loss": 1.1826,
"mean_token_accuracy": 0.7049577981233597,
"num_tokens": 20243495.0,
"step": 3640
},
{
"epoch": 0.3504678460036666,
"grad_norm": 0.5189054608345032,
"learning_rate": 4.853518498228674e-05,
"loss": 1.2659,
"mean_token_accuracy": 0.6826557310298085,
"num_tokens": 20462476.0,
"step": 3680
},
{
"epoch": 0.3542772791124021,
"grad_norm": 0.4221845269203186,
"learning_rate": 4.851081174274715e-05,
"loss": 1.3576,
"mean_token_accuracy": 0.6631780494004488,
"num_tokens": 20679798.0,
"step": 3720
},
{
"epoch": 0.3580867122211376,
"grad_norm": 0.4048372209072113,
"learning_rate": 4.8486002155072754e-05,
"loss": 1.2332,
"mean_token_accuracy": 0.7002108607441186,
"num_tokens": 20913935.0,
"step": 3760
},
{
"epoch": 0.3618961453298731,
"grad_norm": 0.5621294379234314,
"learning_rate": 4.8460756696494594e-05,
"loss": 1.2625,
"mean_token_accuracy": 0.6872183892875909,
"num_tokens": 21138524.0,
"step": 3800
},
{
"epoch": 0.3657055784386086,
"grad_norm": 0.5211415886878967,
"learning_rate": 4.843507585262804e-05,
"loss": 1.2533,
"mean_token_accuracy": 0.6849848669022321,
"num_tokens": 21373326.0,
"step": 3840
},
{
"epoch": 0.3695150115473441,
"grad_norm": 0.551052987575531,
"learning_rate": 4.8408960117463386e-05,
"loss": 1.304,
"mean_token_accuracy": 0.6765170315280556,
"num_tokens": 21590656.0,
"step": 3880
},
{
"epoch": 0.3733244446560796,
"grad_norm": 0.5423365831375122,
"learning_rate": 4.838240999335643e-05,
"loss": 1.3441,
"mean_token_accuracy": 0.6701787773519754,
"num_tokens": 21795838.0,
"step": 3920
},
{
"epoch": 0.3771338777648151,
"grad_norm": 0.46691930294036865,
"learning_rate": 4.835542599101873e-05,
"loss": 1.3163,
"mean_token_accuracy": 0.6742871653288602,
"num_tokens": 22007062.0,
"step": 3960
},
{
"epoch": 0.38094331087355066,
"grad_norm": 0.6379289627075195,
"learning_rate": 4.8328008629507863e-05,
"loss": 1.3094,
"mean_token_accuracy": 0.6776679191738367,
"num_tokens": 22232306.0,
"step": 4000
},
{
"epoch": 0.3828480274279184,
"eval_mean_token_accuracy": 0.6281169285496555,
"eval_num_tokens": 22348477.0,
"eval_test_loss": 1.793502926826477,
"eval_test_runtime": 95.0461,
"eval_test_samples_per_second": 20.948,
"eval_test_steps_per_second": 10.479,
"step": 4020
},
{
"epoch": 0.38475274398228615,
"grad_norm": 0.6131725311279297,
"learning_rate": 4.830015843621735e-05,
"loss": 1.2705,
"mean_token_accuracy": 0.6874665239825845,
"num_tokens": 22461894.0,
"step": 4040
},
{
"epoch": 0.38856217709102164,
"grad_norm": 0.4669947326183319,
"learning_rate": 4.8271875946866574e-05,
"loss": 1.3255,
"mean_token_accuracy": 0.6841361073777079,
"num_tokens": 22684394.0,
"step": 4080
},
{
"epoch": 0.39237161019975714,
"grad_norm": 0.42204299569129944,
"learning_rate": 4.824316170549047e-05,
"loss": 1.2425,
"mean_token_accuracy": 0.6886863965541125,
"num_tokens": 22918777.0,
"step": 4120
},
{
"epoch": 0.39618104330849263,
"grad_norm": 0.5980663895606995,
"learning_rate": 4.821401626442903e-05,
"loss": 1.2868,
"mean_token_accuracy": 0.6745259683579207,
"num_tokens": 23139486.0,
"step": 4160
},
{
"epoch": 0.3999904764172282,
"grad_norm": 0.46430954337120056,
"learning_rate": 4.8184440184316695e-05,
"loss": 1.228,
"mean_token_accuracy": 0.6927272006869316,
"num_tokens": 23372563.0,
"step": 4200
},
{
"epoch": 0.4037999095259637,
"grad_norm": 0.5982958078384399,
"learning_rate": 4.815443403407159e-05,
"loss": 1.2377,
"mean_token_accuracy": 0.687408297508955,
"num_tokens": 23601002.0,
"step": 4240
},
{
"epoch": 0.40760934263469917,
"grad_norm": 0.5242788195610046,
"learning_rate": 4.812399839088453e-05,
"loss": 1.2926,
"mean_token_accuracy": 0.6808499401435256,
"num_tokens": 23814142.0,
"step": 4280
},
{
"epoch": 0.41141877574343466,
"grad_norm": 0.44663509726524353,
"learning_rate": 4.809313384020799e-05,
"loss": 1.2768,
"mean_token_accuracy": 0.6787567920982838,
"num_tokens": 24034008.0,
"step": 4320
},
{
"epoch": 0.4152282088521702,
"grad_norm": 0.571322500705719,
"learning_rate": 4.806184097574478e-05,
"loss": 1.299,
"mean_token_accuracy": 0.6753746373578906,
"num_tokens": 24241179.0,
"step": 4360
},
{
"epoch": 0.4190376419609057,
"grad_norm": 0.6444421410560608,
"learning_rate": 4.8030120399436636e-05,
"loss": 1.2922,
"mean_token_accuracy": 0.6790545796975493,
"num_tokens": 24463001.0,
"step": 4400
},
{
"epoch": 0.4228470750696412,
"grad_norm": 0.5884628891944885,
"learning_rate": 4.799797272145267e-05,
"loss": 1.3354,
"mean_token_accuracy": 0.6655034508556128,
"num_tokens": 24672669.0,
"step": 4440
},
{
"epoch": 0.4266565081783767,
"grad_norm": 0.6725838780403137,
"learning_rate": 4.796539856017762e-05,
"loss": 1.2866,
"mean_token_accuracy": 0.6829743100330233,
"num_tokens": 24895076.0,
"step": 4480
},
{
"epoch": 0.4304659412871122,
"grad_norm": 0.5089847445487976,
"learning_rate": 4.7932398542199935e-05,
"loss": 1.3312,
"mean_token_accuracy": 0.6777703372761608,
"num_tokens": 25105078.0,
"step": 4520
},
{
"epoch": 0.43427537439584774,
"grad_norm": 0.6170702576637268,
"learning_rate": 4.7898973302299746e-05,
"loss": 1.2308,
"mean_token_accuracy": 0.6913123942911625,
"num_tokens": 25330411.0,
"step": 4560
},
{
"epoch": 0.43808480750458323,
"grad_norm": 0.47749921679496765,
"learning_rate": 4.786512348343664e-05,
"loss": 1.2394,
"mean_token_accuracy": 0.6930743562057614,
"num_tokens": 25558075.0,
"step": 4600
},
{
"epoch": 0.4418942406133187,
"grad_norm": 0.5179374814033508,
"learning_rate": 4.783084973673732e-05,
"loss": 1.3121,
"mean_token_accuracy": 0.6737616384401918,
"num_tokens": 25787633.0,
"step": 4640
},
{
"epoch": 0.4457036737220542,
"grad_norm": 0.4957481920719147,
"learning_rate": 4.7796152721483024e-05,
"loss": 1.2679,
"mean_token_accuracy": 0.6782409870997071,
"num_tokens": 26010384.0,
"step": 4680
},
{
"epoch": 0.44951310683078977,
"grad_norm": 0.46905845403671265,
"learning_rate": 4.776103310509691e-05,
"loss": 1.3583,
"mean_token_accuracy": 0.6669555108994245,
"num_tokens": 26220009.0,
"step": 4720
},
{
"epoch": 0.45332253993952526,
"grad_norm": 0.6176592707633972,
"learning_rate": 4.7725491563131164e-05,
"loss": 1.3376,
"mean_token_accuracy": 0.6773041613399983,
"num_tokens": 26437068.0,
"step": 4760
},
{
"epoch": 0.45713197304826075,
"grad_norm": 0.641156017780304,
"learning_rate": 4.7689528779254057e-05,
"loss": 1.338,
"mean_token_accuracy": 0.6686659481376409,
"num_tokens": 26646369.0,
"step": 4800
},
{
"epoch": 0.46094140615699625,
"grad_norm": 0.5560479760169983,
"learning_rate": 4.7653145445236725e-05,
"loss": 1.3156,
"mean_token_accuracy": 0.6741218714043498,
"num_tokens": 26872767.0,
"step": 4840
},
{
"epoch": 0.46475083926573174,
"grad_norm": 0.6000213027000427,
"learning_rate": 4.761634226093993e-05,
"loss": 1.2637,
"mean_token_accuracy": 0.6870461646467447,
"num_tokens": 27092463.0,
"step": 4880
},
{
"epoch": 0.4685602723744673,
"grad_norm": 0.3717496693134308,
"learning_rate": 4.757911993430057e-05,
"loss": 1.3356,
"mean_token_accuracy": 0.6666051434352994,
"num_tokens": 27314300.0,
"step": 4920
},
{
"epoch": 0.4723697054832028,
"grad_norm": 0.839055061340332,
"learning_rate": 4.754147918131804e-05,
"loss": 1.2933,
"mean_token_accuracy": 0.6818767448887229,
"num_tokens": 27534317.0,
"step": 4960
},
{
"epoch": 0.4761791385919383,
"grad_norm": 0.5968820452690125,
"learning_rate": 4.7503420726040496e-05,
"loss": 1.3218,
"mean_token_accuracy": 0.6756005242466927,
"num_tokens": 27744710.0,
"step": 5000
},
{
"epoch": 0.47856003428489796,
"eval_mean_token_accuracy": 0.6309991637506638,
"eval_num_tokens": 27879664.0,
"eval_test_loss": 1.7806874513626099,
"eval_test_runtime": 94.8719,
"eval_test_samples_per_second": 20.986,
"eval_test_steps_per_second": 10.498,
"step": 5025
},
{
"epoch": 0.47998857170067377,
"grad_norm": 0.555782675743103,
"learning_rate": 4.74649453005509e-05,
"loss": 1.2746,
"mean_token_accuracy": 0.684971004165709,
"num_tokens": 27964540.0,
"step": 5040
},
{
"epoch": 0.4837980048094093,
"grad_norm": 0.5832619071006775,
"learning_rate": 4.742605364495298e-05,
"loss": 1.2391,
"mean_token_accuracy": 0.691305941529572,
"num_tokens": 28195211.0,
"step": 5080
},
{
"epoch": 0.4876074379181448,
"grad_norm": 0.4528619349002838,
"learning_rate": 4.738674650735692e-05,
"loss": 1.2295,
"mean_token_accuracy": 0.683625971712172,
"num_tokens": 28427726.0,
"step": 5120
},
{
"epoch": 0.4914168710268803,
"grad_norm": 0.8959270119667053,
"learning_rate": 4.734702464386503e-05,
"loss": 1.3764,
"mean_token_accuracy": 0.6636357877403498,
"num_tokens": 28631998.0,
"step": 5160
},
{
"epoch": 0.4952263041356158,
"grad_norm": 0.43296220898628235,
"learning_rate": 4.73068888185572e-05,
"loss": 1.2929,
"mean_token_accuracy": 0.6792870191857219,
"num_tokens": 28846791.0,
"step": 5200
},
{
"epoch": 0.49903573724435135,
"grad_norm": 0.722977340221405,
"learning_rate": 4.726633980347616e-05,
"loss": 1.3176,
"mean_token_accuracy": 0.6724835351109505,
"num_tokens": 29054506.0,
"step": 5240
},
{
"epoch": 0.5028451703530868,
"grad_norm": 0.41740506887435913,
"learning_rate": 4.722537837861267e-05,
"loss": 1.2455,
"mean_token_accuracy": 0.688827583193779,
"num_tokens": 29270620.0,
"step": 5280
},
{
"epoch": 0.5066546034618223,
"grad_norm": 0.38050729036331177,
"learning_rate": 4.718400533189051e-05,
"loss": 1.2743,
"mean_token_accuracy": 0.6829100279137492,
"num_tokens": 29490565.0,
"step": 5320
},
{
"epoch": 0.5104640365705578,
"grad_norm": 0.5724825263023376,
"learning_rate": 4.7142221459151294e-05,
"loss": 1.2773,
"mean_token_accuracy": 0.6881643293425441,
"num_tokens": 29704265.0,
"step": 5360
},
{
"epoch": 0.5142734696792933,
"grad_norm": 0.6543896198272705,
"learning_rate": 4.7100027564139196e-05,
"loss": 1.3307,
"mean_token_accuracy": 0.6777999725192785,
"num_tokens": 29924515.0,
"step": 5400
},
{
"epoch": 0.5180829027880288,
"grad_norm": 0.43163400888442993,
"learning_rate": 4.705742445848548e-05,
"loss": 1.2653,
"mean_token_accuracy": 0.682813161984086,
"num_tokens": 30148026.0,
"step": 5440
},
{
"epoch": 0.5218923358967643,
"grad_norm": 0.47830525040626526,
"learning_rate": 4.7014412961692864e-05,
"loss": 1.1955,
"mean_token_accuracy": 0.7004892747849226,
"num_tokens": 30372129.0,
"step": 5480
},
{
"epoch": 0.5257017690054999,
"grad_norm": 0.5903618931770325,
"learning_rate": 4.697099390111981e-05,
"loss": 1.2973,
"mean_token_accuracy": 0.6818130781874061,
"num_tokens": 30596623.0,
"step": 5520
},
{
"epoch": 0.5295112021142354,
"grad_norm": 0.43726322054862976,
"learning_rate": 4.6927168111964555e-05,
"loss": 1.2095,
"mean_token_accuracy": 0.696308933570981,
"num_tokens": 30834116.0,
"step": 5560
},
{
"epoch": 0.5333206352229709,
"grad_norm": 0.7869614958763123,
"learning_rate": 4.6882936437249056e-05,
"loss": 1.2275,
"mean_token_accuracy": 0.6923372825607658,
"num_tokens": 31066239.0,
"step": 5600
},
{
"epoch": 0.5371300683317064,
"grad_norm": 0.4349427819252014,
"learning_rate": 4.6838299727802786e-05,
"loss": 1.2973,
"mean_token_accuracy": 0.688424677029252,
"num_tokens": 31302823.0,
"step": 5640
},
{
"epoch": 0.5409395014404419,
"grad_norm": 0.5851991772651672,
"learning_rate": 4.67932588422464e-05,
"loss": 1.237,
"mean_token_accuracy": 0.6886244036257267,
"num_tokens": 31530684.0,
"step": 5680
},
{
"epoch": 0.5447489345491774,
"grad_norm": 0.6614839434623718,
"learning_rate": 4.6747814646975134e-05,
"loss": 1.3017,
"mean_token_accuracy": 0.6840987723320723,
"num_tokens": 31741759.0,
"step": 5720
},
{
"epoch": 0.5485583676579129,
"grad_norm": 0.4596734642982483,
"learning_rate": 4.670196801614224e-05,
"loss": 1.1938,
"mean_token_accuracy": 0.7047518376260996,
"num_tokens": 31987926.0,
"step": 5760
},
{
"epoch": 0.5523678007666484,
"grad_norm": 0.5416800379753113,
"learning_rate": 4.665571983164207e-05,
"loss": 1.2874,
"mean_token_accuracy": 0.6859219571575522,
"num_tokens": 32213674.0,
"step": 5800
},
{
"epoch": 0.5561772338753839,
"grad_norm": 0.4644794464111328,
"learning_rate": 4.660907098309319e-05,
"loss": 1.2893,
"mean_token_accuracy": 0.6782131699845195,
"num_tokens": 32429679.0,
"step": 5840
},
{
"epoch": 0.5599866669841195,
"grad_norm": 0.5027694702148438,
"learning_rate": 4.6562022367821244e-05,
"loss": 1.2955,
"mean_token_accuracy": 0.6798365904018283,
"num_tokens": 32646061.0,
"step": 5880
},
{
"epoch": 0.563796100092855,
"grad_norm": 0.5148358345031738,
"learning_rate": 4.651457489084167e-05,
"loss": 1.2533,
"mean_token_accuracy": 0.684299885481596,
"num_tokens": 32867990.0,
"step": 5920
},
{
"epoch": 0.5676055332015905,
"grad_norm": 0.4006965756416321,
"learning_rate": 4.646672946484232e-05,
"loss": 1.2246,
"mean_token_accuracy": 0.6978412168100476,
"num_tokens": 33100846.0,
"step": 5960
},
{
"epoch": 0.571414966310326,
"grad_norm": 0.5946612358093262,
"learning_rate": 4.641848701016592e-05,
"loss": 1.3112,
"mean_token_accuracy": 0.682670296356082,
"num_tokens": 33316830.0,
"step": 6000
},
{
"epoch": 0.5742720411418776,
"eval_mean_token_accuracy": 0.6325134164597614,
"eval_num_tokens": 33490786.0,
"eval_test_loss": 1.7651844024658203,
"eval_test_runtime": 95.3517,
"eval_test_samples_per_second": 20.881,
"eval_test_steps_per_second": 10.446,
"step": 6030
},
{
"epoch": 0.5752243994190614,
"grad_norm": 0.5004485249519348,
"learning_rate": 4.636984845479229e-05,
"loss": 1.3189,
"mean_token_accuracy": 0.6787041410803795,
"num_tokens": 33547057.0,
"step": 6040
},
{
"epoch": 0.5790338325277969,
"grad_norm": 0.6013615727424622,
"learning_rate": 4.6320814734320574e-05,
"loss": 1.2999,
"mean_token_accuracy": 0.6829198809340596,
"num_tokens": 33751128.0,
"step": 6080
},
{
"epoch": 0.5828432656365324,
"grad_norm": 0.46301329135894775,
"learning_rate": 4.627138679195122e-05,
"loss": 1.2979,
"mean_token_accuracy": 0.6834639564156533,
"num_tokens": 33958374.0,
"step": 6120
},
{
"epoch": 0.5866526987452679,
"grad_norm": 0.5537028908729553,
"learning_rate": 4.622156557846782e-05,
"loss": 1.2919,
"mean_token_accuracy": 0.6853473074734211,
"num_tokens": 34173776.0,
"step": 6160
},
{
"epoch": 0.5904621318540034,
"grad_norm": 0.5172299146652222,
"learning_rate": 4.617135205221882e-05,
"loss": 1.2535,
"mean_token_accuracy": 0.6891471687704325,
"num_tokens": 34402012.0,
"step": 6200
},
{
"epoch": 0.594271564962739,
"grad_norm": 0.45443201065063477,
"learning_rate": 4.6120747179099115e-05,
"loss": 1.3631,
"mean_token_accuracy": 0.6746680632233619,
"num_tokens": 34619563.0,
"step": 6240
},
{
"epoch": 0.5980809980714745,
"grad_norm": 0.5318917632102966,
"learning_rate": 4.606975193253145e-05,
"loss": 1.2875,
"mean_token_accuracy": 0.6842667568475008,
"num_tokens": 34842995.0,
"step": 6280
},
{
"epoch": 0.60189043118021,
"grad_norm": 0.41732507944107056,
"learning_rate": 4.6018367293447696e-05,
"loss": 1.2011,
"mean_token_accuracy": 0.6991326250135899,
"num_tokens": 35074476.0,
"step": 6320
},
{
"epoch": 0.6056998642889455,
"grad_norm": 0.4196927845478058,
"learning_rate": 4.5966594250269964e-05,
"loss": 1.3135,
"mean_token_accuracy": 0.6805538948625326,
"num_tokens": 35296363.0,
"step": 6360
},
{
"epoch": 0.609509297397681,
"grad_norm": 0.42691197991371155,
"learning_rate": 4.5914433798891605e-05,
"loss": 1.246,
"mean_token_accuracy": 0.6876704445108771,
"num_tokens": 35523754.0,
"step": 6400
},
{
"epoch": 0.6133187305064165,
"grad_norm": 0.3611372113227844,
"learning_rate": 4.5861886942658106e-05,
"loss": 1.2269,
"mean_token_accuracy": 0.6875138944014907,
"num_tokens": 35758849.0,
"step": 6440
},
{
"epoch": 0.617128163615152,
"grad_norm": 0.7133765816688538,
"learning_rate": 4.580895469234769e-05,
"loss": 1.2242,
"mean_token_accuracy": 0.6886107694357634,
"num_tokens": 35988842.0,
"step": 6480
},
{
"epoch": 0.6209375967238875,
"grad_norm": 0.44684138894081116,
"learning_rate": 4.575563806615196e-05,
"loss": 1.2675,
"mean_token_accuracy": 0.6812176534906029,
"num_tokens": 36201602.0,
"step": 6520
},
{
"epoch": 0.624747029832623,
"grad_norm": 0.5846595168113708,
"learning_rate": 4.5701938089656256e-05,
"loss": 1.2703,
"mean_token_accuracy": 0.6853839591145515,
"num_tokens": 36403261.0,
"step": 6560
},
{
"epoch": 0.6285564629413586,
"grad_norm": 0.3759899437427521,
"learning_rate": 4.5647855795819943e-05,
"loss": 1.3111,
"mean_token_accuracy": 0.6767927626147866,
"num_tokens": 36619797.0,
"step": 6600
},
{
"epoch": 0.6323658960500941,
"grad_norm": 0.6737694144248962,
"learning_rate": 4.5593392224956576e-05,
"loss": 1.3165,
"mean_token_accuracy": 0.6752157468348742,
"num_tokens": 36828768.0,
"step": 6640
},
{
"epoch": 0.6361753291588296,
"grad_norm": 0.5695846676826477,
"learning_rate": 4.5538548424713835e-05,
"loss": 1.2397,
"mean_token_accuracy": 0.6919887445867061,
"num_tokens": 37053251.0,
"step": 6680
},
{
"epoch": 0.6399847622675651,
"grad_norm": 0.5237749218940735,
"learning_rate": 4.5483325450053406e-05,
"loss": 1.2553,
"mean_token_accuracy": 0.6820298057049513,
"num_tokens": 37268972.0,
"step": 6720
},
{
"epoch": 0.6437941953763006,
"grad_norm": 0.6554339528083801,
"learning_rate": 4.5427724363230683e-05,
"loss": 1.2682,
"mean_token_accuracy": 0.6873216938227416,
"num_tokens": 37492541.0,
"step": 6760
},
{
"epoch": 0.647603628485036,
"grad_norm": 0.5178593397140503,
"learning_rate": 4.537174623377432e-05,
"loss": 1.2537,
"mean_token_accuracy": 0.6826527412980795,
"num_tokens": 37705009.0,
"step": 6800
},
{
"epoch": 0.6514130615937715,
"grad_norm": 0.4857373833656311,
"learning_rate": 4.53153921384657e-05,
"loss": 1.3497,
"mean_token_accuracy": 0.6726470049470663,
"num_tokens": 37915354.0,
"step": 6840
},
{
"epoch": 0.655222494702507,
"grad_norm": 0.48057419061660767,
"learning_rate": 4.5258663161318136e-05,
"loss": 1.2267,
"mean_token_accuracy": 0.6866666225716471,
"num_tokens": 38146196.0,
"step": 6880
},
{
"epoch": 0.6590319278112425,
"grad_norm": 0.5138698220252991,
"learning_rate": 4.5201560393556134e-05,
"loss": 1.2439,
"mean_token_accuracy": 0.6901180801913143,
"num_tokens": 38373540.0,
"step": 6920
},
{
"epoch": 0.6628413609199781,
"grad_norm": 0.47547754645347595,
"learning_rate": 4.5144084933594303e-05,
"loss": 1.2706,
"mean_token_accuracy": 0.6800282135605812,
"num_tokens": 38589036.0,
"step": 6960
},
{
"epoch": 0.6666507940287136,
"grad_norm": 0.6765561103820801,
"learning_rate": 4.508623788701628e-05,
"loss": 1.2929,
"mean_token_accuracy": 0.6894892951473593,
"num_tokens": 38802771.0,
"step": 7000
},
{
"epoch": 0.6699840479988571,
"eval_mean_token_accuracy": 0.6329328228970129,
"eval_num_tokens": 38980983.0,
"eval_test_loss": 1.7534339427947998,
"eval_test_runtime": 95.0805,
"eval_test_samples_per_second": 20.94,
"eval_test_steps_per_second": 10.475,
"step": 7035
},
{
"epoch": 0.6704602271374491,
"grad_norm": 0.40170061588287354,
"learning_rate": 4.502802036655346e-05,
"loss": 1.3205,
"mean_token_accuracy": 0.675545847415924,
"num_tokens": 39011417.0,
"step": 7040
},
{
"epoch": 0.6742696602461846,
"grad_norm": 0.49673372507095337,
"learning_rate": 4.4969433492063564e-05,
"loss": 1.256,
"mean_token_accuracy": 0.6876663960516453,
"num_tokens": 39231629.0,
"step": 7080
},
{
"epoch": 0.6780790933549201,
"grad_norm": 0.45921561121940613,
"learning_rate": 4.491047839050912e-05,
"loss": 1.2542,
"mean_token_accuracy": 0.6906937116757035,
"num_tokens": 39457882.0,
"step": 7120
},
{
"epoch": 0.6818885264636556,
"grad_norm": 0.49605095386505127,
"learning_rate": 4.4851156195935785e-05,
"loss": 1.3266,
"mean_token_accuracy": 0.673082504235208,
"num_tokens": 39675712.0,
"step": 7160
},
{
"epoch": 0.6856979595723911,
"grad_norm": 0.5613274574279785,
"learning_rate": 4.479146804945053e-05,
"loss": 1.3061,
"mean_token_accuracy": 0.6837763454765081,
"num_tokens": 39892449.0,
"step": 7200
},
{
"epoch": 0.6895073926811266,
"grad_norm": 0.40411025285720825,
"learning_rate": 4.473141509919966e-05,
"loss": 1.2459,
"mean_token_accuracy": 0.6935942692682147,
"num_tokens": 40122187.0,
"step": 7240
},
{
"epoch": 0.6933168257898622,
"grad_norm": 0.66608726978302,
"learning_rate": 4.4670998500346795e-05,
"loss": 1.263,
"mean_token_accuracy": 0.6886698173359036,
"num_tokens": 40339718.0,
"step": 7280
},
{
"epoch": 0.6971262588985977,
"grad_norm": 0.5249150395393372,
"learning_rate": 4.461021941505057e-05,
"loss": 1.2077,
"mean_token_accuracy": 0.6970062265172601,
"num_tokens": 40568071.0,
"step": 7320
},
{
"epoch": 0.7009356920073332,
"grad_norm": 0.5321863293647766,
"learning_rate": 4.454907901244236e-05,
"loss": 1.2908,
"mean_token_accuracy": 0.6871679758653044,
"num_tokens": 40781022.0,
"step": 7360
},
{
"epoch": 0.7047451251160687,
"grad_norm": 0.5298981666564941,
"learning_rate": 4.44875784686037e-05,
"loss": 1.2256,
"mean_token_accuracy": 0.6957620535045862,
"num_tokens": 41015922.0,
"step": 7400
},
{
"epoch": 0.7085545582248042,
"grad_norm": 0.6805555820465088,
"learning_rate": 4.442571896654375e-05,
"loss": 1.2059,
"mean_token_accuracy": 0.6951308185234666,
"num_tokens": 41247452.0,
"step": 7440
},
{
"epoch": 0.7123639913335397,
"grad_norm": 0.3826749622821808,
"learning_rate": 4.4363501696176494e-05,
"loss": 1.2866,
"mean_token_accuracy": 0.6792947178706527,
"num_tokens": 41470415.0,
"step": 7480
},
{
"epoch": 0.7161734244422752,
"grad_norm": 0.5363141298294067,
"learning_rate": 4.4300927854297856e-05,
"loss": 1.2691,
"mean_token_accuracy": 0.682473830319941,
"num_tokens": 41701000.0,
"step": 7520
},
{
"epoch": 0.7199828575510107,
"grad_norm": 0.5626250505447388,
"learning_rate": 4.423799864456266e-05,
"loss": 1.3133,
"mean_token_accuracy": 0.6788626465946436,
"num_tokens": 41915333.0,
"step": 7560
},
{
"epoch": 0.7237922906597462,
"grad_norm": 0.6899204850196838,
"learning_rate": 4.417471527746152e-05,
"loss": 1.365,
"mean_token_accuracy": 0.6754604885354638,
"num_tokens": 42139999.0,
"step": 7600
},
{
"epoch": 0.7276017237684818,
"grad_norm": 0.5523748993873596,
"learning_rate": 4.411107897029755e-05,
"loss": 1.2776,
"mean_token_accuracy": 0.6912444988265634,
"num_tokens": 42367874.0,
"step": 7640
},
{
"epoch": 0.7314111568772173,
"grad_norm": 0.5411382913589478,
"learning_rate": 4.404709094716289e-05,
"loss": 1.2326,
"mean_token_accuracy": 0.6946828311309219,
"num_tokens": 42594804.0,
"step": 7680
},
{
"epoch": 0.7352205899859527,
"grad_norm": 0.6533775329589844,
"learning_rate": 4.398275243891522e-05,
"loss": 1.2828,
"mean_token_accuracy": 0.6842706995084882,
"num_tokens": 42814558.0,
"step": 7720
},
{
"epoch": 0.7390300230946882,
"grad_norm": 0.3408315181732178,
"learning_rate": 4.391806468315408e-05,
"loss": 1.2614,
"mean_token_accuracy": 0.690892388857901,
"num_tokens": 43040642.0,
"step": 7760
},
{
"epoch": 0.7428394562034237,
"grad_norm": 0.4586963355541229,
"learning_rate": 4.385302892419702e-05,
"loss": 1.2984,
"mean_token_accuracy": 0.6800136685371398,
"num_tokens": 43259728.0,
"step": 7800
},
{
"epoch": 0.7466488893121592,
"grad_norm": 0.6477558016777039,
"learning_rate": 4.37876464130557e-05,
"loss": 1.3249,
"mean_token_accuracy": 0.6777775203809142,
"num_tokens": 43467264.0,
"step": 7840
},
{
"epoch": 0.7504583224208947,
"grad_norm": 0.4374445080757141,
"learning_rate": 4.3721918407411845e-05,
"loss": 1.2729,
"mean_token_accuracy": 0.689405850879848,
"num_tokens": 43679068.0,
"step": 7880
},
{
"epoch": 0.7542677555296302,
"grad_norm": 0.5325660109519958,
"learning_rate": 4.3655846171592994e-05,
"loss": 1.2863,
"mean_token_accuracy": 0.6884377462789416,
"num_tokens": 43899283.0,
"step": 7920
},
{
"epoch": 0.7580771886383657,
"grad_norm": 0.7073720097541809,
"learning_rate": 4.358943097654823e-05,
"loss": 1.245,
"mean_token_accuracy": 0.6904063617810607,
"num_tokens": 44113559.0,
"step": 7960
},
{
"epoch": 0.7618866217471013,
"grad_norm": 0.44697853922843933,
"learning_rate": 4.3522674099823705e-05,
"loss": 1.2699,
"mean_token_accuracy": 0.6843698143959045,
"num_tokens": 44334374.0,
"step": 8000
},
{
"epoch": 0.7656960548558368,
"grad_norm": 0.4306088984012604,
"learning_rate": 4.345557682553807e-05,
"loss": 1.2319,
"step": 8040
},
{
"epoch": 0.7656960548558368,
"eval_mean_token_accuracy": 0.6363147346070971,
"eval_num_tokens": 44575460.0,
"eval_test_loss": 1.7359126806259155,
"eval_test_runtime": 95.9212,
"eval_test_samples_per_second": 20.757,
"eval_test_steps_per_second": 10.384,
"step": 8040
},
{
"epoch": 0.7695054879645723,
"grad_norm": 0.6170753836631775,
"learning_rate": 4.3388140444357795e-05,
"loss": 1.2328,
"mean_token_accuracy": 0.6931531462818384,
"num_tokens": 44806038.0,
"step": 8080
},
{
"epoch": 0.7733149210733078,
"grad_norm": 0.7038352489471436,
"learning_rate": 4.332036625347232e-05,
"loss": 1.2929,
"mean_token_accuracy": 0.6844880169257521,
"num_tokens": 45016584.0,
"step": 8120
},
{
"epoch": 0.7771243541820433,
"grad_norm": 0.4224265515804291,
"learning_rate": 4.325225555656911e-05,
"loss": 1.2634,
"mean_token_accuracy": 0.6930087611079216,
"num_tokens": 45246046.0,
"step": 8160
},
{
"epoch": 0.7809337872907788,
"grad_norm": 0.46587294340133667,
"learning_rate": 4.3183809663808556e-05,
"loss": 1.2605,
"mean_token_accuracy": 0.687124558724463,
"num_tokens": 45464284.0,
"step": 8200
},
{
"epoch": 0.7847432203995143,
"grad_norm": 0.5252248644828796,
"learning_rate": 4.311502989179882e-05,
"loss": 1.2728,
"mean_token_accuracy": 0.6883155029267073,
"num_tokens": 45684249.0,
"step": 8240
},
{
"epoch": 0.7885526535082498,
"grad_norm": 0.4695100486278534,
"learning_rate": 4.304591756357046e-05,
"loss": 1.27,
"mean_token_accuracy": 0.6879516759887337,
"num_tokens": 45919177.0,
"step": 8280
},
{
"epoch": 0.7923620866169853,
"grad_norm": 0.46411824226379395,
"learning_rate": 4.297647400855103e-05,
"loss": 1.3028,
"mean_token_accuracy": 0.683055117353797,
"num_tokens": 46131789.0,
"step": 8320
},
{
"epoch": 0.7961715197257209,
"grad_norm": 0.7392385601997375,
"learning_rate": 4.290670056253944e-05,
"loss": 1.173,
"mean_token_accuracy": 0.6985200975090265,
"num_tokens": 46348383.0,
"step": 8360
},
{
"epoch": 0.7999809528344564,
"grad_norm": 0.5996558666229248,
"learning_rate": 4.283659856768036e-05,
"loss": 1.1934,
"mean_token_accuracy": 0.6913962122052908,
"num_tokens": 46572321.0,
"step": 8400
},
{
"epoch": 0.8037903859431919,
"grad_norm": 0.6277273297309875,
"learning_rate": 4.276616937243828e-05,
"loss": 1.2401,
"mean_token_accuracy": 0.6883677888661623,
"num_tokens": 46798263.0,
"step": 8440
},
{
"epoch": 0.8075998190519273,
"grad_norm": 0.7397225499153137,
"learning_rate": 4.2695414331571673e-05,
"loss": 1.3177,
"mean_token_accuracy": 0.6755096849054099,
"num_tokens": 47017444.0,
"step": 8480
},
{
"epoch": 0.8114092521606628,
"grad_norm": 0.650918185710907,
"learning_rate": 4.2624334806106894e-05,
"loss": 1.3164,
"mean_token_accuracy": 0.6786608980968595,
"num_tokens": 47230158.0,
"step": 8520
},
{
"epoch": 0.8152186852693983,
"grad_norm": 0.6894013285636902,
"learning_rate": 4.255293216331197e-05,
"loss": 1.2527,
"mean_token_accuracy": 0.6907643361017108,
"num_tokens": 47453276.0,
"step": 8560
},
{
"epoch": 0.8190281183781338,
"grad_norm": 0.6085829138755798,
"learning_rate": 4.2481207776670396e-05,
"loss": 1.2798,
"mean_token_accuracy": 0.6841305760666728,
"num_tokens": 47662402.0,
"step": 8600
},
{
"epoch": 0.8228375514868693,
"grad_norm": 0.47271421551704407,
"learning_rate": 4.24091630258546e-05,
"loss": 1.2038,
"mean_token_accuracy": 0.6984878290444613,
"num_tokens": 47889095.0,
"step": 8640
},
{
"epoch": 0.8266469845956048,
"grad_norm": 0.6806793212890625,
"learning_rate": 4.2336799296699454e-05,
"loss": 1.1881,
"mean_token_accuracy": 0.7019338620826602,
"num_tokens": 48117494.0,
"step": 8680
},
{
"epoch": 0.8304564177043404,
"grad_norm": 0.5494069457054138,
"learning_rate": 4.2264117981175665e-05,
"loss": 1.2331,
"mean_token_accuracy": 0.6901240289211273,
"num_tokens": 48340935.0,
"step": 8720
},
{
"epoch": 0.8342658508130759,
"grad_norm": 0.6391497254371643,
"learning_rate": 4.21911204773629e-05,
"loss": 1.2149,
"mean_token_accuracy": 0.6961538307368755,
"num_tokens": 48573668.0,
"step": 8760
},
{
"epoch": 0.8380752839218114,
"grad_norm": 0.3758852481842041,
"learning_rate": 4.211780818942297e-05,
"loss": 1.251,
"mean_token_accuracy": 0.6919752093032002,
"num_tokens": 48795852.0,
"step": 8800
},
{
"epoch": 0.8418847170305469,
"grad_norm": 0.5446637868881226,
"learning_rate": 4.2044182527572795e-05,
"loss": 1.3,
"mean_token_accuracy": 0.6852457968518137,
"num_tokens": 49032650.0,
"step": 8840
},
{
"epoch": 0.8456941501392824,
"grad_norm": 0.3522646427154541,
"learning_rate": 4.197024490805727e-05,
"loss": 1.2702,
"mean_token_accuracy": 0.6935782097280025,
"num_tokens": 49250248.0,
"step": 8880
},
{
"epoch": 0.8495035832480179,
"grad_norm": 0.652208149433136,
"learning_rate": 4.189599675312204e-05,
"loss": 1.2844,
"mean_token_accuracy": 0.6847639823332429,
"num_tokens": 49476266.0,
"step": 8920
},
{
"epoch": 0.8533130163567534,
"grad_norm": 0.41794517636299133,
"learning_rate": 4.182143949098612e-05,
"loss": 1.2659,
"mean_token_accuracy": 0.6905384896323085,
"num_tokens": 49700173.0,
"step": 8960
},
{
"epoch": 0.8571224494654889,
"grad_norm": 0.532646656036377,
"learning_rate": 4.1746574555814455e-05,
"loss": 1.282,
"mean_token_accuracy": 0.6890167951583862,
"num_tokens": 49930863.0,
"step": 9000
},
{
"epoch": 0.8609318825742244,
"grad_norm": 0.5750353932380676,
"learning_rate": 4.1671403387690284e-05,
"loss": 1.1884,
"mean_token_accuracy": 0.7017395876348018,
"num_tokens": 50166067.0,
"step": 9040
},
{
"epoch": 0.8614080617128164,
"eval_mean_token_accuracy": 0.6381066715621565,
"eval_num_tokens": 50195310.0,
"eval_test_loss": 1.723710060119629,
"eval_test_runtime": 96.0763,
"eval_test_samples_per_second": 20.723,
"eval_test_steps_per_second": 10.367,
"step": 9045
}
],
"logging_steps": 40,
"max_steps": 30161,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 3016,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.1928985580997181e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}