|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8616937691959715, |
|
"eval_steps": 1005, |
|
"global_step": 9048, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0038094331087355064, |
|
"grad_norm": 1.1105036735534668, |
|
"learning_rate": 1.2714024683199598e-06, |
|
"loss": 1.9169, |
|
"mean_token_accuracy": 0.5754982324317097, |
|
"num_tokens": 213515.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.007618866217471013, |
|
"grad_norm": 0.8602012991905212, |
|
"learning_rate": 2.575404999930175e-06, |
|
"loss": 1.7612, |
|
"mean_token_accuracy": 0.5973538164049387, |
|
"num_tokens": 428291.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01142829932620652, |
|
"grad_norm": 0.7212763428688049, |
|
"learning_rate": 3.87940753154039e-06, |
|
"loss": 1.7641, |
|
"mean_token_accuracy": 0.6031968526542186, |
|
"num_tokens": 649316.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.015237732434942025, |
|
"grad_norm": 0.9749614000320435, |
|
"learning_rate": 5.183410063150605e-06, |
|
"loss": 1.5732, |
|
"mean_token_accuracy": 0.6395209338515997, |
|
"num_tokens": 867038.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01904716554367753, |
|
"grad_norm": 0.8746033310890198, |
|
"learning_rate": 6.48741259476082e-06, |
|
"loss": 1.5567, |
|
"mean_token_accuracy": 0.6250597681850195, |
|
"num_tokens": 1084477.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02285659865241304, |
|
"grad_norm": 0.6283444762229919, |
|
"learning_rate": 7.791415126371035e-06, |
|
"loss": 1.4563, |
|
"mean_token_accuracy": 0.6498559027910232, |
|
"num_tokens": 1304254.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.026666031761148543, |
|
"grad_norm": 0.8473705053329468, |
|
"learning_rate": 9.09541765798125e-06, |
|
"loss": 1.5125, |
|
"mean_token_accuracy": 0.640368782542646, |
|
"num_tokens": 1518177.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03047546486988405, |
|
"grad_norm": 0.8421404361724854, |
|
"learning_rate": 1.0399420189591466e-05, |
|
"loss": 1.43, |
|
"mean_token_accuracy": 0.6589812656864524, |
|
"num_tokens": 1745573.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.034284897978619555, |
|
"grad_norm": 0.7308921813964844, |
|
"learning_rate": 1.1703422721201682e-05, |
|
"loss": 1.3957, |
|
"mean_token_accuracy": 0.6596783269196749, |
|
"num_tokens": 1970323.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03809433108735506, |
|
"grad_norm": 1.024109125137329, |
|
"learning_rate": 1.3007425252811896e-05, |
|
"loss": 1.2906, |
|
"mean_token_accuracy": 0.6734623985365034, |
|
"num_tokens": 2209851.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04190376419609057, |
|
"grad_norm": 0.8211472034454346, |
|
"learning_rate": 1.4311427784422112e-05, |
|
"loss": 1.4547, |
|
"mean_token_accuracy": 0.65384187027812, |
|
"num_tokens": 2419422.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04571319730482608, |
|
"grad_norm": 0.9193497896194458, |
|
"learning_rate": 1.5615430316032328e-05, |
|
"loss": 1.3763, |
|
"mean_token_accuracy": 0.6668608419597148, |
|
"num_tokens": 2650647.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04952263041356158, |
|
"grad_norm": 0.9686090350151062, |
|
"learning_rate": 1.6919432847642544e-05, |
|
"loss": 1.3092, |
|
"mean_token_accuracy": 0.6819563843309879, |
|
"num_tokens": 2895742.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.053332063522297087, |
|
"grad_norm": 0.6123139262199402, |
|
"learning_rate": 1.8223435379252756e-05, |
|
"loss": 1.3106, |
|
"mean_token_accuracy": 0.6761070437729358, |
|
"num_tokens": 3131131.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.057141496631032594, |
|
"grad_norm": 0.8315911293029785, |
|
"learning_rate": 1.952743791086297e-05, |
|
"loss": 1.295, |
|
"mean_token_accuracy": 0.6829990902915597, |
|
"num_tokens": 3358554.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0609509297397681, |
|
"grad_norm": 0.6099031567573547, |
|
"learning_rate": 2.0831440442473187e-05, |
|
"loss": 1.315, |
|
"mean_token_accuracy": 0.6704990575090051, |
|
"num_tokens": 3584968.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.0647603628485036, |
|
"grad_norm": 0.8671184778213501, |
|
"learning_rate": 2.2135442974083403e-05, |
|
"loss": 1.4007, |
|
"mean_token_accuracy": 0.6610645942389966, |
|
"num_tokens": 3789884.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.06856979595723911, |
|
"grad_norm": 0.8112688064575195, |
|
"learning_rate": 2.3439445505693616e-05, |
|
"loss": 1.3192, |
|
"mean_token_accuracy": 0.6786379875615239, |
|
"num_tokens": 4010228.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.07237922906597462, |
|
"grad_norm": 0.7991048097610474, |
|
"learning_rate": 2.474344803730383e-05, |
|
"loss": 1.357, |
|
"mean_token_accuracy": 0.6695337913930416, |
|
"num_tokens": 4240601.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.07618866217471013, |
|
"grad_norm": 0.6952106356620789, |
|
"learning_rate": 2.6047450568914047e-05, |
|
"loss": 1.3805, |
|
"mean_token_accuracy": 0.666619416512549, |
|
"num_tokens": 4457613.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07999809528344563, |
|
"grad_norm": 0.5898691415786743, |
|
"learning_rate": 2.7351453100524263e-05, |
|
"loss": 1.3645, |
|
"mean_token_accuracy": 0.6681723784655332, |
|
"num_tokens": 4675575.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.08380752839218114, |
|
"grad_norm": 0.8592577576637268, |
|
"learning_rate": 2.865545563213448e-05, |
|
"loss": 1.3007, |
|
"mean_token_accuracy": 0.6759131707251071, |
|
"num_tokens": 4914793.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.08761696150091665, |
|
"grad_norm": 0.47871366143226624, |
|
"learning_rate": 2.9959458163744694e-05, |
|
"loss": 1.3316, |
|
"mean_token_accuracy": 0.6775730215013027, |
|
"num_tokens": 5145053.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.09142639460965216, |
|
"grad_norm": 0.6131187081336975, |
|
"learning_rate": 3.126346069535491e-05, |
|
"loss": 1.3631, |
|
"mean_token_accuracy": 0.6744197152554989, |
|
"num_tokens": 5353224.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.09523582771838766, |
|
"grad_norm": 0.6329927444458008, |
|
"learning_rate": 3.256746322696512e-05, |
|
"loss": 1.3321, |
|
"mean_token_accuracy": 0.677939809858799, |
|
"num_tokens": 5574886.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0957120068569796, |
|
"eval_mean_token_accuracy": 0.6180267786704393, |
|
"eval_num_tokens": 5598115.0, |
|
"eval_test_loss": 1.8654624223709106, |
|
"eval_test_runtime": 94.5921, |
|
"eval_test_samples_per_second": 21.048, |
|
"eval_test_steps_per_second": 10.529, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.09904526082712316, |
|
"grad_norm": 0.5825843214988708, |
|
"learning_rate": 3.387146575857534e-05, |
|
"loss": 1.3293, |
|
"mean_token_accuracy": 0.6768566837534309, |
|
"num_tokens": 5800189.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.10285469393585867, |
|
"grad_norm": 0.47114425897598267, |
|
"learning_rate": 3.5175468290185554e-05, |
|
"loss": 1.273, |
|
"mean_token_accuracy": 0.6856468811631202, |
|
"num_tokens": 6035590.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.10666412704459417, |
|
"grad_norm": 0.6536944508552551, |
|
"learning_rate": 3.647947082179577e-05, |
|
"loss": 1.2521, |
|
"mean_token_accuracy": 0.6856056058779358, |
|
"num_tokens": 6271507.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.11047356015332968, |
|
"grad_norm": 0.49625030159950256, |
|
"learning_rate": 3.7783473353405986e-05, |
|
"loss": 1.3219, |
|
"mean_token_accuracy": 0.6740474671125412, |
|
"num_tokens": 6485242.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.11428299326206519, |
|
"grad_norm": 0.4760020971298218, |
|
"learning_rate": 3.90874758850162e-05, |
|
"loss": 1.3866, |
|
"mean_token_accuracy": 0.666905522160232, |
|
"num_tokens": 6694201.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1180924263708007, |
|
"grad_norm": 0.6702063083648682, |
|
"learning_rate": 4.039147841662642e-05, |
|
"loss": 1.3396, |
|
"mean_token_accuracy": 0.673108272999525, |
|
"num_tokens": 6920931.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.1219018594795362, |
|
"grad_norm": 0.5828559398651123, |
|
"learning_rate": 4.169548094823663e-05, |
|
"loss": 1.2849, |
|
"mean_token_accuracy": 0.6773701002821326, |
|
"num_tokens": 7147355.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.1257112925882717, |
|
"grad_norm": 0.558748185634613, |
|
"learning_rate": 4.299948347984684e-05, |
|
"loss": 1.3391, |
|
"mean_token_accuracy": 0.6739463916048407, |
|
"num_tokens": 7365364.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.1295207256970072, |
|
"grad_norm": 0.5812460780143738, |
|
"learning_rate": 4.430348601145706e-05, |
|
"loss": 1.3572, |
|
"mean_token_accuracy": 0.6651050833985209, |
|
"num_tokens": 7575079.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.13333015880574273, |
|
"grad_norm": 0.7141745686531067, |
|
"learning_rate": 4.5607488543067274e-05, |
|
"loss": 1.2944, |
|
"mean_token_accuracy": 0.681589861959219, |
|
"num_tokens": 7804608.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.13713959191447822, |
|
"grad_norm": 0.5530197024345398, |
|
"learning_rate": 4.691149107467749e-05, |
|
"loss": 1.3375, |
|
"mean_token_accuracy": 0.6718398928642273, |
|
"num_tokens": 8017882.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.14094902502321374, |
|
"grad_norm": 0.48236706852912903, |
|
"learning_rate": 4.8215493606287705e-05, |
|
"loss": 1.335, |
|
"mean_token_accuracy": 0.6747591784223914, |
|
"num_tokens": 8238637.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.14475845813194924, |
|
"grad_norm": 0.5896688103675842, |
|
"learning_rate": 4.919348145872209e-05, |
|
"loss": 1.359, |
|
"mean_token_accuracy": 0.6685425175353885, |
|
"num_tokens": 8448265.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.14856789124068476, |
|
"grad_norm": 0.6068539619445801, |
|
"learning_rate": 4.9193144349007555e-05, |
|
"loss": 1.3475, |
|
"mean_token_accuracy": 0.6698486657813192, |
|
"num_tokens": 8663046.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.15237732434942025, |
|
"grad_norm": 0.4848249852657318, |
|
"learning_rate": 4.91923577659781e-05, |
|
"loss": 1.3355, |
|
"mean_token_accuracy": 0.6719444127753377, |
|
"num_tokens": 8882966.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.15618675745815574, |
|
"grad_norm": 0.5089111924171448, |
|
"learning_rate": 4.9191121724764224e-05, |
|
"loss": 1.3455, |
|
"mean_token_accuracy": 0.6727668078616261, |
|
"num_tokens": 9099039.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.15999619056689127, |
|
"grad_norm": 0.5716065168380737, |
|
"learning_rate": 4.9189436249142116e-05, |
|
"loss": 1.3555, |
|
"mean_token_accuracy": 0.6711426375433802, |
|
"num_tokens": 9320395.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.16380562367562676, |
|
"grad_norm": 0.7265896797180176, |
|
"learning_rate": 4.918730137153316e-05, |
|
"loss": 1.3418, |
|
"mean_token_accuracy": 0.6719378443434835, |
|
"num_tokens": 9535056.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.16761505678436228, |
|
"grad_norm": 0.6088116765022278, |
|
"learning_rate": 4.9184717133003326e-05, |
|
"loss": 1.306, |
|
"mean_token_accuracy": 0.6769841868430376, |
|
"num_tokens": 9753473.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.17142448989309778, |
|
"grad_norm": 0.3883936405181885, |
|
"learning_rate": 4.918168358326239e-05, |
|
"loss": 1.2348, |
|
"mean_token_accuracy": 0.6958686344325542, |
|
"num_tokens": 9977829.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.1752339230018333, |
|
"grad_norm": 0.5694851279258728, |
|
"learning_rate": 4.917820078066296e-05, |
|
"loss": 1.3571, |
|
"mean_token_accuracy": 0.6749148614704609, |
|
"num_tokens": 10202796.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.1790433561105688, |
|
"grad_norm": 0.4964405298233032, |
|
"learning_rate": 4.9174268792199355e-05, |
|
"loss": 1.292, |
|
"mean_token_accuracy": 0.682447912171483, |
|
"num_tokens": 10446726.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.1828527892193043, |
|
"grad_norm": 0.585851788520813, |
|
"learning_rate": 4.916988769350633e-05, |
|
"loss": 1.3271, |
|
"mean_token_accuracy": 0.6706608653068542, |
|
"num_tokens": 10675299.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.1866622223280398, |
|
"grad_norm": 0.4936956763267517, |
|
"learning_rate": 4.91650575688576e-05, |
|
"loss": 1.3212, |
|
"mean_token_accuracy": 0.6725098427385092, |
|
"num_tokens": 10908277.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.19047165543677533, |
|
"grad_norm": 0.4473949670791626, |
|
"learning_rate": 4.9159778511164254e-05, |
|
"loss": 1.2485, |
|
"mean_token_accuracy": 0.6835329543799162, |
|
"num_tokens": 11138801.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1914240137139592, |
|
"eval_mean_token_accuracy": 0.6230506894878115, |
|
"eval_num_tokens": 11194791.0, |
|
"eval_test_loss": 1.832930564880371, |
|
"eval_test_runtime": 95.4667, |
|
"eval_test_samples_per_second": 20.855, |
|
"eval_test_steps_per_second": 10.433, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.19428108854551082, |
|
"grad_norm": 0.5689784288406372, |
|
"learning_rate": 4.915405062197292e-05, |
|
"loss": 1.2842, |
|
"mean_token_accuracy": 0.6809853713959455, |
|
"num_tokens": 11368252.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.19809052165424632, |
|
"grad_norm": 0.5339015126228333, |
|
"learning_rate": 4.914787401146385e-05, |
|
"loss": 1.3224, |
|
"mean_token_accuracy": 0.67562406193465, |
|
"num_tokens": 11590368.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.20189995476298184, |
|
"grad_norm": 0.6560924053192139, |
|
"learning_rate": 4.91412487984488e-05, |
|
"loss": 1.3732, |
|
"mean_token_accuracy": 0.6702003039419651, |
|
"num_tokens": 11798414.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.20570938787171733, |
|
"grad_norm": 0.5972558259963989, |
|
"learning_rate": 4.91341751103687e-05, |
|
"loss": 1.2983, |
|
"mean_token_accuracy": 0.6805376719683409, |
|
"num_tokens": 12013463.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.20951882098045285, |
|
"grad_norm": 0.4506518542766571, |
|
"learning_rate": 4.912665308329125e-05, |
|
"loss": 1.2653, |
|
"mean_token_accuracy": 0.6877350242808461, |
|
"num_tokens": 12236318.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.21332825408918835, |
|
"grad_norm": 0.5365843772888184, |
|
"learning_rate": 4.91186828619083e-05, |
|
"loss": 1.3285, |
|
"mean_token_accuracy": 0.6712553380057216, |
|
"num_tokens": 12458126.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.21713768719792387, |
|
"grad_norm": 0.4755626916885376, |
|
"learning_rate": 4.9110264599533055e-05, |
|
"loss": 1.3315, |
|
"mean_token_accuracy": 0.6608016982674598, |
|
"num_tokens": 12678890.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.22094712030665936, |
|
"grad_norm": 0.43261516094207764, |
|
"learning_rate": 4.9101398458097093e-05, |
|
"loss": 1.3166, |
|
"mean_token_accuracy": 0.6768794044852257, |
|
"num_tokens": 12900305.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.22475655341539488, |
|
"grad_norm": 0.5397626161575317, |
|
"learning_rate": 4.9092084608147315e-05, |
|
"loss": 1.3296, |
|
"mean_token_accuracy": 0.6792715717107057, |
|
"num_tokens": 13109419.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.22856598652413038, |
|
"grad_norm": 0.4163241386413574, |
|
"learning_rate": 4.908232322884261e-05, |
|
"loss": 1.2938, |
|
"mean_token_accuracy": 0.6871387459337711, |
|
"num_tokens": 13335797.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.23237541963286587, |
|
"grad_norm": 0.7546857595443726, |
|
"learning_rate": 4.907211450795045e-05, |
|
"loss": 1.2872, |
|
"mean_token_accuracy": 0.6850869571790099, |
|
"num_tokens": 13565988.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.2361848527416014, |
|
"grad_norm": 0.6345241069793701, |
|
"learning_rate": 4.906145864184325e-05, |
|
"loss": 1.3234, |
|
"mean_token_accuracy": 0.6739947069436312, |
|
"num_tokens": 13779004.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.23999428585033689, |
|
"grad_norm": 0.5710681080818176, |
|
"learning_rate": 4.905035583549459e-05, |
|
"loss": 1.3, |
|
"mean_token_accuracy": 0.6780852235853672, |
|
"num_tokens": 14005597.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.2438037189590724, |
|
"grad_norm": 0.5527021288871765, |
|
"learning_rate": 4.903880630247529e-05, |
|
"loss": 1.3019, |
|
"mean_token_accuracy": 0.6793860571458936, |
|
"num_tokens": 14235179.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.2476131520678079, |
|
"grad_norm": 0.4566735327243805, |
|
"learning_rate": 4.902681026494929e-05, |
|
"loss": 1.2927, |
|
"mean_token_accuracy": 0.6824004529044032, |
|
"num_tokens": 14464950.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.2514225851765434, |
|
"grad_norm": 0.5023188591003418, |
|
"learning_rate": 4.901436795366938e-05, |
|
"loss": 1.2996, |
|
"mean_token_accuracy": 0.6823519911617041, |
|
"num_tokens": 14686771.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.2552320182852789, |
|
"grad_norm": 0.6048426628112793, |
|
"learning_rate": 4.900147960797276e-05, |
|
"loss": 1.3523, |
|
"mean_token_accuracy": 0.6745322810485959, |
|
"num_tokens": 14896388.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.2590414513940144, |
|
"grad_norm": 0.5594220757484436, |
|
"learning_rate": 4.898814547577645e-05, |
|
"loss": 1.2733, |
|
"mean_token_accuracy": 0.685325993783772, |
|
"num_tokens": 15115431.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.26285088450274996, |
|
"grad_norm": 0.5266786813735962, |
|
"learning_rate": 4.89743658135725e-05, |
|
"loss": 1.378, |
|
"mean_token_accuracy": 0.661252242513001, |
|
"num_tokens": 15318046.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.26666031761148545, |
|
"grad_norm": 0.33945998549461365, |
|
"learning_rate": 4.896014088642304e-05, |
|
"loss": 1.2849, |
|
"mean_token_accuracy": 0.68093207962811, |
|
"num_tokens": 15546171.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.27046975072022095, |
|
"grad_norm": 0.38815465569496155, |
|
"learning_rate": 4.8945470967955234e-05, |
|
"loss": 1.3622, |
|
"mean_token_accuracy": 0.6677591029554606, |
|
"num_tokens": 15763709.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.27427918382895644, |
|
"grad_norm": 0.5876178741455078, |
|
"learning_rate": 4.8930356340355986e-05, |
|
"loss": 1.2209, |
|
"mean_token_accuracy": 0.7016260443255306, |
|
"num_tokens": 15994580.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.27808861693769193, |
|
"grad_norm": 0.6305630207061768, |
|
"learning_rate": 4.8914797294366514e-05, |
|
"loss": 1.3553, |
|
"mean_token_accuracy": 0.6636421175673604, |
|
"num_tokens": 16208325.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.2818980500464275, |
|
"grad_norm": 0.47434547543525696, |
|
"learning_rate": 4.889879412927675e-05, |
|
"loss": 1.3085, |
|
"mean_token_accuracy": 0.6797755250707269, |
|
"num_tokens": 16428418.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.285707483155163, |
|
"grad_norm": 0.5395858883857727, |
|
"learning_rate": 4.888234715291958e-05, |
|
"loss": 1.2705, |
|
"mean_token_accuracy": 0.6883305061608553, |
|
"num_tokens": 16658226.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2871360205709388, |
|
"eval_mean_token_accuracy": 0.6258668910906496, |
|
"eval_num_tokens": 16745761.0, |
|
"eval_test_loss": 1.8088411092758179, |
|
"eval_test_runtime": 94.958, |
|
"eval_test_samples_per_second": 20.967, |
|
"eval_test_steps_per_second": 10.489, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.28951691626389847, |
|
"grad_norm": 0.5518168210983276, |
|
"learning_rate": 4.8865456681664965e-05, |
|
"loss": 1.2997, |
|
"mean_token_accuracy": 0.6776028882712126, |
|
"num_tokens": 16876039.0, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.29332634937263397, |
|
"grad_norm": 0.6039868593215942, |
|
"learning_rate": 4.88481230404138e-05, |
|
"loss": 1.2621, |
|
"mean_token_accuracy": 0.6928930662572383, |
|
"num_tokens": 17104229.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.2971357824813695, |
|
"grad_norm": 0.5715156197547913, |
|
"learning_rate": 4.883034656259167e-05, |
|
"loss": 1.3461, |
|
"mean_token_accuracy": 0.6714908115565776, |
|
"num_tokens": 17312134.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.300945215590105, |
|
"grad_norm": 0.5405182838439941, |
|
"learning_rate": 4.8812127590142505e-05, |
|
"loss": 1.2837, |
|
"mean_token_accuracy": 0.6896362887695432, |
|
"num_tokens": 17550198.0, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.3047546486988405, |
|
"grad_norm": 0.5504481792449951, |
|
"learning_rate": 4.8793466473521904e-05, |
|
"loss": 1.2181, |
|
"mean_token_accuracy": 0.6988515704870224, |
|
"num_tokens": 17778838.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.308564081807576, |
|
"grad_norm": 0.621837317943573, |
|
"learning_rate": 4.877436357169048e-05, |
|
"loss": 1.3323, |
|
"mean_token_accuracy": 0.6710341470316052, |
|
"num_tokens": 17985533.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.3123735149163115, |
|
"grad_norm": 0.6566672325134277, |
|
"learning_rate": 4.8754819252106876e-05, |
|
"loss": 1.2744, |
|
"mean_token_accuracy": 0.683073416352272, |
|
"num_tokens": 18208416.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.31618294802504704, |
|
"grad_norm": 0.5615633130073547, |
|
"learning_rate": 4.873483389072076e-05, |
|
"loss": 1.315, |
|
"mean_token_accuracy": 0.684354678913951, |
|
"num_tokens": 18426223.0, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.31999238113378253, |
|
"grad_norm": 0.4581094980239868, |
|
"learning_rate": 4.871440787196558e-05, |
|
"loss": 1.3269, |
|
"mean_token_accuracy": 0.6761635078117252, |
|
"num_tokens": 18635703.0, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.323801814242518, |
|
"grad_norm": 0.5478665232658386, |
|
"learning_rate": 4.8693541588751144e-05, |
|
"loss": 1.2554, |
|
"mean_token_accuracy": 0.688594707287848, |
|
"num_tokens": 18859074.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3276112473512535, |
|
"grad_norm": 0.39972299337387085, |
|
"learning_rate": 4.867223544245607e-05, |
|
"loss": 1.2497, |
|
"mean_token_accuracy": 0.6876592069864274, |
|
"num_tokens": 19084486.0, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.33142068045998907, |
|
"grad_norm": 0.6696220636367798, |
|
"learning_rate": 4.865048984292009e-05, |
|
"loss": 1.2776, |
|
"mean_token_accuracy": 0.6786478063091635, |
|
"num_tokens": 19305228.0, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.33523011356872456, |
|
"grad_norm": 0.5517823100090027, |
|
"learning_rate": 4.8628305208436137e-05, |
|
"loss": 1.2922, |
|
"mean_token_accuracy": 0.6886478485539556, |
|
"num_tokens": 19538363.0, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.33903954667746006, |
|
"grad_norm": 0.44766560196876526, |
|
"learning_rate": 4.860568196574232e-05, |
|
"loss": 1.2851, |
|
"mean_token_accuracy": 0.6843299470841885, |
|
"num_tokens": 19774720.0, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.34284897978619555, |
|
"grad_norm": 0.38793668150901794, |
|
"learning_rate": 4.858262055001371e-05, |
|
"loss": 1.3057, |
|
"mean_token_accuracy": 0.6804726634174585, |
|
"num_tokens": 19998879.0, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.3466584128949311, |
|
"grad_norm": 0.45702189207077026, |
|
"learning_rate": 4.8559121404853944e-05, |
|
"loss": 1.1826, |
|
"mean_token_accuracy": 0.7049577981233597, |
|
"num_tokens": 20243495.0, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.3504678460036666, |
|
"grad_norm": 0.5189054608345032, |
|
"learning_rate": 4.853518498228674e-05, |
|
"loss": 1.2659, |
|
"mean_token_accuracy": 0.6826557310298085, |
|
"num_tokens": 20462476.0, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.3542772791124021, |
|
"grad_norm": 0.4221845269203186, |
|
"learning_rate": 4.851081174274715e-05, |
|
"loss": 1.3576, |
|
"mean_token_accuracy": 0.6631780494004488, |
|
"num_tokens": 20679798.0, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.3580867122211376, |
|
"grad_norm": 0.4048372209072113, |
|
"learning_rate": 4.8486002155072754e-05, |
|
"loss": 1.2332, |
|
"mean_token_accuracy": 0.7002108607441186, |
|
"num_tokens": 20913935.0, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.3618961453298731, |
|
"grad_norm": 0.5621294379234314, |
|
"learning_rate": 4.8460756696494594e-05, |
|
"loss": 1.2625, |
|
"mean_token_accuracy": 0.6872183892875909, |
|
"num_tokens": 21138524.0, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3657055784386086, |
|
"grad_norm": 0.5211415886878967, |
|
"learning_rate": 4.843507585262804e-05, |
|
"loss": 1.2533, |
|
"mean_token_accuracy": 0.6849848669022321, |
|
"num_tokens": 21373326.0, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.3695150115473441, |
|
"grad_norm": 0.551052987575531, |
|
"learning_rate": 4.8408960117463386e-05, |
|
"loss": 1.304, |
|
"mean_token_accuracy": 0.6765170315280556, |
|
"num_tokens": 21590656.0, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.3733244446560796, |
|
"grad_norm": 0.5423365831375122, |
|
"learning_rate": 4.838240999335643e-05, |
|
"loss": 1.3441, |
|
"mean_token_accuracy": 0.6701787773519754, |
|
"num_tokens": 21795838.0, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.3771338777648151, |
|
"grad_norm": 0.46691930294036865, |
|
"learning_rate": 4.835542599101873e-05, |
|
"loss": 1.3163, |
|
"mean_token_accuracy": 0.6742871653288602, |
|
"num_tokens": 22007062.0, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.38094331087355066, |
|
"grad_norm": 0.6379289627075195, |
|
"learning_rate": 4.8328008629507863e-05, |
|
"loss": 1.3094, |
|
"mean_token_accuracy": 0.6776679191738367, |
|
"num_tokens": 22232306.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3828480274279184, |
|
"eval_mean_token_accuracy": 0.6281169285496555, |
|
"eval_num_tokens": 22348477.0, |
|
"eval_test_loss": 1.793502926826477, |
|
"eval_test_runtime": 95.0461, |
|
"eval_test_samples_per_second": 20.948, |
|
"eval_test_steps_per_second": 10.479, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.38475274398228615, |
|
"grad_norm": 0.6131725311279297, |
|
"learning_rate": 4.830015843621735e-05, |
|
"loss": 1.2705, |
|
"mean_token_accuracy": 0.6874665239825845, |
|
"num_tokens": 22461894.0, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.38856217709102164, |
|
"grad_norm": 0.4669947326183319, |
|
"learning_rate": 4.8271875946866574e-05, |
|
"loss": 1.3255, |
|
"mean_token_accuracy": 0.6841361073777079, |
|
"num_tokens": 22684394.0, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.39237161019975714, |
|
"grad_norm": 0.42204299569129944, |
|
"learning_rate": 4.824316170549047e-05, |
|
"loss": 1.2425, |
|
"mean_token_accuracy": 0.6886863965541125, |
|
"num_tokens": 22918777.0, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.39618104330849263, |
|
"grad_norm": 0.5980663895606995, |
|
"learning_rate": 4.821401626442903e-05, |
|
"loss": 1.2868, |
|
"mean_token_accuracy": 0.6745259683579207, |
|
"num_tokens": 23139486.0, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.3999904764172282, |
|
"grad_norm": 0.46430954337120056, |
|
"learning_rate": 4.8184440184316695e-05, |
|
"loss": 1.228, |
|
"mean_token_accuracy": 0.6927272006869316, |
|
"num_tokens": 23372563.0, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.4037999095259637, |
|
"grad_norm": 0.5982958078384399, |
|
"learning_rate": 4.815443403407159e-05, |
|
"loss": 1.2377, |
|
"mean_token_accuracy": 0.687408297508955, |
|
"num_tokens": 23601002.0, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.40760934263469917, |
|
"grad_norm": 0.5242788195610046, |
|
"learning_rate": 4.812399839088453e-05, |
|
"loss": 1.2926, |
|
"mean_token_accuracy": 0.6808499401435256, |
|
"num_tokens": 23814142.0, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.41141877574343466, |
|
"grad_norm": 0.44663509726524353, |
|
"learning_rate": 4.809313384020799e-05, |
|
"loss": 1.2768, |
|
"mean_token_accuracy": 0.6787567920982838, |
|
"num_tokens": 24034008.0, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.4152282088521702, |
|
"grad_norm": 0.571322500705719, |
|
"learning_rate": 4.806184097574478e-05, |
|
"loss": 1.299, |
|
"mean_token_accuracy": 0.6753746373578906, |
|
"num_tokens": 24241179.0, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.4190376419609057, |
|
"grad_norm": 0.6444421410560608, |
|
"learning_rate": 4.8030120399436636e-05, |
|
"loss": 1.2922, |
|
"mean_token_accuracy": 0.6790545796975493, |
|
"num_tokens": 24463001.0, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.4228470750696412, |
|
"grad_norm": 0.5884628891944885, |
|
"learning_rate": 4.799797272145267e-05, |
|
"loss": 1.3354, |
|
"mean_token_accuracy": 0.6655034508556128, |
|
"num_tokens": 24672669.0, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.4266565081783767, |
|
"grad_norm": 0.6725838780403137, |
|
"learning_rate": 4.796539856017762e-05, |
|
"loss": 1.2866, |
|
"mean_token_accuracy": 0.6829743100330233, |
|
"num_tokens": 24895076.0, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.4304659412871122, |
|
"grad_norm": 0.5089847445487976, |
|
"learning_rate": 4.7932398542199935e-05, |
|
"loss": 1.3312, |
|
"mean_token_accuracy": 0.6777703372761608, |
|
"num_tokens": 25105078.0, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.43427537439584774, |
|
"grad_norm": 0.6170702576637268, |
|
"learning_rate": 4.7898973302299746e-05, |
|
"loss": 1.2308, |
|
"mean_token_accuracy": 0.6913123942911625, |
|
"num_tokens": 25330411.0, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.43808480750458323, |
|
"grad_norm": 0.47749921679496765, |
|
"learning_rate": 4.786512348343664e-05, |
|
"loss": 1.2394, |
|
"mean_token_accuracy": 0.6930743562057614, |
|
"num_tokens": 25558075.0, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.4418942406133187, |
|
"grad_norm": 0.5179374814033508, |
|
"learning_rate": 4.783084973673732e-05, |
|
"loss": 1.3121, |
|
"mean_token_accuracy": 0.6737616384401918, |
|
"num_tokens": 25787633.0, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.4457036737220542, |
|
"grad_norm": 0.4957481920719147, |
|
"learning_rate": 4.7796152721483024e-05, |
|
"loss": 1.2679, |
|
"mean_token_accuracy": 0.6782409870997071, |
|
"num_tokens": 26010384.0, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.44951310683078977, |
|
"grad_norm": 0.46905845403671265, |
|
"learning_rate": 4.776103310509691e-05, |
|
"loss": 1.3583, |
|
"mean_token_accuracy": 0.6669555108994245, |
|
"num_tokens": 26220009.0, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.45332253993952526, |
|
"grad_norm": 0.6176592707633972, |
|
"learning_rate": 4.7725491563131164e-05, |
|
"loss": 1.3376, |
|
"mean_token_accuracy": 0.6773041613399983, |
|
"num_tokens": 26437068.0, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.45713197304826075, |
|
"grad_norm": 0.641156017780304, |
|
"learning_rate": 4.7689528779254057e-05, |
|
"loss": 1.338, |
|
"mean_token_accuracy": 0.6686659481376409, |
|
"num_tokens": 26646369.0, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.46094140615699625, |
|
"grad_norm": 0.5560479760169983, |
|
"learning_rate": 4.7653145445236725e-05, |
|
"loss": 1.3156, |
|
"mean_token_accuracy": 0.6741218714043498, |
|
"num_tokens": 26872767.0, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.46475083926573174, |
|
"grad_norm": 0.6000213027000427, |
|
"learning_rate": 4.761634226093993e-05, |
|
"loss": 1.2637, |
|
"mean_token_accuracy": 0.6870461646467447, |
|
"num_tokens": 27092463.0, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.4685602723744673, |
|
"grad_norm": 0.3717496693134308, |
|
"learning_rate": 4.757911993430057e-05, |
|
"loss": 1.3356, |
|
"mean_token_accuracy": 0.6666051434352994, |
|
"num_tokens": 27314300.0, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.4723697054832028, |
|
"grad_norm": 0.839055061340332, |
|
"learning_rate": 4.754147918131804e-05, |
|
"loss": 1.2933, |
|
"mean_token_accuracy": 0.6818767448887229, |
|
"num_tokens": 27534317.0, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.4761791385919383, |
|
"grad_norm": 0.5968820452690125, |
|
"learning_rate": 4.7503420726040496e-05, |
|
"loss": 1.3218, |
|
"mean_token_accuracy": 0.6756005242466927, |
|
"num_tokens": 27744710.0, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.47856003428489796, |
|
"eval_mean_token_accuracy": 0.6309991637506638, |
|
"eval_num_tokens": 27879664.0, |
|
"eval_test_loss": 1.7806874513626099, |
|
"eval_test_runtime": 94.8719, |
|
"eval_test_samples_per_second": 20.986, |
|
"eval_test_steps_per_second": 10.498, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.47998857170067377, |
|
"grad_norm": 0.555782675743103, |
|
"learning_rate": 4.74649453005509e-05, |
|
"loss": 1.2746, |
|
"mean_token_accuracy": 0.684971004165709, |
|
"num_tokens": 27964540.0, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.4837980048094093, |
|
"grad_norm": 0.5832619071006775, |
|
"learning_rate": 4.742605364495298e-05, |
|
"loss": 1.2391, |
|
"mean_token_accuracy": 0.691305941529572, |
|
"num_tokens": 28195211.0, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.4876074379181448, |
|
"grad_norm": 0.4528619349002838, |
|
"learning_rate": 4.738674650735692e-05, |
|
"loss": 1.2295, |
|
"mean_token_accuracy": 0.683625971712172, |
|
"num_tokens": 28427726.0, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.4914168710268803, |
|
"grad_norm": 0.8959270119667053, |
|
"learning_rate": 4.734702464386503e-05, |
|
"loss": 1.3764, |
|
"mean_token_accuracy": 0.6636357877403498, |
|
"num_tokens": 28631998.0, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.4952263041356158, |
|
"grad_norm": 0.43296220898628235, |
|
"learning_rate": 4.73068888185572e-05, |
|
"loss": 1.2929, |
|
"mean_token_accuracy": 0.6792870191857219, |
|
"num_tokens": 28846791.0, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.49903573724435135, |
|
"grad_norm": 0.722977340221405, |
|
"learning_rate": 4.726633980347616e-05, |
|
"loss": 1.3176, |
|
"mean_token_accuracy": 0.6724835351109505, |
|
"num_tokens": 29054506.0, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.5028451703530868, |
|
"grad_norm": 0.41740506887435913, |
|
"learning_rate": 4.722537837861267e-05, |
|
"loss": 1.2455, |
|
"mean_token_accuracy": 0.688827583193779, |
|
"num_tokens": 29270620.0, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.5066546034618223, |
|
"grad_norm": 0.38050729036331177, |
|
"learning_rate": 4.718400533189051e-05, |
|
"loss": 1.2743, |
|
"mean_token_accuracy": 0.6829100279137492, |
|
"num_tokens": 29490565.0, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.5104640365705578, |
|
"grad_norm": 0.5724825263023376, |
|
"learning_rate": 4.7142221459151294e-05, |
|
"loss": 1.2773, |
|
"mean_token_accuracy": 0.6881643293425441, |
|
"num_tokens": 29704265.0, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.5142734696792933, |
|
"grad_norm": 0.6543896198272705, |
|
"learning_rate": 4.7100027564139196e-05, |
|
"loss": 1.3307, |
|
"mean_token_accuracy": 0.6777999725192785, |
|
"num_tokens": 29924515.0, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.5180829027880288, |
|
"grad_norm": 0.43163400888442993, |
|
"learning_rate": 4.705742445848548e-05, |
|
"loss": 1.2653, |
|
"mean_token_accuracy": 0.682813161984086, |
|
"num_tokens": 30148026.0, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.5218923358967643, |
|
"grad_norm": 0.47830525040626526, |
|
"learning_rate": 4.7014412961692864e-05, |
|
"loss": 1.1955, |
|
"mean_token_accuracy": 0.7004892747849226, |
|
"num_tokens": 30372129.0, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.5257017690054999, |
|
"grad_norm": 0.5903618931770325, |
|
"learning_rate": 4.697099390111981e-05, |
|
"loss": 1.2973, |
|
"mean_token_accuracy": 0.6818130781874061, |
|
"num_tokens": 30596623.0, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.5295112021142354, |
|
"grad_norm": 0.43726322054862976, |
|
"learning_rate": 4.6927168111964555e-05, |
|
"loss": 1.2095, |
|
"mean_token_accuracy": 0.696308933570981, |
|
"num_tokens": 30834116.0, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.5333206352229709, |
|
"grad_norm": 0.7869614958763123, |
|
"learning_rate": 4.6882936437249056e-05, |
|
"loss": 1.2275, |
|
"mean_token_accuracy": 0.6923372825607658, |
|
"num_tokens": 31066239.0, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.5371300683317064, |
|
"grad_norm": 0.4349427819252014, |
|
"learning_rate": 4.6838299727802786e-05, |
|
"loss": 1.2973, |
|
"mean_token_accuracy": 0.688424677029252, |
|
"num_tokens": 31302823.0, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.5409395014404419, |
|
"grad_norm": 0.5851991772651672, |
|
"learning_rate": 4.67932588422464e-05, |
|
"loss": 1.237, |
|
"mean_token_accuracy": 0.6886244036257267, |
|
"num_tokens": 31530684.0, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.5447489345491774, |
|
"grad_norm": 0.6614839434623718, |
|
"learning_rate": 4.6747814646975134e-05, |
|
"loss": 1.3017, |
|
"mean_token_accuracy": 0.6840987723320723, |
|
"num_tokens": 31741759.0, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.5485583676579129, |
|
"grad_norm": 0.4596734642982483, |
|
"learning_rate": 4.670196801614224e-05, |
|
"loss": 1.1938, |
|
"mean_token_accuracy": 0.7047518376260996, |
|
"num_tokens": 31987926.0, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.5523678007666484, |
|
"grad_norm": 0.5416800379753113, |
|
"learning_rate": 4.665571983164207e-05, |
|
"loss": 1.2874, |
|
"mean_token_accuracy": 0.6859219571575522, |
|
"num_tokens": 32213674.0, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.5561772338753839, |
|
"grad_norm": 0.4644794464111328, |
|
"learning_rate": 4.660907098309319e-05, |
|
"loss": 1.2893, |
|
"mean_token_accuracy": 0.6782131699845195, |
|
"num_tokens": 32429679.0, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.5599866669841195, |
|
"grad_norm": 0.5027694702148438, |
|
"learning_rate": 4.6562022367821244e-05, |
|
"loss": 1.2955, |
|
"mean_token_accuracy": 0.6798365904018283, |
|
"num_tokens": 32646061.0, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.563796100092855, |
|
"grad_norm": 0.5148358345031738, |
|
"learning_rate": 4.651457489084167e-05, |
|
"loss": 1.2533, |
|
"mean_token_accuracy": 0.684299885481596, |
|
"num_tokens": 32867990.0, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.5676055332015905, |
|
"grad_norm": 0.4006965756416321, |
|
"learning_rate": 4.646672946484232e-05, |
|
"loss": 1.2246, |
|
"mean_token_accuracy": 0.6978412168100476, |
|
"num_tokens": 33100846.0, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.571414966310326, |
|
"grad_norm": 0.5946612358093262, |
|
"learning_rate": 4.641848701016592e-05, |
|
"loss": 1.3112, |
|
"mean_token_accuracy": 0.682670296356082, |
|
"num_tokens": 33316830.0, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5742720411418776, |
|
"eval_mean_token_accuracy": 0.6325134164597614, |
|
"eval_num_tokens": 33490786.0, |
|
"eval_test_loss": 1.7651844024658203, |
|
"eval_test_runtime": 95.3517, |
|
"eval_test_samples_per_second": 20.881, |
|
"eval_test_steps_per_second": 10.446, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.5752243994190614, |
|
"grad_norm": 0.5004485249519348, |
|
"learning_rate": 4.636984845479229e-05, |
|
"loss": 1.3189, |
|
"mean_token_accuracy": 0.6787041410803795, |
|
"num_tokens": 33547057.0, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.5790338325277969, |
|
"grad_norm": 0.6013615727424622, |
|
"learning_rate": 4.6320814734320574e-05, |
|
"loss": 1.2999, |
|
"mean_token_accuracy": 0.6829198809340596, |
|
"num_tokens": 33751128.0, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.5828432656365324, |
|
"grad_norm": 0.46301329135894775, |
|
"learning_rate": 4.627138679195122e-05, |
|
"loss": 1.2979, |
|
"mean_token_accuracy": 0.6834639564156533, |
|
"num_tokens": 33958374.0, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.5866526987452679, |
|
"grad_norm": 0.5537028908729553, |
|
"learning_rate": 4.622156557846782e-05, |
|
"loss": 1.2919, |
|
"mean_token_accuracy": 0.6853473074734211, |
|
"num_tokens": 34173776.0, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.5904621318540034, |
|
"grad_norm": 0.5172299146652222, |
|
"learning_rate": 4.617135205221882e-05, |
|
"loss": 1.2535, |
|
"mean_token_accuracy": 0.6891471687704325, |
|
"num_tokens": 34402012.0, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.594271564962739, |
|
"grad_norm": 0.45443201065063477, |
|
"learning_rate": 4.6120747179099115e-05, |
|
"loss": 1.3631, |
|
"mean_token_accuracy": 0.6746680632233619, |
|
"num_tokens": 34619563.0, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.5980809980714745, |
|
"grad_norm": 0.5318917632102966, |
|
"learning_rate": 4.606975193253145e-05, |
|
"loss": 1.2875, |
|
"mean_token_accuracy": 0.6842667568475008, |
|
"num_tokens": 34842995.0, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.60189043118021, |
|
"grad_norm": 0.41732507944107056, |
|
"learning_rate": 4.6018367293447696e-05, |
|
"loss": 1.2011, |
|
"mean_token_accuracy": 0.6991326250135899, |
|
"num_tokens": 35074476.0, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.6056998642889455, |
|
"grad_norm": 0.4196927845478058, |
|
"learning_rate": 4.5966594250269964e-05, |
|
"loss": 1.3135, |
|
"mean_token_accuracy": 0.6805538948625326, |
|
"num_tokens": 35296363.0, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.609509297397681, |
|
"grad_norm": 0.42691197991371155, |
|
"learning_rate": 4.5914433798891605e-05, |
|
"loss": 1.246, |
|
"mean_token_accuracy": 0.6876704445108771, |
|
"num_tokens": 35523754.0, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.6133187305064165, |
|
"grad_norm": 0.3611372113227844, |
|
"learning_rate": 4.5861886942658106e-05, |
|
"loss": 1.2269, |
|
"mean_token_accuracy": 0.6875138944014907, |
|
"num_tokens": 35758849.0, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.617128163615152, |
|
"grad_norm": 0.7133765816688538, |
|
"learning_rate": 4.580895469234769e-05, |
|
"loss": 1.2242, |
|
"mean_token_accuracy": 0.6886107694357634, |
|
"num_tokens": 35988842.0, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.6209375967238875, |
|
"grad_norm": 0.44684138894081116, |
|
"learning_rate": 4.575563806615196e-05, |
|
"loss": 1.2675, |
|
"mean_token_accuracy": 0.6812176534906029, |
|
"num_tokens": 36201602.0, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.624747029832623, |
|
"grad_norm": 0.5846595168113708, |
|
"learning_rate": 4.5701938089656256e-05, |
|
"loss": 1.2703, |
|
"mean_token_accuracy": 0.6853839591145515, |
|
"num_tokens": 36403261.0, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.6285564629413586, |
|
"grad_norm": 0.3759899437427521, |
|
"learning_rate": 4.5647855795819943e-05, |
|
"loss": 1.3111, |
|
"mean_token_accuracy": 0.6767927626147866, |
|
"num_tokens": 36619797.0, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.6323658960500941, |
|
"grad_norm": 0.6737694144248962, |
|
"learning_rate": 4.5593392224956576e-05, |
|
"loss": 1.3165, |
|
"mean_token_accuracy": 0.6752157468348742, |
|
"num_tokens": 36828768.0, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.6361753291588296, |
|
"grad_norm": 0.5695846676826477, |
|
"learning_rate": 4.5538548424713835e-05, |
|
"loss": 1.2397, |
|
"mean_token_accuracy": 0.6919887445867061, |
|
"num_tokens": 37053251.0, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.6399847622675651, |
|
"grad_norm": 0.5237749218940735, |
|
"learning_rate": 4.5483325450053406e-05, |
|
"loss": 1.2553, |
|
"mean_token_accuracy": 0.6820298057049513, |
|
"num_tokens": 37268972.0, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.6437941953763006, |
|
"grad_norm": 0.6554339528083801, |
|
"learning_rate": 4.5427724363230683e-05, |
|
"loss": 1.2682, |
|
"mean_token_accuracy": 0.6873216938227416, |
|
"num_tokens": 37492541.0, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.647603628485036, |
|
"grad_norm": 0.5178593397140503, |
|
"learning_rate": 4.537174623377432e-05, |
|
"loss": 1.2537, |
|
"mean_token_accuracy": 0.6826527412980795, |
|
"num_tokens": 37705009.0, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.6514130615937715, |
|
"grad_norm": 0.4857373833656311, |
|
"learning_rate": 4.53153921384657e-05, |
|
"loss": 1.3497, |
|
"mean_token_accuracy": 0.6726470049470663, |
|
"num_tokens": 37915354.0, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.655222494702507, |
|
"grad_norm": 0.48057419061660767, |
|
"learning_rate": 4.5258663161318136e-05, |
|
"loss": 1.2267, |
|
"mean_token_accuracy": 0.6866666225716471, |
|
"num_tokens": 38146196.0, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.6590319278112425, |
|
"grad_norm": 0.5138698220252991, |
|
"learning_rate": 4.5201560393556134e-05, |
|
"loss": 1.2439, |
|
"mean_token_accuracy": 0.6901180801913143, |
|
"num_tokens": 38373540.0, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.6628413609199781, |
|
"grad_norm": 0.47547754645347595, |
|
"learning_rate": 4.5144084933594303e-05, |
|
"loss": 1.2706, |
|
"mean_token_accuracy": 0.6800282135605812, |
|
"num_tokens": 38589036.0, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.6666507940287136, |
|
"grad_norm": 0.6765561103820801, |
|
"learning_rate": 4.508623788701628e-05, |
|
"loss": 1.2929, |
|
"mean_token_accuracy": 0.6894892951473593, |
|
"num_tokens": 38802771.0, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6699840479988571, |
|
"eval_mean_token_accuracy": 0.6329328228970129, |
|
"eval_num_tokens": 38980983.0, |
|
"eval_test_loss": 1.7534339427947998, |
|
"eval_test_runtime": 95.0805, |
|
"eval_test_samples_per_second": 20.94, |
|
"eval_test_steps_per_second": 10.475, |
|
"step": 7035 |
|
}, |
|
{ |
|
"epoch": 0.6704602271374491, |
|
"grad_norm": 0.40170061588287354, |
|
"learning_rate": 4.502802036655346e-05, |
|
"loss": 1.3205, |
|
"mean_token_accuracy": 0.675545847415924, |
|
"num_tokens": 39011417.0, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.6742696602461846, |
|
"grad_norm": 0.49673372507095337, |
|
"learning_rate": 4.4969433492063564e-05, |
|
"loss": 1.256, |
|
"mean_token_accuracy": 0.6876663960516453, |
|
"num_tokens": 39231629.0, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.6780790933549201, |
|
"grad_norm": 0.45921561121940613, |
|
"learning_rate": 4.491047839050912e-05, |
|
"loss": 1.2542, |
|
"mean_token_accuracy": 0.6906937116757035, |
|
"num_tokens": 39457882.0, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.6818885264636556, |
|
"grad_norm": 0.49605095386505127, |
|
"learning_rate": 4.4851156195935785e-05, |
|
"loss": 1.3266, |
|
"mean_token_accuracy": 0.673082504235208, |
|
"num_tokens": 39675712.0, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.6856979595723911, |
|
"grad_norm": 0.5613274574279785, |
|
"learning_rate": 4.479146804945053e-05, |
|
"loss": 1.3061, |
|
"mean_token_accuracy": 0.6837763454765081, |
|
"num_tokens": 39892449.0, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.6895073926811266, |
|
"grad_norm": 0.40411025285720825, |
|
"learning_rate": 4.473141509919966e-05, |
|
"loss": 1.2459, |
|
"mean_token_accuracy": 0.6935942692682147, |
|
"num_tokens": 40122187.0, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.6933168257898622, |
|
"grad_norm": 0.66608726978302, |
|
"learning_rate": 4.4670998500346795e-05, |
|
"loss": 1.263, |
|
"mean_token_accuracy": 0.6886698173359036, |
|
"num_tokens": 40339718.0, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.6971262588985977, |
|
"grad_norm": 0.5249150395393372, |
|
"learning_rate": 4.461021941505057e-05, |
|
"loss": 1.2077, |
|
"mean_token_accuracy": 0.6970062265172601, |
|
"num_tokens": 40568071.0, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.7009356920073332, |
|
"grad_norm": 0.5321863293647766, |
|
"learning_rate": 4.454907901244236e-05, |
|
"loss": 1.2908, |
|
"mean_token_accuracy": 0.6871679758653044, |
|
"num_tokens": 40781022.0, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.7047451251160687, |
|
"grad_norm": 0.5298981666564941, |
|
"learning_rate": 4.44875784686037e-05, |
|
"loss": 1.2256, |
|
"mean_token_accuracy": 0.6957620535045862, |
|
"num_tokens": 41015922.0, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.7085545582248042, |
|
"grad_norm": 0.6805555820465088, |
|
"learning_rate": 4.442571896654375e-05, |
|
"loss": 1.2059, |
|
"mean_token_accuracy": 0.6951308185234666, |
|
"num_tokens": 41247452.0, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.7123639913335397, |
|
"grad_norm": 0.3826749622821808, |
|
"learning_rate": 4.4363501696176494e-05, |
|
"loss": 1.2866, |
|
"mean_token_accuracy": 0.6792947178706527, |
|
"num_tokens": 41470415.0, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.7161734244422752, |
|
"grad_norm": 0.5363141298294067, |
|
"learning_rate": 4.4300927854297856e-05, |
|
"loss": 1.2691, |
|
"mean_token_accuracy": 0.682473830319941, |
|
"num_tokens": 41701000.0, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.7199828575510107, |
|
"grad_norm": 0.5626250505447388, |
|
"learning_rate": 4.423799864456266e-05, |
|
"loss": 1.3133, |
|
"mean_token_accuracy": 0.6788626465946436, |
|
"num_tokens": 41915333.0, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.7237922906597462, |
|
"grad_norm": 0.6899204850196838, |
|
"learning_rate": 4.417471527746152e-05, |
|
"loss": 1.365, |
|
"mean_token_accuracy": 0.6754604885354638, |
|
"num_tokens": 42139999.0, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.7276017237684818, |
|
"grad_norm": 0.5523748993873596, |
|
"learning_rate": 4.411107897029755e-05, |
|
"loss": 1.2776, |
|
"mean_token_accuracy": 0.6912444988265634, |
|
"num_tokens": 42367874.0, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.7314111568772173, |
|
"grad_norm": 0.5411382913589478, |
|
"learning_rate": 4.404709094716289e-05, |
|
"loss": 1.2326, |
|
"mean_token_accuracy": 0.6946828311309219, |
|
"num_tokens": 42594804.0, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.7352205899859527, |
|
"grad_norm": 0.6533775329589844, |
|
"learning_rate": 4.398275243891522e-05, |
|
"loss": 1.2828, |
|
"mean_token_accuracy": 0.6842706995084882, |
|
"num_tokens": 42814558.0, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.7390300230946882, |
|
"grad_norm": 0.3408315181732178, |
|
"learning_rate": 4.391806468315408e-05, |
|
"loss": 1.2614, |
|
"mean_token_accuracy": 0.690892388857901, |
|
"num_tokens": 43040642.0, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.7428394562034237, |
|
"grad_norm": 0.4586963355541229, |
|
"learning_rate": 4.385302892419702e-05, |
|
"loss": 1.2984, |
|
"mean_token_accuracy": 0.6800136685371398, |
|
"num_tokens": 43259728.0, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.7466488893121592, |
|
"grad_norm": 0.6477558016777039, |
|
"learning_rate": 4.37876464130557e-05, |
|
"loss": 1.3249, |
|
"mean_token_accuracy": 0.6777775203809142, |
|
"num_tokens": 43467264.0, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.7504583224208947, |
|
"grad_norm": 0.4374445080757141, |
|
"learning_rate": 4.3721918407411845e-05, |
|
"loss": 1.2729, |
|
"mean_token_accuracy": 0.689405850879848, |
|
"num_tokens": 43679068.0, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.7542677555296302, |
|
"grad_norm": 0.5325660109519958, |
|
"learning_rate": 4.3655846171592994e-05, |
|
"loss": 1.2863, |
|
"mean_token_accuracy": 0.6884377462789416, |
|
"num_tokens": 43899283.0, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.7580771886383657, |
|
"grad_norm": 0.7073720097541809, |
|
"learning_rate": 4.358943097654823e-05, |
|
"loss": 1.245, |
|
"mean_token_accuracy": 0.6904063617810607, |
|
"num_tokens": 44113559.0, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.7618866217471013, |
|
"grad_norm": 0.44697853922843933, |
|
"learning_rate": 4.3522674099823705e-05, |
|
"loss": 1.2699, |
|
"mean_token_accuracy": 0.6843698143959045, |
|
"num_tokens": 44334374.0, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7656960548558368, |
|
"grad_norm": 0.4306088984012604, |
|
"learning_rate": 4.345557682553807e-05, |
|
"loss": 1.2319, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.7656960548558368, |
|
"eval_mean_token_accuracy": 0.6363147346070971, |
|
"eval_num_tokens": 44575460.0, |
|
"eval_test_loss": 1.7359126806259155, |
|
"eval_test_runtime": 95.9212, |
|
"eval_test_samples_per_second": 20.757, |
|
"eval_test_steps_per_second": 10.384, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.7695054879645723, |
|
"grad_norm": 0.6170753836631775, |
|
"learning_rate": 4.3388140444357795e-05, |
|
"loss": 1.2328, |
|
"mean_token_accuracy": 0.6931531462818384, |
|
"num_tokens": 44806038.0, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.7733149210733078, |
|
"grad_norm": 0.7038352489471436, |
|
"learning_rate": 4.332036625347232e-05, |
|
"loss": 1.2929, |
|
"mean_token_accuracy": 0.6844880169257521, |
|
"num_tokens": 45016584.0, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.7771243541820433, |
|
"grad_norm": 0.4224265515804291, |
|
"learning_rate": 4.325225555656911e-05, |
|
"loss": 1.2634, |
|
"mean_token_accuracy": 0.6930087611079216, |
|
"num_tokens": 45246046.0, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.7809337872907788, |
|
"grad_norm": 0.46587294340133667, |
|
"learning_rate": 4.3183809663808556e-05, |
|
"loss": 1.2605, |
|
"mean_token_accuracy": 0.687124558724463, |
|
"num_tokens": 45464284.0, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.7847432203995143, |
|
"grad_norm": 0.5252248644828796, |
|
"learning_rate": 4.311502989179882e-05, |
|
"loss": 1.2728, |
|
"mean_token_accuracy": 0.6883155029267073, |
|
"num_tokens": 45684249.0, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.7885526535082498, |
|
"grad_norm": 0.4695100486278534, |
|
"learning_rate": 4.304591756357046e-05, |
|
"loss": 1.27, |
|
"mean_token_accuracy": 0.6879516759887337, |
|
"num_tokens": 45919177.0, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.7923620866169853, |
|
"grad_norm": 0.46411824226379395, |
|
"learning_rate": 4.297647400855103e-05, |
|
"loss": 1.3028, |
|
"mean_token_accuracy": 0.683055117353797, |
|
"num_tokens": 46131789.0, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.7961715197257209, |
|
"grad_norm": 0.7392385601997375, |
|
"learning_rate": 4.290670056253944e-05, |
|
"loss": 1.173, |
|
"mean_token_accuracy": 0.6985200975090265, |
|
"num_tokens": 46348383.0, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.7999809528344564, |
|
"grad_norm": 0.5996558666229248, |
|
"learning_rate": 4.283659856768036e-05, |
|
"loss": 1.1934, |
|
"mean_token_accuracy": 0.6913962122052908, |
|
"num_tokens": 46572321.0, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.8037903859431919, |
|
"grad_norm": 0.6277273297309875, |
|
"learning_rate": 4.276616937243828e-05, |
|
"loss": 1.2401, |
|
"mean_token_accuracy": 0.6883677888661623, |
|
"num_tokens": 46798263.0, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.8075998190519273, |
|
"grad_norm": 0.7397225499153137, |
|
"learning_rate": 4.2695414331571673e-05, |
|
"loss": 1.3177, |
|
"mean_token_accuracy": 0.6755096849054099, |
|
"num_tokens": 47017444.0, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.8114092521606628, |
|
"grad_norm": 0.650918185710907, |
|
"learning_rate": 4.2624334806106894e-05, |
|
"loss": 1.3164, |
|
"mean_token_accuracy": 0.6786608980968595, |
|
"num_tokens": 47230158.0, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.8152186852693983, |
|
"grad_norm": 0.6894013285636902, |
|
"learning_rate": 4.255293216331197e-05, |
|
"loss": 1.2527, |
|
"mean_token_accuracy": 0.6907643361017108, |
|
"num_tokens": 47453276.0, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.8190281183781338, |
|
"grad_norm": 0.6085829138755798, |
|
"learning_rate": 4.2481207776670396e-05, |
|
"loss": 1.2798, |
|
"mean_token_accuracy": 0.6841305760666728, |
|
"num_tokens": 47662402.0, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.8228375514868693, |
|
"grad_norm": 0.47271421551704407, |
|
"learning_rate": 4.24091630258546e-05, |
|
"loss": 1.2038, |
|
"mean_token_accuracy": 0.6984878290444613, |
|
"num_tokens": 47889095.0, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.8266469845956048, |
|
"grad_norm": 0.6806793212890625, |
|
"learning_rate": 4.2336799296699454e-05, |
|
"loss": 1.1881, |
|
"mean_token_accuracy": 0.7019338620826602, |
|
"num_tokens": 48117494.0, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.8304564177043404, |
|
"grad_norm": 0.5494069457054138, |
|
"learning_rate": 4.2264117981175665e-05, |
|
"loss": 1.2331, |
|
"mean_token_accuracy": 0.6901240289211273, |
|
"num_tokens": 48340935.0, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.8342658508130759, |
|
"grad_norm": 0.6391497254371643, |
|
"learning_rate": 4.21911204773629e-05, |
|
"loss": 1.2149, |
|
"mean_token_accuracy": 0.6961538307368755, |
|
"num_tokens": 48573668.0, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.8380752839218114, |
|
"grad_norm": 0.3758852481842041, |
|
"learning_rate": 4.211780818942297e-05, |
|
"loss": 1.251, |
|
"mean_token_accuracy": 0.6919752093032002, |
|
"num_tokens": 48795852.0, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.8418847170305469, |
|
"grad_norm": 0.5446637868881226, |
|
"learning_rate": 4.2044182527572795e-05, |
|
"loss": 1.3, |
|
"mean_token_accuracy": 0.6852457968518137, |
|
"num_tokens": 49032650.0, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.8456941501392824, |
|
"grad_norm": 0.3522646427154541, |
|
"learning_rate": 4.197024490805727e-05, |
|
"loss": 1.2702, |
|
"mean_token_accuracy": 0.6935782097280025, |
|
"num_tokens": 49250248.0, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.8495035832480179, |
|
"grad_norm": 0.652208149433136, |
|
"learning_rate": 4.189599675312204e-05, |
|
"loss": 1.2844, |
|
"mean_token_accuracy": 0.6847639823332429, |
|
"num_tokens": 49476266.0, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.8533130163567534, |
|
"grad_norm": 0.41794517636299133, |
|
"learning_rate": 4.182143949098612e-05, |
|
"loss": 1.2659, |
|
"mean_token_accuracy": 0.6905384896323085, |
|
"num_tokens": 49700173.0, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.8571224494654889, |
|
"grad_norm": 0.532646656036377, |
|
"learning_rate": 4.1746574555814455e-05, |
|
"loss": 1.282, |
|
"mean_token_accuracy": 0.6890167951583862, |
|
"num_tokens": 49930863.0, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.8609318825742244, |
|
"grad_norm": 0.5750353932380676, |
|
"learning_rate": 4.1671403387690284e-05, |
|
"loss": 1.1884, |
|
"mean_token_accuracy": 0.7017395876348018, |
|
"num_tokens": 50166067.0, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.8614080617128164, |
|
"eval_mean_token_accuracy": 0.6381066715621565, |
|
"eval_num_tokens": 50195310.0, |
|
"eval_test_loss": 1.723710060119629, |
|
"eval_test_runtime": 96.0763, |
|
"eval_test_samples_per_second": 20.723, |
|
"eval_test_steps_per_second": 10.367, |
|
"step": 9045 |
|
} |
|
], |
|
"logging_steps": 40, |
|
"max_steps": 30161, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 3016, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1928985580997181e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|