|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9995732696082615, |
|
"eval_steps": 500, |
|
"global_step": 1464, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003413843133907997, |
|
"grad_norm": 4.705833074023066, |
|
"learning_rate": 2.702702702702703e-06, |
|
"loss": 0.8155, |
|
"num_tokens": 1939277.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006827686267815994, |
|
"grad_norm": 4.012926640069732, |
|
"learning_rate": 6.081081081081082e-06, |
|
"loss": 0.71, |
|
"num_tokens": 3766520.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01024152940172399, |
|
"grad_norm": 2.280161297786124, |
|
"learning_rate": 9.45945945945946e-06, |
|
"loss": 0.6608, |
|
"num_tokens": 5654095.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013655372535631987, |
|
"grad_norm": 1.143701205704835, |
|
"learning_rate": 1.2837837837837838e-05, |
|
"loss": 0.5888, |
|
"num_tokens": 7684648.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017069215669539985, |
|
"grad_norm": 0.9015302385209565, |
|
"learning_rate": 1.6216216216216218e-05, |
|
"loss": 0.5422, |
|
"num_tokens": 9531309.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02048305880344798, |
|
"grad_norm": 0.8842795138478181, |
|
"learning_rate": 1.9594594594594595e-05, |
|
"loss": 0.5045, |
|
"num_tokens": 11396996.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02389690193735598, |
|
"grad_norm": 0.8574020610099237, |
|
"learning_rate": 2.2972972972972976e-05, |
|
"loss": 0.5068, |
|
"num_tokens": 13252705.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.027310745071263975, |
|
"grad_norm": 0.8271667971649893, |
|
"learning_rate": 2.635135135135135e-05, |
|
"loss": 0.4881, |
|
"num_tokens": 15158631.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.030724588205171974, |
|
"grad_norm": 0.6442503629860594, |
|
"learning_rate": 2.9729729729729733e-05, |
|
"loss": 0.4917, |
|
"num_tokens": 17089810.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03413843133907997, |
|
"grad_norm": 1.0354213580125469, |
|
"learning_rate": 3.310810810810811e-05, |
|
"loss": 0.4841, |
|
"num_tokens": 18959052.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.037552274472987965, |
|
"grad_norm": 0.9482071468568299, |
|
"learning_rate": 3.648648648648649e-05, |
|
"loss": 0.4743, |
|
"num_tokens": 20852973.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04096611760689596, |
|
"grad_norm": 1.0587081263012454, |
|
"learning_rate": 3.986486486486487e-05, |
|
"loss": 0.4808, |
|
"num_tokens": 22704402.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04437996074080396, |
|
"grad_norm": 0.8101784899967004, |
|
"learning_rate": 4.324324324324325e-05, |
|
"loss": 0.4859, |
|
"num_tokens": 24673530.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04779380387471196, |
|
"grad_norm": 1.634279655733224, |
|
"learning_rate": 4.662162162162162e-05, |
|
"loss": 0.4832, |
|
"num_tokens": 26712591.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.051207647008619954, |
|
"grad_norm": 1.38458922388954, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4584, |
|
"num_tokens": 28520859.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05462149014252795, |
|
"grad_norm": 1.01136396529527, |
|
"learning_rate": 4.9998563326589096e-05, |
|
"loss": 0.4788, |
|
"num_tokens": 30504282.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.058035333276435945, |
|
"grad_norm": 1.0066323926409173, |
|
"learning_rate": 4.9994253489825765e-05, |
|
"loss": 0.4719, |
|
"num_tokens": 32443081.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06144917641034395, |
|
"grad_norm": 0.9287342359690847, |
|
"learning_rate": 4.998707104009471e-05, |
|
"loss": 0.4745, |
|
"num_tokens": 34326339.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06486301954425194, |
|
"grad_norm": 0.9879821790567098, |
|
"learning_rate": 4.997701689462566e-05, |
|
"loss": 0.4735, |
|
"num_tokens": 36170185.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06827686267815994, |
|
"grad_norm": 0.8069102268140448, |
|
"learning_rate": 4.996409233737627e-05, |
|
"loss": 0.4823, |
|
"num_tokens": 38065120.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07169070581206793, |
|
"grad_norm": 1.032813457346951, |
|
"learning_rate": 4.99482990188681e-05, |
|
"loss": 0.4807, |
|
"num_tokens": 39954377.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.07510454894597593, |
|
"grad_norm": 1.0693609940034285, |
|
"learning_rate": 4.992963895597589e-05, |
|
"loss": 0.4791, |
|
"num_tokens": 41945405.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07851839207988393, |
|
"grad_norm": 0.966510416634568, |
|
"learning_rate": 4.990811453166999e-05, |
|
"loss": 0.476, |
|
"num_tokens": 43794631.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.08193223521379192, |
|
"grad_norm": 1.0423100991803456, |
|
"learning_rate": 4.9883728494711986e-05, |
|
"loss": 0.4688, |
|
"num_tokens": 45665940.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08534607834769992, |
|
"grad_norm": 0.8310450354278927, |
|
"learning_rate": 4.985648395930374e-05, |
|
"loss": 0.475, |
|
"num_tokens": 47594004.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08875992148160793, |
|
"grad_norm": 0.7185061502223402, |
|
"learning_rate": 4.9826384404689666e-05, |
|
"loss": 0.4725, |
|
"num_tokens": 49500082.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09217376461551592, |
|
"grad_norm": 0.9610194838528363, |
|
"learning_rate": 4.9793433674712395e-05, |
|
"loss": 0.4679, |
|
"num_tokens": 51326229.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.09558760774942392, |
|
"grad_norm": 0.7899645948112779, |
|
"learning_rate": 4.9757635977321965e-05, |
|
"loss": 0.4724, |
|
"num_tokens": 53151114.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09900145088333191, |
|
"grad_norm": 0.9979391758397096, |
|
"learning_rate": 4.971899588403836e-05, |
|
"loss": 0.4645, |
|
"num_tokens": 55066586.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.10241529401723991, |
|
"grad_norm": 0.727201189085688, |
|
"learning_rate": 4.9677518329367775e-05, |
|
"loss": 0.4776, |
|
"num_tokens": 57062268.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1058291371511479, |
|
"grad_norm": 0.957820374972616, |
|
"learning_rate": 4.963320861017242e-05, |
|
"loss": 0.4527, |
|
"num_tokens": 58912697.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1092429802850559, |
|
"grad_norm": 0.6922629053527111, |
|
"learning_rate": 4.9586072384994126e-05, |
|
"loss": 0.4609, |
|
"num_tokens": 60809724.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1126568234189639, |
|
"grad_norm": 0.8360362963465879, |
|
"learning_rate": 4.953611567333166e-05, |
|
"loss": 0.4462, |
|
"num_tokens": 62827054.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.11607066655287189, |
|
"grad_norm": 0.7194003955094253, |
|
"learning_rate": 4.9483344854872096e-05, |
|
"loss": 0.4607, |
|
"num_tokens": 64735170.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11948450968677989, |
|
"grad_norm": 0.612939116097357, |
|
"learning_rate": 4.942776666867602e-05, |
|
"loss": 0.4616, |
|
"num_tokens": 66662588.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1228983528206879, |
|
"grad_norm": 0.7916296824208874, |
|
"learning_rate": 4.936938821231698e-05, |
|
"loss": 0.4717, |
|
"num_tokens": 68607450.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1263121959545959, |
|
"grad_norm": 1.0690262233247425, |
|
"learning_rate": 4.9308216940975075e-05, |
|
"loss": 0.4651, |
|
"num_tokens": 70515883.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.12972603908850389, |
|
"grad_norm": 0.6683242093385041, |
|
"learning_rate": 4.924426066648486e-05, |
|
"loss": 0.4645, |
|
"num_tokens": 72437556.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13313988222241188, |
|
"grad_norm": 0.9539510507374334, |
|
"learning_rate": 4.9177527556337835e-05, |
|
"loss": 0.4444, |
|
"num_tokens": 74410081.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.13655372535631988, |
|
"grad_norm": 0.9252957776174132, |
|
"learning_rate": 4.910802613263931e-05, |
|
"loss": 0.4607, |
|
"num_tokens": 76352374.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13996756849022787, |
|
"grad_norm": 0.6332331264138565, |
|
"learning_rate": 4.903576527102018e-05, |
|
"loss": 0.4536, |
|
"num_tokens": 2029573.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.14338141162413587, |
|
"grad_norm": 0.7431194368438976, |
|
"learning_rate": 4.896075419950342e-05, |
|
"loss": 0.4615, |
|
"num_tokens": 4026360.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.14679525475804386, |
|
"grad_norm": 0.9372438278269656, |
|
"learning_rate": 4.888300249732565e-05, |
|
"loss": 0.4536, |
|
"num_tokens": 5855288.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.15020909789195186, |
|
"grad_norm": 0.6587314361302145, |
|
"learning_rate": 4.880252009371382e-05, |
|
"loss": 0.4607, |
|
"num_tokens": 7786439.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15362294102585985, |
|
"grad_norm": 0.7310394284137065, |
|
"learning_rate": 4.8719317266617206e-05, |
|
"loss": 0.4403, |
|
"num_tokens": 9571254.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.15703678415976785, |
|
"grad_norm": 0.8609393951227605, |
|
"learning_rate": 4.863340464139486e-05, |
|
"loss": 0.4769, |
|
"num_tokens": 11588067.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16045062729367585, |
|
"grad_norm": 0.7326705125046628, |
|
"learning_rate": 4.854479318945873e-05, |
|
"loss": 0.4503, |
|
"num_tokens": 13478146.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.16386447042758384, |
|
"grad_norm": 0.7462315558110848, |
|
"learning_rate": 4.8453494226872526e-05, |
|
"loss": 0.4467, |
|
"num_tokens": 15442504.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.16727831356149184, |
|
"grad_norm": 0.9061873366679609, |
|
"learning_rate": 4.8359519412906656e-05, |
|
"loss": 0.4493, |
|
"num_tokens": 17409465.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.17069215669539983, |
|
"grad_norm": 0.7321388464328794, |
|
"learning_rate": 4.826288074854926e-05, |
|
"loss": 0.4672, |
|
"num_tokens": 19363531.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.17410599982930786, |
|
"grad_norm": 0.7665868699326416, |
|
"learning_rate": 4.816359057497363e-05, |
|
"loss": 0.4564, |
|
"num_tokens": 21233246.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.17751984296321585, |
|
"grad_norm": 0.812562841775717, |
|
"learning_rate": 4.806166157196218e-05, |
|
"loss": 0.449, |
|
"num_tokens": 23327219.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18093368609712385, |
|
"grad_norm": 0.8688256441368103, |
|
"learning_rate": 4.795710675628724e-05, |
|
"loss": 0.4567, |
|
"num_tokens": 25306743.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.18434752923103184, |
|
"grad_norm": 0.9435753690607764, |
|
"learning_rate": 4.784993948004867e-05, |
|
"loss": 0.4517, |
|
"num_tokens": 27174263.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18776137236493984, |
|
"grad_norm": 1.4362472881603188, |
|
"learning_rate": 4.774017342896881e-05, |
|
"loss": 0.4604, |
|
"num_tokens": 28998685.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.19117521549884783, |
|
"grad_norm": 3.7837849040471623, |
|
"learning_rate": 4.7627822620644735e-05, |
|
"loss": 0.5089, |
|
"num_tokens": 30861476.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.19458905863275583, |
|
"grad_norm": 1.6810142201752163, |
|
"learning_rate": 4.7512901402758135e-05, |
|
"loss": 0.4516, |
|
"num_tokens": 32762640.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.19800290176666382, |
|
"grad_norm": 1.2181895510266618, |
|
"learning_rate": 4.7395424451243056e-05, |
|
"loss": 0.4753, |
|
"num_tokens": 34810642.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.20141674490057182, |
|
"grad_norm": 0.8193936087136641, |
|
"learning_rate": 4.7275406768411736e-05, |
|
"loss": 0.4652, |
|
"num_tokens": 36683859.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.20483058803447982, |
|
"grad_norm": 1.0109058198551923, |
|
"learning_rate": 4.715286368103873e-05, |
|
"loss": 0.4537, |
|
"num_tokens": 38493680.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2082444311683878, |
|
"grad_norm": 0.7390004642372939, |
|
"learning_rate": 4.702781083840362e-05, |
|
"loss": 0.44, |
|
"num_tokens": 40420032.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2116582743022958, |
|
"grad_norm": 0.9033348017381692, |
|
"learning_rate": 4.690026421029254e-05, |
|
"loss": 0.4371, |
|
"num_tokens": 42428881.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2150721174362038, |
|
"grad_norm": 0.6122028277057487, |
|
"learning_rate": 4.677024008495876e-05, |
|
"loss": 0.4463, |
|
"num_tokens": 44293027.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2184859605701118, |
|
"grad_norm": 0.5056169247540012, |
|
"learning_rate": 4.66377550670426e-05, |
|
"loss": 0.4469, |
|
"num_tokens": 46312487.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2218998037040198, |
|
"grad_norm": 0.6403101500013877, |
|
"learning_rate": 4.650282607545096e-05, |
|
"loss": 0.451, |
|
"num_tokens": 48245722.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2253136468379278, |
|
"grad_norm": 0.652067318740719, |
|
"learning_rate": 4.636547034119668e-05, |
|
"loss": 0.4458, |
|
"num_tokens": 50103512.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.22872748997183578, |
|
"grad_norm": 0.6724023128855483, |
|
"learning_rate": 4.622570540519811e-05, |
|
"loss": 0.4349, |
|
"num_tokens": 52090247.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.23214133310574378, |
|
"grad_norm": 0.5515342597831395, |
|
"learning_rate": 4.6083549116039e-05, |
|
"loss": 0.4548, |
|
"num_tokens": 53993921.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.23555517623965178, |
|
"grad_norm": 0.7100102164490761, |
|
"learning_rate": 4.59390196276892e-05, |
|
"loss": 0.4525, |
|
"num_tokens": 55882666.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.23896901937355977, |
|
"grad_norm": 0.8679862020361908, |
|
"learning_rate": 4.579213539718632e-05, |
|
"loss": 0.4607, |
|
"num_tokens": 57807639.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2423828625074678, |
|
"grad_norm": 0.6200007378003499, |
|
"learning_rate": 4.564291518227866e-05, |
|
"loss": 0.4351, |
|
"num_tokens": 59586573.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.2457967056413758, |
|
"grad_norm": 0.6889715253876092, |
|
"learning_rate": 4.549137803902978e-05, |
|
"loss": 0.4422, |
|
"num_tokens": 61574526.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.24921054877528379, |
|
"grad_norm": 0.7149528120875684, |
|
"learning_rate": 4.533754331938498e-05, |
|
"loss": 0.4333, |
|
"num_tokens": 63467351.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2526243919091918, |
|
"grad_norm": 0.7467833331657708, |
|
"learning_rate": 4.5181430668699934e-05, |
|
"loss": 0.4391, |
|
"num_tokens": 65251754.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.25603823504309975, |
|
"grad_norm": 0.6233104186087223, |
|
"learning_rate": 4.5023060023231915e-05, |
|
"loss": 0.4542, |
|
"num_tokens": 67341024.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.25945207817700777, |
|
"grad_norm": 0.7036989801599275, |
|
"learning_rate": 4.486245160759385e-05, |
|
"loss": 0.4357, |
|
"num_tokens": 69203543.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.26286592131091574, |
|
"grad_norm": 0.6213336099848067, |
|
"learning_rate": 4.469962593217154e-05, |
|
"loss": 0.4312, |
|
"num_tokens": 71079356.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.26627976444482376, |
|
"grad_norm": 0.7761503823566354, |
|
"learning_rate": 4.453460379050441e-05, |
|
"loss": 0.4402, |
|
"num_tokens": 73014977.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.26969360757873173, |
|
"grad_norm": 0.7024337605986739, |
|
"learning_rate": 4.436740625663008e-05, |
|
"loss": 0.4492, |
|
"num_tokens": 74994274.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.27310745071263975, |
|
"grad_norm": 0.6710110629057495, |
|
"learning_rate": 4.41980546823931e-05, |
|
"loss": 0.4363, |
|
"num_tokens": 76893195.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2765212938465478, |
|
"grad_norm": 0.5751338135821263, |
|
"learning_rate": 4.4026570694718243e-05, |
|
"loss": 0.4229, |
|
"num_tokens": 78803696.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.27993513698045575, |
|
"grad_norm": 0.6675876320789876, |
|
"learning_rate": 4.385297619284868e-05, |
|
"loss": 0.4485, |
|
"num_tokens": 80775669.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.28334898011436377, |
|
"grad_norm": 0.5465180089446272, |
|
"learning_rate": 4.367729334554932e-05, |
|
"loss": 0.4285, |
|
"num_tokens": 82613548.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.28676282324827174, |
|
"grad_norm": 0.5408169168484701, |
|
"learning_rate": 4.3499544588275725e-05, |
|
"loss": 0.4476, |
|
"num_tokens": 84534417.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.29017666638217976, |
|
"grad_norm": 0.622446223545655, |
|
"learning_rate": 4.331975262030911e-05, |
|
"loss": 0.4334, |
|
"num_tokens": 86437083.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2935905095160877, |
|
"grad_norm": 0.6346095791048567, |
|
"learning_rate": 4.3137940401857464e-05, |
|
"loss": 0.4445, |
|
"num_tokens": 88482847.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.29700435264999575, |
|
"grad_norm": 0.5790350174610308, |
|
"learning_rate": 4.295413115112345e-05, |
|
"loss": 0.4185, |
|
"num_tokens": 90375995.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3004181957839037, |
|
"grad_norm": 0.5571697392013049, |
|
"learning_rate": 4.2768348341339356e-05, |
|
"loss": 0.4268, |
|
"num_tokens": 92352827.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.30383203891781174, |
|
"grad_norm": 0.5962036778698983, |
|
"learning_rate": 4.258061569776944e-05, |
|
"loss": 0.4443, |
|
"num_tokens": 94322928.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3072458820517197, |
|
"grad_norm": 0.6651032961579146, |
|
"learning_rate": 4.239095719468015e-05, |
|
"loss": 0.4265, |
|
"num_tokens": 96272955.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.31065972518562773, |
|
"grad_norm": 0.6679632928061942, |
|
"learning_rate": 4.2199397052278467e-05, |
|
"loss": 0.4352, |
|
"num_tokens": 98232888.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.3140735683195357, |
|
"grad_norm": 0.5411345637950933, |
|
"learning_rate": 4.200595973361888e-05, |
|
"loss": 0.4262, |
|
"num_tokens": 100242672.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3174874114534437, |
|
"grad_norm": 0.5125155486830999, |
|
"learning_rate": 4.1810669941479396e-05, |
|
"loss": 0.413, |
|
"num_tokens": 102186114.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3209012545873517, |
|
"grad_norm": 0.6282116038580893, |
|
"learning_rate": 4.161355261520683e-05, |
|
"loss": 0.4328, |
|
"num_tokens": 104084923.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3243150977212597, |
|
"grad_norm": 0.6010959699090055, |
|
"learning_rate": 4.141463292753199e-05, |
|
"loss": 0.439, |
|
"num_tokens": 106031358.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3277289408551677, |
|
"grad_norm": 0.8772786749582367, |
|
"learning_rate": 4.121393628135498e-05, |
|
"loss": 0.431, |
|
"num_tokens": 107924869.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3311427839890757, |
|
"grad_norm": 1.0201596112760478, |
|
"learning_rate": 4.101148830650114e-05, |
|
"loss": 0.4439, |
|
"num_tokens": 109870864.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3345566271229837, |
|
"grad_norm": 0.559327637241473, |
|
"learning_rate": 4.080731485644804e-05, |
|
"loss": 0.435, |
|
"num_tokens": 111884766.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3379704702568917, |
|
"grad_norm": 0.6096167733572574, |
|
"learning_rate": 4.0601442005023856e-05, |
|
"loss": 0.4242, |
|
"num_tokens": 113772365.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.34138431339079967, |
|
"grad_norm": 0.9108459316259976, |
|
"learning_rate": 4.039389604307762e-05, |
|
"loss": 0.4261, |
|
"num_tokens": 115712538.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3447981565247077, |
|
"grad_norm": 0.6525655395918979, |
|
"learning_rate": 4.018470347512177e-05, |
|
"loss": 0.4247, |
|
"num_tokens": 117621912.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3482119996586157, |
|
"grad_norm": 0.5854044655049723, |
|
"learning_rate": 3.9973891015947444e-05, |
|
"loss": 0.4303, |
|
"num_tokens": 119575860.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3516258427925237, |
|
"grad_norm": 0.6006841042494367, |
|
"learning_rate": 3.976148558721285e-05, |
|
"loss": 0.4121, |
|
"num_tokens": 121468214.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3550396859264317, |
|
"grad_norm": 0.54247201922023, |
|
"learning_rate": 3.954751431400524e-05, |
|
"loss": 0.4195, |
|
"num_tokens": 123433327.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.35845352906033967, |
|
"grad_norm": 0.7322510070215393, |
|
"learning_rate": 3.933200452137698e-05, |
|
"loss": 0.4432, |
|
"num_tokens": 125322825.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3618673721942477, |
|
"grad_norm": 0.5768815019253595, |
|
"learning_rate": 3.911498373085596e-05, |
|
"loss": 0.4258, |
|
"num_tokens": 127130221.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.36528121532815566, |
|
"grad_norm": 0.591651997040178, |
|
"learning_rate": 3.889647965693101e-05, |
|
"loss": 0.4244, |
|
"num_tokens": 128997742.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.3686950584620637, |
|
"grad_norm": 0.5344728835769015, |
|
"learning_rate": 3.867652020351264e-05, |
|
"loss": 0.4241, |
|
"num_tokens": 130995215.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.37210890159597165, |
|
"grad_norm": 0.5325744114113397, |
|
"learning_rate": 3.845513346036958e-05, |
|
"loss": 0.4228, |
|
"num_tokens": 133018839.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.3755227447298797, |
|
"grad_norm": 0.6631108435559407, |
|
"learning_rate": 3.823234769954158e-05, |
|
"loss": 0.421, |
|
"num_tokens": 134832930.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.37893658786378764, |
|
"grad_norm": 0.6928079184781692, |
|
"learning_rate": 3.8008191371729017e-05, |
|
"loss": 0.4281, |
|
"num_tokens": 136698114.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.38235043099769567, |
|
"grad_norm": 0.5955134031861724, |
|
"learning_rate": 3.778269310265952e-05, |
|
"loss": 0.4242, |
|
"num_tokens": 138517838.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.38576427413160363, |
|
"grad_norm": 0.5869430753929097, |
|
"learning_rate": 3.7555881689432424e-05, |
|
"loss": 0.4348, |
|
"num_tokens": 140490351.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.38917811726551166, |
|
"grad_norm": 0.8509776377325078, |
|
"learning_rate": 3.73277860968412e-05, |
|
"loss": 0.4263, |
|
"num_tokens": 142410473.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3925919603994196, |
|
"grad_norm": 0.5861313037171644, |
|
"learning_rate": 3.709843545367456e-05, |
|
"loss": 0.4243, |
|
"num_tokens": 144380368.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.39600580353332765, |
|
"grad_norm": 0.4286584654592609, |
|
"learning_rate": 3.6867859048996595e-05, |
|
"loss": 0.426, |
|
"num_tokens": 146330816.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3994196466672356, |
|
"grad_norm": 0.7753370232383693, |
|
"learning_rate": 3.663608632840638e-05, |
|
"loss": 0.4153, |
|
"num_tokens": 148170239.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.40283348980114364, |
|
"grad_norm": 0.7689293256752312, |
|
"learning_rate": 3.640314689027768e-05, |
|
"loss": 0.4241, |
|
"num_tokens": 150112731.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4062473329350516, |
|
"grad_norm": 0.6665265300249069, |
|
"learning_rate": 3.616907048197917e-05, |
|
"loss": 0.4189, |
|
"num_tokens": 152035344.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.40966117606895963, |
|
"grad_norm": 0.5723446787640218, |
|
"learning_rate": 3.5933886996075435e-05, |
|
"loss": 0.4167, |
|
"num_tokens": 154002716.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.41307501920286765, |
|
"grad_norm": 0.6694354277387731, |
|
"learning_rate": 3.5697626466509663e-05, |
|
"loss": 0.4224, |
|
"num_tokens": 155973989.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.4164888623367756, |
|
"grad_norm": 0.5019251612975679, |
|
"learning_rate": 3.546031906476818e-05, |
|
"loss": 0.4367, |
|
"num_tokens": 157917749.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.41990270547068365, |
|
"grad_norm": 0.5366028805927413, |
|
"learning_rate": 3.5221995096027335e-05, |
|
"loss": 0.4223, |
|
"num_tokens": 159769673.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.4233165486045916, |
|
"grad_norm": 0.6307389093268616, |
|
"learning_rate": 3.498268499528351e-05, |
|
"loss": 0.4296, |
|
"num_tokens": 161702363.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.42673039173849964, |
|
"grad_norm": 0.5185000780295823, |
|
"learning_rate": 3.474241932346637e-05, |
|
"loss": 0.4229, |
|
"num_tokens": 163583753.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.4301442348724076, |
|
"grad_norm": 0.6074107938423772, |
|
"learning_rate": 3.450122876353609e-05, |
|
"loss": 0.423, |
|
"num_tokens": 165432734.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.43355807800631563, |
|
"grad_norm": 0.533346072388862, |
|
"learning_rate": 3.42591441165651e-05, |
|
"loss": 0.4341, |
|
"num_tokens": 167313920.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.4369719211402236, |
|
"grad_norm": 0.5550506682873664, |
|
"learning_rate": 3.4016196297804516e-05, |
|
"loss": 0.414, |
|
"num_tokens": 169298285.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4403857642741316, |
|
"grad_norm": 0.5664369905865961, |
|
"learning_rate": 3.3772416332736266e-05, |
|
"loss": 0.4415, |
|
"num_tokens": 171315518.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.4437996074080396, |
|
"grad_norm": 0.5095443091018487, |
|
"learning_rate": 3.352783535311093e-05, |
|
"loss": 0.4144, |
|
"num_tokens": 173090546.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4472134505419476, |
|
"grad_norm": 0.702407462685812, |
|
"learning_rate": 3.3282484592972064e-05, |
|
"loss": 0.4229, |
|
"num_tokens": 175013822.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.4506272936758556, |
|
"grad_norm": 0.4562607019189676, |
|
"learning_rate": 3.3036395384667545e-05, |
|
"loss": 0.4117, |
|
"num_tokens": 176955808.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4540411368097636, |
|
"grad_norm": 0.5524213808614263, |
|
"learning_rate": 3.278959915484822e-05, |
|
"loss": 0.4228, |
|
"num_tokens": 178848221.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.45745497994367157, |
|
"grad_norm": 0.7175125912985042, |
|
"learning_rate": 3.2542127420454675e-05, |
|
"loss": 0.4205, |
|
"num_tokens": 180778091.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4608688230775796, |
|
"grad_norm": 0.5768863903803467, |
|
"learning_rate": 3.229401178469231e-05, |
|
"loss": 0.4087, |
|
"num_tokens": 182763012.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.46428266621148756, |
|
"grad_norm": 0.5647802515717784, |
|
"learning_rate": 3.2045283932995465e-05, |
|
"loss": 0.4245, |
|
"num_tokens": 184824341.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4676965093453956, |
|
"grad_norm": 0.4620392448650056, |
|
"learning_rate": 3.1795975628981164e-05, |
|
"loss": 0.4093, |
|
"num_tokens": 186756487.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.47111035247930355, |
|
"grad_norm": 0.5118022386772281, |
|
"learning_rate": 3.154611871039264e-05, |
|
"loss": 0.4115, |
|
"num_tokens": 188672319.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4745241956132116, |
|
"grad_norm": 0.5561299324608565, |
|
"learning_rate": 3.1295745085033565e-05, |
|
"loss": 0.4215, |
|
"num_tokens": 190589583.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.47793803874711954, |
|
"grad_norm": 0.4691354993514919, |
|
"learning_rate": 3.104488672669332e-05, |
|
"loss": 0.418, |
|
"num_tokens": 192492174.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.48135188188102757, |
|
"grad_norm": 0.5755352685196231, |
|
"learning_rate": 3.079357567106375e-05, |
|
"loss": 0.4265, |
|
"num_tokens": 194432665.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.4847657250149356, |
|
"grad_norm": 0.4711522992107647, |
|
"learning_rate": 3.05418440116481e-05, |
|
"loss": 0.4245, |
|
"num_tokens": 196379920.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.48817956814884356, |
|
"grad_norm": 0.5520972585859143, |
|
"learning_rate": 3.0289723895662524e-05, |
|
"loss": 0.4285, |
|
"num_tokens": 198398055.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.4915934112827516, |
|
"grad_norm": 0.6345781690749602, |
|
"learning_rate": 3.0037247519930757e-05, |
|
"loss": 0.4242, |
|
"num_tokens": 200270832.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.49500725441665955, |
|
"grad_norm": 0.5825312523831861, |
|
"learning_rate": 2.9784447126772437e-05, |
|
"loss": 0.4003, |
|
"num_tokens": 202109491.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.49842109755056757, |
|
"grad_norm": 0.47408045365479284, |
|
"learning_rate": 2.9531354999885607e-05, |
|
"loss": 0.4178, |
|
"num_tokens": 203930534.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5018349406844755, |
|
"grad_norm": 0.5325996103792026, |
|
"learning_rate": 2.9278003460223986e-05, |
|
"loss": 0.4077, |
|
"num_tokens": 205892837.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5052487838183836, |
|
"grad_norm": 0.5298839794980272, |
|
"learning_rate": 2.902442486186941e-05, |
|
"loss": 0.4176, |
|
"num_tokens": 207899891.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5086626269522916, |
|
"grad_norm": 0.4998105699659344, |
|
"learning_rate": 2.8770651587900078e-05, |
|
"loss": 0.4133, |
|
"num_tokens": 209877182.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.5120764700861995, |
|
"grad_norm": 0.5133764198248255, |
|
"learning_rate": 2.8516716046255115e-05, |
|
"loss": 0.423, |
|
"num_tokens": 211819088.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5154903132201075, |
|
"grad_norm": 0.5310538979020579, |
|
"learning_rate": 2.8262650665595914e-05, |
|
"loss": 0.4019, |
|
"num_tokens": 213708565.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5189041563540155, |
|
"grad_norm": 0.5445101655689776, |
|
"learning_rate": 2.800848789116489e-05, |
|
"loss": 0.4149, |
|
"num_tokens": 215508938.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5223179994879236, |
|
"grad_norm": 0.6540766089094884, |
|
"learning_rate": 2.775426018064205e-05, |
|
"loss": 0.4203, |
|
"num_tokens": 217491244.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.5257318426218315, |
|
"grad_norm": 0.5395770305665898, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.408, |
|
"num_tokens": 219339581.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5291456857557395, |
|
"grad_norm": 0.48365089981209847, |
|
"learning_rate": 2.7245739819357964e-05, |
|
"loss": 0.4214, |
|
"num_tokens": 221254680.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5325595288896475, |
|
"grad_norm": 0.460635944445485, |
|
"learning_rate": 2.699151210883512e-05, |
|
"loss": 0.4141, |
|
"num_tokens": 223144608.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5359733720235555, |
|
"grad_norm": 0.4304625584138572, |
|
"learning_rate": 2.6737349334404087e-05, |
|
"loss": 0.4067, |
|
"num_tokens": 225087725.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5393872151574635, |
|
"grad_norm": 0.44701240718703694, |
|
"learning_rate": 2.6483283953744897e-05, |
|
"loss": 0.4011, |
|
"num_tokens": 226868036.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5428010582913715, |
|
"grad_norm": 0.5014347366716482, |
|
"learning_rate": 2.622934841209993e-05, |
|
"loss": 0.4171, |
|
"num_tokens": 228775587.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5462149014252795, |
|
"grad_norm": 0.4975075619084584, |
|
"learning_rate": 2.5975575138130597e-05, |
|
"loss": 0.4181, |
|
"num_tokens": 230614734.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5496287445591875, |
|
"grad_norm": 0.5147269313286894, |
|
"learning_rate": 2.5721996539776023e-05, |
|
"loss": 0.4217, |
|
"num_tokens": 232514049.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.5530425876930956, |
|
"grad_norm": 0.4747545337319586, |
|
"learning_rate": 2.5468645000114395e-05, |
|
"loss": 0.4127, |
|
"num_tokens": 234444977.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5564564308270035, |
|
"grad_norm": 0.4903248400626682, |
|
"learning_rate": 2.521555287322757e-05, |
|
"loss": 0.4043, |
|
"num_tokens": 236352441.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.5598702739609115, |
|
"grad_norm": 0.5597063633430578, |
|
"learning_rate": 2.496275248006925e-05, |
|
"loss": 0.4108, |
|
"num_tokens": 238222102.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5632841170948195, |
|
"grad_norm": 0.5100590720976775, |
|
"learning_rate": 2.4710276104337482e-05, |
|
"loss": 0.4031, |
|
"num_tokens": 240193053.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.5666979602287275, |
|
"grad_norm": 0.5618375369863093, |
|
"learning_rate": 2.4458155988351907e-05, |
|
"loss": 0.4155, |
|
"num_tokens": 242035771.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5701118033626354, |
|
"grad_norm": 0.517325444445414, |
|
"learning_rate": 2.420642432893625e-05, |
|
"loss": 0.4081, |
|
"num_tokens": 244022427.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.5735256464965435, |
|
"grad_norm": 0.42604811387441327, |
|
"learning_rate": 2.395511327330668e-05, |
|
"loss": 0.4099, |
|
"num_tokens": 245932222.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5769394896304515, |
|
"grad_norm": 0.49013520367676244, |
|
"learning_rate": 2.370425491496644e-05, |
|
"loss": 0.4029, |
|
"num_tokens": 247827549.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5803533327643595, |
|
"grad_norm": 0.4851808594715651, |
|
"learning_rate": 2.3453881289607372e-05, |
|
"loss": 0.4055, |
|
"num_tokens": 249632270.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5837671758982674, |
|
"grad_norm": 0.5855416869416588, |
|
"learning_rate": 2.3204024371018844e-05, |
|
"loss": 0.4082, |
|
"num_tokens": 251437745.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5871810190321755, |
|
"grad_norm": 0.6044854619444842, |
|
"learning_rate": 2.2954716067004534e-05, |
|
"loss": 0.4095, |
|
"num_tokens": 253372171.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5905948621660835, |
|
"grad_norm": 0.5934417052542611, |
|
"learning_rate": 2.2705988215307704e-05, |
|
"loss": 0.4075, |
|
"num_tokens": 255368626.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5940087052999915, |
|
"grad_norm": 0.5359964239833354, |
|
"learning_rate": 2.245787257954533e-05, |
|
"loss": 0.3938, |
|
"num_tokens": 257318060.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5974225484338994, |
|
"grad_norm": 0.563887986329405, |
|
"learning_rate": 2.221040084515178e-05, |
|
"loss": 0.4065, |
|
"num_tokens": 259300427.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.6008363915678074, |
|
"grad_norm": 0.4443245904144499, |
|
"learning_rate": 2.1963604615332467e-05, |
|
"loss": 0.3872, |
|
"num_tokens": 261155573.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6042502347017155, |
|
"grad_norm": 0.39956918103747635, |
|
"learning_rate": 2.1717515407027938e-05, |
|
"loss": 0.4014, |
|
"num_tokens": 262948392.0, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.6076640778356235, |
|
"grad_norm": 0.4591793649544128, |
|
"learning_rate": 2.147216464688907e-05, |
|
"loss": 0.4134, |
|
"num_tokens": 264928246.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6110779209695314, |
|
"grad_norm": 0.5128192459967758, |
|
"learning_rate": 2.1227583667263733e-05, |
|
"loss": 0.4125, |
|
"num_tokens": 266763637.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.6144917641034394, |
|
"grad_norm": 0.5521676175422746, |
|
"learning_rate": 2.0983803702195486e-05, |
|
"loss": 0.4011, |
|
"num_tokens": 268823568.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6179056072373474, |
|
"grad_norm": 0.49824193142191775, |
|
"learning_rate": 2.0740855883434913e-05, |
|
"loss": 0.4119, |
|
"num_tokens": 270796270.0, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.6213194503712555, |
|
"grad_norm": 0.5108779953936516, |
|
"learning_rate": 2.049877123646391e-05, |
|
"loss": 0.4062, |
|
"num_tokens": 272675016.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6247332935051635, |
|
"grad_norm": 0.5974402147887337, |
|
"learning_rate": 2.0257580676533637e-05, |
|
"loss": 0.4126, |
|
"num_tokens": 274528787.0, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.6281471366390714, |
|
"grad_norm": 0.49396513244608686, |
|
"learning_rate": 2.0017315004716493e-05, |
|
"loss": 0.3936, |
|
"num_tokens": 276418125.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6315609797729794, |
|
"grad_norm": 0.487366764504166, |
|
"learning_rate": 1.9778004903972667e-05, |
|
"loss": 0.4004, |
|
"num_tokens": 278341722.0, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.6349748229068874, |
|
"grad_norm": 0.4711689645645803, |
|
"learning_rate": 1.953968093523183e-05, |
|
"loss": 0.397, |
|
"num_tokens": 280248165.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6383886660407955, |
|
"grad_norm": 0.4537279287511967, |
|
"learning_rate": 1.9302373533490335e-05, |
|
"loss": 0.4129, |
|
"num_tokens": 282166215.0, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.6418025091747034, |
|
"grad_norm": 0.586245477819875, |
|
"learning_rate": 1.9066113003924574e-05, |
|
"loss": 0.411, |
|
"num_tokens": 284060529.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6452163523086114, |
|
"grad_norm": 0.5613407219001563, |
|
"learning_rate": 1.8830929518020833e-05, |
|
"loss": 0.3931, |
|
"num_tokens": 285954023.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.6486301954425194, |
|
"grad_norm": 0.5399254636998788, |
|
"learning_rate": 1.8596853109722323e-05, |
|
"loss": 0.399, |
|
"num_tokens": 287819237.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6520440385764275, |
|
"grad_norm": 0.4520412022868816, |
|
"learning_rate": 1.836391367159364e-05, |
|
"loss": 0.3962, |
|
"num_tokens": 289657591.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.6554578817103354, |
|
"grad_norm": 0.5520302927407486, |
|
"learning_rate": 1.8132140951003414e-05, |
|
"loss": 0.3897, |
|
"num_tokens": 291512633.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6588717248442434, |
|
"grad_norm": 0.4729609184343458, |
|
"learning_rate": 1.790156454632544e-05, |
|
"loss": 0.4123, |
|
"num_tokens": 293487255.0, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.6622855679781514, |
|
"grad_norm": 0.4638796086042126, |
|
"learning_rate": 1.7672213903158813e-05, |
|
"loss": 0.4007, |
|
"num_tokens": 295378178.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6656994111120594, |
|
"grad_norm": 0.4004041498754247, |
|
"learning_rate": 1.744411831056758e-05, |
|
"loss": 0.3868, |
|
"num_tokens": 297228147.0, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.6691132542459673, |
|
"grad_norm": 0.4693011092079871, |
|
"learning_rate": 1.721730689734049e-05, |
|
"loss": 0.4024, |
|
"num_tokens": 299233885.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6725270973798754, |
|
"grad_norm": 0.5111518020676975, |
|
"learning_rate": 1.699180862827099e-05, |
|
"loss": 0.3877, |
|
"num_tokens": 301106661.0, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.6759409405137834, |
|
"grad_norm": 0.4806204434046691, |
|
"learning_rate": 1.6767652300458417e-05, |
|
"loss": 0.3903, |
|
"num_tokens": 303007424.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6793547836476914, |
|
"grad_norm": 0.4566261498657525, |
|
"learning_rate": 1.654486653963043e-05, |
|
"loss": 0.384, |
|
"num_tokens": 304932074.0, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.6827686267815993, |
|
"grad_norm": 0.42269960844619753, |
|
"learning_rate": 1.632347979648737e-05, |
|
"loss": 0.4007, |
|
"num_tokens": 306794512.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6861824699155074, |
|
"grad_norm": 0.6082446933491564, |
|
"learning_rate": 1.6103520343068995e-05, |
|
"loss": 0.4114, |
|
"num_tokens": 308710513.0, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6895963130494154, |
|
"grad_norm": 0.5745984360238439, |
|
"learning_rate": 1.588501626914404e-05, |
|
"loss": 0.3886, |
|
"num_tokens": 310538563.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6930101561833234, |
|
"grad_norm": 0.688939868912826, |
|
"learning_rate": 1.5667995478623027e-05, |
|
"loss": 0.4006, |
|
"num_tokens": 312422847.0, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6964239993172314, |
|
"grad_norm": 0.5455006930867037, |
|
"learning_rate": 1.5452485685994766e-05, |
|
"loss": 0.392, |
|
"num_tokens": 314304209.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6998378424511393, |
|
"grad_norm": 0.49975087118690814, |
|
"learning_rate": 1.5238514412787158e-05, |
|
"loss": 0.4034, |
|
"num_tokens": 316294604.0, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.7032516855850474, |
|
"grad_norm": 0.39649261290017973, |
|
"learning_rate": 1.5026108984052565e-05, |
|
"loss": 0.3958, |
|
"num_tokens": 318158111.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7066655287189554, |
|
"grad_norm": 0.5102610944466994, |
|
"learning_rate": 1.4815296524878236e-05, |
|
"loss": 0.4049, |
|
"num_tokens": 319996964.0, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.7100793718528634, |
|
"grad_norm": 0.49044740180990587, |
|
"learning_rate": 1.4606103956922388e-05, |
|
"loss": 0.4113, |
|
"num_tokens": 321897848.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7134932149867713, |
|
"grad_norm": 0.5610768682384292, |
|
"learning_rate": 1.4398557994976153e-05, |
|
"loss": 0.3901, |
|
"num_tokens": 323836050.0, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.7169070581206793, |
|
"grad_norm": 0.45115509497097084, |
|
"learning_rate": 1.419268514355197e-05, |
|
"loss": 0.4047, |
|
"num_tokens": 325673763.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7203209012545874, |
|
"grad_norm": 0.4284920867269521, |
|
"learning_rate": 1.3988511693498868e-05, |
|
"loss": 0.3869, |
|
"num_tokens": 327548131.0, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.7237347443884954, |
|
"grad_norm": 0.49464944285568574, |
|
"learning_rate": 1.3786063718645027e-05, |
|
"loss": 0.3977, |
|
"num_tokens": 329443563.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7271485875224033, |
|
"grad_norm": 0.5710280894381398, |
|
"learning_rate": 1.3585367072468014e-05, |
|
"loss": 0.3973, |
|
"num_tokens": 331374265.0, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.7305624306563113, |
|
"grad_norm": 0.424309277242454, |
|
"learning_rate": 1.3386447384793166e-05, |
|
"loss": 0.3972, |
|
"num_tokens": 333352485.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.7339762737902193, |
|
"grad_norm": 0.4788996741106936, |
|
"learning_rate": 1.3189330058520605e-05, |
|
"loss": 0.4041, |
|
"num_tokens": 335215438.0, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.7373901169241274, |
|
"grad_norm": 0.4342833991078278, |
|
"learning_rate": 1.2994040266381124e-05, |
|
"loss": 0.4003, |
|
"num_tokens": 337161669.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7408039600580353, |
|
"grad_norm": 0.43204091990934734, |
|
"learning_rate": 1.280060294772154e-05, |
|
"loss": 0.4081, |
|
"num_tokens": 339139569.0, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.7442178031919433, |
|
"grad_norm": 0.36722220839525554, |
|
"learning_rate": 1.2609042805319848e-05, |
|
"loss": 0.3982, |
|
"num_tokens": 341159421.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.7476316463258513, |
|
"grad_norm": 0.45921341384234504, |
|
"learning_rate": 1.2419384302230562e-05, |
|
"loss": 0.3941, |
|
"num_tokens": 343058673.0, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.7510454894597594, |
|
"grad_norm": 0.4734058543110404, |
|
"learning_rate": 1.2231651658660653e-05, |
|
"loss": 0.3853, |
|
"num_tokens": 344849449.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7544593325936674, |
|
"grad_norm": 0.4426908856722868, |
|
"learning_rate": 1.2045868848876554e-05, |
|
"loss": 0.3867, |
|
"num_tokens": 346706798.0, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.7578731757275753, |
|
"grad_norm": 0.5059105436600692, |
|
"learning_rate": 1.1862059598142537e-05, |
|
"loss": 0.3928, |
|
"num_tokens": 348493564.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7612870188614833, |
|
"grad_norm": 0.4830390714316328, |
|
"learning_rate": 1.1680247379690893e-05, |
|
"loss": 0.3871, |
|
"num_tokens": 350327439.0, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.7647008619953913, |
|
"grad_norm": 0.383477127300196, |
|
"learning_rate": 1.1500455411724277e-05, |
|
"loss": 0.3839, |
|
"num_tokens": 352222780.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7681147051292994, |
|
"grad_norm": 0.4192236038559293, |
|
"learning_rate": 1.1322706654450693e-05, |
|
"loss": 0.387, |
|
"num_tokens": 354158243.0, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.7715285482632073, |
|
"grad_norm": 0.4225409862113959, |
|
"learning_rate": 1.1147023807151319e-05, |
|
"loss": 0.3882, |
|
"num_tokens": 356121045.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.7749423913971153, |
|
"grad_norm": 0.4618680342672497, |
|
"learning_rate": 1.0973429305281755e-05, |
|
"loss": 0.392, |
|
"num_tokens": 358052013.0, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.7783562345310233, |
|
"grad_norm": 0.4145071434270498, |
|
"learning_rate": 1.080194531760691e-05, |
|
"loss": 0.3845, |
|
"num_tokens": 359922588.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7817700776649313, |
|
"grad_norm": 0.4648820991648914, |
|
"learning_rate": 1.063259374336993e-05, |
|
"loss": 0.3907, |
|
"num_tokens": 361916248.0, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.7851839207988393, |
|
"grad_norm": 0.42609495826749355, |
|
"learning_rate": 1.0465396209495592e-05, |
|
"loss": 0.3858, |
|
"num_tokens": 363863339.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7885977639327473, |
|
"grad_norm": 0.39598818557393706, |
|
"learning_rate": 1.0300374067828463e-05, |
|
"loss": 0.3971, |
|
"num_tokens": 365987286.0, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.7920116070666553, |
|
"grad_norm": 0.5378540325321663, |
|
"learning_rate": 1.0137548392406157e-05, |
|
"loss": 0.4006, |
|
"num_tokens": 367915269.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7954254502005633, |
|
"grad_norm": 0.5261694039722628, |
|
"learning_rate": 9.976939976768094e-06, |
|
"loss": 0.391, |
|
"num_tokens": 369748560.0, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.7988392933344712, |
|
"grad_norm": 0.45871697845064263, |
|
"learning_rate": 9.81856933130007e-06, |
|
"loss": 0.3884, |
|
"num_tokens": 371728452.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8022531364683793, |
|
"grad_norm": 0.3849106883948326, |
|
"learning_rate": 9.662456680615026e-06, |
|
"loss": 0.3919, |
|
"num_tokens": 373546125.0, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.8056669796022873, |
|
"grad_norm": 0.4069296296834249, |
|
"learning_rate": 9.50862196097022e-06, |
|
"loss": 0.411, |
|
"num_tokens": 375581054.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.8090808227361953, |
|
"grad_norm": 0.3774090848188763, |
|
"learning_rate": 9.357084817721343e-06, |
|
"loss": 0.3908, |
|
"num_tokens": 377458399.0, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.8124946658701032, |
|
"grad_norm": 0.46564290335514785, |
|
"learning_rate": 9.207864602813684e-06, |
|
"loss": 0.388, |
|
"num_tokens": 379400003.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8159085090040112, |
|
"grad_norm": 0.38385099187145405, |
|
"learning_rate": 9.060980372310805e-06, |
|
"loss": 0.3774, |
|
"num_tokens": 381337873.0, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.8193223521379193, |
|
"grad_norm": 0.5024542029662248, |
|
"learning_rate": 8.916450883961005e-06, |
|
"loss": 0.3955, |
|
"num_tokens": 383250150.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8227361952718273, |
|
"grad_norm": 0.40892869928158526, |
|
"learning_rate": 8.77429459480189e-06, |
|
"loss": 0.4013, |
|
"num_tokens": 385155779.0, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.8261500384057353, |
|
"grad_norm": 0.3945484962557817, |
|
"learning_rate": 8.634529658803322e-06, |
|
"loss": 0.3802, |
|
"num_tokens": 386988785.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.8295638815396432, |
|
"grad_norm": 0.385950673206825, |
|
"learning_rate": 8.497173924549042e-06, |
|
"loss": 0.3825, |
|
"num_tokens": 388925343.0, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.8329777246735512, |
|
"grad_norm": 0.38739675307188914, |
|
"learning_rate": 8.362244932957402e-06, |
|
"loss": 0.3937, |
|
"num_tokens": 390919568.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.8363915678074593, |
|
"grad_norm": 0.4115485385386184, |
|
"learning_rate": 8.229759915041243e-06, |
|
"loss": 0.3883, |
|
"num_tokens": 392910905.0, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.8398054109413673, |
|
"grad_norm": 0.4475212067282627, |
|
"learning_rate": 8.099735789707462e-06, |
|
"loss": 0.3808, |
|
"num_tokens": 394743801.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.8432192540752752, |
|
"grad_norm": 0.4081804601752203, |
|
"learning_rate": 7.97218916159638e-06, |
|
"loss": 0.3852, |
|
"num_tokens": 396575417.0, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.8466330972091832, |
|
"grad_norm": 0.4183809315204868, |
|
"learning_rate": 7.847136318961276e-06, |
|
"loss": 0.3812, |
|
"num_tokens": 398541087.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.8500469403430913, |
|
"grad_norm": 0.40295579325617725, |
|
"learning_rate": 7.724593231588272e-06, |
|
"loss": 0.3934, |
|
"num_tokens": 400489921.0, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.8534607834769993, |
|
"grad_norm": 0.45804469948779925, |
|
"learning_rate": 7.604575548756949e-06, |
|
"loss": 0.3994, |
|
"num_tokens": 402379009.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8568746266109072, |
|
"grad_norm": 0.38882818799193014, |
|
"learning_rate": 7.487098597241871e-06, |
|
"loss": 0.3908, |
|
"num_tokens": 404352878.0, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.8602884697448152, |
|
"grad_norm": 0.36846595203062904, |
|
"learning_rate": 7.372177379355269e-06, |
|
"loss": 0.3947, |
|
"num_tokens": 406330141.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8637023128787232, |
|
"grad_norm": 0.42947478533888517, |
|
"learning_rate": 7.259826571031192e-06, |
|
"loss": 0.3862, |
|
"num_tokens": 408204838.0, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.8671161560126313, |
|
"grad_norm": 0.3878495259462103, |
|
"learning_rate": 7.150060519951341e-06, |
|
"loss": 0.4017, |
|
"num_tokens": 410160328.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.8705299991465392, |
|
"grad_norm": 0.37081146658298825, |
|
"learning_rate": 7.042893243712772e-06, |
|
"loss": 0.3815, |
|
"num_tokens": 412112871.0, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.8739438422804472, |
|
"grad_norm": 0.37469108073137014, |
|
"learning_rate": 6.938338428037822e-06, |
|
"loss": 0.3801, |
|
"num_tokens": 414056837.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8773576854143552, |
|
"grad_norm": 0.4484507788307278, |
|
"learning_rate": 6.836409425026376e-06, |
|
"loss": 0.3815, |
|
"num_tokens": 415976407.0, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.8807715285482632, |
|
"grad_norm": 0.42468934286329335, |
|
"learning_rate": 6.737119251450741e-06, |
|
"loss": 0.3836, |
|
"num_tokens": 417897727.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.8841853716821712, |
|
"grad_norm": 0.3620037642681858, |
|
"learning_rate": 6.640480587093342e-06, |
|
"loss": 0.4026, |
|
"num_tokens": 419994110.0, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.8875992148160792, |
|
"grad_norm": 0.38344852481107256, |
|
"learning_rate": 6.546505773127476e-06, |
|
"loss": 0.3797, |
|
"num_tokens": 421924847.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8910130579499872, |
|
"grad_norm": 0.3584332646483648, |
|
"learning_rate": 6.455206810541276e-06, |
|
"loss": 0.387, |
|
"num_tokens": 423915098.0, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.8944269010838952, |
|
"grad_norm": 0.42116701870642653, |
|
"learning_rate": 6.366595358605142e-06, |
|
"loss": 0.3832, |
|
"num_tokens": 425893344.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8978407442178032, |
|
"grad_norm": 0.3521494410121148, |
|
"learning_rate": 6.280682733382796e-06, |
|
"loss": 0.394, |
|
"num_tokens": 427909649.0, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.9012545873517112, |
|
"grad_norm": 0.3983072854935981, |
|
"learning_rate": 6.197479906286184e-06, |
|
"loss": 0.3819, |
|
"num_tokens": 429848299.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9046684304856192, |
|
"grad_norm": 0.3595529616776975, |
|
"learning_rate": 6.116997502674356e-06, |
|
"loss": 0.3836, |
|
"num_tokens": 431823107.0, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.9080822736195272, |
|
"grad_norm": 0.42462919737239924, |
|
"learning_rate": 6.039245800496585e-06, |
|
"loss": 0.3842, |
|
"num_tokens": 433654195.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.9114961167534352, |
|
"grad_norm": 0.5008600053663176, |
|
"learning_rate": 5.964234728979824e-06, |
|
"loss": 0.3855, |
|
"num_tokens": 435602596.0, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.9149099598873431, |
|
"grad_norm": 0.4939861659729308, |
|
"learning_rate": 5.8919738673606936e-06, |
|
"loss": 0.3968, |
|
"num_tokens": 437539406.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9183238030212512, |
|
"grad_norm": 0.418502345365578, |
|
"learning_rate": 5.8224724436621695e-06, |
|
"loss": 0.377, |
|
"num_tokens": 439392309.0, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.9217376461551592, |
|
"grad_norm": 0.3865370260878832, |
|
"learning_rate": 5.75573933351514e-06, |
|
"loss": 0.3907, |
|
"num_tokens": 441289639.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9251514892890672, |
|
"grad_norm": 0.36155969808913657, |
|
"learning_rate": 5.6917830590249315e-06, |
|
"loss": 0.3939, |
|
"num_tokens": 443217058.0, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.9285653324229751, |
|
"grad_norm": 0.4469202947870825, |
|
"learning_rate": 5.63061178768302e-06, |
|
"loss": 0.3908, |
|
"num_tokens": 445046539.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.9319791755568831, |
|
"grad_norm": 0.36026663716494844, |
|
"learning_rate": 5.5722333313239804e-06, |
|
"loss": 0.3765, |
|
"num_tokens": 446908896.0, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.9353930186907912, |
|
"grad_norm": 0.4386391881600453, |
|
"learning_rate": 5.5166551451279065e-06, |
|
"loss": 0.3846, |
|
"num_tokens": 448884969.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.9388068618246992, |
|
"grad_norm": 0.3656587817230275, |
|
"learning_rate": 5.463884326668339e-06, |
|
"loss": 0.3884, |
|
"num_tokens": 450849803.0, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.9422207049586071, |
|
"grad_norm": 0.473339948542882, |
|
"learning_rate": 5.413927615005879e-06, |
|
"loss": 0.3918, |
|
"num_tokens": 452778895.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.9456345480925151, |
|
"grad_norm": 0.4327834800966883, |
|
"learning_rate": 5.366791389827578e-06, |
|
"loss": 0.3901, |
|
"num_tokens": 454738409.0, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.9490483912264231, |
|
"grad_norm": 0.38831303378458926, |
|
"learning_rate": 5.322481670632229e-06, |
|
"loss": 0.3868, |
|
"num_tokens": 456587467.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.9524622343603312, |
|
"grad_norm": 0.3682504260150772, |
|
"learning_rate": 5.281004115961642e-06, |
|
"loss": 0.397, |
|
"num_tokens": 458487641.0, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.9558760774942391, |
|
"grad_norm": 0.45399594742338995, |
|
"learning_rate": 5.242364022678038e-06, |
|
"loss": 0.3822, |
|
"num_tokens": 460386310.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9592899206281471, |
|
"grad_norm": 0.36213676752021623, |
|
"learning_rate": 5.206566325287607e-06, |
|
"loss": 0.375, |
|
"num_tokens": 462390119.0, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.9627037637620551, |
|
"grad_norm": 0.3623211794036725, |
|
"learning_rate": 5.173615595310344e-06, |
|
"loss": 0.3797, |
|
"num_tokens": 464283440.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.9661176068959632, |
|
"grad_norm": 0.40375519775096763, |
|
"learning_rate": 5.143516040696265e-06, |
|
"loss": 0.3849, |
|
"num_tokens": 466103896.0, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.9695314500298712, |
|
"grad_norm": 0.3806537371766384, |
|
"learning_rate": 5.116271505288018e-06, |
|
"loss": 0.3876, |
|
"num_tokens": 467974167.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9729452931637791, |
|
"grad_norm": 0.36856238291556104, |
|
"learning_rate": 5.0918854683300105e-06, |
|
"loss": 0.3965, |
|
"num_tokens": 469915807.0, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.9763591362976871, |
|
"grad_norm": 0.4291715872204905, |
|
"learning_rate": 5.070361044024103e-06, |
|
"loss": 0.3891, |
|
"num_tokens": 471734184.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.9797729794315951, |
|
"grad_norm": 0.3580402114466082, |
|
"learning_rate": 5.051700981131903e-06, |
|
"loss": 0.3829, |
|
"num_tokens": 473649866.0, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.9831868225655032, |
|
"grad_norm": 0.3475223629471517, |
|
"learning_rate": 5.035907662623737e-06, |
|
"loss": 0.3853, |
|
"num_tokens": 475543359.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9866006656994111, |
|
"grad_norm": 0.39539394938410216, |
|
"learning_rate": 5.02298310537434e-06, |
|
"loss": 0.3862, |
|
"num_tokens": 477474338.0, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.9900145088333191, |
|
"grad_norm": 0.3692684131085188, |
|
"learning_rate": 5.0129289599052915e-06, |
|
"loss": 0.3717, |
|
"num_tokens": 479401233.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9934283519672271, |
|
"grad_norm": 0.4177627041474193, |
|
"learning_rate": 5.005746510174235e-06, |
|
"loss": 0.3924, |
|
"num_tokens": 481235568.0, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.9968421951011351, |
|
"grad_norm": 0.3671477184652084, |
|
"learning_rate": 5.001436673410903e-06, |
|
"loss": 0.3852, |
|
"num_tokens": 483184723.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.9995732696082615, |
|
"step": 1464, |
|
"total_flos": 2.8225135896499847e+19, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.6907, |
|
"train_samples_per_second": 55441.67, |
|
"train_steps_per_second": 865.934 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1464, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.8225135896499847e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|