|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.97907949790795, |
|
"eval_steps": 500, |
|
"global_step": 357, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008368200836820083, |
|
"grad_norm": 1.1835436820983887, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 1.0482, |
|
"mean_token_accuracy": 0.8137692213058472, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04184100418410042, |
|
"grad_norm": 1.143718957901001, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.1531, |
|
"mean_token_accuracy": 0.7920060083270073, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08368200836820083, |
|
"grad_norm": 0.6482775807380676, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 0.9534, |
|
"mean_token_accuracy": 0.8159618139266968, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12552301255230125, |
|
"grad_norm": 0.5585402250289917, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.8057, |
|
"mean_token_accuracy": 0.8243314325809479, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16736401673640167, |
|
"grad_norm": 49.08249282836914, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 0.7266, |
|
"mean_token_accuracy": 0.8348599970340729, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20920502092050208, |
|
"grad_norm": 0.2956574857234955, |
|
"learning_rate": 0.0001388888888888889, |
|
"loss": 0.5231, |
|
"mean_token_accuracy": 0.8722780823707581, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2510460251046025, |
|
"grad_norm": 0.33143389225006104, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.4544, |
|
"mean_token_accuracy": 0.8879955649375916, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2928870292887029, |
|
"grad_norm": 0.35626646876335144, |
|
"learning_rate": 0.00019444444444444446, |
|
"loss": 0.3863, |
|
"mean_token_accuracy": 0.9022103250026703, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.33472803347280333, |
|
"grad_norm": 0.26618582010269165, |
|
"learning_rate": 0.00019992338316586133, |
|
"loss": 0.3502, |
|
"mean_token_accuracy": 0.9123009383678437, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.37656903765690375, |
|
"grad_norm": 0.19758877158164978, |
|
"learning_rate": 0.0001996123284790336, |
|
"loss": 0.33, |
|
"mean_token_accuracy": 0.9169199109077454, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.41841004184100417, |
|
"grad_norm": 0.17389552295207977, |
|
"learning_rate": 0.00019906279151130337, |
|
"loss": 0.3597, |
|
"mean_token_accuracy": 0.9097823560237884, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4602510460251046, |
|
"grad_norm": 0.18199238181114197, |
|
"learning_rate": 0.000198276087911361, |
|
"loss": 0.331, |
|
"mean_token_accuracy": 0.91509507894516, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.502092050209205, |
|
"grad_norm": 0.17495596408843994, |
|
"learning_rate": 0.0001972541011294959, |
|
"loss": 0.3322, |
|
"mean_token_accuracy": 0.9161225736141205, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5439330543933054, |
|
"grad_norm": 0.15940091013908386, |
|
"learning_rate": 0.00019599927790842073, |
|
"loss": 0.3483, |
|
"mean_token_accuracy": 0.9118252336978913, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5857740585774058, |
|
"grad_norm": 0.1603671759366989, |
|
"learning_rate": 0.0001945146224255145, |
|
"loss": 0.3215, |
|
"mean_token_accuracy": 0.9172553181648254, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6276150627615062, |
|
"grad_norm": 0.16840051114559174, |
|
"learning_rate": 0.00019280368910050942, |
|
"loss": 0.3131, |
|
"mean_token_accuracy": 0.9191941678524017, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6694560669456067, |
|
"grad_norm": 0.16477249562740326, |
|
"learning_rate": 0.0001908705740858402, |
|
"loss": 0.3337, |
|
"mean_token_accuracy": 0.9134519279003144, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7112970711297071, |
|
"grad_norm": 0.18231800198554993, |
|
"learning_rate": 0.0001887199054600286, |
|
"loss": 0.288, |
|
"mean_token_accuracy": 0.9250746846199036, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7531380753138075, |
|
"grad_norm": 0.14658205211162567, |
|
"learning_rate": 0.00018635683214758214, |
|
"loss": 0.3291, |
|
"mean_token_accuracy": 0.9156664907932281, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7949790794979079, |
|
"grad_norm": 0.14870484173297882, |
|
"learning_rate": 0.0001837870115919327, |
|
"loss": 0.3056, |
|
"mean_token_accuracy": 0.920881724357605, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8368200836820083, |
|
"grad_norm": 0.1470540314912796, |
|
"learning_rate": 0.0001810165962109283, |
|
"loss": 0.3065, |
|
"mean_token_accuracy": 0.9206145107746124, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8786610878661087, |
|
"grad_norm": 0.1503264307975769, |
|
"learning_rate": 0.00017805221866730458, |
|
"loss": 0.3204, |
|
"mean_token_accuracy": 0.9173631191253662, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9205020920502092, |
|
"grad_norm": 0.15052708983421326, |
|
"learning_rate": 0.0001749009759893999, |
|
"loss": 0.3065, |
|
"mean_token_accuracy": 0.9213023066520691, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9623430962343096, |
|
"grad_norm": 0.13865287601947784, |
|
"learning_rate": 0.00017157041258013074, |
|
"loss": 0.2935, |
|
"mean_token_accuracy": 0.9245201706886291, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2014499455690384, |
|
"learning_rate": 0.0001680685021549063, |
|
"loss": 0.3392, |
|
"mean_token_accuracy": 0.9127676884333292, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.3295583426952362, |
|
"eval_mean_token_accuracy": 0.9158889651298523, |
|
"eval_runtime": 188.907, |
|
"eval_samples_per_second": 1.906, |
|
"eval_steps_per_second": 0.238, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0418410041841004, |
|
"grad_norm": 0.14540784060955048, |
|
"learning_rate": 0.00016440362865172373, |
|
"loss": 0.2842, |
|
"mean_token_accuracy": 0.9250402748584747, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0836820083682008, |
|
"grad_norm": 0.15603987872600555, |
|
"learning_rate": 0.00016058456615914817, |
|
"loss": 0.2592, |
|
"mean_token_accuracy": 0.9306007921695709, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1255230125523012, |
|
"grad_norm": 0.20846955478191376, |
|
"learning_rate": 0.00015662045791023173, |
|
"loss": 0.2977, |
|
"mean_token_accuracy": 0.9211509883403778, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.1673640167364017, |
|
"grad_norm": 0.1669706404209137, |
|
"learning_rate": 0.00015252079439266177, |
|
"loss": 0.3082, |
|
"mean_token_accuracy": 0.9184517621994018, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.209205020920502, |
|
"grad_norm": 0.20693209767341614, |
|
"learning_rate": 0.00014829539062754596, |
|
"loss": 0.2919, |
|
"mean_token_accuracy": 0.9223342299461365, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.2510460251046025, |
|
"grad_norm": 0.20154564082622528, |
|
"learning_rate": 0.00014395436267123016, |
|
"loss": 0.2819, |
|
"mean_token_accuracy": 0.9249872386455535, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.292887029288703, |
|
"grad_norm": 0.19385474920272827, |
|
"learning_rate": 0.00013950810339640688, |
|
"loss": 0.286, |
|
"mean_token_accuracy": 0.9242500007152558, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.3347280334728033, |
|
"grad_norm": 0.18098486959934235, |
|
"learning_rate": 0.00013496725761049638, |
|
"loss": 0.2797, |
|
"mean_token_accuracy": 0.9268094718456268, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3765690376569037, |
|
"grad_norm": 0.1658986210823059, |
|
"learning_rate": 0.00013034269657086992, |
|
"loss": 0.2535, |
|
"mean_token_accuracy": 0.931826388835907, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.4184100418410042, |
|
"grad_norm": 0.16022320091724396, |
|
"learning_rate": 0.0001256454919579284, |
|
"loss": 0.2564, |
|
"mean_token_accuracy": 0.9323311626911164, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4602510460251046, |
|
"grad_norm": 0.1953819990158081, |
|
"learning_rate": 0.00012088688936834704, |
|
"loss": 0.2459, |
|
"mean_token_accuracy": 0.9334764361381531, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.502092050209205, |
|
"grad_norm": 0.18393699824810028, |
|
"learning_rate": 0.00011607828139194683, |
|
"loss": 0.286, |
|
"mean_token_accuracy": 0.9224025547504425, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.5439330543933054, |
|
"grad_norm": 0.21144607663154602, |
|
"learning_rate": 0.00011123118033664876, |
|
"loss": 0.2604, |
|
"mean_token_accuracy": 0.9307535707950592, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.5857740585774058, |
|
"grad_norm": 0.18700766563415527, |
|
"learning_rate": 0.00010635719066681063, |
|
"loss": 0.2579, |
|
"mean_token_accuracy": 0.9308933317661285, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6276150627615062, |
|
"grad_norm": 0.20680415630340576, |
|
"learning_rate": 0.00010146798122093166, |
|
"loss": 0.2745, |
|
"mean_token_accuracy": 0.9262791752815247, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.6694560669456067, |
|
"grad_norm": 0.2134338617324829, |
|
"learning_rate": 9.657525727523897e-05, |
|
"loss": 0.2838, |
|
"mean_token_accuracy": 0.9231680512428284, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.711297071129707, |
|
"grad_norm": 0.18799664080142975, |
|
"learning_rate": 9.169073252003779e-05, |
|
"loss": 0.2694, |
|
"mean_token_accuracy": 0.9275272071361542, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.7531380753138075, |
|
"grad_norm": 0.18628208339214325, |
|
"learning_rate": 8.682610101591814e-05, |
|
"loss": 0.2518, |
|
"mean_token_accuracy": 0.9325793802738189, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.794979079497908, |
|
"grad_norm": 0.18667584657669067, |
|
"learning_rate": 8.19930091969567e-05, |
|
"loss": 0.2671, |
|
"mean_token_accuracy": 0.9276439845561981, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.8368200836820083, |
|
"grad_norm": 0.2169790118932724, |
|
"learning_rate": 7.720302798794153e-05, |
|
"loss": 0.2608, |
|
"mean_token_accuracy": 0.9305820167064667, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8786610878661087, |
|
"grad_norm": 0.20957927405834198, |
|
"learning_rate": 7.246762510237403e-05, |
|
"loss": 0.2592, |
|
"mean_token_accuracy": 0.9301635086536407, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.9205020920502092, |
|
"grad_norm": 0.21752670407295227, |
|
"learning_rate": 6.779813758756943e-05, |
|
"loss": 0.2683, |
|
"mean_token_accuracy": 0.9270337641239166, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.9623430962343096, |
|
"grad_norm": 0.21591052412986755, |
|
"learning_rate": 6.320574468258555e-05, |
|
"loss": 0.2555, |
|
"mean_token_accuracy": 0.930656349658966, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2959473729133606, |
|
"learning_rate": 5.870144105396118e-05, |
|
"loss": 0.253, |
|
"mean_token_accuracy": 0.9325528542200724, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.3456202745437622, |
|
"eval_mean_token_accuracy": 0.914831633037991, |
|
"eval_runtime": 188.7768, |
|
"eval_samples_per_second": 1.907, |
|
"eval_steps_per_second": 0.238, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.0418410041841004, |
|
"grad_norm": 0.24203084409236908, |
|
"learning_rate": 5.429601047334022e-05, |
|
"loss": 0.1868, |
|
"mean_token_accuracy": 0.9483842253684998, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.083682008368201, |
|
"grad_norm": 0.2632400393486023, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.2226, |
|
"mean_token_accuracy": 0.9376705825328827, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.1255230125523012, |
|
"grad_norm": 0.24455517530441284, |
|
"learning_rate": 4.58236947300939e-05, |
|
"loss": 0.2038, |
|
"mean_token_accuracy": 0.9432938873767853, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.1673640167364017, |
|
"grad_norm": 0.27484190464019775, |
|
"learning_rate": 4.17770931730606e-05, |
|
"loss": 0.207, |
|
"mean_token_accuracy": 0.9414038896560669, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.209205020920502, |
|
"grad_norm": 0.22127795219421387, |
|
"learning_rate": 3.786988331415211e-05, |
|
"loss": 0.2139, |
|
"mean_token_accuracy": 0.939983355998993, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.2510460251046025, |
|
"grad_norm": 0.2719532549381256, |
|
"learning_rate": 3.41114194203889e-05, |
|
"loss": 0.2091, |
|
"mean_token_accuracy": 0.9410282075405121, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.292887029288703, |
|
"grad_norm": 0.27510541677474976, |
|
"learning_rate": 3.0510699645470984e-05, |
|
"loss": 0.204, |
|
"mean_token_accuracy": 0.9429362654685974, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.3347280334728033, |
|
"grad_norm": 0.2817744016647339, |
|
"learning_rate": 2.7076344487261697e-05, |
|
"loss": 0.2278, |
|
"mean_token_accuracy": 0.9364924848079681, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.3765690376569037, |
|
"grad_norm": 0.26610061526298523, |
|
"learning_rate": 2.381657614941858e-05, |
|
"loss": 0.1996, |
|
"mean_token_accuracy": 0.9442806720733643, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.418410041841004, |
|
"grad_norm": 0.2731967568397522, |
|
"learning_rate": 2.073919885658223e-05, |
|
"loss": 0.2217, |
|
"mean_token_accuracy": 0.9379707038402557, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.4602510460251046, |
|
"grad_norm": 0.2731280028820038, |
|
"learning_rate": 1.7851580170250304e-05, |
|
"loss": 0.1916, |
|
"mean_token_accuracy": 0.9465275406837463, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.502092050209205, |
|
"grad_norm": 0.2769322097301483, |
|
"learning_rate": 1.5160633350068509e-05, |
|
"loss": 0.232, |
|
"mean_token_accuracy": 0.9358445882797242, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.5439330543933054, |
|
"grad_norm": 0.27492755651474, |
|
"learning_rate": 1.2672800802767715e-05, |
|
"loss": 0.2094, |
|
"mean_token_accuracy": 0.9408862173557282, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.585774058577406, |
|
"grad_norm": 0.309391051530838, |
|
"learning_rate": 1.0394038658371574e-05, |
|
"loss": 0.1979, |
|
"mean_token_accuracy": 0.9446548879146576, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.6276150627615062, |
|
"grad_norm": 0.2758495509624481, |
|
"learning_rate": 8.329802510601559e-06, |
|
"loss": 0.2048, |
|
"mean_token_accuracy": 0.9433695197105407, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.6694560669456067, |
|
"grad_norm": 0.2817879617214203, |
|
"learning_rate": 6.485034355617747e-06, |
|
"loss": 0.1977, |
|
"mean_token_accuracy": 0.944527405500412, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.711297071129707, |
|
"grad_norm": 0.2818117141723633, |
|
"learning_rate": 4.864150760365771e-06, |
|
"loss": 0.2205, |
|
"mean_token_accuracy": 0.9380866944789886, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.7531380753138075, |
|
"grad_norm": 0.2880750894546509, |
|
"learning_rate": 3.471032288855869e-06, |
|
"loss": 0.2078, |
|
"mean_token_accuracy": 0.942375785112381, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.794979079497908, |
|
"grad_norm": 0.26675674319267273, |
|
"learning_rate": 2.3090142116888646e-06, |
|
"loss": 0.2037, |
|
"mean_token_accuracy": 0.9432387411594391, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.8368200836820083, |
|
"grad_norm": 0.2919078469276428, |
|
"learning_rate": 1.3808785210711606e-06, |
|
"loss": 0.2217, |
|
"mean_token_accuracy": 0.9386197745800018, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.8786610878661087, |
|
"grad_norm": 0.27360963821411133, |
|
"learning_rate": 6.888472704359661e-07, |
|
"loss": 0.2101, |
|
"mean_token_accuracy": 0.9409088790416718, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.920502092050209, |
|
"grad_norm": 0.2891521453857422, |
|
"learning_rate": 2.3457725461607517e-07, |
|
"loss": 0.2033, |
|
"mean_token_accuracy": 0.9430344641208649, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.9623430962343096, |
|
"grad_norm": 0.2698313593864441, |
|
"learning_rate": 1.915604330464671e-08, |
|
"loss": 0.1824, |
|
"mean_token_accuracy": 0.9485956370830536, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.97907949790795, |
|
"eval_loss": 0.37856200337409973, |
|
"eval_mean_token_accuracy": 0.9150180731500898, |
|
"eval_runtime": 188.9862, |
|
"eval_samples_per_second": 1.905, |
|
"eval_steps_per_second": 0.238, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.97907949790795, |
|
"step": 357, |
|
"total_flos": 2.503141107915817e+17, |
|
"train_loss": 0.3060124111609633, |
|
"train_runtime": 5934.9405, |
|
"train_samples_per_second": 0.483, |
|
"train_steps_per_second": 0.06 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 357, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.503141107915817e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|