|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 37, |
|
"global_step": 294, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003401360544217687, |
|
"grad_norm": 106.7094005171616, |
|
"learning_rate": 0.0, |
|
"loss": 2.9268, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003401360544217687, |
|
"eval_loss": 2.5302913188934326, |
|
"eval_runtime": 3.7953, |
|
"eval_samples_per_second": 14.492, |
|
"eval_steps_per_second": 1.054, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006802721088435374, |
|
"grad_norm": 57.97506009705182, |
|
"learning_rate": 6.89655172413793e-08, |
|
"loss": 2.0122, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01020408163265306, |
|
"grad_norm": 116.23413141145363, |
|
"learning_rate": 1.379310344827586e-07, |
|
"loss": 2.6743, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.013605442176870748, |
|
"grad_norm": 21.262801374024775, |
|
"learning_rate": 2.0689655172413793e-07, |
|
"loss": 2.0743, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.017006802721088437, |
|
"grad_norm": 59.319984755304056, |
|
"learning_rate": 2.758620689655172e-07, |
|
"loss": 2.2775, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02040816326530612, |
|
"grad_norm": 159.51320885432614, |
|
"learning_rate": 3.4482758620689656e-07, |
|
"loss": 2.1337, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.023809523809523808, |
|
"grad_norm": 87.93970940325055, |
|
"learning_rate": 4.1379310344827586e-07, |
|
"loss": 1.9061, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.027210884353741496, |
|
"grad_norm": 61.133777808660895, |
|
"learning_rate": 4.827586206896552e-07, |
|
"loss": 1.8118, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030612244897959183, |
|
"grad_norm": 48.65887299035499, |
|
"learning_rate": 5.517241379310344e-07, |
|
"loss": 3.4095, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.034013605442176874, |
|
"grad_norm": 30.592687909719288, |
|
"learning_rate": 6.206896551724138e-07, |
|
"loss": 2.2398, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03741496598639456, |
|
"grad_norm": 74.15295766799099, |
|
"learning_rate": 6.896551724137931e-07, |
|
"loss": 3.4425, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04081632653061224, |
|
"grad_norm": 34.94892634385338, |
|
"learning_rate": 7.586206896551724e-07, |
|
"loss": 2.5405, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04421768707482993, |
|
"grad_norm": 26.538521745061775, |
|
"learning_rate": 8.275862068965517e-07, |
|
"loss": 1.9614, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": 52.23979896259082, |
|
"learning_rate": 8.96551724137931e-07, |
|
"loss": 2.9785, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05102040816326531, |
|
"grad_norm": 30.812143999051266, |
|
"learning_rate": 9.655172413793103e-07, |
|
"loss": 2.0185, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05442176870748299, |
|
"grad_norm": 41.48478088374125, |
|
"learning_rate": 1.0344827586206896e-06, |
|
"loss": 2.1126, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05782312925170068, |
|
"grad_norm": 29.347588210089675, |
|
"learning_rate": 1.1034482758620688e-06, |
|
"loss": 2.2078, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.061224489795918366, |
|
"grad_norm": 28.947554594850924, |
|
"learning_rate": 1.172413793103448e-06, |
|
"loss": 2.442, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06462585034013606, |
|
"grad_norm": 32.28592513881342, |
|
"learning_rate": 1.2413793103448275e-06, |
|
"loss": 2.8683, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06802721088435375, |
|
"grad_norm": 38.97631997775744, |
|
"learning_rate": 1.3103448275862068e-06, |
|
"loss": 2.4376, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 43.775478156068516, |
|
"learning_rate": 1.3793103448275862e-06, |
|
"loss": 2.4167, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07482993197278912, |
|
"grad_norm": 30.904260805899465, |
|
"learning_rate": 1.4482758620689655e-06, |
|
"loss": 2.6971, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0782312925170068, |
|
"grad_norm": 48.202871069183985, |
|
"learning_rate": 1.5172413793103447e-06, |
|
"loss": 2.5093, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08163265306122448, |
|
"grad_norm": 55.067186300198706, |
|
"learning_rate": 1.5862068965517242e-06, |
|
"loss": 2.0053, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08503401360544217, |
|
"grad_norm": 38.486811757681096, |
|
"learning_rate": 1.6551724137931035e-06, |
|
"loss": 2.2475, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08843537414965986, |
|
"grad_norm": 90.78568630900098, |
|
"learning_rate": 1.7241379310344825e-06, |
|
"loss": 3.8342, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09183673469387756, |
|
"grad_norm": 23.32050516158788, |
|
"learning_rate": 1.793103448275862e-06, |
|
"loss": 2.2496, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 25.01047005218693, |
|
"learning_rate": 1.8620689655172412e-06, |
|
"loss": 2.6991, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09863945578231292, |
|
"grad_norm": 27.40209208002175, |
|
"learning_rate": 1.9310344827586207e-06, |
|
"loss": 2.7017, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.10204081632653061, |
|
"grad_norm": 16.372774250078056, |
|
"learning_rate": 2e-06, |
|
"loss": 2.1315, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1054421768707483, |
|
"grad_norm": 34.32100924763162, |
|
"learning_rate": 1.999984207714351e-06, |
|
"loss": 2.4298, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.10884353741496598, |
|
"grad_norm": 49.15042168439896, |
|
"learning_rate": 1.9999368313561964e-06, |
|
"loss": 3.1687, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.11224489795918367, |
|
"grad_norm": 27.553221322487154, |
|
"learning_rate": 1.9998578724218984e-06, |
|
"loss": 2.307, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.11564625850340136, |
|
"grad_norm": 25.29898708562965, |
|
"learning_rate": 1.999747333405341e-06, |
|
"loss": 2.6711, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 35.13639034121329, |
|
"learning_rate": 1.9996052177978517e-06, |
|
"loss": 2.2923, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12244897959183673, |
|
"grad_norm": 61.904951168823246, |
|
"learning_rate": 1.999431530088091e-06, |
|
"loss": 3.0837, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.12585034013605442, |
|
"grad_norm": 43.72931173152359, |
|
"learning_rate": 1.9992262757619108e-06, |
|
"loss": 2.9055, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12585034013605442, |
|
"eval_loss": 2.2881884574890137, |
|
"eval_runtime": 3.7387, |
|
"eval_samples_per_second": 14.711, |
|
"eval_steps_per_second": 1.07, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1292517006802721, |
|
"grad_norm": 75.128224809043, |
|
"learning_rate": 1.9989894613021807e-06, |
|
"loss": 3.9717, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1326530612244898, |
|
"grad_norm": 6.423556290490496, |
|
"learning_rate": 1.998721094188584e-06, |
|
"loss": 1.6634, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1360544217687075, |
|
"grad_norm": 8.952452652609857, |
|
"learning_rate": 1.9984211828973816e-06, |
|
"loss": 2.1183, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13945578231292516, |
|
"grad_norm": 12.837161899787583, |
|
"learning_rate": 1.998089736901142e-06, |
|
"loss": 2.1306, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 7.2779063942957825, |
|
"learning_rate": 1.9977267666684456e-06, |
|
"loss": 1.9831, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.14625850340136054, |
|
"grad_norm": 30.288569770228293, |
|
"learning_rate": 1.9973322836635515e-06, |
|
"loss": 2.1869, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.14965986394557823, |
|
"grad_norm": 11.672608976353168, |
|
"learning_rate": 1.996906300346036e-06, |
|
"loss": 1.9566, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.15306122448979592, |
|
"grad_norm": 14.837719065187358, |
|
"learning_rate": 1.9964488301704e-06, |
|
"loss": 2.2152, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1564625850340136, |
|
"grad_norm": 18.558600033713702, |
|
"learning_rate": 1.9959598875856427e-06, |
|
"loss": 2.06, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1598639455782313, |
|
"grad_norm": 17.161073648503006, |
|
"learning_rate": 1.995439488034806e-06, |
|
"loss": 2.0463, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 10.944090642041195, |
|
"learning_rate": 1.994887647954486e-06, |
|
"loss": 1.9676, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 30.260773919516463, |
|
"learning_rate": 1.9943043847743164e-06, |
|
"loss": 2.4235, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.17006802721088435, |
|
"grad_norm": 17.95874457178673, |
|
"learning_rate": 1.9936897169164135e-06, |
|
"loss": 2.4211, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17346938775510204, |
|
"grad_norm": 29.32804844947439, |
|
"learning_rate": 1.993043663794799e-06, |
|
"loss": 2.2786, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.17687074829931973, |
|
"grad_norm": 31.224760731119037, |
|
"learning_rate": 1.9923662458147826e-06, |
|
"loss": 2.8374, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.18027210884353742, |
|
"grad_norm": 4.5045539325043205, |
|
"learning_rate": 1.9916574843723217e-06, |
|
"loss": 1.6301, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1836734693877551, |
|
"grad_norm": 10.827050277516674, |
|
"learning_rate": 1.9909174018533427e-06, |
|
"loss": 2.0554, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1870748299319728, |
|
"grad_norm": 17.063187262605883, |
|
"learning_rate": 1.990146021633034e-06, |
|
"loss": 2.4202, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 3.946679947433292, |
|
"learning_rate": 1.98934336807511e-06, |
|
"loss": 1.7808, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.19387755102040816, |
|
"grad_norm": 8.431222224384186, |
|
"learning_rate": 1.9885094665310388e-06, |
|
"loss": 1.7766, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.19727891156462585, |
|
"grad_norm": 32.28667139462841, |
|
"learning_rate": 1.9876443433392433e-06, |
|
"loss": 2.2299, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.20068027210884354, |
|
"grad_norm": 11.950555724182584, |
|
"learning_rate": 1.986748025824268e-06, |
|
"loss": 1.928, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.20408163265306123, |
|
"grad_norm": 3.6059136679066977, |
|
"learning_rate": 1.985820542295918e-06, |
|
"loss": 1.7761, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20748299319727892, |
|
"grad_norm": 41.40947345983446, |
|
"learning_rate": 1.984861922048363e-06, |
|
"loss": 2.6704, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2108843537414966, |
|
"grad_norm": 30.634237938465816, |
|
"learning_rate": 1.983872195359212e-06, |
|
"loss": 2.7336, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 3.760013022701194, |
|
"learning_rate": 1.9828513934885587e-06, |
|
"loss": 1.8831, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.21768707482993196, |
|
"grad_norm": 37.34059674722221, |
|
"learning_rate": 1.981799548677993e-06, |
|
"loss": 2.27, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.22108843537414966, |
|
"grad_norm": 11.009700618421736, |
|
"learning_rate": 1.980716694149581e-06, |
|
"loss": 1.9265, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22448979591836735, |
|
"grad_norm": 17.609147027884987, |
|
"learning_rate": 1.9796028641048194e-06, |
|
"loss": 2.3411, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.22789115646258504, |
|
"grad_norm": 17.432142291951372, |
|
"learning_rate": 1.978458093723553e-06, |
|
"loss": 2.2213, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.23129251700680273, |
|
"grad_norm": 14.11664326231067, |
|
"learning_rate": 1.9772824191628632e-06, |
|
"loss": 2.0831, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.23469387755102042, |
|
"grad_norm": 37.456025944063875, |
|
"learning_rate": 1.9760758775559273e-06, |
|
"loss": 2.7494, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 16.30994509129653, |
|
"learning_rate": 1.974838507010844e-06, |
|
"loss": 2.118, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24149659863945577, |
|
"grad_norm": 25.92468917111241, |
|
"learning_rate": 1.9735703466094324e-06, |
|
"loss": 2.1656, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.24489795918367346, |
|
"grad_norm": 17.23253832018251, |
|
"learning_rate": 1.972271436405994e-06, |
|
"loss": 2.0787, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.24829931972789115, |
|
"grad_norm": 6.286286593272188, |
|
"learning_rate": 1.970941817426052e-06, |
|
"loss": 1.7458, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.25170068027210885, |
|
"grad_norm": 20.87004487229478, |
|
"learning_rate": 1.969581531665051e-06, |
|
"loss": 2.364, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.25170068027210885, |
|
"eval_loss": 2.240875482559204, |
|
"eval_runtime": 3.7328, |
|
"eval_samples_per_second": 14.734, |
|
"eval_steps_per_second": 1.072, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.25510204081632654, |
|
"grad_norm": 22.83815781491435, |
|
"learning_rate": 1.968190622087034e-06, |
|
"loss": 2.2176, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2585034013605442, |
|
"grad_norm": 39.2204163613504, |
|
"learning_rate": 1.9667691326232835e-06, |
|
"loss": 2.605, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2619047619047619, |
|
"grad_norm": 9.599486970591897, |
|
"learning_rate": 1.965317108170935e-06, |
|
"loss": 2.1652, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2653061224489796, |
|
"grad_norm": 3.7571781853463175, |
|
"learning_rate": 1.9638345945915586e-06, |
|
"loss": 1.6055, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2687074829931973, |
|
"grad_norm": 7.064670527473922, |
|
"learning_rate": 1.962321638709709e-06, |
|
"loss": 1.9937, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.272108843537415, |
|
"grad_norm": 28.207901160479654, |
|
"learning_rate": 1.9607782883114506e-06, |
|
"loss": 2.2552, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2755102040816326, |
|
"grad_norm": 15.991872570963396, |
|
"learning_rate": 1.959204592142843e-06, |
|
"loss": 2.1559, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2789115646258503, |
|
"grad_norm": 13.401822104278665, |
|
"learning_rate": 1.957600599908406e-06, |
|
"loss": 2.1652, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.282312925170068, |
|
"grad_norm": 14.708704691038701, |
|
"learning_rate": 1.9559663622695455e-06, |
|
"loss": 1.9673, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 3.3458550475032105, |
|
"learning_rate": 1.954301930842958e-06, |
|
"loss": 1.6917, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2891156462585034, |
|
"grad_norm": 3.479853146114766, |
|
"learning_rate": 1.9526073581989955e-06, |
|
"loss": 1.624, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2925170068027211, |
|
"grad_norm": 25.10854427551898, |
|
"learning_rate": 1.950882697860009e-06, |
|
"loss": 2.3626, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.29591836734693877, |
|
"grad_norm": 14.389114459997433, |
|
"learning_rate": 1.9491280042986562e-06, |
|
"loss": 2.0549, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.29931972789115646, |
|
"grad_norm": 17.72897272235088, |
|
"learning_rate": 1.9473433329361802e-06, |
|
"loss": 2.4525, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.30272108843537415, |
|
"grad_norm": 8.212788560084723, |
|
"learning_rate": 1.945528740140662e-06, |
|
"loss": 2.1368, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.30612244897959184, |
|
"grad_norm": 26.76274867022125, |
|
"learning_rate": 1.943684283225236e-06, |
|
"loss": 2.3735, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.30952380952380953, |
|
"grad_norm": 23.71630229663243, |
|
"learning_rate": 1.941810020446284e-06, |
|
"loss": 2.6005, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3129251700680272, |
|
"grad_norm": 22.889738702248234, |
|
"learning_rate": 1.9399060110015917e-06, |
|
"loss": 2.6924, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3163265306122449, |
|
"grad_norm": 32.54631787971477, |
|
"learning_rate": 1.9379723150284814e-06, |
|
"loss": 2.5301, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3197278911564626, |
|
"grad_norm": 3.6877224549117344, |
|
"learning_rate": 1.936008993601912e-06, |
|
"loss": 1.6556, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.3231292517006803, |
|
"grad_norm": 33.682920637388364, |
|
"learning_rate": 1.934016108732548e-06, |
|
"loss": 2.3709, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 19.342157148675135, |
|
"learning_rate": 1.9319937233648045e-06, |
|
"loss": 1.8713, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3299319727891156, |
|
"grad_norm": 36.9446891807536, |
|
"learning_rate": 1.929941901374856e-06, |
|
"loss": 3.1666, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 12.769242612326224, |
|
"learning_rate": 1.9278607075686205e-06, |
|
"loss": 2.2024, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.336734693877551, |
|
"grad_norm": 7.569149644914372, |
|
"learning_rate": 1.9257502076797123e-06, |
|
"loss": 1.8434, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3401360544217687, |
|
"grad_norm": 18.672166864254265, |
|
"learning_rate": 1.9236104683673653e-06, |
|
"loss": 2.6262, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3435374149659864, |
|
"grad_norm": 7.251393661314555, |
|
"learning_rate": 1.9214415572143284e-06, |
|
"loss": 1.8447, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3469387755102041, |
|
"grad_norm": 25.8588617341962, |
|
"learning_rate": 1.919243542724731e-06, |
|
"loss": 2.3528, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.35034013605442177, |
|
"grad_norm": 21.00339285362203, |
|
"learning_rate": 1.917016494321918e-06, |
|
"loss": 2.462, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.35374149659863946, |
|
"grad_norm": 19.533037226832878, |
|
"learning_rate": 1.9147604823462585e-06, |
|
"loss": 2.3057, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 3.1087327492999286, |
|
"learning_rate": 1.9124755780529243e-06, |
|
"loss": 1.6935, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.36054421768707484, |
|
"grad_norm": 35.707396347148176, |
|
"learning_rate": 1.910161853609637e-06, |
|
"loss": 2.3652, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.36394557823129253, |
|
"grad_norm": 16.694934440145225, |
|
"learning_rate": 1.9078193820943916e-06, |
|
"loss": 2.6014, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3673469387755102, |
|
"grad_norm": 12.946146725042743, |
|
"learning_rate": 1.9054482374931466e-06, |
|
"loss": 1.9379, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3707482993197279, |
|
"grad_norm": 8.740650008889842, |
|
"learning_rate": 1.9030484946974878e-06, |
|
"loss": 1.9414, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3741496598639456, |
|
"grad_norm": 23.13581690576701, |
|
"learning_rate": 1.9006202295022629e-06, |
|
"loss": 2.4563, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.37755102040816324, |
|
"grad_norm": 10.00026809536462, |
|
"learning_rate": 1.8981635186031869e-06, |
|
"loss": 1.8384, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.37755102040816324, |
|
"eval_loss": 2.2185332775115967, |
|
"eval_runtime": 3.7603, |
|
"eval_samples_per_second": 14.626, |
|
"eval_steps_per_second": 1.064, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 26.376801704138895, |
|
"learning_rate": 1.89567843959442e-06, |
|
"loss": 3.095, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3843537414965986, |
|
"grad_norm": 31.801160647661863, |
|
"learning_rate": 1.8931650709661176e-06, |
|
"loss": 2.4186, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3877551020408163, |
|
"grad_norm": 3.7202396333724406, |
|
"learning_rate": 1.8906234921019504e-06, |
|
"loss": 1.8483, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.391156462585034, |
|
"grad_norm": 20.22060079238643, |
|
"learning_rate": 1.8880537832765975e-06, |
|
"loss": 2.1247, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3945578231292517, |
|
"grad_norm": 29.233218070907714, |
|
"learning_rate": 1.8854560256532098e-06, |
|
"loss": 2.3962, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3979591836734694, |
|
"grad_norm": 12.311196195760077, |
|
"learning_rate": 1.882830301280849e-06, |
|
"loss": 1.9291, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.4013605442176871, |
|
"grad_norm": 24.022251844658836, |
|
"learning_rate": 1.880176693091893e-06, |
|
"loss": 2.0967, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.40476190476190477, |
|
"grad_norm": 15.5145598820515, |
|
"learning_rate": 1.8774952848994193e-06, |
|
"loss": 2.0164, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 18.669552144287866, |
|
"learning_rate": 1.874786161394556e-06, |
|
"loss": 1.9074, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.41156462585034015, |
|
"grad_norm": 20.221669243742017, |
|
"learning_rate": 1.8720494081438077e-06, |
|
"loss": 2.0693, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.41496598639455784, |
|
"grad_norm": 40.16853982486705, |
|
"learning_rate": 1.8692851115863521e-06, |
|
"loss": 2.7133, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.41836734693877553, |
|
"grad_norm": 28.130765299643805, |
|
"learning_rate": 1.8664933590313116e-06, |
|
"loss": 2.3678, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4217687074829932, |
|
"grad_norm": 3.285521259165442, |
|
"learning_rate": 1.8636742386549936e-06, |
|
"loss": 1.643, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.42517006802721086, |
|
"grad_norm": 14.918765530830019, |
|
"learning_rate": 1.8608278394981065e-06, |
|
"loss": 2.2832, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 3.221047286582191, |
|
"learning_rate": 1.8579542514629471e-06, |
|
"loss": 1.7598, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.43197278911564624, |
|
"grad_norm": 30.02563146393063, |
|
"learning_rate": 1.8550535653105621e-06, |
|
"loss": 2.2684, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.43537414965986393, |
|
"grad_norm": 14.894051195947721, |
|
"learning_rate": 1.8521258726578802e-06, |
|
"loss": 2.2898, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.4387755102040816, |
|
"grad_norm": 31.346174242632404, |
|
"learning_rate": 1.849171265974818e-06, |
|
"loss": 2.4443, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4421768707482993, |
|
"grad_norm": 18.396976082720574, |
|
"learning_rate": 1.846189838581362e-06, |
|
"loss": 2.4081, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.445578231292517, |
|
"grad_norm": 11.300098238275778, |
|
"learning_rate": 1.843181684644617e-06, |
|
"loss": 1.9707, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4489795918367347, |
|
"grad_norm": 9.311622064720812, |
|
"learning_rate": 1.8401468991758364e-06, |
|
"loss": 2.0055, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4523809523809524, |
|
"grad_norm": 17.268118260619143, |
|
"learning_rate": 1.837085578027418e-06, |
|
"loss": 2.1029, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4557823129251701, |
|
"grad_norm": 13.534018757700077, |
|
"learning_rate": 1.833997817889878e-06, |
|
"loss": 1.6714, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.45918367346938777, |
|
"grad_norm": 25.67291091851184, |
|
"learning_rate": 1.8308837162887962e-06, |
|
"loss": 2.0809, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.46258503401360546, |
|
"grad_norm": 16.78554391811326, |
|
"learning_rate": 1.827743371581737e-06, |
|
"loss": 2.095, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.46598639455782315, |
|
"grad_norm": 7.0895304724541175, |
|
"learning_rate": 1.8245768829551415e-06, |
|
"loss": 2.0924, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.46938775510204084, |
|
"grad_norm": 28.325113542255774, |
|
"learning_rate": 1.8213843504211956e-06, |
|
"loss": 2.2312, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.47278911564625853, |
|
"grad_norm": 19.627621449351967, |
|
"learning_rate": 1.8181658748146709e-06, |
|
"loss": 2.1092, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 3.253642214201976, |
|
"learning_rate": 1.8149215577897394e-06, |
|
"loss": 1.8119, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.47959183673469385, |
|
"grad_norm": 22.194249754011054, |
|
"learning_rate": 1.8116515018167635e-06, |
|
"loss": 1.8086, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.48299319727891155, |
|
"grad_norm": 3.291628206622755, |
|
"learning_rate": 1.8083558101790595e-06, |
|
"loss": 1.6961, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.48639455782312924, |
|
"grad_norm": 30.333797331495706, |
|
"learning_rate": 1.8050345869696346e-06, |
|
"loss": 2.4649, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"grad_norm": 35.46381155966904, |
|
"learning_rate": 1.8016879370879004e-06, |
|
"loss": 2.375, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4931972789115646, |
|
"grad_norm": 10.065027530577671, |
|
"learning_rate": 1.798315966236358e-06, |
|
"loss": 1.7088, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4965986394557823, |
|
"grad_norm": 31.969238069641904, |
|
"learning_rate": 1.794918780917262e-06, |
|
"loss": 2.2722, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.1706943713916287, |
|
"learning_rate": 1.791496488429254e-06, |
|
"loss": 1.5129, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5034013605442177, |
|
"grad_norm": 40.129409477941664, |
|
"learning_rate": 1.7880491968639751e-06, |
|
"loss": 2.8429, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5034013605442177, |
|
"eval_loss": 2.2053215503692627, |
|
"eval_runtime": 3.8702, |
|
"eval_samples_per_second": 14.211, |
|
"eval_steps_per_second": 1.034, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5068027210884354, |
|
"grad_norm": 26.985890370710862, |
|
"learning_rate": 1.7845770151026513e-06, |
|
"loss": 2.3221, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5102040816326531, |
|
"grad_norm": 34.746114296368646, |
|
"learning_rate": 1.7810800528126553e-06, |
|
"loss": 2.3499, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5136054421768708, |
|
"grad_norm": 3.902076154967714, |
|
"learning_rate": 1.7775584204440416e-06, |
|
"loss": 1.7411, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.5170068027210885, |
|
"grad_norm": 27.80193827038684, |
|
"learning_rate": 1.7740122292260594e-06, |
|
"loss": 2.2895, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5204081632653061, |
|
"grad_norm": 3.4114906810600685, |
|
"learning_rate": 1.7704415911636375e-06, |
|
"loss": 1.5119, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5238095238095238, |
|
"grad_norm": 9.505522369554297, |
|
"learning_rate": 1.7668466190338483e-06, |
|
"loss": 1.844, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5272108843537415, |
|
"grad_norm": 36.46998151934392, |
|
"learning_rate": 1.7632274263823457e-06, |
|
"loss": 2.4713, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5306122448979592, |
|
"grad_norm": 17.765108257489125, |
|
"learning_rate": 1.759584127519778e-06, |
|
"loss": 2.2811, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5340136054421769, |
|
"grad_norm": 14.148223114236801, |
|
"learning_rate": 1.7559168375181775e-06, |
|
"loss": 1.8442, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5374149659863946, |
|
"grad_norm": 9.76402372234183, |
|
"learning_rate": 1.7522256722073273e-06, |
|
"loss": 1.8945, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5408163265306123, |
|
"grad_norm": 16.450896799860217, |
|
"learning_rate": 1.748510748171101e-06, |
|
"loss": 1.9574, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.54421768707483, |
|
"grad_norm": 3.912613042056259, |
|
"learning_rate": 1.7447721827437819e-06, |
|
"loss": 1.6032, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5476190476190477, |
|
"grad_norm": 33.305605159021646, |
|
"learning_rate": 1.7410100940063558e-06, |
|
"loss": 2.4057, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5510204081632653, |
|
"grad_norm": 38.319973023280475, |
|
"learning_rate": 1.7372246007827833e-06, |
|
"loss": 2.5925, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5544217687074829, |
|
"grad_norm": 17.216523524482163, |
|
"learning_rate": 1.7334158226362446e-06, |
|
"loss": 2.0324, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5578231292517006, |
|
"grad_norm": 4.9862323362748535, |
|
"learning_rate": 1.7295838798653649e-06, |
|
"loss": 1.7436, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5612244897959183, |
|
"grad_norm": 4.0759355613648625, |
|
"learning_rate": 1.7257288935004132e-06, |
|
"loss": 1.7034, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.564625850340136, |
|
"grad_norm": 16.519960341878562, |
|
"learning_rate": 1.7218509852994822e-06, |
|
"loss": 2.115, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5680272108843537, |
|
"grad_norm": 17.37824200525593, |
|
"learning_rate": 1.7179502777446392e-06, |
|
"loss": 2.0609, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 39.604264809847564, |
|
"learning_rate": 1.7140268940380605e-06, |
|
"loss": 2.3861, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5748299319727891, |
|
"grad_norm": 17.489048911326037, |
|
"learning_rate": 1.7100809580981384e-06, |
|
"loss": 1.9979, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5782312925170068, |
|
"grad_norm": 6.642641185839537, |
|
"learning_rate": 1.7061125945555679e-06, |
|
"loss": 1.7533, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5816326530612245, |
|
"grad_norm": 41.437166409250736, |
|
"learning_rate": 1.70212192874941e-06, |
|
"loss": 2.8676, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5850340136054422, |
|
"grad_norm": 12.285090452877482, |
|
"learning_rate": 1.6981090867231336e-06, |
|
"loss": 1.8715, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5884353741496599, |
|
"grad_norm": 20.351266920257437, |
|
"learning_rate": 1.694074195220634e-06, |
|
"loss": 2.5238, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5918367346938775, |
|
"grad_norm": 13.128678816386138, |
|
"learning_rate": 1.6900173816822289e-06, |
|
"loss": 1.7191, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 3.1331026154409565, |
|
"learning_rate": 1.6859387742406358e-06, |
|
"loss": 1.7885, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5986394557823129, |
|
"grad_norm": 12.273944679120639, |
|
"learning_rate": 1.6818385017169212e-06, |
|
"loss": 1.9361, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6020408163265306, |
|
"grad_norm": 18.988287394873876, |
|
"learning_rate": 1.6777166936164354e-06, |
|
"loss": 2.118, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.6054421768707483, |
|
"grad_norm": 13.330413347581118, |
|
"learning_rate": 1.6735734801247202e-06, |
|
"loss": 1.9923, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.608843537414966, |
|
"grad_norm": 8.528660885149025, |
|
"learning_rate": 1.6694089921033976e-06, |
|
"loss": 1.6938, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.6122448979591837, |
|
"grad_norm": 28.049589150374253, |
|
"learning_rate": 1.6652233610860364e-06, |
|
"loss": 2.4092, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6156462585034014, |
|
"grad_norm": 19.077236893577115, |
|
"learning_rate": 1.6610167192739978e-06, |
|
"loss": 2.3235, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6190476190476191, |
|
"grad_norm": 23.109888095114325, |
|
"learning_rate": 1.6567891995322603e-06, |
|
"loss": 2.2678, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6224489795918368, |
|
"grad_norm": 19.456776496200867, |
|
"learning_rate": 1.6525409353852221e-06, |
|
"loss": 2.2764, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6258503401360545, |
|
"grad_norm": 9.82404206796416, |
|
"learning_rate": 1.6482720610124856e-06, |
|
"loss": 1.8034, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6292517006802721, |
|
"grad_norm": 24.2061776724548, |
|
"learning_rate": 1.6439827112446173e-06, |
|
"loss": 2.161, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6292517006802721, |
|
"eval_loss": 2.194326400756836, |
|
"eval_runtime": 3.7428, |
|
"eval_samples_per_second": 14.695, |
|
"eval_steps_per_second": 1.069, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6326530612244898, |
|
"grad_norm": 30.469163171671003, |
|
"learning_rate": 1.6396730215588912e-06, |
|
"loss": 2.2773, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6360544217687075, |
|
"grad_norm": 3.646917584621385, |
|
"learning_rate": 1.6353431280750082e-06, |
|
"loss": 1.5989, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6394557823129252, |
|
"grad_norm": 30.30266588230692, |
|
"learning_rate": 1.6309931675507978e-06, |
|
"loss": 2.6169, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 14.371186117614542, |
|
"learning_rate": 1.6266232773778983e-06, |
|
"loss": 1.9241, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6462585034013606, |
|
"grad_norm": 18.71258411403636, |
|
"learning_rate": 1.6222335955774176e-06, |
|
"loss": 2.1737, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6496598639455783, |
|
"grad_norm": 3.2723339662931585, |
|
"learning_rate": 1.617824260795573e-06, |
|
"loss": 1.8075, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"grad_norm": 16.496061968286824, |
|
"learning_rate": 1.6133954122993139e-06, |
|
"loss": 2.0147, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.6564625850340136, |
|
"grad_norm": 3.2013079969624805, |
|
"learning_rate": 1.608947189971921e-06, |
|
"loss": 1.6798, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6598639455782312, |
|
"grad_norm": 20.981814890242124, |
|
"learning_rate": 1.6044797343085898e-06, |
|
"loss": 2.0425, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6632653061224489, |
|
"grad_norm": 50.879018823375965, |
|
"learning_rate": 1.599993186411992e-06, |
|
"loss": 3.8504, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 3.283241794235971, |
|
"learning_rate": 1.59548768798782e-06, |
|
"loss": 1.4971, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6700680272108843, |
|
"grad_norm": 12.706772022061763, |
|
"learning_rate": 1.5909633813403092e-06, |
|
"loss": 1.9318, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.673469387755102, |
|
"grad_norm": 7.747043673117189, |
|
"learning_rate": 1.5864204093677463e-06, |
|
"loss": 1.8641, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.6768707482993197, |
|
"grad_norm": 12.685665761738797, |
|
"learning_rate": 1.5818589155579529e-06, |
|
"loss": 2.0781, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6802721088435374, |
|
"grad_norm": 8.183695796856302, |
|
"learning_rate": 1.5772790439837555e-06, |
|
"loss": 2.1112, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6836734693877551, |
|
"grad_norm": 3.6436475976280605, |
|
"learning_rate": 1.572680939298435e-06, |
|
"loss": 1.504, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6870748299319728, |
|
"grad_norm": 7.765753459491514, |
|
"learning_rate": 1.5680647467311555e-06, |
|
"loss": 1.6113, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6904761904761905, |
|
"grad_norm": 27.059590789587673, |
|
"learning_rate": 1.563430612082382e-06, |
|
"loss": 2.3797, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6938775510204082, |
|
"grad_norm": 17.865181616406808, |
|
"learning_rate": 1.5587786817192687e-06, |
|
"loss": 2.2287, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6972789115646258, |
|
"grad_norm": 11.50437842198177, |
|
"learning_rate": 1.5541091025710434e-06, |
|
"loss": 2.2926, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7006802721088435, |
|
"grad_norm": 18.03962056520961, |
|
"learning_rate": 1.5494220221243607e-06, |
|
"loss": 2.3374, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.7040816326530612, |
|
"grad_norm": 19.808732477248256, |
|
"learning_rate": 1.5447175884186478e-06, |
|
"loss": 2.3215, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.7074829931972789, |
|
"grad_norm": 21.35228597761302, |
|
"learning_rate": 1.539995950041426e-06, |
|
"loss": 2.2378, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.7108843537414966, |
|
"grad_norm": 14.090932946927257, |
|
"learning_rate": 1.5352572561236197e-06, |
|
"loss": 2.22, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 22.22875395969964, |
|
"learning_rate": 1.5305016563348443e-06, |
|
"loss": 2.44, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.717687074829932, |
|
"grad_norm": 12.732771656478363, |
|
"learning_rate": 1.5257293008786807e-06, |
|
"loss": 2.0598, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.7210884353741497, |
|
"grad_norm": 3.3024595151809777, |
|
"learning_rate": 1.5209403404879303e-06, |
|
"loss": 1.8514, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7244897959183674, |
|
"grad_norm": 31.041628605811148, |
|
"learning_rate": 1.5161349264198535e-06, |
|
"loss": 2.4225, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7278911564625851, |
|
"grad_norm": 11.866017531018645, |
|
"learning_rate": 1.511313210451394e-06, |
|
"loss": 1.9747, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.7312925170068028, |
|
"grad_norm": 23.77867996796224, |
|
"learning_rate": 1.5064753448743832e-06, |
|
"loss": 2.0971, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7346938775510204, |
|
"grad_norm": 28.640512428374876, |
|
"learning_rate": 1.5016214824907314e-06, |
|
"loss": 2.2247, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7380952380952381, |
|
"grad_norm": 9.463317499162777, |
|
"learning_rate": 1.4967517766076015e-06, |
|
"loss": 1.9511, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.7414965986394558, |
|
"grad_norm": 3.6132074342008336, |
|
"learning_rate": 1.4918663810325659e-06, |
|
"loss": 1.5643, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.7448979591836735, |
|
"grad_norm": 12.274299577611806, |
|
"learning_rate": 1.4869654500687492e-06, |
|
"loss": 2.0865, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.7482993197278912, |
|
"grad_norm": 9.577269499797044, |
|
"learning_rate": 1.4820491385099555e-06, |
|
"loss": 2.1494, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7517006802721088, |
|
"grad_norm": 13.665325186622818, |
|
"learning_rate": 1.477117601635777e-06, |
|
"loss": 2.0676, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.7551020408163265, |
|
"grad_norm": 12.044556166373619, |
|
"learning_rate": 1.4721709952066923e-06, |
|
"loss": 1.7408, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7551020408163265, |
|
"eval_loss": 2.1867611408233643, |
|
"eval_runtime": 3.7388, |
|
"eval_samples_per_second": 14.711, |
|
"eval_steps_per_second": 1.07, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7585034013605442, |
|
"grad_norm": 32.0866216128451, |
|
"learning_rate": 1.4672094754591449e-06, |
|
"loss": 2.6444, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 26.272890838528287, |
|
"learning_rate": 1.4622331991006082e-06, |
|
"loss": 2.0286, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7653061224489796, |
|
"grad_norm": 10.948966043777636, |
|
"learning_rate": 1.4572423233046385e-06, |
|
"loss": 1.8924, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7687074829931972, |
|
"grad_norm": 10.041220633719293, |
|
"learning_rate": 1.4522370057059079e-06, |
|
"loss": 1.8589, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7721088435374149, |
|
"grad_norm": 19.90849856575333, |
|
"learning_rate": 1.447217404395227e-06, |
|
"loss": 2.4632, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7755102040816326, |
|
"grad_norm": 3.3718807752757134, |
|
"learning_rate": 1.4421836779145511e-06, |
|
"loss": 1.7402, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7789115646258503, |
|
"grad_norm": 33.99543346002537, |
|
"learning_rate": 1.4371359852519734e-06, |
|
"loss": 2.9081, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.782312925170068, |
|
"grad_norm": 12.446391408704297, |
|
"learning_rate": 1.4320744858367024e-06, |
|
"loss": 2.0828, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 26.19952152880794, |
|
"learning_rate": 1.4269993395340277e-06, |
|
"loss": 2.2178, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.7891156462585034, |
|
"grad_norm": 36.07799078718175, |
|
"learning_rate": 1.4219107066402692e-06, |
|
"loss": 2.6926, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7925170068027211, |
|
"grad_norm": 11.216785179837261, |
|
"learning_rate": 1.4168087478777152e-06, |
|
"loss": 2.0393, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7959183673469388, |
|
"grad_norm": 17.659830496744974, |
|
"learning_rate": 1.4116936243895466e-06, |
|
"loss": 2.1082, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.7993197278911565, |
|
"grad_norm": 17.001892765923902, |
|
"learning_rate": 1.406565497734745e-06, |
|
"loss": 1.9051, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8027210884353742, |
|
"grad_norm": 31.896056687773818, |
|
"learning_rate": 1.4014245298829935e-06, |
|
"loss": 2.702, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.8061224489795918, |
|
"grad_norm": 6.972810630357569, |
|
"learning_rate": 1.3962708832095568e-06, |
|
"loss": 1.9466, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"grad_norm": 17.689383441039308, |
|
"learning_rate": 1.3911047204901558e-06, |
|
"loss": 2.3425, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.8129251700680272, |
|
"grad_norm": 16.46834046227904, |
|
"learning_rate": 1.385926204895826e-06, |
|
"loss": 2.1545, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 21.69161139742313, |
|
"learning_rate": 1.3807354999877614e-06, |
|
"loss": 2.3222, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8197278911564626, |
|
"grad_norm": 3.411794366451801, |
|
"learning_rate": 1.3755327697121522e-06, |
|
"loss": 1.6492, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.8231292517006803, |
|
"grad_norm": 13.113564486849809, |
|
"learning_rate": 1.3703181783950031e-06, |
|
"loss": 2.0212, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.826530612244898, |
|
"grad_norm": 14.798483657902382, |
|
"learning_rate": 1.3650918907369452e-06, |
|
"loss": 2.1974, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8299319727891157, |
|
"grad_norm": 10.19780084250851, |
|
"learning_rate": 1.3598540718080345e-06, |
|
"loss": 1.8543, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 30.023251305313995, |
|
"learning_rate": 1.3546048870425354e-06, |
|
"loss": 2.2387, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8367346938775511, |
|
"grad_norm": 22.321684071392564, |
|
"learning_rate": 1.3493445022336994e-06, |
|
"loss": 2.4305, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8401360544217688, |
|
"grad_norm": 34.98925650288134, |
|
"learning_rate": 1.3440730835285247e-06, |
|
"loss": 2.4364, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.8435374149659864, |
|
"grad_norm": 3.161092974878791, |
|
"learning_rate": 1.3387907974225116e-06, |
|
"loss": 1.4885, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.8469387755102041, |
|
"grad_norm": 50.11899935337027, |
|
"learning_rate": 1.3334978107544024e-06, |
|
"loss": 2.3332, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.8503401360544217, |
|
"grad_norm": 15.05206270554561, |
|
"learning_rate": 1.3281942907009112e-06, |
|
"loss": 2.2131, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8537414965986394, |
|
"grad_norm": 24.869549840961, |
|
"learning_rate": 1.3228804047714462e-06, |
|
"loss": 2.2264, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 16.049594008906414, |
|
"learning_rate": 1.317556320802816e-06, |
|
"loss": 1.7228, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.8605442176870748, |
|
"grad_norm": 14.258214783846427, |
|
"learning_rate": 1.31222220695393e-06, |
|
"loss": 1.999, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.8639455782312925, |
|
"grad_norm": 3.3063413494205474, |
|
"learning_rate": 1.3068782317004874e-06, |
|
"loss": 1.4607, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.8673469387755102, |
|
"grad_norm": 8.831787955552995, |
|
"learning_rate": 1.3015245638296563e-06, |
|
"loss": 2.1192, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8707482993197279, |
|
"grad_norm": 3.121872417027736, |
|
"learning_rate": 1.296161372434741e-06, |
|
"loss": 1.5467, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8741496598639455, |
|
"grad_norm": 33.22351218100941, |
|
"learning_rate": 1.2907888269098416e-06, |
|
"loss": 2.3588, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.8775510204081632, |
|
"grad_norm": 3.188560179185641, |
|
"learning_rate": 1.2854070969445064e-06, |
|
"loss": 1.5405, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.8809523809523809, |
|
"grad_norm": 21.318069352021737, |
|
"learning_rate": 1.2800163525183688e-06, |
|
"loss": 2.2063, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8809523809523809, |
|
"eval_loss": 2.1820290088653564, |
|
"eval_runtime": 3.8534, |
|
"eval_samples_per_second": 14.273, |
|
"eval_steps_per_second": 1.038, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8843537414965986, |
|
"grad_norm": 8.243323927611506, |
|
"learning_rate": 1.2746167638957805e-06, |
|
"loss": 1.8474, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8877551020408163, |
|
"grad_norm": 28.909948439715215, |
|
"learning_rate": 1.2692085016204333e-06, |
|
"loss": 2.2626, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.891156462585034, |
|
"grad_norm": 3.0722449835450116, |
|
"learning_rate": 1.2637917365099725e-06, |
|
"loss": 1.6435, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.8945578231292517, |
|
"grad_norm": 29.871491992872432, |
|
"learning_rate": 1.2583666396506023e-06, |
|
"loss": 2.1498, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.8979591836734694, |
|
"grad_norm": 2.977539901133042, |
|
"learning_rate": 1.2529333823916806e-06, |
|
"loss": 1.7024, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.9013605442176871, |
|
"grad_norm": 16.47476152363902, |
|
"learning_rate": 1.2474921363403094e-06, |
|
"loss": 2.532, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9047619047619048, |
|
"grad_norm": 13.022051400004793, |
|
"learning_rate": 1.2420430733559124e-06, |
|
"loss": 1.8884, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.9081632653061225, |
|
"grad_norm": 8.97804602434911, |
|
"learning_rate": 1.2365863655448075e-06, |
|
"loss": 1.7885, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.9115646258503401, |
|
"grad_norm": 16.047174726202446, |
|
"learning_rate": 1.2311221852547721e-06, |
|
"loss": 2.3363, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.9149659863945578, |
|
"grad_norm": 3.5763323384852765, |
|
"learning_rate": 1.2256507050695977e-06, |
|
"loss": 1.701, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.9183673469387755, |
|
"grad_norm": 26.929796973835796, |
|
"learning_rate": 1.220172097803641e-06, |
|
"loss": 2.3601, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9217687074829932, |
|
"grad_norm": 22.50281840057178, |
|
"learning_rate": 1.2146865364963633e-06, |
|
"loss": 2.0693, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.9251700680272109, |
|
"grad_norm": 11.62602578923058, |
|
"learning_rate": 1.2091941944068665e-06, |
|
"loss": 1.9123, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 16.841220035990798, |
|
"learning_rate": 1.2036952450084214e-06, |
|
"loss": 2.2163, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9319727891156463, |
|
"grad_norm": 18.055133543008612, |
|
"learning_rate": 1.1981898619829879e-06, |
|
"loss": 2.2485, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.935374149659864, |
|
"grad_norm": 26.45820099458286, |
|
"learning_rate": 1.1926782192157273e-06, |
|
"loss": 2.1845, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9387755102040817, |
|
"grad_norm": 3.334955291200548, |
|
"learning_rate": 1.1871604907895148e-06, |
|
"loss": 1.7059, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.9421768707482994, |
|
"grad_norm": 19.511242339983163, |
|
"learning_rate": 1.1816368509794364e-06, |
|
"loss": 2.3601, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.9455782312925171, |
|
"grad_norm": 21.146925953072365, |
|
"learning_rate": 1.1761074742472882e-06, |
|
"loss": 1.9957, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.9489795918367347, |
|
"grad_norm": 3.5535024021194452, |
|
"learning_rate": 1.1705725352360633e-06, |
|
"loss": 1.9249, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 13.348912305071467, |
|
"learning_rate": 1.165032208764438e-06, |
|
"loss": 2.0641, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.95578231292517, |
|
"grad_norm": 12.61033318044152, |
|
"learning_rate": 1.1594866698212483e-06, |
|
"loss": 2.169, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.9591836734693877, |
|
"grad_norm": 28.256325358544956, |
|
"learning_rate": 1.1539360935599644e-06, |
|
"loss": 2.0952, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.9625850340136054, |
|
"grad_norm": 12.61302060729169, |
|
"learning_rate": 1.1483806552931582e-06, |
|
"loss": 1.9411, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.9659863945578231, |
|
"grad_norm": 8.711391665501074, |
|
"learning_rate": 1.142820530486966e-06, |
|
"loss": 1.7633, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.9693877551020408, |
|
"grad_norm": 35.95958496013491, |
|
"learning_rate": 1.1372558947555455e-06, |
|
"loss": 2.1904, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9727891156462585, |
|
"grad_norm": 3.429092657849847, |
|
"learning_rate": 1.131686923855531e-06, |
|
"loss": 1.8276, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.9761904761904762, |
|
"grad_norm": 12.871658288368948, |
|
"learning_rate": 1.1261137936804811e-06, |
|
"loss": 2.0911, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.9795918367346939, |
|
"grad_norm": 13.217001333800638, |
|
"learning_rate": 1.1205366802553228e-06, |
|
"loss": 1.9614, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.9829931972789115, |
|
"grad_norm": 24.712172909538513, |
|
"learning_rate": 1.1149557597307934e-06, |
|
"loss": 2.0412, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.9863945578231292, |
|
"grad_norm": 10.412944718560512, |
|
"learning_rate": 1.1093712083778746e-06, |
|
"loss": 1.7787, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9897959183673469, |
|
"grad_norm": 15.631851389191027, |
|
"learning_rate": 1.1037832025822265e-06, |
|
"loss": 2.3362, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.9931972789115646, |
|
"grad_norm": 12.135256117907334, |
|
"learning_rate": 1.098191918838617e-06, |
|
"loss": 2.0212, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.9965986394557823, |
|
"grad_norm": 13.057522322919077, |
|
"learning_rate": 1.0925975337453462e-06, |
|
"loss": 2.2842, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 17.565324685523922, |
|
"learning_rate": 1.0870002239986686e-06, |
|
"loss": 2.5002, |
|
"step": 294 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 588, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 294, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 95887829237760.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|