|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 939, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0010649627263045794, |
|
"grad_norm": 49.5, |
|
"learning_rate": 3e-05, |
|
"loss": 2.4246, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002129925452609159, |
|
"grad_norm": 37.75, |
|
"learning_rate": 2.9999916048314652e-05, |
|
"loss": 2.2391, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.003194888178913738, |
|
"grad_norm": 26.75, |
|
"learning_rate": 2.9999664194198307e-05, |
|
"loss": 1.8044, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004259850905218318, |
|
"grad_norm": 22.5, |
|
"learning_rate": 2.9999244440470125e-05, |
|
"loss": 1.5917, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.005324813631522897, |
|
"grad_norm": 15.5, |
|
"learning_rate": 2.999865679182864e-05, |
|
"loss": 1.74, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006389776357827476, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 2.999790125485172e-05, |
|
"loss": 1.6569, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.007454739084132056, |
|
"grad_norm": 20.375, |
|
"learning_rate": 2.9996977837996533e-05, |
|
"loss": 2.0075, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.008519701810436636, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 2.9995886551599382e-05, |
|
"loss": 1.8082, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.009584664536741214, |
|
"grad_norm": 14.875, |
|
"learning_rate": 2.9994627407875647e-05, |
|
"loss": 1.4614, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.010649627263045794, |
|
"grad_norm": 19.125, |
|
"learning_rate": 2.999320042091963e-05, |
|
"loss": 1.5376, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011714589989350373, |
|
"grad_norm": 20.25, |
|
"learning_rate": 2.999160560670439e-05, |
|
"loss": 1.6331, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.012779552715654952, |
|
"grad_norm": 23.5, |
|
"learning_rate": 2.9989842983081574e-05, |
|
"loss": 1.3088, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.013844515441959531, |
|
"grad_norm": 25.5, |
|
"learning_rate": 2.9987912569781212e-05, |
|
"loss": 1.368, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.014909478168264111, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 2.99858143884115e-05, |
|
"loss": 1.424, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01597444089456869, |
|
"grad_norm": 18.625, |
|
"learning_rate": 2.9983548462458546e-05, |
|
"loss": 1.9811, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01703940362087327, |
|
"grad_norm": 18.375, |
|
"learning_rate": 2.9981114817286128e-05, |
|
"loss": 1.6946, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01810436634717785, |
|
"grad_norm": 17.875, |
|
"learning_rate": 2.9978513480135398e-05, |
|
"loss": 1.5809, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.019169329073482427, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 2.9975744480124565e-05, |
|
"loss": 1.607, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02023429179978701, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 2.99728078482486e-05, |
|
"loss": 1.4597, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.021299254526091587, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 2.996970361737886e-05, |
|
"loss": 1.4767, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.022364217252396165, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.9966431822262732e-05, |
|
"loss": 1.6979, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.023429179978700747, |
|
"grad_norm": 12.375, |
|
"learning_rate": 2.9962992499523246e-05, |
|
"loss": 1.5985, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.024494142705005325, |
|
"grad_norm": 15.0625, |
|
"learning_rate": 2.9959385687658655e-05, |
|
"loss": 1.3997, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.025559105431309903, |
|
"grad_norm": 14.75, |
|
"learning_rate": 2.9955611427042026e-05, |
|
"loss": 1.5398, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.026624068157614485, |
|
"grad_norm": 20.125, |
|
"learning_rate": 2.9951669759920757e-05, |
|
"loss": 1.5154, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.027689030883919063, |
|
"grad_norm": 11.25, |
|
"learning_rate": 2.9947560730416133e-05, |
|
"loss": 1.7137, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02875399361022364, |
|
"grad_norm": 17.5, |
|
"learning_rate": 2.9943284384522815e-05, |
|
"loss": 1.5518, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.029818956336528223, |
|
"grad_norm": 15.625, |
|
"learning_rate": 2.9938840770108324e-05, |
|
"loss": 0.6355, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0308839190628328, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.9934229936912516e-05, |
|
"loss": 1.5532, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03194888178913738, |
|
"grad_norm": 12.5, |
|
"learning_rate": 2.992945193654702e-05, |
|
"loss": 1.5355, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03301384451544196, |
|
"grad_norm": 12.25, |
|
"learning_rate": 2.9924506822494668e-05, |
|
"loss": 1.5092, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.03407880724174654, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 2.9919394650108877e-05, |
|
"loss": 1.7472, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.03514376996805112, |
|
"grad_norm": 12.375, |
|
"learning_rate": 2.9914115476613035e-05, |
|
"loss": 1.5176, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0362087326943557, |
|
"grad_norm": 11.5, |
|
"learning_rate": 2.9908669361099895e-05, |
|
"loss": 1.4667, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03727369542066028, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 2.9903056364530856e-05, |
|
"loss": 1.6553, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.038338658146964855, |
|
"grad_norm": 12.875, |
|
"learning_rate": 2.989727654973532e-05, |
|
"loss": 1.7434, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.039403620873269436, |
|
"grad_norm": 13.75, |
|
"learning_rate": 2.9891329981409983e-05, |
|
"loss": 1.8034, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04046858359957402, |
|
"grad_norm": 15.875, |
|
"learning_rate": 2.9885216726118107e-05, |
|
"loss": 1.1681, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04153354632587859, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 2.987893685228876e-05, |
|
"loss": 1.6805, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.042598509052183174, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 2.987249043021608e-05, |
|
"loss": 1.6899, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.043663471778487756, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 2.986587753205847e-05, |
|
"loss": 1.773, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.04472843450479233, |
|
"grad_norm": 18.125, |
|
"learning_rate": 2.985909823183778e-05, |
|
"loss": 1.7827, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.04579339723109691, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 2.985215260543851e-05, |
|
"loss": 1.4104, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.046858359957401494, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 2.9845040730606926e-05, |
|
"loss": 1.6319, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.04792332268370607, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.9837762686950216e-05, |
|
"loss": 1.597, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04898828541001065, |
|
"grad_norm": 16.25, |
|
"learning_rate": 2.9830318555935578e-05, |
|
"loss": 1.4282, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.05005324813631523, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.982270842088933e-05, |
|
"loss": 1.5651, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.051118210862619806, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 2.9814932366995963e-05, |
|
"loss": 1.8754, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.05218317358892439, |
|
"grad_norm": 11.625, |
|
"learning_rate": 2.980699048129718e-05, |
|
"loss": 1.8102, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.05324813631522897, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 2.9798882852690942e-05, |
|
"loss": 1.7686, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.054313099041533544, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 2.979060957193047e-05, |
|
"loss": 1.6104, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.055378061767838126, |
|
"grad_norm": 15.0, |
|
"learning_rate": 2.9782170731623196e-05, |
|
"loss": 1.6611, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.05644302449414271, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 2.977356642622978e-05, |
|
"loss": 1.5407, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.05750798722044728, |
|
"grad_norm": 12.0, |
|
"learning_rate": 2.9764796752063013e-05, |
|
"loss": 1.4936, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.058572949946751864, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.9755861807286744e-05, |
|
"loss": 1.7986, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.059637912673056445, |
|
"grad_norm": 11.625, |
|
"learning_rate": 2.9746761691914805e-05, |
|
"loss": 1.2507, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.06070287539936102, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 2.9737496507809862e-05, |
|
"loss": 1.7383, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0617678381256656, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 2.9728066358682293e-05, |
|
"loss": 1.8381, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.06283280085197018, |
|
"grad_norm": 11.125, |
|
"learning_rate": 2.9718471350089018e-05, |
|
"loss": 1.4074, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.06389776357827476, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 2.970871158943232e-05, |
|
"loss": 1.4428, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06496272630457935, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 2.9698787185958652e-05, |
|
"loss": 1.6872, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.06602768903088392, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 2.9688698250757396e-05, |
|
"loss": 1.5911, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0670926517571885, |
|
"grad_norm": 10.875, |
|
"learning_rate": 2.9678444896759637e-05, |
|
"loss": 1.8574, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.06815761448349308, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 2.9668027238736885e-05, |
|
"loss": 1.6105, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.06922257720979766, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 2.9657445393299805e-05, |
|
"loss": 1.4482, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07028753993610223, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 2.964669947889689e-05, |
|
"loss": 1.6329, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.07135250266240682, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.963578961581316e-05, |
|
"loss": 1.6117, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0724174653887114, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.962471592616881e-05, |
|
"loss": 1.6114, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.07348242811501597, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 2.9613478533917813e-05, |
|
"loss": 1.352, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.07454739084132056, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 2.9602077564846577e-05, |
|
"loss": 1.6438, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07561235356762513, |
|
"grad_norm": 10.375, |
|
"learning_rate": 2.9590513146572512e-05, |
|
"loss": 1.7608, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.07667731629392971, |
|
"grad_norm": 10.75, |
|
"learning_rate": 2.957878540854261e-05, |
|
"loss": 1.3708, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0777422790202343, |
|
"grad_norm": 9.25, |
|
"learning_rate": 2.9566894482031983e-05, |
|
"loss": 1.5508, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.07880724174653887, |
|
"grad_norm": 10.625, |
|
"learning_rate": 2.95548405001424e-05, |
|
"loss": 1.9396, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.07987220447284345, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 2.954262359780082e-05, |
|
"loss": 1.6615, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08093716719914804, |
|
"grad_norm": 16.75, |
|
"learning_rate": 2.9530243911757843e-05, |
|
"loss": 1.7127, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.08200212992545261, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.95177015805862e-05, |
|
"loss": 1.6021, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.08306709265175719, |
|
"grad_norm": 12.875, |
|
"learning_rate": 2.950499674467921e-05, |
|
"loss": 1.6266, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.08413205537806177, |
|
"grad_norm": 13.875, |
|
"learning_rate": 2.949212954624918e-05, |
|
"loss": 1.3126, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.08519701810436635, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 2.9479100129325855e-05, |
|
"loss": 1.3167, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08626198083067092, |
|
"grad_norm": 9.875, |
|
"learning_rate": 2.9465908639754763e-05, |
|
"loss": 1.8433, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.08732694355697551, |
|
"grad_norm": 11.625, |
|
"learning_rate": 2.9452555225195608e-05, |
|
"loss": 1.467, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.08839190628328009, |
|
"grad_norm": 9.75, |
|
"learning_rate": 2.9439040035120615e-05, |
|
"loss": 1.8683, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.08945686900958466, |
|
"grad_norm": 17.5, |
|
"learning_rate": 2.9425363220812843e-05, |
|
"loss": 1.515, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.09052183173588925, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 2.941152493536451e-05, |
|
"loss": 1.7103, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09158679446219382, |
|
"grad_norm": 14.125, |
|
"learning_rate": 2.939752533367527e-05, |
|
"loss": 1.7114, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0926517571884984, |
|
"grad_norm": 11.625, |
|
"learning_rate": 2.9383364572450472e-05, |
|
"loss": 1.3458, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.09371671991480299, |
|
"grad_norm": 10.875, |
|
"learning_rate": 2.9369042810199422e-05, |
|
"loss": 1.7802, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.09478168264110756, |
|
"grad_norm": 10.5, |
|
"learning_rate": 2.9354560207233596e-05, |
|
"loss": 1.4666, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.09584664536741214, |
|
"grad_norm": 14.875, |
|
"learning_rate": 2.9339916925664856e-05, |
|
"loss": 1.4141, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09691160809371673, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 2.9325113129403612e-05, |
|
"loss": 1.3912, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0979765708200213, |
|
"grad_norm": 16.0, |
|
"learning_rate": 2.9310148984157028e-05, |
|
"loss": 1.7521, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.09904153354632587, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.9295024657427128e-05, |
|
"loss": 1.0351, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.10010649627263046, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 2.927974031850894e-05, |
|
"loss": 1.545, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.10117145899893504, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.9264296138488606e-05, |
|
"loss": 1.8704, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10223642172523961, |
|
"grad_norm": 16.75, |
|
"learning_rate": 2.9248692290241445e-05, |
|
"loss": 1.3282, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1033013844515442, |
|
"grad_norm": 13.625, |
|
"learning_rate": 2.9232928948430037e-05, |
|
"loss": 1.6012, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.10436634717784878, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.9217006289502266e-05, |
|
"loss": 1.783, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.10543130990415335, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 2.920092449168934e-05, |
|
"loss": 1.7447, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.10649627263045794, |
|
"grad_norm": 11.25, |
|
"learning_rate": 2.91846837350038e-05, |
|
"loss": 1.908, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10756123535676251, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.9168284201237487e-05, |
|
"loss": 1.5631, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.10862619808306709, |
|
"grad_norm": 10.5, |
|
"learning_rate": 2.9151726073959544e-05, |
|
"loss": 1.9487, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.10969116080937168, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 2.9135009538514325e-05, |
|
"loss": 1.1878, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.11075612353567625, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 2.9118134782019345e-05, |
|
"loss": 1.7323, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.11182108626198083, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 2.9101101993363162e-05, |
|
"loss": 1.4423, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.11288604898828541, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 2.9083911363203294e-05, |
|
"loss": 1.429, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.11395101171458999, |
|
"grad_norm": 10.75, |
|
"learning_rate": 2.9066563083964054e-05, |
|
"loss": 1.3625, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.11501597444089456, |
|
"grad_norm": 11.25, |
|
"learning_rate": 2.904905734983441e-05, |
|
"loss": 1.8362, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.11608093716719915, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 2.9031394356765817e-05, |
|
"loss": 1.5439, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.11714589989350373, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.901357430247001e-05, |
|
"loss": 1.8833, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1182108626198083, |
|
"grad_norm": 10.5, |
|
"learning_rate": 2.89955973864168e-05, |
|
"loss": 1.6258, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.11927582534611289, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 2.8977463809831847e-05, |
|
"loss": 1.8468, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.12034078807241747, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 2.8959173775694387e-05, |
|
"loss": 1.501, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.12140575079872204, |
|
"grad_norm": 16.125, |
|
"learning_rate": 2.894072748873498e-05, |
|
"loss": 1.4962, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.12247071352502663, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 2.892212515543321e-05, |
|
"loss": 1.9837, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1235356762513312, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 2.890336698401538e-05, |
|
"loss": 1.7728, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.12460063897763578, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 2.888445318445216e-05, |
|
"loss": 2.0979, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.12566560170394037, |
|
"grad_norm": 15.625, |
|
"learning_rate": 2.8865383968456272e-05, |
|
"loss": 1.5003, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.12673056443024494, |
|
"grad_norm": 10.625, |
|
"learning_rate": 2.8846159549480088e-05, |
|
"loss": 2.2097, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.12779552715654952, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 2.882678014271326e-05, |
|
"loss": 1.8599, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1288604898828541, |
|
"grad_norm": 10.5, |
|
"learning_rate": 2.88072459650803e-05, |
|
"loss": 1.6879, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1299254526091587, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 2.8787557235238167e-05, |
|
"loss": 1.6777, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.13099041533546327, |
|
"grad_norm": 9.0, |
|
"learning_rate": 2.876771417357379e-05, |
|
"loss": 1.8459, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.13205537806176784, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 2.8747717002201638e-05, |
|
"loss": 1.7704, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.13312034078807242, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.87275659449612e-05, |
|
"loss": 1.8606, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.134185303514377, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 2.870726122741452e-05, |
|
"loss": 1.8471, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.13525026624068157, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 2.868680307684363e-05, |
|
"loss": 1.7965, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.13631522896698617, |
|
"grad_norm": 9.125, |
|
"learning_rate": 2.866619172224802e-05, |
|
"loss": 1.3363, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.13738019169329074, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 2.864542739434208e-05, |
|
"loss": 1.3873, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.13844515441959532, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 2.862451032555253e-05, |
|
"loss": 1.3849, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1395101171458999, |
|
"grad_norm": 9.75, |
|
"learning_rate": 2.8603440750015786e-05, |
|
"loss": 1.6095, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.14057507987220447, |
|
"grad_norm": 10.75, |
|
"learning_rate": 2.858221890357537e-05, |
|
"loss": 1.4683, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.14164004259850904, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 2.856084502377925e-05, |
|
"loss": 1.7605, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.14270500532481364, |
|
"grad_norm": 12.0, |
|
"learning_rate": 2.853931934987719e-05, |
|
"loss": 1.8109, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.14376996805111822, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 2.8517642122818067e-05, |
|
"loss": 1.7257, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1448349307774228, |
|
"grad_norm": 10.5, |
|
"learning_rate": 2.849581358524719e-05, |
|
"loss": 1.649, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.14589989350372737, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.8473833981503553e-05, |
|
"loss": 1.365, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.14696485623003194, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 2.8451703557617126e-05, |
|
"loss": 1.4037, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.14802981895633652, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.84294225613061e-05, |
|
"loss": 1.9803, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.14909478168264112, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 2.840699124197409e-05, |
|
"loss": 1.6368, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1501597444089457, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 2.8384409850707383e-05, |
|
"loss": 1.8567, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.15122470713525027, |
|
"grad_norm": 18.5, |
|
"learning_rate": 2.8361678640272086e-05, |
|
"loss": 1.8818, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.15228966986155484, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.8338797865111323e-05, |
|
"loss": 1.7529, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.15335463258785942, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 2.831576778134238e-05, |
|
"loss": 1.5764, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.154419595314164, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.8292588646753838e-05, |
|
"loss": 1.7057, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1554845580404686, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 2.826926072080268e-05, |
|
"loss": 1.2397, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.15654952076677317, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 2.8245784264611408e-05, |
|
"loss": 1.5406, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.15761448349307774, |
|
"grad_norm": 12.375, |
|
"learning_rate": 2.822215954096509e-05, |
|
"loss": 1.4422, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.15867944621938232, |
|
"grad_norm": 10.25, |
|
"learning_rate": 2.8198386814308442e-05, |
|
"loss": 1.8692, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.1597444089456869, |
|
"grad_norm": 13.5, |
|
"learning_rate": 2.8174466350742865e-05, |
|
"loss": 1.7064, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16080937167199147, |
|
"grad_norm": 11.125, |
|
"learning_rate": 2.8150398418023447e-05, |
|
"loss": 1.7838, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.16187433439829607, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.8126183285556e-05, |
|
"loss": 1.9361, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.16293929712460065, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 2.810182122439401e-05, |
|
"loss": 1.8139, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.16400425985090522, |
|
"grad_norm": 9.875, |
|
"learning_rate": 2.807731250723562e-05, |
|
"loss": 1.5778, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1650692225772098, |
|
"grad_norm": 16.75, |
|
"learning_rate": 2.8052657408420587e-05, |
|
"loss": 1.5036, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.16613418530351437, |
|
"grad_norm": 11.75, |
|
"learning_rate": 2.8027856203927183e-05, |
|
"loss": 1.2629, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.16719914802981894, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 2.800290917136913e-05, |
|
"loss": 1.7838, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.16826411075612355, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.7977816589992494e-05, |
|
"loss": 1.8775, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.16932907348242812, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 2.795257874067253e-05, |
|
"loss": 1.4417, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.1703940362087327, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 2.7927195905910576e-05, |
|
"loss": 2.1449, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17145899893503727, |
|
"grad_norm": 9.625, |
|
"learning_rate": 2.790166836983086e-05, |
|
"loss": 1.8285, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.17252396166134185, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 2.7875996418177348e-05, |
|
"loss": 1.4369, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.17358892438764642, |
|
"grad_norm": 12.625, |
|
"learning_rate": 2.7850180338310517e-05, |
|
"loss": 1.9668, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.17465388711395102, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.782422041920415e-05, |
|
"loss": 1.7926, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.1757188498402556, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 2.779811695144212e-05, |
|
"loss": 1.945, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.17678381256656017, |
|
"grad_norm": 10.0, |
|
"learning_rate": 2.7771870227215096e-05, |
|
"loss": 1.9434, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.17784877529286475, |
|
"grad_norm": 15.3125, |
|
"learning_rate": 2.7745480540317315e-05, |
|
"loss": 1.6811, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.17891373801916932, |
|
"grad_norm": 10.75, |
|
"learning_rate": 2.771894818614327e-05, |
|
"loss": 1.8628, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.1799787007454739, |
|
"grad_norm": 14.125, |
|
"learning_rate": 2.7692273461684407e-05, |
|
"loss": 1.8834, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.1810436634717785, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.7665456665525805e-05, |
|
"loss": 1.8599, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18210862619808307, |
|
"grad_norm": 10.75, |
|
"learning_rate": 2.7638498097842823e-05, |
|
"loss": 1.5488, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.18317358892438765, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.7611398060397755e-05, |
|
"loss": 1.6002, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.18423855165069222, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 2.7584156856536446e-05, |
|
"loss": 1.6349, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.1853035143769968, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 2.7556774791184893e-05, |
|
"loss": 1.7368, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.18636847710330137, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 2.752925217084583e-05, |
|
"loss": 1.9945, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18743343982960597, |
|
"grad_norm": 9.5, |
|
"learning_rate": 2.7501589303595305e-05, |
|
"loss": 1.1876, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.18849840255591055, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 2.7473786499079232e-05, |
|
"loss": 1.8281, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.18956336528221512, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 2.744584406850992e-05, |
|
"loss": 1.4349, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.1906283280085197, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 2.741776232466258e-05, |
|
"loss": 1.8583, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.19169329073482427, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 2.7389541581871843e-05, |
|
"loss": 2.0097, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19275825346112885, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 2.736118215602823e-05, |
|
"loss": 1.8045, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.19382321618743345, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 2.7332684364574632e-05, |
|
"loss": 2.2613, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.19488817891373802, |
|
"grad_norm": 9.5, |
|
"learning_rate": 2.7304048526502723e-05, |
|
"loss": 1.443, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.1959531416400426, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 2.7275274962349417e-05, |
|
"loss": 1.5373, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.19701810436634717, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 2.7246363994193276e-05, |
|
"loss": 1.8491, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.19808306709265175, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.721731594565091e-05, |
|
"loss": 1.5917, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.19914802981895632, |
|
"grad_norm": 10.0, |
|
"learning_rate": 2.718813114187332e-05, |
|
"loss": 1.7208, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.20021299254526093, |
|
"grad_norm": 13.875, |
|
"learning_rate": 2.7158809909542308e-05, |
|
"loss": 1.5774, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.2012779552715655, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 2.712935257686679e-05, |
|
"loss": 1.4742, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.20234291799787008, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 2.709975947357914e-05, |
|
"loss": 1.7704, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.20340788072417465, |
|
"grad_norm": 10.25, |
|
"learning_rate": 2.707003093093146e-05, |
|
"loss": 1.3238, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.20447284345047922, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.704016728169193e-05, |
|
"loss": 1.8382, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.2055378061767838, |
|
"grad_norm": 9.25, |
|
"learning_rate": 2.7010168860141033e-05, |
|
"loss": 1.6611, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.2066027689030884, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 2.6980036002067846e-05, |
|
"loss": 1.4489, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.20766773162939298, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 2.6949769044766266e-05, |
|
"loss": 1.6431, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.20873269435569755, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 2.6919368327031236e-05, |
|
"loss": 1.6825, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.20979765708200213, |
|
"grad_norm": 9.625, |
|
"learning_rate": 2.6888834189154955e-05, |
|
"loss": 2.1797, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.2108626198083067, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 2.6858166972923063e-05, |
|
"loss": 2.0067, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.21192758253461128, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 2.6827367021610832e-05, |
|
"loss": 1.4995, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.21299254526091588, |
|
"grad_norm": 18.0, |
|
"learning_rate": 2.67964346799793e-05, |
|
"loss": 1.4391, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21405750798722045, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.676537029427143e-05, |
|
"loss": 2.0381, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.21512247071352503, |
|
"grad_norm": 12.75, |
|
"learning_rate": 2.6734174212208226e-05, |
|
"loss": 1.7726, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.2161874334398296, |
|
"grad_norm": 12.5, |
|
"learning_rate": 2.6702846782984846e-05, |
|
"loss": 1.5714, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.21725239616613418, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 2.6671388357266687e-05, |
|
"loss": 2.0508, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.21831735889243875, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 2.6639799287185456e-05, |
|
"loss": 1.7225, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.21938232161874335, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 2.660807992633525e-05, |
|
"loss": 1.433, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.22044728434504793, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 2.657623062976858e-05, |
|
"loss": 1.9277, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.2215122470713525, |
|
"grad_norm": 9.0, |
|
"learning_rate": 2.6544251753992387e-05, |
|
"loss": 1.5421, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.22257720979765708, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 2.6512143656964077e-05, |
|
"loss": 1.8624, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.22364217252396165, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 2.6479906698087496e-05, |
|
"loss": 1.7282, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22470713525026625, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 2.6447541238208917e-05, |
|
"loss": 1.993, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.22577209797657083, |
|
"grad_norm": 13.875, |
|
"learning_rate": 2.6415047639612992e-05, |
|
"loss": 2.1598, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.2268370607028754, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 2.6382426266018704e-05, |
|
"loss": 1.4964, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.22790202342917998, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 2.6349677482575297e-05, |
|
"loss": 1.46, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.22896698615548455, |
|
"grad_norm": 9.125, |
|
"learning_rate": 2.6316801655858165e-05, |
|
"loss": 1.1583, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.23003194888178913, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 2.6283799153864797e-05, |
|
"loss": 1.3867, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.23109691160809373, |
|
"grad_norm": 13.375, |
|
"learning_rate": 2.6250670346010608e-05, |
|
"loss": 1.3233, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.2321618743343983, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 2.6217415603124835e-05, |
|
"loss": 1.414, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.23322683706070288, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 2.618403529744637e-05, |
|
"loss": 1.8825, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.23429179978700745, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 2.6150529802619604e-05, |
|
"loss": 1.6701, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.23535676251331203, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.6116899493690237e-05, |
|
"loss": 1.7026, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.2364217252396166, |
|
"grad_norm": 10.5, |
|
"learning_rate": 2.6083144747101086e-05, |
|
"loss": 2.3935, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2374866879659212, |
|
"grad_norm": 9.0, |
|
"learning_rate": 2.6049265940687868e-05, |
|
"loss": 1.7679, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.23855165069222578, |
|
"grad_norm": 14.0, |
|
"learning_rate": 2.601526345367496e-05, |
|
"loss": 1.5758, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.23961661341853036, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 2.5981137666671178e-05, |
|
"loss": 2.0644, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.24068157614483493, |
|
"grad_norm": 12.125, |
|
"learning_rate": 2.5946888961665512e-05, |
|
"loss": 1.6724, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.2417465388711395, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.5912517722022817e-05, |
|
"loss": 1.3376, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.24281150159744408, |
|
"grad_norm": 11.25, |
|
"learning_rate": 2.587802433247956e-05, |
|
"loss": 1.5454, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.24387646432374868, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 2.5843409179139498e-05, |
|
"loss": 1.6634, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.24494142705005326, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 2.580867264946936e-05, |
|
"loss": 1.9844, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.24600638977635783, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 2.5773815132294517e-05, |
|
"loss": 1.8239, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.2470713525026624, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 2.57388370177946e-05, |
|
"loss": 2.0781, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.24813631522896698, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.5703738697499167e-05, |
|
"loss": 2.0183, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.24920127795527156, |
|
"grad_norm": 21.5, |
|
"learning_rate": 2.5668520564283305e-05, |
|
"loss": 1.8053, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.25026624068157616, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.5633183012363226e-05, |
|
"loss": 1.4977, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.25133120340788073, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 2.559772643729188e-05, |
|
"loss": 1.9078, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2523961661341853, |
|
"grad_norm": 10.375, |
|
"learning_rate": 2.556215123595449e-05, |
|
"loss": 1.8836, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2534611288604899, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 2.5526457806564138e-05, |
|
"loss": 1.4115, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.25452609158679446, |
|
"grad_norm": 19.875, |
|
"learning_rate": 2.5490646548657296e-05, |
|
"loss": 1.8534, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.25559105431309903, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.5454717863089367e-05, |
|
"loss": 1.6013, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2566560170394036, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 2.5418672152030174e-05, |
|
"loss": 1.4595, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.2577209797657082, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 2.5382509818959468e-05, |
|
"loss": 1.3814, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.25878594249201275, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 2.5346231268662435e-05, |
|
"loss": 1.6796, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.2598509052183174, |
|
"grad_norm": 9.0, |
|
"learning_rate": 2.5309836907225126e-05, |
|
"loss": 1.5827, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.26091586794462196, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.527332714202994e-05, |
|
"loss": 1.7017, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.26198083067092653, |
|
"grad_norm": 14.875, |
|
"learning_rate": 2.523670238175106e-05, |
|
"loss": 1.5484, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.2630457933972311, |
|
"grad_norm": 10.375, |
|
"learning_rate": 2.519996303634985e-05, |
|
"loss": 1.7837, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.2641107561235357, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 2.5163109517070322e-05, |
|
"loss": 1.6422, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.26517571884984026, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 2.512614223643448e-05, |
|
"loss": 2.0394, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.26624068157614483, |
|
"grad_norm": 14.375, |
|
"learning_rate": 2.5089061608237717e-05, |
|
"loss": 1.5178, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2673056443024494, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 2.5051868047544206e-05, |
|
"loss": 1.6257, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.268370607028754, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.501456197068222e-05, |
|
"loss": 1.6878, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.26943556975505856, |
|
"grad_norm": 11.625, |
|
"learning_rate": 2.4977143795239504e-05, |
|
"loss": 1.7076, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.27050053248136313, |
|
"grad_norm": 18.0, |
|
"learning_rate": 2.493961394005857e-05, |
|
"loss": 1.3901, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.2715654952076677, |
|
"grad_norm": 10.875, |
|
"learning_rate": 2.4901972825232033e-05, |
|
"loss": 1.901, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.27263045793397234, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 2.48642208720979e-05, |
|
"loss": 2.0679, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.2736954206602769, |
|
"grad_norm": 14.25, |
|
"learning_rate": 2.482635850323484e-05, |
|
"loss": 1.5161, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.2747603833865815, |
|
"grad_norm": 9.25, |
|
"learning_rate": 2.478838614245749e-05, |
|
"loss": 1.6164, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.27582534611288606, |
|
"grad_norm": 12.25, |
|
"learning_rate": 2.475030421481167e-05, |
|
"loss": 1.1729, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.27689030883919064, |
|
"grad_norm": 9.125, |
|
"learning_rate": 2.4712113146569638e-05, |
|
"loss": 1.6588, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2779552715654952, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 2.4673813365225346e-05, |
|
"loss": 1.404, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.2790202342917998, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 2.463540529948961e-05, |
|
"loss": 1.7023, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.28008519701810436, |
|
"grad_norm": 9.125, |
|
"learning_rate": 2.4596889379285353e-05, |
|
"loss": 1.7676, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.28115015974440893, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 2.455826603574276e-05, |
|
"loss": 1.6981, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.2822151224707135, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 2.451953570119446e-05, |
|
"loss": 1.1314, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2832800851970181, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.4480698809170716e-05, |
|
"loss": 1.5507, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.28434504792332266, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 2.4441755794394522e-05, |
|
"loss": 1.9222, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.2854100106496273, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 2.4402707092776778e-05, |
|
"loss": 0.9189, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.28647497337593186, |
|
"grad_norm": 8.125, |
|
"learning_rate": 2.436355314141139e-05, |
|
"loss": 1.8083, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.28753993610223644, |
|
"grad_norm": 9.5, |
|
"learning_rate": 2.4324294378570385e-05, |
|
"loss": 1.4605, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.288604898828541, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 2.428493124369902e-05, |
|
"loss": 1.653, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.2896698615548456, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.4245464177410802e-05, |
|
"loss": 1.3704, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.29073482428115016, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 2.4205893621482648e-05, |
|
"loss": 1.3454, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.29179978700745474, |
|
"grad_norm": 11.75, |
|
"learning_rate": 2.416622001884987e-05, |
|
"loss": 1.9638, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.2928647497337593, |
|
"grad_norm": 15.125, |
|
"learning_rate": 2.4126443813601235e-05, |
|
"loss": 1.3706, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2939297124600639, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.408656545097401e-05, |
|
"loss": 1.2448, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.29499467518636846, |
|
"grad_norm": 20.25, |
|
"learning_rate": 2.4046585377348963e-05, |
|
"loss": 1.3458, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.29605963791267303, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 2.400650404024537e-05, |
|
"loss": 1.7146, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2971246006389776, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 2.3966321888316e-05, |
|
"loss": 1.442, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.29818956336528224, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 2.3926039371342105e-05, |
|
"loss": 1.6687, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2992545260915868, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.3885656940228378e-05, |
|
"loss": 1.8262, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.3003194888178914, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 2.3845175046997903e-05, |
|
"loss": 0.883, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.30138445154419596, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 2.3804594144787105e-05, |
|
"loss": 1.842, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.30244941427050054, |
|
"grad_norm": 10.625, |
|
"learning_rate": 2.3763914687840663e-05, |
|
"loss": 1.7852, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.3035143769968051, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 2.3723137131506454e-05, |
|
"loss": 1.6978, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3045793397231097, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 2.3682261932230403e-05, |
|
"loss": 1.9347, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.30564430244941426, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 2.364128954755144e-05, |
|
"loss": 2.0528, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.30670926517571884, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.360022043609632e-05, |
|
"loss": 1.6198, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.3077742279020234, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 2.3559055057574533e-05, |
|
"loss": 1.8185, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.308839190628328, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.3517793872773135e-05, |
|
"loss": 1.2761, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.30990415335463256, |
|
"grad_norm": 21.5, |
|
"learning_rate": 2.3476437343551585e-05, |
|
"loss": 1.4168, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.3109691160809372, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 2.3434985932836603e-05, |
|
"loss": 1.5111, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.31203407880724177, |
|
"grad_norm": 10.875, |
|
"learning_rate": 2.3393440104616953e-05, |
|
"loss": 1.2663, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.31309904153354634, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 2.335180032393828e-05, |
|
"loss": 1.6099, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.3141640042598509, |
|
"grad_norm": 10.25, |
|
"learning_rate": 2.331006705689788e-05, |
|
"loss": 2.0177, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3152289669861555, |
|
"grad_norm": 10.875, |
|
"learning_rate": 2.3268240770639508e-05, |
|
"loss": 1.3157, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.31629392971246006, |
|
"grad_norm": 11.75, |
|
"learning_rate": 2.322632193334812e-05, |
|
"loss": 1.5085, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.31735889243876464, |
|
"grad_norm": 12.625, |
|
"learning_rate": 2.3184311014244663e-05, |
|
"loss": 1.7265, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.3184238551650692, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 2.314220848358079e-05, |
|
"loss": 1.5724, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.3194888178913738, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 2.310001481263363e-05, |
|
"loss": 1.6024, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32055378061767836, |
|
"grad_norm": 10.625, |
|
"learning_rate": 2.3057730473700472e-05, |
|
"loss": 1.6951, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.32161874334398294, |
|
"grad_norm": 10.875, |
|
"learning_rate": 2.3015355940093544e-05, |
|
"loss": 1.5714, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.3226837060702875, |
|
"grad_norm": 10.625, |
|
"learning_rate": 2.2972891686134624e-05, |
|
"loss": 1.6869, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.32374866879659214, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 2.2930338187149816e-05, |
|
"loss": 1.9157, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3248136315228967, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.2887695919464172e-05, |
|
"loss": 1.7153, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3258785942492013, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 2.2844965360396405e-05, |
|
"loss": 1.66, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.32694355697550587, |
|
"grad_norm": 8.75, |
|
"learning_rate": 2.2802146988253494e-05, |
|
"loss": 1.3719, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.32800851970181044, |
|
"grad_norm": 10.375, |
|
"learning_rate": 2.2759241282325384e-05, |
|
"loss": 1.8534, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.329073482428115, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.2716248722879577e-05, |
|
"loss": 1.6755, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.3301384451544196, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 2.2673169791155787e-05, |
|
"loss": 1.713, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.33120340788072417, |
|
"grad_norm": 11.5, |
|
"learning_rate": 2.2630004969360534e-05, |
|
"loss": 1.6701, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.33226837060702874, |
|
"grad_norm": 16.25, |
|
"learning_rate": 2.2586754740661756e-05, |
|
"loss": 1.6506, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 8.75, |
|
"learning_rate": 2.25434195891834e-05, |
|
"loss": 1.5584, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.3343982960596379, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 2.25e-05, |
|
"loss": 1.5541, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.3354632587859425, |
|
"grad_norm": 9.0, |
|
"learning_rate": 2.245649645913125e-05, |
|
"loss": 1.6149, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3365282215122471, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.2412909453536553e-05, |
|
"loss": 1.6719, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.33759318423855167, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 2.2369239471109594e-05, |
|
"loss": 1.8576, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.33865814696485624, |
|
"grad_norm": 10.75, |
|
"learning_rate": 2.2325487000672855e-05, |
|
"loss": 1.4974, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.3397231096911608, |
|
"grad_norm": 9.875, |
|
"learning_rate": 2.2281652531972147e-05, |
|
"loss": 1.7744, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.3407880724174654, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 2.223773655567115e-05, |
|
"loss": 0.9289, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34185303514376997, |
|
"grad_norm": 8.875, |
|
"learning_rate": 2.2193739563345886e-05, |
|
"loss": 2.0212, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.34291799787007454, |
|
"grad_norm": 11.5, |
|
"learning_rate": 2.214966204747924e-05, |
|
"loss": 1.9651, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.3439829605963791, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 2.2105504501455456e-05, |
|
"loss": 1.8089, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.3450479233226837, |
|
"grad_norm": 9.875, |
|
"learning_rate": 2.2061267419554577e-05, |
|
"loss": 2.0376, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.34611288604898827, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.2016951296946955e-05, |
|
"loss": 2.1548, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.34717784877529284, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.1972556629687674e-05, |
|
"loss": 1.478, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.34824281150159747, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 2.1928083914711023e-05, |
|
"loss": 1.6681, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.34930777422790205, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.1883533649824922e-05, |
|
"loss": 1.8026, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.3503727369542066, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 2.1838906333705338e-05, |
|
"loss": 1.6024, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.3514376996805112, |
|
"grad_norm": 9.25, |
|
"learning_rate": 2.1794202465890734e-05, |
|
"loss": 1.3394, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.35250266240681577, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 2.1749422546776446e-05, |
|
"loss": 1.7381, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.35356762513312034, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 2.170456707760909e-05, |
|
"loss": 2.0204, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.3546325878594249, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 2.165963656048098e-05, |
|
"loss": 1.9509, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.3556975505857295, |
|
"grad_norm": 10.5, |
|
"learning_rate": 2.1614631498324455e-05, |
|
"loss": 1.915, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.35676251331203407, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.1569552394906292e-05, |
|
"loss": 1.4493, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.35782747603833864, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.152439975482205e-05, |
|
"loss": 1.5415, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.3588924387646432, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 2.1479174083490443e-05, |
|
"loss": 1.5783, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.3599574014909478, |
|
"grad_norm": 12.375, |
|
"learning_rate": 2.1433875887147628e-05, |
|
"loss": 1.5324, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.3610223642172524, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 2.13885056728416e-05, |
|
"loss": 1.5981, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.362087326943557, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 2.1343063948426495e-05, |
|
"loss": 1.6717, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.36315228966986157, |
|
"grad_norm": 15.6875, |
|
"learning_rate": 2.1297551222556887e-05, |
|
"loss": 1.8055, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.36421725239616615, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 2.1251968004682112e-05, |
|
"loss": 1.9067, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.3652822151224707, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.1206314805040573e-05, |
|
"loss": 1.4649, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.3663471778487753, |
|
"grad_norm": 10.75, |
|
"learning_rate": 2.1160592134654e-05, |
|
"loss": 1.5144, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.36741214057507987, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 2.111480050532177e-05, |
|
"loss": 1.6677, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.36847710330138445, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 2.1068940429615138e-05, |
|
"loss": 1.9054, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.369542066027689, |
|
"grad_norm": 12.75, |
|
"learning_rate": 2.102301242087152e-05, |
|
"loss": 1.3985, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.3706070287539936, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 2.097701699318875e-05, |
|
"loss": 1.8731, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.37167199148029817, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 2.0930954661419325e-05, |
|
"loss": 1.9324, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.37273695420660274, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 2.088482594116462e-05, |
|
"loss": 1.3877, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3738019169329074, |
|
"grad_norm": 11.25, |
|
"learning_rate": 2.0838631348769142e-05, |
|
"loss": 1.6139, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.37486687965921195, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 2.079237140131475e-05, |
|
"loss": 1.5881, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.3759318423855165, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 2.0746046616614846e-05, |
|
"loss": 2.0595, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3769968051118211, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 2.0699657513208603e-05, |
|
"loss": 1.4865, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3780617678381257, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 2.065320461035513e-05, |
|
"loss": 1.3383, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.37912673056443025, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 2.0606688428027708e-05, |
|
"loss": 1.683, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.3801916932907348, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 2.0560109486907912e-05, |
|
"loss": 1.2894, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3812566560170394, |
|
"grad_norm": 14.375, |
|
"learning_rate": 2.0513468308379826e-05, |
|
"loss": 1.491, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.38232161874334397, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 2.046676541452419e-05, |
|
"loss": 1.4899, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.38338658146964855, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 2.0420001328112558e-05, |
|
"loss": 1.5506, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3844515441959531, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 2.0373176572601443e-05, |
|
"loss": 1.9362, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.3855165069222577, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.032629167212647e-05, |
|
"loss": 1.6449, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.3865814696485623, |
|
"grad_norm": 12.0, |
|
"learning_rate": 2.0279347151496484e-05, |
|
"loss": 1.5654, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.3876464323748669, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.023234353618771e-05, |
|
"loss": 2.1523, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.3887113951011715, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 2.0185281352337845e-05, |
|
"loss": 1.4507, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.38977635782747605, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 2.0138161126740167e-05, |
|
"loss": 1.7295, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.3908413205537806, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 2.0090983386837668e-05, |
|
"loss": 1.4096, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.3919062832800852, |
|
"grad_norm": 11.125, |
|
"learning_rate": 2.0043748660717107e-05, |
|
"loss": 1.7948, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.3929712460063898, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.999645747710314e-05, |
|
"loss": 1.6431, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.39403620873269435, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.9949110365352377e-05, |
|
"loss": 1.8226, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3951011714589989, |
|
"grad_norm": 14.875, |
|
"learning_rate": 1.990170785544745e-05, |
|
"loss": 0.9145, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.3961661341853035, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 1.985425047799112e-05, |
|
"loss": 1.7916, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.3972310969116081, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.9806738764200293e-05, |
|
"loss": 1.5905, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.39829605963791265, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 1.975917324590009e-05, |
|
"loss": 1.5794, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.3993610223642173, |
|
"grad_norm": 8.75, |
|
"learning_rate": 1.97115544555179e-05, |
|
"loss": 1.4684, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.40042598509052185, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.966388292607742e-05, |
|
"loss": 1.4792, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.4014909478168264, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 1.961615919119268e-05, |
|
"loss": 2.0911, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.402555910543131, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 1.9568383785062086e-05, |
|
"loss": 2.1202, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4036208732694356, |
|
"grad_norm": 11.5, |
|
"learning_rate": 1.9520557242462412e-05, |
|
"loss": 1.7334, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.40468583599574015, |
|
"grad_norm": 8.625, |
|
"learning_rate": 1.9472680098742838e-05, |
|
"loss": 1.7019, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4057507987220447, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.9424752889818956e-05, |
|
"loss": 1.5539, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.4068157614483493, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.9376776152166757e-05, |
|
"loss": 1.2953, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.4078807241746539, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.932875042281664e-05, |
|
"loss": 1.4313, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.40894568690095845, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.9280676239347392e-05, |
|
"loss": 1.6241, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.410010649627263, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 1.923255413988018e-05, |
|
"loss": 1.2076, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.4110756123535676, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.9184384663072514e-05, |
|
"loss": 1.6646, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.41214057507987223, |
|
"grad_norm": 8.75, |
|
"learning_rate": 1.9136168348112236e-05, |
|
"loss": 1.8323, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.4132055378061768, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 1.9087905734711457e-05, |
|
"loss": 1.985, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.4142705005324814, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 1.9039597363100542e-05, |
|
"loss": 1.2271, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.41533546325878595, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 1.8991243774022065e-05, |
|
"loss": 1.7998, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4164004259850905, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 1.894284550872472e-05, |
|
"loss": 1.7686, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.4174653887113951, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.8894403108957305e-05, |
|
"loss": 1.4314, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.4185303514376997, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.884591711696263e-05, |
|
"loss": 1.6033, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.41959531416400425, |
|
"grad_norm": 9.375, |
|
"learning_rate": 1.879738807547146e-05, |
|
"loss": 1.613, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.4206602768903088, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 1.8748816527696443e-05, |
|
"loss": 1.1745, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4217252396166134, |
|
"grad_norm": 10.375, |
|
"learning_rate": 1.8700203017326017e-05, |
|
"loss": 1.3751, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.422790202342918, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.8651548088518328e-05, |
|
"loss": 1.3624, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.42385516506922255, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 1.8602852285895148e-05, |
|
"loss": 1.8956, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.4249201277955272, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 1.8554116154535774e-05, |
|
"loss": 1.5782, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.42598509052183176, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.850534023997092e-05, |
|
"loss": 0.929, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.42705005324813633, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.8456525088176608e-05, |
|
"loss": 1.9057, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.4281150159744409, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.8407671245568086e-05, |
|
"loss": 1.5854, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.4291799787007455, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.8358779258993673e-05, |
|
"loss": 1.5715, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.43024494142705005, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.8309849675728654e-05, |
|
"loss": 1.5288, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.43130990415335463, |
|
"grad_norm": 10.0, |
|
"learning_rate": 1.8260883043469165e-05, |
|
"loss": 1.9417, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.4323748668796592, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.8211879910326044e-05, |
|
"loss": 1.8962, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.4334398296059638, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.8162840824818706e-05, |
|
"loss": 1.5027, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.43450479233226835, |
|
"grad_norm": 9.625, |
|
"learning_rate": 1.8113766335869004e-05, |
|
"loss": 1.9446, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.4355697550585729, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.8064656992795076e-05, |
|
"loss": 1.857, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.4366347177848775, |
|
"grad_norm": 14.125, |
|
"learning_rate": 1.8015513345305205e-05, |
|
"loss": 1.7554, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.43769968051118213, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.7966335943491664e-05, |
|
"loss": 1.769, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.4387646432374867, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.7917125337824546e-05, |
|
"loss": 1.4604, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.4398296059637913, |
|
"grad_norm": 9.875, |
|
"learning_rate": 1.786788207914563e-05, |
|
"loss": 1.7425, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.44089456869009586, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.7818606718662193e-05, |
|
"loss": 1.5833, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.44195953141640043, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 1.7769299807940835e-05, |
|
"loss": 1.7705, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.443024494142705, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 1.771996189890133e-05, |
|
"loss": 1.8412, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.4440894568690096, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.7670593543810427e-05, |
|
"loss": 1.1687, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.44515441959531415, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.7621195295275668e-05, |
|
"loss": 1.7284, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.44621938232161873, |
|
"grad_norm": 10.875, |
|
"learning_rate": 1.757176770623922e-05, |
|
"loss": 1.5151, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.4472843450479233, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.752231132997167e-05, |
|
"loss": 1.9066, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4483493077742279, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 1.7472826720065833e-05, |
|
"loss": 1.4873, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.4494142705005325, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.7423314430430564e-05, |
|
"loss": 1.6894, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.4504792332268371, |
|
"grad_norm": 9.875, |
|
"learning_rate": 1.737377501528455e-05, |
|
"loss": 1.9422, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.45154419595314166, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 1.7324209029150118e-05, |
|
"loss": 1.7035, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.45260915867944623, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.7274617026847e-05, |
|
"loss": 1.6268, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4536741214057508, |
|
"grad_norm": 10.0, |
|
"learning_rate": 1.7224999563486163e-05, |
|
"loss": 1.6705, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.4547390841320554, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.7175357194463556e-05, |
|
"loss": 1.8731, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.45580404685835996, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 1.7125690475453915e-05, |
|
"loss": 1.6503, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.45686900958466453, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.7075999962404548e-05, |
|
"loss": 2.3099, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.4579339723109691, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.70262862115291e-05, |
|
"loss": 1.5789, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4589989350372737, |
|
"grad_norm": 10.0, |
|
"learning_rate": 1.697654977930132e-05, |
|
"loss": 1.6735, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.46006389776357826, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.6926791222448854e-05, |
|
"loss": 1.5729, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.46112886048988283, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.6877011097946995e-05, |
|
"loss": 1.8057, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.46219382321618746, |
|
"grad_norm": 15.0, |
|
"learning_rate": 1.6827209963012454e-05, |
|
"loss": 1.3789, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.46325878594249204, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.6777388375097133e-05, |
|
"loss": 1.6362, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4643237486687966, |
|
"grad_norm": 12.625, |
|
"learning_rate": 1.6727546891881862e-05, |
|
"loss": 1.7457, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.4653887113951012, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.6677686071270175e-05, |
|
"loss": 1.6304, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.46645367412140576, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 1.6627806471382065e-05, |
|
"loss": 1.6158, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.46751863684771033, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.6577908650547732e-05, |
|
"loss": 1.5116, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.4685835995740149, |
|
"grad_norm": 7.875, |
|
"learning_rate": 1.6527993167301322e-05, |
|
"loss": 1.8969, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4696485623003195, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.64780605803747e-05, |
|
"loss": 1.4656, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.47071352502662406, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 1.6428111448691177e-05, |
|
"loss": 2.104, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.47177848775292863, |
|
"grad_norm": 8.875, |
|
"learning_rate": 1.6378146331359252e-05, |
|
"loss": 1.7346, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.4728434504792332, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.6328165787666368e-05, |
|
"loss": 1.3809, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.4739084132055378, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.627817037707265e-05, |
|
"loss": 1.9848, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.4749733759318424, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 1.6228160659204623e-05, |
|
"loss": 1.6315, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.476038338658147, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.6178137193848956e-05, |
|
"loss": 1.552, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.47710330138445156, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 1.6128100540946227e-05, |
|
"loss": 1.4892, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.47816826411075614, |
|
"grad_norm": 9.75, |
|
"learning_rate": 1.607805126058461e-05, |
|
"loss": 0.9454, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.4792332268370607, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.6027989912993635e-05, |
|
"loss": 1.8403, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4802981895633653, |
|
"grad_norm": 9.125, |
|
"learning_rate": 1.5977917058537893e-05, |
|
"loss": 1.659, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.48136315228966986, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 1.592783325771079e-05, |
|
"loss": 1.1204, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.48242811501597443, |
|
"grad_norm": 10.75, |
|
"learning_rate": 1.5877739071128266e-05, |
|
"loss": 1.4801, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.483493077742279, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.5827635059522496e-05, |
|
"loss": 1.6075, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.4845580404685836, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.577752178373564e-05, |
|
"loss": 1.7549, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.48562300319488816, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 1.572739980471357e-05, |
|
"loss": 1.3698, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.48668796592119273, |
|
"grad_norm": 10.375, |
|
"learning_rate": 1.567726968349956e-05, |
|
"loss": 2.1374, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.48775292864749736, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.5627131981228035e-05, |
|
"loss": 1.8397, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.48881789137380194, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.557698725911827e-05, |
|
"loss": 1.791, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.4898828541001065, |
|
"grad_norm": 12.375, |
|
"learning_rate": 1.5526836078468133e-05, |
|
"loss": 1.6973, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4909478168264111, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 1.5476679000647777e-05, |
|
"loss": 1.6784, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.49201277955271566, |
|
"grad_norm": 12.5, |
|
"learning_rate": 1.5426516587093348e-05, |
|
"loss": 1.6424, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.49307774227902024, |
|
"grad_norm": 15.8125, |
|
"learning_rate": 1.5376349399300748e-05, |
|
"loss": 1.3521, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.4941427050053248, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.53261779988193e-05, |
|
"loss": 1.5164, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.4952076677316294, |
|
"grad_norm": 15.625, |
|
"learning_rate": 1.5276002947245486e-05, |
|
"loss": 1.602, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.49627263045793396, |
|
"grad_norm": 8.25, |
|
"learning_rate": 1.5225824806216662e-05, |
|
"loss": 1.6166, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.49733759318423854, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.5175644137404763e-05, |
|
"loss": 1.9271, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.4984025559105431, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 1.5125461502510014e-05, |
|
"loss": 1.8779, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.4994675186368477, |
|
"grad_norm": 15.875, |
|
"learning_rate": 1.5075277463254655e-05, |
|
"loss": 1.2732, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.5005324813631523, |
|
"grad_norm": 13.0, |
|
"learning_rate": 1.5025092581376643e-05, |
|
"loss": 1.4483, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5015974440894568, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 1.4974907418623361e-05, |
|
"loss": 1.5915, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.5026624068157615, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 1.4924722536745351e-05, |
|
"loss": 1.5104, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.503727369542066, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.4874538497489989e-05, |
|
"loss": 1.7744, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.5047923322683706, |
|
"grad_norm": 16.625, |
|
"learning_rate": 1.4824355862595245e-05, |
|
"loss": 1.3057, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.5058572949946751, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.477417519378334e-05, |
|
"loss": 1.3718, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5069222577209798, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 1.472399705275452e-05, |
|
"loss": 2.0112, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.5079872204472844, |
|
"grad_norm": 9.75, |
|
"learning_rate": 1.4673822001180703e-05, |
|
"loss": 1.5658, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.5090521831735889, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.4623650600699254e-05, |
|
"loss": 1.7109, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.5101171458998935, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 1.4573483412906653e-05, |
|
"loss": 1.5009, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.5111821086261981, |
|
"grad_norm": 9.375, |
|
"learning_rate": 1.4523320999352228e-05, |
|
"loss": 1.5631, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5122470713525027, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 1.4473163921531868e-05, |
|
"loss": 1.5273, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.5133120340788072, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.4423012740881726e-05, |
|
"loss": 1.4918, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.5143769968051118, |
|
"grad_norm": 13.375, |
|
"learning_rate": 1.4372868018771971e-05, |
|
"loss": 1.6232, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.5154419595314164, |
|
"grad_norm": 8.125, |
|
"learning_rate": 1.4322730316500444e-05, |
|
"loss": 1.8868, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.516506922257721, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.4272600195286437e-05, |
|
"loss": 0.9475, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5175718849840255, |
|
"grad_norm": 8.875, |
|
"learning_rate": 1.422247821626436e-05, |
|
"loss": 1.9696, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.5186368477103301, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.4172364940477512e-05, |
|
"loss": 1.8617, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.5197018104366348, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.4122260928871737e-05, |
|
"loss": 1.7146, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.5207667731629393, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 1.4072166742289206e-05, |
|
"loss": 1.7215, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.5218317358892439, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 1.402208294146211e-05, |
|
"loss": 1.9426, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5228966986155484, |
|
"grad_norm": 13.375, |
|
"learning_rate": 1.3972010087006364e-05, |
|
"loss": 1.5129, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.5239616613418531, |
|
"grad_norm": 8.125, |
|
"learning_rate": 1.392194873941539e-05, |
|
"loss": 1.8564, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.5250266240681576, |
|
"grad_norm": 9.5, |
|
"learning_rate": 1.3871899459053769e-05, |
|
"loss": 1.674, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.5260915867944622, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.3821862806151046e-05, |
|
"loss": 1.5183, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.5271565495207667, |
|
"grad_norm": 9.625, |
|
"learning_rate": 1.3771839340795383e-05, |
|
"loss": 1.9528, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5282215122470714, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.3721829622927354e-05, |
|
"loss": 1.8296, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.5292864749733759, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.3671834212333633e-05, |
|
"loss": 1.5489, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.5303514376996805, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.362185366864075e-05, |
|
"loss": 1.6517, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.531416400425985, |
|
"grad_norm": 17.25, |
|
"learning_rate": 1.3571888551308827e-05, |
|
"loss": 1.4663, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.5324813631522897, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.3521939419625304e-05, |
|
"loss": 1.661, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5335463258785943, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.347200683269868e-05, |
|
"loss": 1.3956, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.5346112886048988, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.3422091349452269e-05, |
|
"loss": 1.6099, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.5356762513312034, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.3372193528617936e-05, |
|
"loss": 1.6831, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.536741214057508, |
|
"grad_norm": 9.625, |
|
"learning_rate": 1.3322313928729824e-05, |
|
"loss": 1.2729, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.5378061767838126, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.3272453108118142e-05, |
|
"loss": 1.8599, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5388711395101171, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 1.322261162490287e-05, |
|
"loss": 1.7929, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.5399361022364217, |
|
"grad_norm": 8.125, |
|
"learning_rate": 1.3172790036987545e-05, |
|
"loss": 1.5644, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.5410010649627263, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 1.3122988902053007e-05, |
|
"loss": 1.4376, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.5420660276890309, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.3073208777551152e-05, |
|
"loss": 1.4278, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.5431309904153354, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.3023450220698683e-05, |
|
"loss": 1.926, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.54419595314164, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.2973713788470907e-05, |
|
"loss": 1.7574, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.5452609158679447, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.2924000037595453e-05, |
|
"loss": 2.1237, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.5463258785942492, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.2874309524546085e-05, |
|
"loss": 1.6285, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.5473908413205538, |
|
"grad_norm": 11.75, |
|
"learning_rate": 1.282464280553645e-05, |
|
"loss": 1.606, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.5484558040468583, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.277500043651384e-05, |
|
"loss": 1.4522, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.549520766773163, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.2725382973153003e-05, |
|
"loss": 1.6253, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.5505857294994675, |
|
"grad_norm": 9.875, |
|
"learning_rate": 1.2675790970849885e-05, |
|
"loss": 1.8144, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.5516506922257721, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.2626224984715451e-05, |
|
"loss": 1.6275, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.5527156549520766, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 1.2576685569569438e-05, |
|
"loss": 1.7244, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.5537806176783813, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.2527173279934173e-05, |
|
"loss": 1.7188, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5548455804046858, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 1.2477688670028331e-05, |
|
"loss": 1.5563, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.5559105431309904, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 1.2428232293760784e-05, |
|
"loss": 1.9121, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.556975505857295, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.2378804704724331e-05, |
|
"loss": 1.4518, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.5580404685835996, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 1.2329406456189574e-05, |
|
"loss": 1.3654, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.5591054313099042, |
|
"grad_norm": 8.625, |
|
"learning_rate": 1.2280038101098671e-05, |
|
"loss": 1.6945, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5601703940362087, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 1.2230700192059162e-05, |
|
"loss": 1.3019, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.5612353567625133, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 1.218139328133781e-05, |
|
"loss": 2.0614, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.5623003194888179, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.213211792085437e-05, |
|
"loss": 1.4044, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.5633652822151225, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 1.208287466217546e-05, |
|
"loss": 1.6953, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.564430244941427, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 1.203366405650834e-05, |
|
"loss": 1.8574, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5654952076677316, |
|
"grad_norm": 9.125, |
|
"learning_rate": 1.19844866546948e-05, |
|
"loss": 1.4788, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.5665601703940362, |
|
"grad_norm": 10.75, |
|
"learning_rate": 1.1935343007204925e-05, |
|
"loss": 1.8245, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.5676251331203408, |
|
"grad_norm": 8.625, |
|
"learning_rate": 1.1886233664130999e-05, |
|
"loss": 1.5421, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.5686900958466453, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 1.1837159175181296e-05, |
|
"loss": 1.1918, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.56975505857295, |
|
"grad_norm": 13.75, |
|
"learning_rate": 1.1788120089673963e-05, |
|
"loss": 1.3899, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5708200212992546, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.1739116956530839e-05, |
|
"loss": 1.389, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.5718849840255591, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 1.1690150324271345e-05, |
|
"loss": 1.7149, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.5729499467518637, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.1641220741006331e-05, |
|
"loss": 1.951, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.5740149094781682, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.1592328754431911e-05, |
|
"loss": 1.2455, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.5750798722044729, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.1543474911823391e-05, |
|
"loss": 1.3843, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5761448349307774, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 1.1494659760029085e-05, |
|
"loss": 1.5953, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.577209797657082, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.1445883845464229e-05, |
|
"loss": 1.5503, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.5782747603833865, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 1.1397147714104853e-05, |
|
"loss": 2.1053, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.5793397231096912, |
|
"grad_norm": 9.375, |
|
"learning_rate": 1.134845191148168e-05, |
|
"loss": 1.7168, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.5804046858359957, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 1.1299796982673988e-05, |
|
"loss": 1.7581, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5814696485623003, |
|
"grad_norm": 8.875, |
|
"learning_rate": 1.1251183472303562e-05, |
|
"loss": 1.5051, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.582534611288605, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.120261192452854e-05, |
|
"loss": 1.747, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.5835995740149095, |
|
"grad_norm": 8.25, |
|
"learning_rate": 1.1154082883037371e-05, |
|
"loss": 1.3058, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.5846645367412141, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 1.1105596891042699e-05, |
|
"loss": 1.606, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.5857294994675186, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 1.1057154491275281e-05, |
|
"loss": 1.5261, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5867944621938233, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 1.1008756225977936e-05, |
|
"loss": 1.6968, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.5878594249201278, |
|
"grad_norm": 9.5, |
|
"learning_rate": 1.0960402636899457e-05, |
|
"loss": 1.1682, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.5889243876464324, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.091209426528855e-05, |
|
"loss": 1.745, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.5899893503727369, |
|
"grad_norm": 13.875, |
|
"learning_rate": 1.0863831651887768e-05, |
|
"loss": 1.3198, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.5910543130990416, |
|
"grad_norm": 12.5625, |
|
"learning_rate": 1.081561533692749e-05, |
|
"loss": 1.7697, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5921192758253461, |
|
"grad_norm": 18.5, |
|
"learning_rate": 1.0767445860119822e-05, |
|
"loss": 1.9429, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.5931842385516507, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.0719323760652612e-05, |
|
"loss": 1.7703, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.5942492012779552, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 1.0671249577183364e-05, |
|
"loss": 1.782, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.5953141640042598, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 1.062322384783325e-05, |
|
"loss": 1.6762, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.5963791267305645, |
|
"grad_norm": 9.625, |
|
"learning_rate": 1.0575247110181048e-05, |
|
"loss": 1.4526, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.597444089456869, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.0527319901257161e-05, |
|
"loss": 1.9204, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.5985090521831736, |
|
"grad_norm": 9.125, |
|
"learning_rate": 1.047944275753759e-05, |
|
"loss": 0.9891, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.5995740149094781, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.0431616214937911e-05, |
|
"loss": 1.7877, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.6006389776357828, |
|
"grad_norm": 9.125, |
|
"learning_rate": 1.038384080880732e-05, |
|
"loss": 1.9058, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.6017039403620873, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 1.033611707392258e-05, |
|
"loss": 1.8118, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6027689030883919, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.0288445544482105e-05, |
|
"loss": 1.3733, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.6038338658146964, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 1.0240826754099914e-05, |
|
"loss": 1.5815, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.6048988285410011, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.0193261235799713e-05, |
|
"loss": 1.7796, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.6059637912673056, |
|
"grad_norm": 18.375, |
|
"learning_rate": 1.0145749522008881e-05, |
|
"loss": 1.5525, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.6070287539936102, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 1.009829214455255e-05, |
|
"loss": 1.6614, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6080937167199149, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.0050889634647629e-05, |
|
"loss": 1.5261, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.6091586794462194, |
|
"grad_norm": 11.875, |
|
"learning_rate": 1.0003542522896859e-05, |
|
"loss": 1.3385, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.610223642172524, |
|
"grad_norm": 10.0, |
|
"learning_rate": 9.956251339282895e-06, |
|
"loss": 1.8288, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.6112886048988285, |
|
"grad_norm": 13.125, |
|
"learning_rate": 9.909016613162334e-06, |
|
"loss": 2.1504, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.6123535676251332, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 9.861838873259835e-06, |
|
"loss": 1.5278, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6134185303514377, |
|
"grad_norm": 10.25, |
|
"learning_rate": 9.814718647662158e-06, |
|
"loss": 1.5204, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.6144834930777423, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 9.767656463812292e-06, |
|
"loss": 1.3771, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.6155484558040468, |
|
"grad_norm": 9.625, |
|
"learning_rate": 9.720652848503519e-06, |
|
"loss": 1.5326, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.6166134185303515, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 9.673708327873535e-06, |
|
"loss": 2.0136, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.617678381256656, |
|
"grad_norm": 9.875, |
|
"learning_rate": 9.62682342739856e-06, |
|
"loss": 1.7406, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6187433439829606, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 9.57999867188745e-06, |
|
"loss": 1.8908, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.6198083067092651, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 9.533234585475814e-06, |
|
"loss": 1.565, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.6208732694355698, |
|
"grad_norm": 16.0, |
|
"learning_rate": 9.486531691620182e-06, |
|
"loss": 1.4218, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.6219382321618744, |
|
"grad_norm": 10.375, |
|
"learning_rate": 9.439890513092092e-06, |
|
"loss": 1.4015, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.6230031948881789, |
|
"grad_norm": 9.75, |
|
"learning_rate": 9.393311571972293e-06, |
|
"loss": 1.678, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6240681576144835, |
|
"grad_norm": 10.25, |
|
"learning_rate": 9.34679538964487e-06, |
|
"loss": 1.8353, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.625133120340788, |
|
"grad_norm": 11.5, |
|
"learning_rate": 9.300342486791401e-06, |
|
"loss": 1.9182, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.6261980830670927, |
|
"grad_norm": 11.375, |
|
"learning_rate": 9.253953383385158e-06, |
|
"loss": 1.0814, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.6272630457933972, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 9.207628598685253e-06, |
|
"loss": 1.3614, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.6283280085197018, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 9.161368651230862e-06, |
|
"loss": 1.6114, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6293929712460063, |
|
"grad_norm": 11.875, |
|
"learning_rate": 9.115174058835386e-06, |
|
"loss": 1.4484, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.630457933972311, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 9.069045338580684e-06, |
|
"loss": 1.6065, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.6315228966986155, |
|
"grad_norm": 9.125, |
|
"learning_rate": 9.02298300681125e-06, |
|
"loss": 1.6245, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.6325878594249201, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 8.976987579128486e-06, |
|
"loss": 1.2913, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.6336528221512248, |
|
"grad_norm": 9.625, |
|
"learning_rate": 8.931059570384864e-06, |
|
"loss": 1.6383, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6347177848775293, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 8.88519949467823e-06, |
|
"loss": 1.5182, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.6357827476038339, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 8.839407865345999e-06, |
|
"loss": 1.2956, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.6368477103301384, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 8.79368519495943e-06, |
|
"loss": 2.0363, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.6379126730564431, |
|
"grad_norm": 8.5, |
|
"learning_rate": 8.748031995317887e-06, |
|
"loss": 1.6898, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.6389776357827476, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 8.702448777443115e-06, |
|
"loss": 1.6978, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6400425985090522, |
|
"grad_norm": 10.125, |
|
"learning_rate": 8.656936051573505e-06, |
|
"loss": 1.6635, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.6411075612353567, |
|
"grad_norm": 11.625, |
|
"learning_rate": 8.611494327158398e-06, |
|
"loss": 1.2385, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.6421725239616614, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 8.56612411285238e-06, |
|
"loss": 1.6086, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.6432374866879659, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 8.520825916509557e-06, |
|
"loss": 1.5494, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.6443024494142705, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 8.475600245177951e-06, |
|
"loss": 1.6019, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.645367412140575, |
|
"grad_norm": 9.0, |
|
"learning_rate": 8.430447605093707e-06, |
|
"loss": 1.9869, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.6464323748668797, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 8.385368501675551e-06, |
|
"loss": 2.1738, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.6474973375931843, |
|
"grad_norm": 8.875, |
|
"learning_rate": 8.340363439519021e-06, |
|
"loss": 1.7293, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.6485623003194888, |
|
"grad_norm": 9.125, |
|
"learning_rate": 8.295432922390905e-06, |
|
"loss": 1.8153, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.6496272630457934, |
|
"grad_norm": 11.0, |
|
"learning_rate": 8.250577453223561e-06, |
|
"loss": 1.2735, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.650692225772098, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 8.205797534109265e-06, |
|
"loss": 1.5776, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.6517571884984026, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 8.161093666294664e-06, |
|
"loss": 1.2921, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.6528221512247071, |
|
"grad_norm": 8.875, |
|
"learning_rate": 8.116466350175079e-06, |
|
"loss": 1.8095, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.6538871139510117, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 8.071916085288981e-06, |
|
"loss": 1.9529, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.6549520766773163, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 8.027443370312326e-06, |
|
"loss": 1.6288, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6560170394036209, |
|
"grad_norm": 11.375, |
|
"learning_rate": 7.983048703053055e-06, |
|
"loss": 1.9875, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.6570820021299254, |
|
"grad_norm": 10.875, |
|
"learning_rate": 7.938732580445422e-06, |
|
"loss": 1.6334, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.65814696485623, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 7.894495498544551e-06, |
|
"loss": 1.6786, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.6592119275825347, |
|
"grad_norm": 9.625, |
|
"learning_rate": 7.850337952520763e-06, |
|
"loss": 1.6683, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.6602768903088392, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 7.806260436654116e-06, |
|
"loss": 2.3544, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6613418530351438, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 7.762263444328856e-06, |
|
"loss": 1.3748, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.6624068157614483, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 7.71834746802785e-06, |
|
"loss": 1.307, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.663471778487753, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 7.674512999327149e-06, |
|
"loss": 1.7112, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.6645367412140575, |
|
"grad_norm": 9.25, |
|
"learning_rate": 7.630760528890403e-06, |
|
"loss": 1.7411, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.6656017039403621, |
|
"grad_norm": 8.625, |
|
"learning_rate": 7.587090546463447e-06, |
|
"loss": 1.5324, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 7.5435035408687504e-06, |
|
"loss": 1.5691, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.6677316293929713, |
|
"grad_norm": 9.75, |
|
"learning_rate": 7.500000000000004e-06, |
|
"loss": 1.6352, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.6687965921192758, |
|
"grad_norm": 9.125, |
|
"learning_rate": 7.456580410816604e-06, |
|
"loss": 1.5628, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.6698615548455804, |
|
"grad_norm": 10.125, |
|
"learning_rate": 7.41324525933825e-06, |
|
"loss": 1.5523, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.670926517571885, |
|
"grad_norm": 11.0, |
|
"learning_rate": 7.3699950306394715e-06, |
|
"loss": 1.6238, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6719914802981896, |
|
"grad_norm": 11.0, |
|
"learning_rate": 7.3268302088442125e-06, |
|
"loss": 1.9907, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.6730564430244942, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 7.283751277120427e-06, |
|
"loss": 1.7915, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.6741214057507987, |
|
"grad_norm": 10.0, |
|
"learning_rate": 7.2407587176746146e-06, |
|
"loss": 1.6269, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.6751863684771033, |
|
"grad_norm": 10.75, |
|
"learning_rate": 7.197853011746506e-06, |
|
"loss": 1.2853, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.6762513312034079, |
|
"grad_norm": 12.5, |
|
"learning_rate": 7.1550346396035975e-06, |
|
"loss": 1.6906, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6773162939297125, |
|
"grad_norm": 9.625, |
|
"learning_rate": 7.112304080535827e-06, |
|
"loss": 1.7357, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.678381256656017, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 7.069661812850188e-06, |
|
"loss": 1.548, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.6794462193823216, |
|
"grad_norm": 12.375, |
|
"learning_rate": 7.027108313865379e-06, |
|
"loss": 1.9071, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.6805111821086262, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 6.984644059906461e-06, |
|
"loss": 1.6673, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.6815761448349308, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 6.942269526299527e-06, |
|
"loss": 1.4935, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6826411075612353, |
|
"grad_norm": 12.375, |
|
"learning_rate": 6.899985187366376e-06, |
|
"loss": 2.1448, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.6837060702875399, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 6.857791516419212e-06, |
|
"loss": 1.418, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.6847710330138446, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 6.815688985755341e-06, |
|
"loss": 1.5875, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.6858359957401491, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 6.773678066651881e-06, |
|
"loss": 1.8208, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.6869009584664537, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 6.731759229360494e-06, |
|
"loss": 1.4882, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6879659211927582, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 6.6899329431021215e-06, |
|
"loss": 1.7406, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.6890308839190629, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 6.648199676061724e-06, |
|
"loss": 1.717, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.6900958466453674, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 6.606559895383051e-06, |
|
"loss": 1.633, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.691160809371672, |
|
"grad_norm": 10.5, |
|
"learning_rate": 6.5650140671634e-06, |
|
"loss": 1.0872, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.6922257720979765, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 6.523562656448417e-06, |
|
"loss": 1.5847, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6932907348242812, |
|
"grad_norm": 9.75, |
|
"learning_rate": 6.4822061272268696e-06, |
|
"loss": 0.9841, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.6943556975505857, |
|
"grad_norm": 8.125, |
|
"learning_rate": 6.440944942425469e-06, |
|
"loss": 1.7512, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.6954206602768903, |
|
"grad_norm": 12.125, |
|
"learning_rate": 6.399779563903683e-06, |
|
"loss": 1.3146, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.6964856230031949, |
|
"grad_norm": 9.375, |
|
"learning_rate": 6.358710452448566e-06, |
|
"loss": 1.6926, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.6975505857294995, |
|
"grad_norm": 8.375, |
|
"learning_rate": 6.317738067769599e-06, |
|
"loss": 1.4995, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.6986155484558041, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 6.2768628684935496e-06, |
|
"loss": 1.5856, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.6996805111821086, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 6.236085312159335e-06, |
|
"loss": 1.4653, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.7007454739084132, |
|
"grad_norm": 11.125, |
|
"learning_rate": 6.195405855212896e-06, |
|
"loss": 1.8539, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.7018104366347178, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 6.154824953002098e-06, |
|
"loss": 1.4134, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.7028753993610224, |
|
"grad_norm": 8.625, |
|
"learning_rate": 6.114343059771625e-06, |
|
"loss": 1.6557, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7039403620873269, |
|
"grad_norm": 10.125, |
|
"learning_rate": 6.073960628657896e-06, |
|
"loss": 1.4981, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.7050053248136315, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 6.033678111684001e-06, |
|
"loss": 1.9017, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.7060702875399361, |
|
"grad_norm": 10.0, |
|
"learning_rate": 5.9934959597546315e-06, |
|
"loss": 1.7662, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.7071352502662407, |
|
"grad_norm": 8.875, |
|
"learning_rate": 5.953414622651037e-06, |
|
"loss": 1.5215, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.7082002129925452, |
|
"grad_norm": 11.0, |
|
"learning_rate": 5.913434549025989e-06, |
|
"loss": 1.6729, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.7092651757188498, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 5.873556186398771e-06, |
|
"loss": 1.7875, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.7103301384451545, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 5.833779981150133e-06, |
|
"loss": 1.8491, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.711395101171459, |
|
"grad_norm": 10.375, |
|
"learning_rate": 5.7941063785173535e-06, |
|
"loss": 1.6454, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.7124600638977636, |
|
"grad_norm": 12.0, |
|
"learning_rate": 5.754535822589197e-06, |
|
"loss": 1.5219, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.7135250266240681, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 5.715068756300985e-06, |
|
"loss": 1.9882, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7145899893503728, |
|
"grad_norm": 11.0, |
|
"learning_rate": 5.675705621429611e-06, |
|
"loss": 1.5543, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.7156549520766773, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 5.636446858588611e-06, |
|
"loss": 1.4343, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.7167199148029819, |
|
"grad_norm": 8.875, |
|
"learning_rate": 5.597292907223229e-06, |
|
"loss": 1.7156, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.7177848775292864, |
|
"grad_norm": 10.375, |
|
"learning_rate": 5.55824420560548e-06, |
|
"loss": 1.8069, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.7188498402555911, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 5.51930119082929e-06, |
|
"loss": 2.1811, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7199148029818956, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 5.480464298805539e-06, |
|
"loss": 1.5952, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.7209797657082002, |
|
"grad_norm": 11.75, |
|
"learning_rate": 5.441733964257246e-06, |
|
"loss": 1.7754, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.7220447284345048, |
|
"grad_norm": 8.625, |
|
"learning_rate": 5.403110620714647e-06, |
|
"loss": 1.4991, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.7231096911608094, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 5.3645947005103874e-06, |
|
"loss": 1.5927, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.724174653887114, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 5.326186634774654e-06, |
|
"loss": 1.3676, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7252396166134185, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 5.287886853430362e-06, |
|
"loss": 1.5596, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.7263045793397231, |
|
"grad_norm": 9.25, |
|
"learning_rate": 5.249695785188338e-06, |
|
"loss": 1.8652, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.7273695420660277, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 5.21161385754251e-06, |
|
"loss": 1.7813, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.7284345047923323, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 5.173641496765163e-06, |
|
"loss": 1.2195, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.7294994675186368, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 5.135779127902103e-06, |
|
"loss": 1.5703, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7305644302449414, |
|
"grad_norm": 10.25, |
|
"learning_rate": 5.098027174767972e-06, |
|
"loss": 1.623, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.731629392971246, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 5.0603860599414324e-06, |
|
"loss": 1.9181, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.7326943556975506, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 5.022856204760504e-06, |
|
"loss": 1.7919, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.7337593184238551, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.98543802931778e-06, |
|
"loss": 1.8822, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.7348242811501597, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.948131952455802e-06, |
|
"loss": 1.8503, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7358892438764644, |
|
"grad_norm": 10.25, |
|
"learning_rate": 4.910938391762287e-06, |
|
"loss": 1.9597, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.7369542066027689, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 4.873857763565523e-06, |
|
"loss": 1.5532, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.7380191693290735, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 4.8368904829296816e-06, |
|
"loss": 1.6786, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.739084132055378, |
|
"grad_norm": 12.0, |
|
"learning_rate": 4.800036963650147e-06, |
|
"loss": 1.7628, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.7401490947816827, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 4.7632976182489475e-06, |
|
"loss": 1.1223, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7412140575079872, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 4.726672857970059e-06, |
|
"loss": 1.4561, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.7422790202342918, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 4.690163092774878e-06, |
|
"loss": 1.4758, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.7433439829605963, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.65376873133757e-06, |
|
"loss": 1.442, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.744408945686901, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 4.617490181040536e-06, |
|
"loss": 1.5644, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.7454739084132055, |
|
"grad_norm": 12.4375, |
|
"learning_rate": 4.581327847969832e-06, |
|
"loss": 1.5819, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7465388711395101, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.545282136910635e-06, |
|
"loss": 1.7544, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.7476038338658147, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 4.509353451342704e-06, |
|
"loss": 1.518, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.7486687965921193, |
|
"grad_norm": 11.625, |
|
"learning_rate": 4.4735421934358625e-06, |
|
"loss": 1.843, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.7497337593184239, |
|
"grad_norm": 10.5, |
|
"learning_rate": 4.437848764045515e-06, |
|
"loss": 2.0874, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.7507987220447284, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.402273562708119e-06, |
|
"loss": 1.7729, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.751863684771033, |
|
"grad_norm": 10.875, |
|
"learning_rate": 4.366816987636777e-06, |
|
"loss": 2.045, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.7529286474973376, |
|
"grad_norm": 10.25, |
|
"learning_rate": 4.3314794357167e-06, |
|
"loss": 1.1957, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.7539936102236422, |
|
"grad_norm": 11.75, |
|
"learning_rate": 4.2962613025008365e-06, |
|
"loss": 1.3515, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.7550585729499467, |
|
"grad_norm": 9.25, |
|
"learning_rate": 4.2611629822054035e-06, |
|
"loss": 1.5657, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.7561235356762513, |
|
"grad_norm": 8.125, |
|
"learning_rate": 4.226184867705487e-06, |
|
"loss": 1.4376, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7571884984025559, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.1913273505306385e-06, |
|
"loss": 1.6607, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.7582534611288605, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 4.156590820860506e-06, |
|
"loss": 1.5126, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.759318423855165, |
|
"grad_norm": 11.5, |
|
"learning_rate": 4.121975667520446e-06, |
|
"loss": 1.7676, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.7603833865814696, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 4.087482277977188e-06, |
|
"loss": 1.7903, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.7614483493077743, |
|
"grad_norm": 11.625, |
|
"learning_rate": 4.0531110383344906e-06, |
|
"loss": 1.6722, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7625133120340788, |
|
"grad_norm": 10.125, |
|
"learning_rate": 4.018862333328819e-06, |
|
"loss": 1.6057, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.7635782747603834, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 3.984736546325043e-06, |
|
"loss": 1.7429, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.7646432374866879, |
|
"grad_norm": 12.875, |
|
"learning_rate": 3.9507340593121385e-06, |
|
"loss": 1.6328, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.7657082002129926, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 3.916855252898917e-06, |
|
"loss": 1.5328, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.7667731629392971, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.883100506309763e-06, |
|
"loss": 1.5779, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7678381256656017, |
|
"grad_norm": 10.875, |
|
"learning_rate": 3.849470197380397e-06, |
|
"loss": 1.3155, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.7689030883919062, |
|
"grad_norm": 8.875, |
|
"learning_rate": 3.815964702553632e-06, |
|
"loss": 1.9236, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.7699680511182109, |
|
"grad_norm": 9.75, |
|
"learning_rate": 3.7825843968751665e-06, |
|
"loss": 1.295, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.7710330138445154, |
|
"grad_norm": 9.125, |
|
"learning_rate": 3.749329653989393e-06, |
|
"loss": 1.4935, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.77209797657082, |
|
"grad_norm": 12.0, |
|
"learning_rate": 3.7162008461352055e-06, |
|
"loss": 1.1767, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7731629392971247, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 3.6831983441418366e-06, |
|
"loss": 2.067, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.7742279020234292, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 3.650322517424708e-06, |
|
"loss": 1.853, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.7752928647497338, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 3.6175737339812968e-06, |
|
"loss": 1.5211, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.7763578274760383, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 3.584952360387009e-06, |
|
"loss": 1.5024, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.777422790202343, |
|
"grad_norm": 10.25, |
|
"learning_rate": 3.5524587617910844e-06, |
|
"loss": 1.3889, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7784877529286475, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 3.520093301912505e-06, |
|
"loss": 1.665, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.7795527156549521, |
|
"grad_norm": 9.125, |
|
"learning_rate": 3.4878563430359246e-06, |
|
"loss": 1.5885, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.7806176783812566, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 3.4557482460076145e-06, |
|
"loss": 1.747, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.7816826411075612, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 3.4237693702314215e-06, |
|
"loss": 1.6977, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.7827476038338658, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 3.3919200736647476e-06, |
|
"loss": 2.0445, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7838125665601704, |
|
"grad_norm": 8.5, |
|
"learning_rate": 3.3602007128145485e-06, |
|
"loss": 1.1119, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.784877529286475, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 3.328611642733316e-06, |
|
"loss": 1.547, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.7859424920127795, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 3.297153217015155e-06, |
|
"loss": 1.6143, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.7870074547390842, |
|
"grad_norm": 14.0625, |
|
"learning_rate": 3.265825787791774e-06, |
|
"loss": 1.0104, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.7880724174653887, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 3.234629705728571e-06, |
|
"loss": 1.8247, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7891373801916933, |
|
"grad_norm": 8.5, |
|
"learning_rate": 3.203565320020701e-06, |
|
"loss": 1.6653, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.7902023429179978, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.1726329783891688e-06, |
|
"loss": 1.3578, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.7912673056443025, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 3.1418330270769375e-06, |
|
"loss": 1.3851, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.792332268370607, |
|
"grad_norm": 11.75, |
|
"learning_rate": 3.1111658108450465e-06, |
|
"loss": 1.4159, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.7933972310969116, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.080631672968769e-06, |
|
"loss": 1.7524, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7944621938232161, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 3.050230955233733e-06, |
|
"loss": 1.5065, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.7955271565495208, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 3.019963997932157e-06, |
|
"loss": 1.6124, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.7965921192758253, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.9898311398589674e-06, |
|
"loss": 1.5844, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.7976570820021299, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 2.959832718308077e-06, |
|
"loss": 1.0975, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.7987220447284346, |
|
"grad_norm": 11.8125, |
|
"learning_rate": 2.929969069068539e-06, |
|
"loss": 1.8824, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7997870074547391, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.900240526420861e-06, |
|
"loss": 1.56, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.8008519701810437, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 2.8706474231332064e-06, |
|
"loss": 1.9439, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.8019169329073482, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 2.8411900904576916e-06, |
|
"loss": 1.373, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.8029818956336529, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 2.811868858126686e-06, |
|
"loss": 1.4297, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.8040468583599574, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.7826840543490932e-06, |
|
"loss": 1.7362, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.805111821086262, |
|
"grad_norm": 9.25, |
|
"learning_rate": 2.753636005806725e-06, |
|
"loss": 1.8195, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.8061767838125665, |
|
"grad_norm": 12.5, |
|
"learning_rate": 2.7247250376505823e-06, |
|
"loss": 1.6419, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.8072417465388712, |
|
"grad_norm": 15.625, |
|
"learning_rate": 2.69595147349728e-06, |
|
"loss": 1.4818, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.8083067092651757, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 2.667315635425366e-06, |
|
"loss": 1.4912, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.8093716719914803, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.6388178439717696e-06, |
|
"loss": 1.8378, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8104366347177849, |
|
"grad_norm": 11.0, |
|
"learning_rate": 2.610458418128158e-06, |
|
"loss": 0.8448, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.8115015974440895, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 2.5822376753374215e-06, |
|
"loss": 1.8725, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.8125665601703941, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 2.554155931490085e-06, |
|
"loss": 1.6764, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.8136315228966986, |
|
"grad_norm": 17.0, |
|
"learning_rate": 2.526213500920766e-06, |
|
"loss": 1.4876, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.8146964856230032, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 2.498410696404698e-06, |
|
"loss": 1.7144, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.8157614483493077, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 2.47074782915417e-06, |
|
"loss": 1.6642, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.8168264110756124, |
|
"grad_norm": 8.875, |
|
"learning_rate": 2.443225208815111e-06, |
|
"loss": 1.9021, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.8178913738019169, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 2.4158431434635525e-06, |
|
"loss": 1.4165, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.8189563365282215, |
|
"grad_norm": 12.375, |
|
"learning_rate": 2.388601939602246e-06, |
|
"loss": 0.9623, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.820021299254526, |
|
"grad_norm": 10.625, |
|
"learning_rate": 2.3615019021571798e-06, |
|
"loss": 1.8855, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8210862619808307, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 2.3345433344741984e-06, |
|
"loss": 1.9843, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.8221512247071352, |
|
"grad_norm": 11.125, |
|
"learning_rate": 2.3077265383155937e-06, |
|
"loss": 1.8292, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.8232161874334398, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 2.281051813856732e-06, |
|
"loss": 1.7709, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.8242811501597445, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 2.2545194596826867e-06, |
|
"loss": 1.6551, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.825346112886049, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 2.2281297727849042e-06, |
|
"loss": 2.0816, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8264110756123536, |
|
"grad_norm": 10.375, |
|
"learning_rate": 2.201883048557885e-06, |
|
"loss": 1.7194, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.8274760383386581, |
|
"grad_norm": 11.375, |
|
"learning_rate": 2.175779580795848e-06, |
|
"loss": 1.5737, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.8285410010649628, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.1498196616894867e-06, |
|
"loss": 1.7461, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.8296059637912673, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.1240035818226546e-06, |
|
"loss": 0.9217, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.8306709265175719, |
|
"grad_norm": 12.6875, |
|
"learning_rate": 2.09833163016914e-06, |
|
"loss": 1.483, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8317358892438764, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 2.0728040940894277e-06, |
|
"loss": 1.5439, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.832800851970181, |
|
"grad_norm": 9.375, |
|
"learning_rate": 2.047421259327472e-06, |
|
"loss": 1.7316, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.8338658146964856, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 2.0221834100075086e-06, |
|
"loss": 0.6799, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.8349307774227902, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 1.99709082863087e-06, |
|
"loss": 1.2157, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.8359957401490948, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.9721437960728183e-06, |
|
"loss": 1.783, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.8370607028753994, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.9473425915794108e-06, |
|
"loss": 2.1057, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.838125665601704, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 1.922687492764379e-06, |
|
"loss": 1.8039, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.8391906283280085, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 1.8981787756059933e-06, |
|
"loss": 1.6254, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.8402555910543131, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.8738167144440026e-06, |
|
"loss": 1.3783, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.8413205537806177, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 1.8496015819765548e-06, |
|
"loss": 1.6897, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8423855165069223, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.8255336492571394e-06, |
|
"loss": 1.3614, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.8434504792332268, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 1.8016131856915608e-06, |
|
"loss": 1.8157, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.8445154419595314, |
|
"grad_norm": 14.75, |
|
"learning_rate": 1.7778404590349135e-06, |
|
"loss": 1.3761, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.845580404685836, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 1.754215735388595e-06, |
|
"loss": 1.9235, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.8466453674121406, |
|
"grad_norm": 20.875, |
|
"learning_rate": 1.7307392791973204e-06, |
|
"loss": 1.0706, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8477103301384451, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 1.7074113532461644e-06, |
|
"loss": 1.6008, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.8487752928647497, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.6842322186576208e-06, |
|
"loss": 1.3219, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.8498402555910544, |
|
"grad_norm": 12.125, |
|
"learning_rate": 1.6612021348886775e-06, |
|
"loss": 1.831, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.8509052183173589, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 1.6383213597279146e-06, |
|
"loss": 1.5059, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.8519701810436635, |
|
"grad_norm": 9.625, |
|
"learning_rate": 1.615590149292618e-06, |
|
"loss": 1.371, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.853035143769968, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.5930087580259089e-06, |
|
"loss": 1.5541, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.8541001064962727, |
|
"grad_norm": 9.875, |
|
"learning_rate": 1.5705774386939027e-06, |
|
"loss": 1.5982, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.8551650692225772, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 1.5482964423828738e-06, |
|
"loss": 1.4818, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.8562300319488818, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.5261660184964488e-06, |
|
"loss": 1.4705, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.8572949946751863, |
|
"grad_norm": 13.5625, |
|
"learning_rate": 1.50418641475281e-06, |
|
"loss": 1.1884, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.858359957401491, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 1.4823578771819308e-06, |
|
"loss": 1.7592, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.8594249201277955, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 1.4606806501228098e-06, |
|
"loss": 1.8618, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.8604898828541001, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 1.439154976220753e-06, |
|
"loss": 1.872, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.8615548455804047, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.417781096424629e-06, |
|
"loss": 1.7073, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.8626198083067093, |
|
"grad_norm": 15.625, |
|
"learning_rate": 1.3965592499842133e-06, |
|
"loss": 1.5956, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8636847710330139, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 1.3754896744474704e-06, |
|
"loss": 1.6528, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.8647497337593184, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.3545726056579199e-06, |
|
"loss": 1.6145, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.865814696485623, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.3338082777519822e-06, |
|
"loss": 2.4227, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.8668796592119276, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.31319692315637e-06, |
|
"loss": 1.1167, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.8679446219382322, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.2927387725854761e-06, |
|
"loss": 1.7869, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8690095846645367, |
|
"grad_norm": 9.5, |
|
"learning_rate": 1.2724340550387963e-06, |
|
"loss": 1.538, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.8700745473908413, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.2522829977983691e-06, |
|
"loss": 1.8242, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.8711395101171459, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.2322858264262133e-06, |
|
"loss": 1.3644, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.8722044728434505, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.2124427647618392e-06, |
|
"loss": 1.755, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.873269435569755, |
|
"grad_norm": 10.5625, |
|
"learning_rate": 1.1927540349196986e-06, |
|
"loss": 1.7762, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8743343982960596, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.173219857286742e-06, |
|
"loss": 1.2137, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.8753993610223643, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 1.1538404505199102e-06, |
|
"loss": 1.9446, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.8764643237486688, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.1346160315437282e-06, |
|
"loss": 1.9131, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.8775292864749734, |
|
"grad_norm": 8.75, |
|
"learning_rate": 1.1155468155478387e-06, |
|
"loss": 1.6933, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.8785942492012779, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 1.096633015984621e-06, |
|
"loss": 1.6843, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8796592119275826, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 1.0778748445667907e-06, |
|
"loss": 1.0095, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.8807241746538871, |
|
"grad_norm": 9.625, |
|
"learning_rate": 1.0592725112650204e-06, |
|
"loss": 1.5605, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.8817891373801917, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 1.040826224305616e-06, |
|
"loss": 1.4877, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.8828541001064962, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 1.022536190168153e-06, |
|
"loss": 1.7958, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.8839190628328009, |
|
"grad_norm": 9.875, |
|
"learning_rate": 1.0044026135832018e-06, |
|
"loss": 1.2038, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8849840255591054, |
|
"grad_norm": 13.0, |
|
"learning_rate": 9.864256975299912e-07, |
|
"loss": 1.5369, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.88604898828541, |
|
"grad_norm": 10.5, |
|
"learning_rate": 9.686056432341872e-07, |
|
"loss": 1.8342, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.8871139510117146, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 9.509426501655921e-07, |
|
"loss": 1.683, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.8881789137380192, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 9.334369160359463e-07, |
|
"loss": 1.6419, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.8892438764643238, |
|
"grad_norm": 8.875, |
|
"learning_rate": 9.16088636796708e-07, |
|
"loss": 1.6393, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8903088391906283, |
|
"grad_norm": 15.0, |
|
"learning_rate": 8.988980066368357e-07, |
|
"loss": 1.4805, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.8913738019169329, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 8.818652179806591e-07, |
|
"loss": 1.5142, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.8924387646432375, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 8.649904614856746e-07, |
|
"loss": 1.1254, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.8935037273695421, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 8.482739260404604e-07, |
|
"loss": 1.5179, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.8945686900958466, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 8.317157987625146e-07, |
|
"loss": 1.254, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8956336528221512, |
|
"grad_norm": 12.125, |
|
"learning_rate": 8.153162649962054e-07, |
|
"loss": 1.655, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.8966986155484558, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 7.99075508310661e-07, |
|
"loss": 1.5181, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.8977635782747604, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 7.829937104977347e-07, |
|
"loss": 1.6125, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.898828541001065, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 7.670710515699647e-07, |
|
"loss": 1.4513, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.8998935037273695, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 7.513077097585558e-07, |
|
"loss": 1.2881, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.9009584664536742, |
|
"grad_norm": 10.75, |
|
"learning_rate": 7.357038615113959e-07, |
|
"loss": 1.9436, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.9020234291799787, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 7.202596814910561e-07, |
|
"loss": 1.6863, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.9030883919062833, |
|
"grad_norm": 11.0, |
|
"learning_rate": 7.049753425728723e-07, |
|
"loss": 1.4833, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.9041533546325878, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 6.89851015842971e-07, |
|
"loss": 1.5175, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.9052183173588925, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 6.74886870596389e-07, |
|
"loss": 1.6937, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.906283280085197, |
|
"grad_norm": 8.5, |
|
"learning_rate": 6.600830743351482e-07, |
|
"loss": 1.3861, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.9073482428115016, |
|
"grad_norm": 9.75, |
|
"learning_rate": 6.454397927664035e-07, |
|
"loss": 1.7092, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.9084132055378061, |
|
"grad_norm": 12.25, |
|
"learning_rate": 6.309571898005784e-07, |
|
"loss": 1.4632, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.9094781682641108, |
|
"grad_norm": 11.0, |
|
"learning_rate": 6.166354275495284e-07, |
|
"loss": 1.6058, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.9105431309904153, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 6.02474666324731e-07, |
|
"loss": 1.6195, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9116080937167199, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 5.884750646354903e-07, |
|
"loss": 1.9317, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.9126730564430245, |
|
"grad_norm": 9.75, |
|
"learning_rate": 5.746367791871582e-07, |
|
"loss": 1.5125, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.9137380191693291, |
|
"grad_norm": 11.0, |
|
"learning_rate": 5.609599648793878e-07, |
|
"loss": 1.8878, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.9148029818956337, |
|
"grad_norm": 8.875, |
|
"learning_rate": 5.474447748043931e-07, |
|
"loss": 1.4361, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.9158679446219382, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 5.340913602452385e-07, |
|
"loss": 1.5905, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9169329073482428, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 5.208998706741469e-07, |
|
"loss": 1.8177, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.9179978700745474, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 5.078704537508194e-07, |
|
"loss": 1.1538, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.919062832800852, |
|
"grad_norm": 9.625, |
|
"learning_rate": 4.950032553207934e-07, |
|
"loss": 0.8619, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.9201277955271565, |
|
"grad_norm": 10.5, |
|
"learning_rate": 4.822984194138003e-07, |
|
"loss": 1.3963, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.9211927582534611, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 4.6975608824215866e-07, |
|
"loss": 2.0253, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.9222577209797657, |
|
"grad_norm": 11.125, |
|
"learning_rate": 4.5737640219917885e-07, |
|
"loss": 1.6444, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.9233226837060703, |
|
"grad_norm": 15.4375, |
|
"learning_rate": 4.451594998575975e-07, |
|
"loss": 1.637, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.9243876464323749, |
|
"grad_norm": 11.25, |
|
"learning_rate": 4.331055179680188e-07, |
|
"loss": 1.6452, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.9254526091586794, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 4.212145914573906e-07, |
|
"loss": 1.1187, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.9265175718849841, |
|
"grad_norm": 9.5, |
|
"learning_rate": 4.0948685342748595e-07, |
|
"loss": 1.5924, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9275825346112886, |
|
"grad_norm": 9.125, |
|
"learning_rate": 3.9792243515342387e-07, |
|
"loss": 1.5951, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.9286474973375932, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 3.865214660821892e-07, |
|
"loss": 1.6178, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.9297124600638977, |
|
"grad_norm": 12.25, |
|
"learning_rate": 3.7528407383119355e-07, |
|
"loss": 1.6325, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.9307774227902024, |
|
"grad_norm": 12.625, |
|
"learning_rate": 3.642103841868383e-07, |
|
"loss": 1.4013, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.9318423855165069, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 3.533005211031104e-07, |
|
"loss": 1.7131, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.9329073482428115, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 3.4255460670019723e-07, |
|
"loss": 1.6834, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.933972310969116, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 3.3197276126311404e-07, |
|
"loss": 1.8832, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.9350372736954207, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.2155510324036354e-07, |
|
"loss": 1.5813, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.9361022364217252, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.1130174924260345e-07, |
|
"loss": 1.7615, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.9371671991480298, |
|
"grad_norm": 9.75, |
|
"learning_rate": 3.012128140413495e-07, |
|
"loss": 1.9889, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9382321618743344, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 2.9128841056767943e-07, |
|
"loss": 1.8032, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.939297124600639, |
|
"grad_norm": 9.25, |
|
"learning_rate": 2.815286499109826e-07, |
|
"loss": 1.287, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.9403620873269436, |
|
"grad_norm": 11.875, |
|
"learning_rate": 2.719336413177076e-07, |
|
"loss": 1.4744, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.9414270500532481, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 2.6250349219013813e-07, |
|
"loss": 1.9576, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.9424920127795527, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 2.5323830808519575e-07, |
|
"loss": 1.7786, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.9435569755058573, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 2.4413819271325576e-07, |
|
"loss": 1.5379, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.9446219382321619, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 2.3520324793698977e-07, |
|
"loss": 1.546, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.9456869009584664, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 2.2643357377022166e-07, |
|
"loss": 1.8081, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.946751863684771, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 2.1782926837680518e-07, |
|
"loss": 1.6002, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.9478168264110756, |
|
"grad_norm": 10.5, |
|
"learning_rate": 2.09390428069533e-07, |
|
"loss": 1.2529, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9488817891373802, |
|
"grad_norm": 10.0, |
|
"learning_rate": 2.0111714730905783e-07, |
|
"loss": 1.8403, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.9499467518636848, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.9300951870282136e-07, |
|
"loss": 2.0224, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.9510117145899893, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.850676330040385e-07, |
|
"loss": 2.1045, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.952076677316294, |
|
"grad_norm": 8.875, |
|
"learning_rate": 1.7729157911066994e-07, |
|
"loss": 1.7303, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.9531416400425985, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 1.6968144406442288e-07, |
|
"loss": 1.1697, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9542066027689031, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 1.6223731304978838e-07, |
|
"loss": 1.6273, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.9552715654952076, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 1.549592693930757e-07, |
|
"loss": 1.2158, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.9563365282215123, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.4784739456149442e-07, |
|
"loss": 1.3284, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.9574014909478168, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 1.4090176816222211e-07, |
|
"loss": 1.3294, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.9584664536741214, |
|
"grad_norm": 9.875, |
|
"learning_rate": 1.3412246794153481e-07, |
|
"loss": 1.8441, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9595314164004259, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.2750956978392124e-07, |
|
"loss": 1.4858, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.9605963791267306, |
|
"grad_norm": 10.125, |
|
"learning_rate": 1.2106314771124171e-07, |
|
"loss": 1.1116, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.9616613418530351, |
|
"grad_norm": 12.25, |
|
"learning_rate": 1.1478327388189547e-07, |
|
"loss": 1.6537, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.9627263045793397, |
|
"grad_norm": 9.5, |
|
"learning_rate": 1.0867001859001801e-07, |
|
"loss": 2.0677, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.9637912673056444, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 1.0272345026468177e-07, |
|
"loss": 1.1597, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.9648562300319489, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 9.694363546914664e-08, |
|
"loss": 1.4244, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.9659211927582535, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 9.133063890010729e-08, |
|
"loss": 1.0121, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.966986155484558, |
|
"grad_norm": 8.125, |
|
"learning_rate": 8.588452338696206e-08, |
|
"loss": 1.5185, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.9680511182108626, |
|
"grad_norm": 9.75, |
|
"learning_rate": 8.060534989112688e-08, |
|
"loss": 1.5174, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.9691160809371672, |
|
"grad_norm": 11.3125, |
|
"learning_rate": 7.549317750533246e-08, |
|
"loss": 1.413, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9701810436634718, |
|
"grad_norm": 8.875, |
|
"learning_rate": 7.054806345297815e-08, |
|
"loss": 1.8943, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.9712460063897763, |
|
"grad_norm": 9.625, |
|
"learning_rate": 6.577006308748579e-08, |
|
"loss": 1.5478, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.972310969116081, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 6.115922989167855e-08, |
|
"loss": 1.7652, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.9733759318423855, |
|
"grad_norm": 9.625, |
|
"learning_rate": 5.6715615477188064e-08, |
|
"loss": 1.5531, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.9744408945686901, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 5.243926958386658e-08, |
|
"loss": 1.4704, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9755058572949947, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 4.833024007924236e-08, |
|
"loss": 1.9338, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.9765708200212992, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.438857295797516e-08, |
|
"loss": 1.3931, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.9776357827476039, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.0614312341346604e-08, |
|
"loss": 1.7876, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.9787007454739084, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 3.7007500476757274e-08, |
|
"loss": 1.7527, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.979765708200213, |
|
"grad_norm": 10.125, |
|
"learning_rate": 3.356817773727039e-08, |
|
"loss": 1.3077, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9808306709265175, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.02963826211422e-08, |
|
"loss": 1.8316, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.9818956336528222, |
|
"grad_norm": 14.375, |
|
"learning_rate": 2.7192151751400662e-08, |
|
"loss": 1.0442, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.9829605963791267, |
|
"grad_norm": 13.375, |
|
"learning_rate": 2.4255519875434062e-08, |
|
"loss": 1.6917, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.9840255591054313, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 2.1486519864604703e-08, |
|
"loss": 1.544, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.9850905218317358, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 1.8885182713870853e-08, |
|
"loss": 1.7003, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9861554845580405, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.6451537541453677e-08, |
|
"loss": 1.3597, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.987220447284345, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.4185611588500847e-08, |
|
"loss": 1.5915, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.9882854100106496, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 1.2087430218786776e-08, |
|
"loss": 1.6792, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.9893503727369543, |
|
"grad_norm": 10.0, |
|
"learning_rate": 1.0157016918426188e-08, |
|
"loss": 1.3929, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.9904153354632588, |
|
"grad_norm": 8.5, |
|
"learning_rate": 8.39439329561098e-09, |
|
"loss": 1.5281, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9914802981895634, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 6.799579080372098e-09, |
|
"loss": 1.5907, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.9925452609158679, |
|
"grad_norm": 9.0, |
|
"learning_rate": 5.372592124354703e-09, |
|
"loss": 1.7476, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.9936102236421726, |
|
"grad_norm": 10.375, |
|
"learning_rate": 4.113448400621667e-09, |
|
"loss": 1.5377, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.9946751863684771, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 3.0221620034687203e-09, |
|
"loss": 1.26, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.9957401490947817, |
|
"grad_norm": 9.625, |
|
"learning_rate": 2.0987451482762376e-09, |
|
"loss": 1.2999, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9968051118210862, |
|
"grad_norm": 9.9375, |
|
"learning_rate": 1.3432081713626865e-09, |
|
"loss": 1.7814, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.9978700745473909, |
|
"grad_norm": 11.125, |
|
"learning_rate": 7.555595298747165e-10, |
|
"loss": 1.4365, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.9989350372736954, |
|
"grad_norm": 13.875, |
|
"learning_rate": 3.3580580169223494e-10, |
|
"loss": 1.4393, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 37.5, |
|
"learning_rate": 8.395168535180187e-11, |
|
"loss": 1.3917, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 939, |
|
"total_flos": 1.9193025942808166e+17, |
|
"train_loss": 1.628197498976613, |
|
"train_runtime": 2231.2795, |
|
"train_samples_per_second": 6.728, |
|
"train_steps_per_second": 0.421 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 939, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9193025942808166e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|