|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.928, |
|
"eval_steps": 500, |
|
"global_step": 310, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 2.3387899124656393, |
|
"learning_rate": 0.0, |
|
"loss": 1.1321, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 2.2739175729395282, |
|
"learning_rate": 3.2258064516129035e-07, |
|
"loss": 0.9759, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 3.2333006230979437, |
|
"learning_rate": 6.451612903225807e-07, |
|
"loss": 1.0141, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 2.350357276823565, |
|
"learning_rate": 9.67741935483871e-07, |
|
"loss": 0.9947, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.1839271581694755, |
|
"learning_rate": 1.2903225806451614e-06, |
|
"loss": 1.023, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 2.153599307977408, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.9609, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 2.1555129140427067, |
|
"learning_rate": 1.935483870967742e-06, |
|
"loss": 0.9622, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 2.008278114649717, |
|
"learning_rate": 2.2580645161290324e-06, |
|
"loss": 0.9711, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.9242182990110015, |
|
"learning_rate": 2.580645161290323e-06, |
|
"loss": 0.9673, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.750252408219067, |
|
"learning_rate": 2.903225806451613e-06, |
|
"loss": 0.9438, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.6939070396617073, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 0.8756, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.2834939133740737, |
|
"learning_rate": 3.548387096774194e-06, |
|
"loss": 0.9823, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.4367519479529964, |
|
"learning_rate": 3.870967741935484e-06, |
|
"loss": 0.9596, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 1.3144994907230427, |
|
"learning_rate": 4.193548387096774e-06, |
|
"loss": 1.0521, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0689628463958285, |
|
"learning_rate": 4.516129032258065e-06, |
|
"loss": 0.938, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.2021712620582392, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 1.0955, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.2539951248917753, |
|
"learning_rate": 5.161290322580646e-06, |
|
"loss": 0.9417, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.4053233581800462, |
|
"learning_rate": 5.483870967741935e-06, |
|
"loss": 0.9956, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 1.0278616263126648, |
|
"learning_rate": 5.806451612903226e-06, |
|
"loss": 0.7439, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.256128877382818, |
|
"learning_rate": 6.129032258064517e-06, |
|
"loss": 0.9542, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.9487549369343321, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 0.9816, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.0891896971338342, |
|
"learning_rate": 6.774193548387097e-06, |
|
"loss": 1.0704, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.1660258370107575, |
|
"learning_rate": 7.096774193548388e-06, |
|
"loss": 0.8931, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.9415137340919071, |
|
"learning_rate": 7.4193548387096784e-06, |
|
"loss": 0.8683, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.0798354285638785, |
|
"learning_rate": 7.741935483870968e-06, |
|
"loss": 1.0478, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.8096142802637235, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 0.8466, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.7447999799600428, |
|
"learning_rate": 8.387096774193549e-06, |
|
"loss": 0.7486, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.8621800122661538, |
|
"learning_rate": 8.70967741935484e-06, |
|
"loss": 0.9581, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.7940321530754582, |
|
"learning_rate": 9.03225806451613e-06, |
|
"loss": 0.8867, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.8496750615573763, |
|
"learning_rate": 9.35483870967742e-06, |
|
"loss": 0.9179, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.8798601594137285, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 0.8834, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.8818390794513048, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8952, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.8322934845468171, |
|
"learning_rate": 9.999683023724021e-06, |
|
"loss": 1.029, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.7007834456821566, |
|
"learning_rate": 9.998732135085665e-06, |
|
"loss": 0.8438, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6137020406246253, |
|
"learning_rate": 9.99714745464859e-06, |
|
"loss": 0.8435, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.7231796555332799, |
|
"learning_rate": 9.994929183335237e-06, |
|
"loss": 0.9646, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.6180598082755802, |
|
"learning_rate": 9.992077602401358e-06, |
|
"loss": 0.9616, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.6567886708960547, |
|
"learning_rate": 9.988593073400354e-06, |
|
"loss": 0.8522, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.7454067800699115, |
|
"learning_rate": 9.984476038137437e-06, |
|
"loss": 0.9052, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6714316518124167, |
|
"learning_rate": 9.979727018613607e-06, |
|
"loss": 0.8103, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.6229146249540587, |
|
"learning_rate": 9.974346616959476e-06, |
|
"loss": 0.8401, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.6448840489157996, |
|
"learning_rate": 9.968335515358916e-06, |
|
"loss": 0.9144, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.528060268374605, |
|
"learning_rate": 9.961694475962562e-06, |
|
"loss": 0.8348, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.5902102953939741, |
|
"learning_rate": 9.954424340791195e-06, |
|
"loss": 0.7721, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5526442876969153, |
|
"learning_rate": 9.94652603162896e-06, |
|
"loss": 0.8438, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.5331236922634145, |
|
"learning_rate": 9.938000549906509e-06, |
|
"loss": 0.8023, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.6339240906385807, |
|
"learning_rate": 9.92884897657402e-06, |
|
"loss": 0.894, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.5283804877111584, |
|
"learning_rate": 9.919072471964146e-06, |
|
"loss": 0.9069, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.5170169653511614, |
|
"learning_rate": 9.908672275644898e-06, |
|
"loss": 0.8051, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4763529065504662, |
|
"learning_rate": 9.897649706262474e-06, |
|
"loss": 0.8301, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.4890239660294139, |
|
"learning_rate": 9.88600616137407e-06, |
|
"loss": 0.8629, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.37392633165922323, |
|
"learning_rate": 9.873743117270691e-06, |
|
"loss": 0.8343, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.5097372006252004, |
|
"learning_rate": 9.860862128789954e-06, |
|
"loss": 0.9025, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.5331140780823508, |
|
"learning_rate": 9.847364829118963e-06, |
|
"loss": 0.7832, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5371583880849503, |
|
"learning_rate": 9.833252929587231e-06, |
|
"loss": 0.7241, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.45993890737280607, |
|
"learning_rate": 9.818528219449705e-06, |
|
"loss": 0.8964, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.4715847632418153, |
|
"learning_rate": 9.803192565659898e-06, |
|
"loss": 0.8623, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.4942244137822645, |
|
"learning_rate": 9.78724791263318e-06, |
|
"loss": 0.7657, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.45536932211394177, |
|
"learning_rate": 9.770696282000245e-06, |
|
"loss": 0.8766, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.4891485391099054, |
|
"learning_rate": 9.753539772350792e-06, |
|
"loss": 0.8621, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.45887131278496746, |
|
"learning_rate": 9.735780558967434e-06, |
|
"loss": 0.7718, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.5002555111850944, |
|
"learning_rate": 9.717420893549902e-06, |
|
"loss": 0.8746, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5002555111850944, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.7591, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 0.7921009597496743, |
|
"learning_rate": 9.67890959377418e-06, |
|
"loss": 0.8141, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 0.5263404719063407, |
|
"learning_rate": 9.658762842283343e-06, |
|
"loss": 0.8, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 0.6027486092631968, |
|
"learning_rate": 9.638025403873939e-06, |
|
"loss": 0.8869, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 0.5040803182249076, |
|
"learning_rate": 9.616699907856368e-06, |
|
"loss": 0.7434, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.4794307353380867, |
|
"learning_rate": 9.594789058101154e-06, |
|
"loss": 0.7162, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 0.519708365022811, |
|
"learning_rate": 9.57229563269612e-06, |
|
"loss": 0.9109, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 0.5405956328678668, |
|
"learning_rate": 9.549222483594154e-06, |
|
"loss": 0.7813, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 0.5713243651929747, |
|
"learning_rate": 9.525572536251608e-06, |
|
"loss": 0.7776, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 0.7593604265555369, |
|
"learning_rate": 9.501348789257373e-06, |
|
"loss": 0.7817, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.5104218019933199, |
|
"learning_rate": 9.476554313952697e-06, |
|
"loss": 0.7754, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 0.4357678519578492, |
|
"learning_rate": 9.451192254041759e-06, |
|
"loss": 0.7828, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 0.48992867265594076, |
|
"learning_rate": 9.425265825193077e-06, |
|
"loss": 0.7823, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 0.4744625173124652, |
|
"learning_rate": 9.398778314631801e-06, |
|
"loss": 0.7303, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 0.5598627796833753, |
|
"learning_rate": 9.371733080722911e-06, |
|
"loss": 0.7444, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.4312343972662455, |
|
"learning_rate": 9.34413355254542e-06, |
|
"loss": 0.836, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 0.624677329344369, |
|
"learning_rate": 9.31598322945759e-06, |
|
"loss": 0.8577, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 0.49859023374542194, |
|
"learning_rate": 9.287285680653254e-06, |
|
"loss": 0.7683, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 0.6015424998788388, |
|
"learning_rate": 9.258044544709276e-06, |
|
"loss": 0.8227, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 0.44003554877608686, |
|
"learning_rate": 9.228263529124199e-06, |
|
"loss": 0.7863, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.5609641970743867, |
|
"learning_rate": 9.197946409848196e-06, |
|
"loss": 0.8774, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 0.43143646632622484, |
|
"learning_rate": 9.167097030804289e-06, |
|
"loss": 0.8157, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 0.43142249232413526, |
|
"learning_rate": 9.135719303400995e-06, |
|
"loss": 0.7508, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 0.4657223032122722, |
|
"learning_rate": 9.103817206036383e-06, |
|
"loss": 0.6509, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 0.48523334387177774, |
|
"learning_rate": 9.071394783593664e-06, |
|
"loss": 0.8359, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.46496551627210725, |
|
"learning_rate": 9.038456146928325e-06, |
|
"loss": 0.864, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 0.4440780678769445, |
|
"learning_rate": 9.005005472346923e-06, |
|
"loss": 0.7902, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 0.561826912899574, |
|
"learning_rate": 8.971047001077561e-06, |
|
"loss": 0.8076, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 0.4900358752234758, |
|
"learning_rate": 8.936585038732143e-06, |
|
"loss": 0.6602, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 0.4606489965356049, |
|
"learning_rate": 8.90162395476046e-06, |
|
"loss": 0.8151, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.5359817129085576, |
|
"learning_rate": 8.866168181896198e-06, |
|
"loss": 0.8108, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 0.4099923668319849, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.7473, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 0.46662939848509505, |
|
"learning_rate": 8.793790613463956e-06, |
|
"loss": 0.8871, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.528, |
|
"grad_norm": 0.6601471838069068, |
|
"learning_rate": 8.756877994684818e-06, |
|
"loss": 0.8559, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 0.45387042686918283, |
|
"learning_rate": 8.719489039427256e-06, |
|
"loss": 0.7211, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.39658772389509284, |
|
"learning_rate": 8.681628488255986e-06, |
|
"loss": 0.8225, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 0.48808248239123686, |
|
"learning_rate": 8.643301141529619e-06, |
|
"loss": 0.7259, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 0.5005912376865987, |
|
"learning_rate": 8.604511858792006e-06, |
|
"loss": 0.8113, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 0.39380103997630306, |
|
"learning_rate": 8.565265558156101e-06, |
|
"loss": 0.782, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.624, |
|
"grad_norm": 0.5511984663976129, |
|
"learning_rate": 8.525567215680397e-06, |
|
"loss": 0.8738, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.4456656598175722, |
|
"learning_rate": 8.485421864737997e-06, |
|
"loss": 0.7922, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 0.4908529752157155, |
|
"learning_rate": 8.444834595378434e-06, |
|
"loss": 0.8487, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 0.4428491434503844, |
|
"learning_rate": 8.403810553682307e-06, |
|
"loss": 0.8335, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.688, |
|
"grad_norm": 0.5044508101327926, |
|
"learning_rate": 8.362354941108803e-06, |
|
"loss": 0.7921, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 0.5179814826203967, |
|
"learning_rate": 8.320473013836197e-06, |
|
"loss": 0.8403, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.5470226099168956, |
|
"learning_rate": 8.278170082095422e-06, |
|
"loss": 0.8999, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 0.41030649187861784, |
|
"learning_rate": 8.23545150949679e-06, |
|
"loss": 0.8033, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 0.5622265643432103, |
|
"learning_rate": 8.192322712349917e-06, |
|
"loss": 0.7425, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 0.5223132208572823, |
|
"learning_rate": 8.148789158977012e-06, |
|
"loss": 0.7316, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.784, |
|
"grad_norm": 0.475126333442523, |
|
"learning_rate": 8.104856369019525e-06, |
|
"loss": 0.7683, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.4948572562329079, |
|
"learning_rate": 8.060529912738316e-06, |
|
"loss": 0.7647, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 0.531807251361054, |
|
"learning_rate": 8.0158154103074e-06, |
|
"loss": 0.7611, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 0.47670080587998737, |
|
"learning_rate": 7.970718531101365e-06, |
|
"loss": 0.8092, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 0.4114578657588441, |
|
"learning_rate": 7.925244992976538e-06, |
|
"loss": 0.8859, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 0.5115548644684491, |
|
"learning_rate": 7.879400561546033e-06, |
|
"loss": 0.8891, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.5853821031130237, |
|
"learning_rate": 7.833191049448706e-06, |
|
"loss": 0.6497, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 0.3798635295552685, |
|
"learning_rate": 7.786622315612182e-06, |
|
"loss": 0.8151, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 0.4971866098065484, |
|
"learning_rate": 7.739700264509993e-06, |
|
"loss": 0.7809, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 0.4980674292423549, |
|
"learning_rate": 7.692430845412946e-06, |
|
"loss": 0.7679, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.944, |
|
"grad_norm": 0.4324290073091459, |
|
"learning_rate": 7.644820051634813e-06, |
|
"loss": 0.8501, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.47176687932737493, |
|
"learning_rate": 7.596873919772438e-06, |
|
"loss": 0.8027, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"grad_norm": 0.43463683245782003, |
|
"learning_rate": 7.548598528940354e-06, |
|
"loss": 0.7492, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 0.49151296507554265, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.7126, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.7056512813119306, |
|
"learning_rate": 7.451084494783668e-06, |
|
"loss": 0.9934, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 0.8222112850756275, |
|
"learning_rate": 7.401858215313228e-06, |
|
"loss": 0.7107, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 0.578857909730416, |
|
"learning_rate": 7.352327403013779e-06, |
|
"loss": 0.7626, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 0.6947157428046444, |
|
"learning_rate": 7.302498337922293e-06, |
|
"loss": 0.8212, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 1.3653434931121922, |
|
"learning_rate": 7.2523773378913655e-06, |
|
"loss": 0.7474, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.8976124611811842, |
|
"learning_rate": 7.201970757788172e-06, |
|
"loss": 0.6376, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 0.5152408942741, |
|
"learning_rate": 7.151284988688731e-06, |
|
"loss": 0.6641, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 0.6830633595485539, |
|
"learning_rate": 7.100326457067576e-06, |
|
"loss": 0.7996, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 0.5496861285450714, |
|
"learning_rate": 7.049101623982938e-06, |
|
"loss": 0.7581, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 0.6403401425933127, |
|
"learning_rate": 6.9976169842575526e-06, |
|
"loss": 0.7457, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.44381400078067595, |
|
"learning_rate": 6.945879065655164e-06, |
|
"loss": 0.6706, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 0.607564640506437, |
|
"learning_rate": 6.893894428052881e-06, |
|
"loss": 0.7532, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 0.5370083889151855, |
|
"learning_rate": 6.841669662609437e-06, |
|
"loss": 0.723, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 0.503787140696792, |
|
"learning_rate": 6.789211390929497e-06, |
|
"loss": 0.6432, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 0.48811253497835444, |
|
"learning_rate": 6.736526264224101e-06, |
|
"loss": 0.7416, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.5393544740145144, |
|
"learning_rate": 6.6836209624673575e-06, |
|
"loss": 0.6972, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 0.5146295742117404, |
|
"learning_rate": 6.6305021935494755e-06, |
|
"loss": 0.7735, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 0.5354676350111098, |
|
"learning_rate": 6.5771766924262795e-06, |
|
"loss": 0.7254, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 0.48957997280536275, |
|
"learning_rate": 6.523651220265269e-06, |
|
"loss": 0.6449, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 0.6260723291086323, |
|
"learning_rate": 6.469932563588386e-06, |
|
"loss": 0.6024, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.6277559902277917, |
|
"learning_rate": 6.41602753341152e-06, |
|
"loss": 0.7118, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 0.5654085188913539, |
|
"learning_rate": 6.361942964380967e-06, |
|
"loss": 0.7289, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 0.5511085996838134, |
|
"learning_rate": 6.307685713906835e-06, |
|
"loss": 0.7078, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 0.48643046392503236, |
|
"learning_rate": 6.2532626612936035e-06, |
|
"loss": 0.6913, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 0.4251201514104161, |
|
"learning_rate": 6.1986807068678926e-06, |
|
"loss": 0.8129, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.4957551218147004, |
|
"learning_rate": 6.143946771103561e-06, |
|
"loss": 0.7541, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 0.5337634201089703, |
|
"learning_rate": 6.089067793744258e-06, |
|
"loss": 0.7421, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 0.45135282751579137, |
|
"learning_rate": 6.034050732923538e-06, |
|
"loss": 0.7623, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 0.40984798370772846, |
|
"learning_rate": 5.978902564282616e-06, |
|
"loss": 0.8119, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 0.42530164890628464, |
|
"learning_rate": 5.923630280085948e-06, |
|
"loss": 0.7985, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.390559143940863, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.6778, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.45694185479460997, |
|
"learning_rate": 5.8127414118779825e-06, |
|
"loss": 0.7597, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 0.4952633583203359, |
|
"learning_rate": 5.757138887522884e-06, |
|
"loss": 0.7168, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 0.4342497218363157, |
|
"learning_rate": 5.701440365141799e-06, |
|
"loss": 0.6564, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 0.4718029081715289, |
|
"learning_rate": 5.645652906778808e-06, |
|
"loss": 0.66, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.5236548730626042, |
|
"learning_rate": 5.5897835857542315e-06, |
|
"loss": 0.7189, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.576, |
|
"grad_norm": 0.4217271822666547, |
|
"learning_rate": 5.533839485767795e-06, |
|
"loss": 0.6501, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 0.51236732912632, |
|
"learning_rate": 5.477827700000492e-06, |
|
"loss": 0.5564, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 0.5794935429954644, |
|
"learning_rate": 5.421755330215223e-06, |
|
"loss": 0.7193, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 0.43579809758377336, |
|
"learning_rate": 5.365629485856381e-06, |
|
"loss": 0.782, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.4273541248432265, |
|
"learning_rate": 5.30945728314841e-06, |
|
"loss": 0.6904, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 0.5127404572609707, |
|
"learning_rate": 5.253245844193564e-06, |
|
"loss": 0.7756, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.672, |
|
"grad_norm": 0.4502193208238627, |
|
"learning_rate": 5.197002296068878e-06, |
|
"loss": 0.7199, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 0.6522139082928684, |
|
"learning_rate": 5.140733769922525e-06, |
|
"loss": 0.7523, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 0.4604865675922319, |
|
"learning_rate": 5.084447400069656e-06, |
|
"loss": 0.658, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.47189630919959746, |
|
"learning_rate": 5.0281503230878304e-06, |
|
"loss": 0.7168, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 0.45853515734923483, |
|
"learning_rate": 4.971849676912172e-06, |
|
"loss": 0.59, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 0.6617371663600603, |
|
"learning_rate": 4.915552599930345e-06, |
|
"loss": 0.7154, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 0.4581535675670496, |
|
"learning_rate": 4.859266230077474e-06, |
|
"loss": 0.8279, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 0.44995006651439057, |
|
"learning_rate": 4.802997703931124e-06, |
|
"loss": 0.7029, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.6247080166668746, |
|
"learning_rate": 4.746754155806437e-06, |
|
"loss": 0.6663, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 0.43343619400050193, |
|
"learning_rate": 4.6905427168515914e-06, |
|
"loss": 0.7364, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 0.6592346345644005, |
|
"learning_rate": 4.63437051414362e-06, |
|
"loss": 0.7337, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 0.464552230592377, |
|
"learning_rate": 4.5782446697847775e-06, |
|
"loss": 0.7164, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 0.47084386707640963, |
|
"learning_rate": 4.52217229999951e-06, |
|
"loss": 0.7538, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.49465152130409273, |
|
"learning_rate": 4.466160514232206e-06, |
|
"loss": 0.6193, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 0.7463104607291, |
|
"learning_rate": 4.410216414245771e-06, |
|
"loss": 0.6887, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 0.5311673012762305, |
|
"learning_rate": 4.354347093221194e-06, |
|
"loss": 0.8275, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 0.4765968349930382, |
|
"learning_rate": 4.298559634858202e-06, |
|
"loss": 0.709, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 0.4041073627021218, |
|
"learning_rate": 4.2428611124771184e-06, |
|
"loss": 0.7103, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.5090780248152671, |
|
"learning_rate": 4.187258588122019e-06, |
|
"loss": 0.7976, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 0.6082158511853922, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 0.7694, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"grad_norm": 0.5727874160786263, |
|
"learning_rate": 4.076369719914055e-06, |
|
"loss": 0.7123, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.5727874160786263, |
|
"learning_rate": 4.021097435717386e-06, |
|
"loss": 0.5299, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 3.016, |
|
"grad_norm": 0.786848104570295, |
|
"learning_rate": 3.965949267076465e-06, |
|
"loss": 0.6761, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.032, |
|
"grad_norm": 0.6468253157472471, |
|
"learning_rate": 3.910932206255742e-06, |
|
"loss": 0.681, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 3.048, |
|
"grad_norm": 0.5371976935352293, |
|
"learning_rate": 3.856053228896442e-06, |
|
"loss": 0.7382, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.064, |
|
"grad_norm": 0.67361754038798, |
|
"learning_rate": 3.8013192931321095e-06, |
|
"loss": 0.6811, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.5906235125913445, |
|
"learning_rate": 3.7467373387063973e-06, |
|
"loss": 0.6517, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.096, |
|
"grad_norm": 1.1573348876057628, |
|
"learning_rate": 3.692314286093167e-06, |
|
"loss": 0.5953, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.112, |
|
"grad_norm": 0.8839756051783417, |
|
"learning_rate": 3.6380570356190346e-06, |
|
"loss": 0.6948, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.128, |
|
"grad_norm": 0.478868590948113, |
|
"learning_rate": 3.58397246658848e-06, |
|
"loss": 0.7461, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 3.144, |
|
"grad_norm": 1.084603868797167, |
|
"learning_rate": 3.5300674364116173e-06, |
|
"loss": 0.765, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.6440739819259357, |
|
"learning_rate": 3.476348779734732e-06, |
|
"loss": 0.6823, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 3.176, |
|
"grad_norm": 0.6488776395666153, |
|
"learning_rate": 3.4228233075737225e-06, |
|
"loss": 0.6315, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.192, |
|
"grad_norm": 0.48544942050692214, |
|
"learning_rate": 3.3694978064505258e-06, |
|
"loss": 0.6785, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 3.208, |
|
"grad_norm": 0.5813475942014552, |
|
"learning_rate": 3.316379037532644e-06, |
|
"loss": 0.6103, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.224, |
|
"grad_norm": 0.5475003564476933, |
|
"learning_rate": 3.2634737357758994e-06, |
|
"loss": 0.6907, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.6514850828217877, |
|
"learning_rate": 3.2107886090705035e-06, |
|
"loss": 0.7155, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 3.2560000000000002, |
|
"grad_norm": 0.8216743112809076, |
|
"learning_rate": 3.158330337390565e-06, |
|
"loss": 0.5908, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.2720000000000002, |
|
"grad_norm": 0.6175170431371285, |
|
"learning_rate": 3.10610557194712e-06, |
|
"loss": 0.7275, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 3.288, |
|
"grad_norm": 0.49918856301616266, |
|
"learning_rate": 3.0541209343448373e-06, |
|
"loss": 0.6363, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 3.304, |
|
"grad_norm": 0.4505859996624395, |
|
"learning_rate": 3.0023830157424504e-06, |
|
"loss": 0.5068, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.6047043450034242, |
|
"learning_rate": 2.950898376017064e-06, |
|
"loss": 0.6684, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 3.336, |
|
"grad_norm": 0.7050300424064798, |
|
"learning_rate": 2.8996735429324256e-06, |
|
"loss": 0.5437, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.352, |
|
"grad_norm": 0.6046509312257183, |
|
"learning_rate": 2.848715011311271e-06, |
|
"loss": 0.6065, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 3.368, |
|
"grad_norm": 0.5489312138581334, |
|
"learning_rate": 2.7980292422118282e-06, |
|
"loss": 0.6238, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 3.384, |
|
"grad_norm": 0.5345507410785075, |
|
"learning_rate": 2.7476226621086354e-06, |
|
"loss": 0.6665, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.44663549941791464, |
|
"learning_rate": 2.697501662077707e-06, |
|
"loss": 0.6989, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 3.416, |
|
"grad_norm": 0.4997402304914766, |
|
"learning_rate": 2.6476725969862227e-06, |
|
"loss": 0.7103, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.432, |
|
"grad_norm": 0.5248320920728966, |
|
"learning_rate": 2.5981417846867753e-06, |
|
"loss": 0.708, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 3.448, |
|
"grad_norm": 0.47361876881842346, |
|
"learning_rate": 2.548915505216333e-06, |
|
"loss": 0.6161, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 3.464, |
|
"grad_norm": 0.38951773809344253, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.6242, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.4621052178631838, |
|
"learning_rate": 2.4514014710596467e-06, |
|
"loss": 0.6869, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 3.496, |
|
"grad_norm": 0.6218871062337231, |
|
"learning_rate": 2.4031260802275623e-06, |
|
"loss": 0.6285, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.512, |
|
"grad_norm": 0.5837366459679408, |
|
"learning_rate": 2.3551799483651894e-06, |
|
"loss": 0.601, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 3.528, |
|
"grad_norm": 0.5638704457348409, |
|
"learning_rate": 2.307569154587056e-06, |
|
"loss": 0.6197, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 3.544, |
|
"grad_norm": 0.48163508407083816, |
|
"learning_rate": 2.2602997354900075e-06, |
|
"loss": 0.5835, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.42941733644651287, |
|
"learning_rate": 2.2133776843878185e-06, |
|
"loss": 0.6948, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 3.576, |
|
"grad_norm": 0.5079953380381145, |
|
"learning_rate": 2.166808950551296e-06, |
|
"loss": 0.5979, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.592, |
|
"grad_norm": 0.44804407509524075, |
|
"learning_rate": 2.120599438453968e-06, |
|
"loss": 0.6914, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 3.608, |
|
"grad_norm": 0.43408421753223764, |
|
"learning_rate": 2.074755007023461e-06, |
|
"loss": 0.6357, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 3.624, |
|
"grad_norm": 0.5135247400323784, |
|
"learning_rate": 2.0292814688986375e-06, |
|
"loss": 0.657, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.414982645568696, |
|
"learning_rate": 1.9841845896926022e-06, |
|
"loss": 0.6087, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 3.656, |
|
"grad_norm": 0.6317365143952755, |
|
"learning_rate": 1.9394700872616856e-06, |
|
"loss": 0.6895, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.672, |
|
"grad_norm": 0.42574406798807973, |
|
"learning_rate": 1.8951436309804766e-06, |
|
"loss": 0.6307, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 3.6879999999999997, |
|
"grad_norm": 0.4066800096346742, |
|
"learning_rate": 1.8512108410229878e-06, |
|
"loss": 0.6796, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 3.7039999999999997, |
|
"grad_norm": 0.4312374117560947, |
|
"learning_rate": 1.8076772876500831e-06, |
|
"loss": 0.5983, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 0.4963904945917347, |
|
"learning_rate": 1.7645484905032129e-06, |
|
"loss": 0.6899, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 3.7359999999999998, |
|
"grad_norm": 0.8069474385968237, |
|
"learning_rate": 1.7218299179045789e-06, |
|
"loss": 0.6829, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.752, |
|
"grad_norm": 0.4408325662467717, |
|
"learning_rate": 1.6795269861638041e-06, |
|
"loss": 0.6838, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 3.768, |
|
"grad_norm": 0.39257406551923874, |
|
"learning_rate": 1.6376450588911985e-06, |
|
"loss": 0.621, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 3.784, |
|
"grad_norm": 0.5021408990976908, |
|
"learning_rate": 1.5961894463176942e-06, |
|
"loss": 0.5381, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.0161511106208339, |
|
"learning_rate": 1.555165404621567e-06, |
|
"loss": 0.6631, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 3.816, |
|
"grad_norm": 0.5589883713476835, |
|
"learning_rate": 1.5145781352620054e-06, |
|
"loss": 0.7263, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.832, |
|
"grad_norm": 0.5170039801729964, |
|
"learning_rate": 1.4744327843196043e-06, |
|
"loss": 0.6526, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 3.848, |
|
"grad_norm": 0.4365703947702624, |
|
"learning_rate": 1.434734441843899e-06, |
|
"loss": 0.624, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 3.864, |
|
"grad_norm": 0.45343775193001395, |
|
"learning_rate": 1.3954881412079945e-06, |
|
"loss": 0.6207, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.5987276534862411, |
|
"learning_rate": 1.3566988584703817e-06, |
|
"loss": 0.6082, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 3.896, |
|
"grad_norm": 0.4189228526905657, |
|
"learning_rate": 1.3183715117440143e-06, |
|
"loss": 0.7211, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.912, |
|
"grad_norm": 0.4560749716198238, |
|
"learning_rate": 1.280510960572745e-06, |
|
"loss": 0.8071, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 3.928, |
|
"grad_norm": 0.43909838579399224, |
|
"learning_rate": 1.2431220053151832e-06, |
|
"loss": 0.5696, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 3.944, |
|
"grad_norm": 0.3884372689129327, |
|
"learning_rate": 1.2062093865360458e-06, |
|
"loss": 0.6917, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.4782590203306387, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 0.6519, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 3.976, |
|
"grad_norm": 0.47207710861498847, |
|
"learning_rate": 1.1338318181038037e-06, |
|
"loss": 0.6184, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.992, |
|
"grad_norm": 0.599636336477758, |
|
"learning_rate": 1.0983760452395415e-06, |
|
"loss": 0.5992, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.7417980348609519, |
|
"learning_rate": 1.063414961267859e-06, |
|
"loss": 0.6355, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 4.016, |
|
"grad_norm": 0.7141823488630563, |
|
"learning_rate": 1.02895299892244e-06, |
|
"loss": 0.6503, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 4.032, |
|
"grad_norm": 0.435159174788827, |
|
"learning_rate": 9.949945276530782e-07, |
|
"loss": 0.7162, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 4.048, |
|
"grad_norm": 0.36049991204427784, |
|
"learning_rate": 9.615438530716753e-07, |
|
"loss": 0.6724, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 4.064, |
|
"grad_norm": 0.5381648210317546, |
|
"learning_rate": 9.286052164063369e-07, |
|
"loss": 0.7197, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.5781835965920862, |
|
"learning_rate": 8.961827939636198e-07, |
|
"loss": 0.5765, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 4.096, |
|
"grad_norm": 0.5139404342600378, |
|
"learning_rate": 8.64280696599008e-07, |
|
"loss": 0.6287, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 4.112, |
|
"grad_norm": 0.5518389677460305, |
|
"learning_rate": 8.329029691957124e-07, |
|
"loss": 0.6191, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 4.128, |
|
"grad_norm": 0.43662936915773237, |
|
"learning_rate": 8.02053590151805e-07, |
|
"loss": 0.6479, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.144, |
|
"grad_norm": 0.46532145231036826, |
|
"learning_rate": 7.717364708758024e-07, |
|
"loss": 0.6143, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.5445645397074081, |
|
"learning_rate": 7.41955455290726e-07, |
|
"loss": 0.6516, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 4.176, |
|
"grad_norm": 0.5440155226788446, |
|
"learning_rate": 7.127143193467445e-07, |
|
"loss": 0.5751, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 4.192, |
|
"grad_norm": 0.5155530465031002, |
|
"learning_rate": 6.840167705424106e-07, |
|
"loss": 0.692, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 4.208, |
|
"grad_norm": 0.5031049864266178, |
|
"learning_rate": 6.558664474545817e-07, |
|
"loss": 0.6563, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 4.224, |
|
"grad_norm": 0.4762521621576414, |
|
"learning_rate": 6.282669192770896e-07, |
|
"loss": 0.6349, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.4287400326563321, |
|
"learning_rate": 6.012216853682001e-07, |
|
"loss": 0.6119, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 4.256, |
|
"grad_norm": 0.5036260301077607, |
|
"learning_rate": 5.747341748069229e-07, |
|
"loss": 0.6086, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 4.272, |
|
"grad_norm": 0.47816388259567105, |
|
"learning_rate": 5.488077459582425e-07, |
|
"loss": 0.6872, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 4.288, |
|
"grad_norm": 0.5642935756781936, |
|
"learning_rate": 5.234456860473042e-07, |
|
"loss": 0.5685, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.304, |
|
"grad_norm": 0.5190425314054581, |
|
"learning_rate": 4.986512107426283e-07, |
|
"loss": 0.5682, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.4991277601303304, |
|
"learning_rate": 4.7442746374839363e-07, |
|
"loss": 0.5999, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 4.336, |
|
"grad_norm": 0.513792964016802, |
|
"learning_rate": 4.50777516405847e-07, |
|
"loss": 0.5073, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 4.352, |
|
"grad_norm": 0.49945898158203306, |
|
"learning_rate": 4.2770436730388166e-07, |
|
"loss": 0.6389, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 4.368, |
|
"grad_norm": 0.4160620660991658, |
|
"learning_rate": 4.05210941898847e-07, |
|
"loss": 0.5893, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 4.384, |
|
"grad_norm": 0.7266111521201372, |
|
"learning_rate": 3.8330009214363197e-07, |
|
"loss": 0.6772, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.42017543128804824, |
|
"learning_rate": 3.619745961260623e-07, |
|
"loss": 0.5987, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 4.416, |
|
"grad_norm": 0.4256599373285864, |
|
"learning_rate": 3.4123715771665786e-07, |
|
"loss": 0.6398, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 4.432, |
|
"grad_norm": 0.47920082328651303, |
|
"learning_rate": 3.2109040622582186e-07, |
|
"loss": 0.5561, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 4.448, |
|
"grad_norm": 0.4446279572298486, |
|
"learning_rate": 3.015368960704584e-07, |
|
"loss": 0.5226, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.464, |
|
"grad_norm": 0.4128809611697924, |
|
"learning_rate": 2.8257910645009935e-07, |
|
"loss": 0.5459, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.5164938599454552, |
|
"learning_rate": 2.6421944103256657e-07, |
|
"loss": 0.7039, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 4.496, |
|
"grad_norm": 0.39001141065666656, |
|
"learning_rate": 2.4646022764920843e-07, |
|
"loss": 0.6533, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 4.5120000000000005, |
|
"grad_norm": 0.4572069001734452, |
|
"learning_rate": 2.2930371799975593e-07, |
|
"loss": 0.6683, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 4.5280000000000005, |
|
"grad_norm": 0.4484745544743389, |
|
"learning_rate": 2.1275208736682262e-07, |
|
"loss": 0.7176, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 4.5440000000000005, |
|
"grad_norm": 0.5157717675452108, |
|
"learning_rate": 1.9680743434010385e-07, |
|
"loss": 0.5022, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 0.48446115116950594, |
|
"learning_rate": 1.814717805502958e-07, |
|
"loss": 0.5703, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 4.576, |
|
"grad_norm": 0.4082228397127477, |
|
"learning_rate": 1.667470704127694e-07, |
|
"loss": 0.6104, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 4.592, |
|
"grad_norm": 0.4080020398560308, |
|
"learning_rate": 1.5263517088103862e-07, |
|
"loss": 0.6606, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"grad_norm": 0.42656755273331415, |
|
"learning_rate": 1.3913787121004717e-07, |
|
"loss": 0.5138, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.624, |
|
"grad_norm": 0.43365881833884873, |
|
"learning_rate": 1.2625688272930925e-07, |
|
"loss": 0.647, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.5334922284291049, |
|
"learning_rate": 1.1399383862592928e-07, |
|
"loss": 0.5862, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 4.656, |
|
"grad_norm": 0.4475821043001788, |
|
"learning_rate": 1.0235029373752758e-07, |
|
"loss": 0.5856, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 4.672, |
|
"grad_norm": 0.5095817714162325, |
|
"learning_rate": 9.132772435510362e-08, |
|
"loss": 0.6776, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 4.688, |
|
"grad_norm": 0.4401821249490004, |
|
"learning_rate": 8.092752803585513e-08, |
|
"loss": 0.6395, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 4.704, |
|
"grad_norm": 0.5139254893593113, |
|
"learning_rate": 7.115102342598101e-08, |
|
"loss": 0.6758, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.45890269186686167, |
|
"learning_rate": 6.199945009349173e-08, |
|
"loss": 0.6181, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 4.736, |
|
"grad_norm": 0.5704736020159095, |
|
"learning_rate": 5.3473968371040575e-08, |
|
"loss": 0.6903, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 4.752, |
|
"grad_norm": 0.4163575886575153, |
|
"learning_rate": 4.55756592088058e-08, |
|
"loss": 0.6921, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 4.768, |
|
"grad_norm": 0.4666414134754862, |
|
"learning_rate": 3.8305524037438035e-08, |
|
"loss": 0.5428, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.784, |
|
"grad_norm": 0.425223735477641, |
|
"learning_rate": 3.166448464108629e-08, |
|
"loss": 0.6095, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.44012934287274635, |
|
"learning_rate": 2.5653383040524228e-08, |
|
"loss": 0.6132, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 4.816, |
|
"grad_norm": 0.4498748526968263, |
|
"learning_rate": 2.0272981386393332e-08, |
|
"loss": 0.6117, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 4.832, |
|
"grad_norm": 0.5516353902390095, |
|
"learning_rate": 1.552396186256411e-08, |
|
"loss": 0.6204, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 4.848, |
|
"grad_norm": 0.5348366426470775, |
|
"learning_rate": 1.1406926599646373e-08, |
|
"loss": 0.5491, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"grad_norm": 0.4365913854403371, |
|
"learning_rate": 7.922397598642551e-09, |
|
"loss": 0.5755, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.4508181509045664, |
|
"learning_rate": 5.0708166647628345e-09, |
|
"loss": 0.6805, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 4.896, |
|
"grad_norm": 0.4459155047488747, |
|
"learning_rate": 2.8525453514099966e-09, |
|
"loss": 0.7079, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 4.912, |
|
"grad_norm": 0.4498756560834822, |
|
"learning_rate": 1.2678649143349485e-09, |
|
"loss": 0.6142, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 4.928, |
|
"grad_norm": 0.4011359054769027, |
|
"learning_rate": 3.1697627597970794e-10, |
|
"loss": 0.7004, |
|
"step": 310 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 62448509779968.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|