|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 189, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 2.3387899124656393, |
|
"learning_rate": 0.0, |
|
"loss": 1.1321, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 2.2739175729395282, |
|
"learning_rate": 3.2258064516129035e-07, |
|
"loss": 0.9759, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 3.2333006230979437, |
|
"learning_rate": 6.451612903225807e-07, |
|
"loss": 1.0141, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 2.350357276823565, |
|
"learning_rate": 9.67741935483871e-07, |
|
"loss": 0.9947, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.1839271581694755, |
|
"learning_rate": 1.2903225806451614e-06, |
|
"loss": 1.023, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 2.153599307977408, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.9609, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 2.1555129140427067, |
|
"learning_rate": 1.935483870967742e-06, |
|
"loss": 0.9622, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 2.008278114649717, |
|
"learning_rate": 2.2580645161290324e-06, |
|
"loss": 0.9711, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.9242182990110015, |
|
"learning_rate": 2.580645161290323e-06, |
|
"loss": 0.9673, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.750252408219067, |
|
"learning_rate": 2.903225806451613e-06, |
|
"loss": 0.9438, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.6939070396617073, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 0.8756, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.2834939133740737, |
|
"learning_rate": 3.548387096774194e-06, |
|
"loss": 0.9823, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.4367519479529964, |
|
"learning_rate": 3.870967741935484e-06, |
|
"loss": 0.9596, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 1.3144994907230427, |
|
"learning_rate": 4.193548387096774e-06, |
|
"loss": 1.0521, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0689628463958285, |
|
"learning_rate": 4.516129032258065e-06, |
|
"loss": 0.938, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.2021712620582392, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 1.0955, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.2539951248917753, |
|
"learning_rate": 5.161290322580646e-06, |
|
"loss": 0.9417, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.4053233581800462, |
|
"learning_rate": 5.483870967741935e-06, |
|
"loss": 0.9956, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 1.0278616263126648, |
|
"learning_rate": 5.806451612903226e-06, |
|
"loss": 0.7439, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.256128877382818, |
|
"learning_rate": 6.129032258064517e-06, |
|
"loss": 0.9542, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.9487549369343321, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 0.9816, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.0891896971338342, |
|
"learning_rate": 6.774193548387097e-06, |
|
"loss": 1.0704, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.1660258370107575, |
|
"learning_rate": 7.096774193548388e-06, |
|
"loss": 0.8931, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.9415137340919071, |
|
"learning_rate": 7.4193548387096784e-06, |
|
"loss": 0.8683, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.0798354285638785, |
|
"learning_rate": 7.741935483870968e-06, |
|
"loss": 1.0478, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.8096142802637235, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 0.8466, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.7447999799600428, |
|
"learning_rate": 8.387096774193549e-06, |
|
"loss": 0.7486, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.8621800122661538, |
|
"learning_rate": 8.70967741935484e-06, |
|
"loss": 0.9581, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.7940321530754582, |
|
"learning_rate": 9.03225806451613e-06, |
|
"loss": 0.8867, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.8496750615573763, |
|
"learning_rate": 9.35483870967742e-06, |
|
"loss": 0.9179, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.8798601594137285, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 0.8834, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.8818390794513048, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8952, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.8322934845468171, |
|
"learning_rate": 9.999683023724021e-06, |
|
"loss": 1.029, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.7007834456821566, |
|
"learning_rate": 9.998732135085665e-06, |
|
"loss": 0.8438, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6137020406246253, |
|
"learning_rate": 9.99714745464859e-06, |
|
"loss": 0.8435, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.7231796555332799, |
|
"learning_rate": 9.994929183335237e-06, |
|
"loss": 0.9646, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.6180598082755802, |
|
"learning_rate": 9.992077602401358e-06, |
|
"loss": 0.9616, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.6567886708960547, |
|
"learning_rate": 9.988593073400354e-06, |
|
"loss": 0.8522, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.7454067800699115, |
|
"learning_rate": 9.984476038137437e-06, |
|
"loss": 0.9052, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6714316518124167, |
|
"learning_rate": 9.979727018613607e-06, |
|
"loss": 0.8103, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.6229146249540587, |
|
"learning_rate": 9.974346616959476e-06, |
|
"loss": 0.8401, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.6448840489157996, |
|
"learning_rate": 9.968335515358916e-06, |
|
"loss": 0.9144, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.528060268374605, |
|
"learning_rate": 9.961694475962562e-06, |
|
"loss": 0.8348, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.5902102953939741, |
|
"learning_rate": 9.954424340791195e-06, |
|
"loss": 0.7721, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5526442876969153, |
|
"learning_rate": 9.94652603162896e-06, |
|
"loss": 0.8438, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.5331236922634145, |
|
"learning_rate": 9.938000549906509e-06, |
|
"loss": 0.8023, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.6339240906385807, |
|
"learning_rate": 9.92884897657402e-06, |
|
"loss": 0.894, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.5283804877111584, |
|
"learning_rate": 9.919072471964146e-06, |
|
"loss": 0.9069, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.5170169653511614, |
|
"learning_rate": 9.908672275644898e-06, |
|
"loss": 0.8051, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4763529065504662, |
|
"learning_rate": 9.897649706262474e-06, |
|
"loss": 0.8301, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.4890239660294139, |
|
"learning_rate": 9.88600616137407e-06, |
|
"loss": 0.8629, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.37392633165922323, |
|
"learning_rate": 9.873743117270691e-06, |
|
"loss": 0.8343, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.5097372006252004, |
|
"learning_rate": 9.860862128789954e-06, |
|
"loss": 0.9025, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.5331140780823508, |
|
"learning_rate": 9.847364829118963e-06, |
|
"loss": 0.7832, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5371583880849503, |
|
"learning_rate": 9.833252929587231e-06, |
|
"loss": 0.7241, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.45993890737280607, |
|
"learning_rate": 9.818528219449705e-06, |
|
"loss": 0.8964, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.4715847632418153, |
|
"learning_rate": 9.803192565659898e-06, |
|
"loss": 0.8623, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.4942244137822645, |
|
"learning_rate": 9.78724791263318e-06, |
|
"loss": 0.7657, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.45536932211394177, |
|
"learning_rate": 9.770696282000245e-06, |
|
"loss": 0.8766, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.4891485391099054, |
|
"learning_rate": 9.753539772350792e-06, |
|
"loss": 0.8621, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.45887131278496746, |
|
"learning_rate": 9.735780558967434e-06, |
|
"loss": 0.7718, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.5002555111850944, |
|
"learning_rate": 9.717420893549902e-06, |
|
"loss": 0.8746, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5002555111850944, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.7591, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 0.7921009597496743, |
|
"learning_rate": 9.67890959377418e-06, |
|
"loss": 0.8141, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 0.5263404719063407, |
|
"learning_rate": 9.658762842283343e-06, |
|
"loss": 0.8, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 0.6027486092631968, |
|
"learning_rate": 9.638025403873939e-06, |
|
"loss": 0.8869, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 0.5040803182249076, |
|
"learning_rate": 9.616699907856368e-06, |
|
"loss": 0.7434, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.4794307353380867, |
|
"learning_rate": 9.594789058101154e-06, |
|
"loss": 0.7162, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 0.519708365022811, |
|
"learning_rate": 9.57229563269612e-06, |
|
"loss": 0.9109, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 0.5405956328678668, |
|
"learning_rate": 9.549222483594154e-06, |
|
"loss": 0.7813, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 0.5713243651929747, |
|
"learning_rate": 9.525572536251608e-06, |
|
"loss": 0.7776, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 0.7593604265555369, |
|
"learning_rate": 9.501348789257373e-06, |
|
"loss": 0.7817, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.5104218019933199, |
|
"learning_rate": 9.476554313952697e-06, |
|
"loss": 0.7754, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 0.4357678519578492, |
|
"learning_rate": 9.451192254041759e-06, |
|
"loss": 0.7828, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 0.48992867265594076, |
|
"learning_rate": 9.425265825193077e-06, |
|
"loss": 0.7823, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 0.4744625173124652, |
|
"learning_rate": 9.398778314631801e-06, |
|
"loss": 0.7303, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 0.5598627796833753, |
|
"learning_rate": 9.371733080722911e-06, |
|
"loss": 0.7444, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.4312343972662455, |
|
"learning_rate": 9.34413355254542e-06, |
|
"loss": 0.836, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 0.624677329344369, |
|
"learning_rate": 9.31598322945759e-06, |
|
"loss": 0.8577, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 0.49859023374542194, |
|
"learning_rate": 9.287285680653254e-06, |
|
"loss": 0.7683, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 0.6015424998788388, |
|
"learning_rate": 9.258044544709276e-06, |
|
"loss": 0.8227, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 0.44003554877608686, |
|
"learning_rate": 9.228263529124199e-06, |
|
"loss": 0.7863, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.5609641970743867, |
|
"learning_rate": 9.197946409848196e-06, |
|
"loss": 0.8774, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 0.43143646632622484, |
|
"learning_rate": 9.167097030804289e-06, |
|
"loss": 0.8157, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 0.43142249232413526, |
|
"learning_rate": 9.135719303400995e-06, |
|
"loss": 0.7508, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 0.4657223032122722, |
|
"learning_rate": 9.103817206036383e-06, |
|
"loss": 0.6509, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 0.48523334387177774, |
|
"learning_rate": 9.071394783593664e-06, |
|
"loss": 0.8359, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.46496551627210725, |
|
"learning_rate": 9.038456146928325e-06, |
|
"loss": 0.864, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 0.4440780678769445, |
|
"learning_rate": 9.005005472346923e-06, |
|
"loss": 0.7902, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 0.561826912899574, |
|
"learning_rate": 8.971047001077561e-06, |
|
"loss": 0.8076, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 0.4900358752234758, |
|
"learning_rate": 8.936585038732143e-06, |
|
"loss": 0.6602, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 0.4606489965356049, |
|
"learning_rate": 8.90162395476046e-06, |
|
"loss": 0.8151, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.5359817129085576, |
|
"learning_rate": 8.866168181896198e-06, |
|
"loss": 0.8108, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 0.4099923668319849, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.7473, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 0.46662939848509505, |
|
"learning_rate": 8.793790613463956e-06, |
|
"loss": 0.8871, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.528, |
|
"grad_norm": 0.6601471838069068, |
|
"learning_rate": 8.756877994684818e-06, |
|
"loss": 0.8559, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 0.45387042686918283, |
|
"learning_rate": 8.719489039427256e-06, |
|
"loss": 0.7211, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.39658772389509284, |
|
"learning_rate": 8.681628488255986e-06, |
|
"loss": 0.8225, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 0.48808248239123686, |
|
"learning_rate": 8.643301141529619e-06, |
|
"loss": 0.7259, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 0.5005912376865987, |
|
"learning_rate": 8.604511858792006e-06, |
|
"loss": 0.8113, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 0.39380103997630306, |
|
"learning_rate": 8.565265558156101e-06, |
|
"loss": 0.782, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.624, |
|
"grad_norm": 0.5511984663976129, |
|
"learning_rate": 8.525567215680397e-06, |
|
"loss": 0.8738, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.4456656598175722, |
|
"learning_rate": 8.485421864737997e-06, |
|
"loss": 0.7922, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 0.4908529752157155, |
|
"learning_rate": 8.444834595378434e-06, |
|
"loss": 0.8487, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 0.4428491434503844, |
|
"learning_rate": 8.403810553682307e-06, |
|
"loss": 0.8335, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.688, |
|
"grad_norm": 0.5044508101327926, |
|
"learning_rate": 8.362354941108803e-06, |
|
"loss": 0.7921, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 0.5179814826203967, |
|
"learning_rate": 8.320473013836197e-06, |
|
"loss": 0.8403, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.5470226099168956, |
|
"learning_rate": 8.278170082095422e-06, |
|
"loss": 0.8999, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 0.41030649187861784, |
|
"learning_rate": 8.23545150949679e-06, |
|
"loss": 0.8033, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 0.5622265643432103, |
|
"learning_rate": 8.192322712349917e-06, |
|
"loss": 0.7425, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 0.5223132208572823, |
|
"learning_rate": 8.148789158977012e-06, |
|
"loss": 0.7316, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.784, |
|
"grad_norm": 0.475126333442523, |
|
"learning_rate": 8.104856369019525e-06, |
|
"loss": 0.7683, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.4948572562329079, |
|
"learning_rate": 8.060529912738316e-06, |
|
"loss": 0.7647, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 0.531807251361054, |
|
"learning_rate": 8.0158154103074e-06, |
|
"loss": 0.7611, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 0.47670080587998737, |
|
"learning_rate": 7.970718531101365e-06, |
|
"loss": 0.8092, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 0.4114578657588441, |
|
"learning_rate": 7.925244992976538e-06, |
|
"loss": 0.8859, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 0.5115548644684491, |
|
"learning_rate": 7.879400561546033e-06, |
|
"loss": 0.8891, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.5853821031130237, |
|
"learning_rate": 7.833191049448706e-06, |
|
"loss": 0.6497, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 0.3798635295552685, |
|
"learning_rate": 7.786622315612182e-06, |
|
"loss": 0.8151, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 0.4971866098065484, |
|
"learning_rate": 7.739700264509993e-06, |
|
"loss": 0.7809, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 0.4980674292423549, |
|
"learning_rate": 7.692430845412946e-06, |
|
"loss": 0.7679, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.944, |
|
"grad_norm": 0.4324290073091459, |
|
"learning_rate": 7.644820051634813e-06, |
|
"loss": 0.8501, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.47176687932737493, |
|
"learning_rate": 7.596873919772438e-06, |
|
"loss": 0.8027, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"grad_norm": 0.43463683245782003, |
|
"learning_rate": 7.548598528940354e-06, |
|
"loss": 0.7492, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 0.49151296507554265, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.7126, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.7056512813119306, |
|
"learning_rate": 7.451084494783668e-06, |
|
"loss": 0.9934, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 0.8222112850756275, |
|
"learning_rate": 7.401858215313228e-06, |
|
"loss": 0.7107, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 0.578857909730416, |
|
"learning_rate": 7.352327403013779e-06, |
|
"loss": 0.7626, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 0.6947157428046444, |
|
"learning_rate": 7.302498337922293e-06, |
|
"loss": 0.8212, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 1.3653434931121922, |
|
"learning_rate": 7.2523773378913655e-06, |
|
"loss": 0.7474, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.8976124611811842, |
|
"learning_rate": 7.201970757788172e-06, |
|
"loss": 0.6376, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 0.5152408942741, |
|
"learning_rate": 7.151284988688731e-06, |
|
"loss": 0.6641, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 0.6830633595485539, |
|
"learning_rate": 7.100326457067576e-06, |
|
"loss": 0.7996, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 0.5496861285450714, |
|
"learning_rate": 7.049101623982938e-06, |
|
"loss": 0.7581, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 0.6403401425933127, |
|
"learning_rate": 6.9976169842575526e-06, |
|
"loss": 0.7457, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.44381400078067595, |
|
"learning_rate": 6.945879065655164e-06, |
|
"loss": 0.6706, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 0.607564640506437, |
|
"learning_rate": 6.893894428052881e-06, |
|
"loss": 0.7532, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 0.5370083889151855, |
|
"learning_rate": 6.841669662609437e-06, |
|
"loss": 0.723, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 0.503787140696792, |
|
"learning_rate": 6.789211390929497e-06, |
|
"loss": 0.6432, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 0.48811253497835444, |
|
"learning_rate": 6.736526264224101e-06, |
|
"loss": 0.7416, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.5393544740145144, |
|
"learning_rate": 6.6836209624673575e-06, |
|
"loss": 0.6972, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 0.5146295742117404, |
|
"learning_rate": 6.6305021935494755e-06, |
|
"loss": 0.7735, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 0.5354676350111098, |
|
"learning_rate": 6.5771766924262795e-06, |
|
"loss": 0.7254, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 0.48957997280536275, |
|
"learning_rate": 6.523651220265269e-06, |
|
"loss": 0.6449, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 0.6260723291086323, |
|
"learning_rate": 6.469932563588386e-06, |
|
"loss": 0.6024, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.6277559902277917, |
|
"learning_rate": 6.41602753341152e-06, |
|
"loss": 0.7118, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 0.5654085188913539, |
|
"learning_rate": 6.361942964380967e-06, |
|
"loss": 0.7289, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 0.5511085996838134, |
|
"learning_rate": 6.307685713906835e-06, |
|
"loss": 0.7078, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 0.48643046392503236, |
|
"learning_rate": 6.2532626612936035e-06, |
|
"loss": 0.6913, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 0.4251201514104161, |
|
"learning_rate": 6.1986807068678926e-06, |
|
"loss": 0.8129, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.4957551218147004, |
|
"learning_rate": 6.143946771103561e-06, |
|
"loss": 0.7541, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 0.5337634201089703, |
|
"learning_rate": 6.089067793744258e-06, |
|
"loss": 0.7421, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 0.45135282751579137, |
|
"learning_rate": 6.034050732923538e-06, |
|
"loss": 0.7623, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 0.40984798370772846, |
|
"learning_rate": 5.978902564282616e-06, |
|
"loss": 0.8119, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 0.42530164890628464, |
|
"learning_rate": 5.923630280085948e-06, |
|
"loss": 0.7985, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.390559143940863, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.6778, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.45694185479460997, |
|
"learning_rate": 5.8127414118779825e-06, |
|
"loss": 0.7597, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 0.4952633583203359, |
|
"learning_rate": 5.757138887522884e-06, |
|
"loss": 0.7168, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 0.4342497218363157, |
|
"learning_rate": 5.701440365141799e-06, |
|
"loss": 0.6564, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 0.4718029081715289, |
|
"learning_rate": 5.645652906778808e-06, |
|
"loss": 0.66, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.5236548730626042, |
|
"learning_rate": 5.5897835857542315e-06, |
|
"loss": 0.7189, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.576, |
|
"grad_norm": 0.4217271822666547, |
|
"learning_rate": 5.533839485767795e-06, |
|
"loss": 0.6501, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 0.51236732912632, |
|
"learning_rate": 5.477827700000492e-06, |
|
"loss": 0.5564, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 0.5794935429954644, |
|
"learning_rate": 5.421755330215223e-06, |
|
"loss": 0.7193, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 0.43579809758377336, |
|
"learning_rate": 5.365629485856381e-06, |
|
"loss": 0.782, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.4273541248432265, |
|
"learning_rate": 5.30945728314841e-06, |
|
"loss": 0.6904, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 0.5127404572609707, |
|
"learning_rate": 5.253245844193564e-06, |
|
"loss": 0.7756, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.672, |
|
"grad_norm": 0.4502193208238627, |
|
"learning_rate": 5.197002296068878e-06, |
|
"loss": 0.7199, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 0.6522139082928684, |
|
"learning_rate": 5.140733769922525e-06, |
|
"loss": 0.7523, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 0.4604865675922319, |
|
"learning_rate": 5.084447400069656e-06, |
|
"loss": 0.658, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.47189630919959746, |
|
"learning_rate": 5.0281503230878304e-06, |
|
"loss": 0.7168, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 0.45853515734923483, |
|
"learning_rate": 4.971849676912172e-06, |
|
"loss": 0.59, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 0.6617371663600603, |
|
"learning_rate": 4.915552599930345e-06, |
|
"loss": 0.7154, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 0.4581535675670496, |
|
"learning_rate": 4.859266230077474e-06, |
|
"loss": 0.8279, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 0.44995006651439057, |
|
"learning_rate": 4.802997703931124e-06, |
|
"loss": 0.7029, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.6247080166668746, |
|
"learning_rate": 4.746754155806437e-06, |
|
"loss": 0.6663, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 0.43343619400050193, |
|
"learning_rate": 4.6905427168515914e-06, |
|
"loss": 0.7364, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 0.6592346345644005, |
|
"learning_rate": 4.63437051414362e-06, |
|
"loss": 0.7337, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 0.464552230592377, |
|
"learning_rate": 4.5782446697847775e-06, |
|
"loss": 0.7164, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 0.47084386707640963, |
|
"learning_rate": 4.52217229999951e-06, |
|
"loss": 0.7538, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.49465152130409273, |
|
"learning_rate": 4.466160514232206e-06, |
|
"loss": 0.6193, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 0.7463104607291, |
|
"learning_rate": 4.410216414245771e-06, |
|
"loss": 0.6887, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 0.5311673012762305, |
|
"learning_rate": 4.354347093221194e-06, |
|
"loss": 0.8275, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 0.4765968349930382, |
|
"learning_rate": 4.298559634858202e-06, |
|
"loss": 0.709, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 0.4041073627021218, |
|
"learning_rate": 4.2428611124771184e-06, |
|
"loss": 0.7103, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.5090780248152671, |
|
"learning_rate": 4.187258588122019e-06, |
|
"loss": 0.7976, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 0.6082158511853922, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 0.7694, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"grad_norm": 0.5727874160786263, |
|
"learning_rate": 4.076369719914055e-06, |
|
"loss": 0.7123, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.5727874160786263, |
|
"learning_rate": 4.021097435717386e-06, |
|
"loss": 0.5299, |
|
"step": 189 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 37970102255616.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|