|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 170, |
|
"global_step": 170, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0058823529411764705, |
|
"grad_norm": 0.740064799785614, |
|
"learning_rate": 1e-05, |
|
"loss": 2.4395, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011764705882352941, |
|
"grad_norm": 0.7219232320785522, |
|
"learning_rate": 9.941176470588236e-06, |
|
"loss": 2.3902, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01764705882352941, |
|
"grad_norm": 0.77315753698349, |
|
"learning_rate": 9.882352941176472e-06, |
|
"loss": 2.4516, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.023529411764705882, |
|
"grad_norm": 0.7578166127204895, |
|
"learning_rate": 9.823529411764706e-06, |
|
"loss": 2.4148, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.029411764705882353, |
|
"grad_norm": 0.7205833196640015, |
|
"learning_rate": 9.764705882352942e-06, |
|
"loss": 2.3372, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03529411764705882, |
|
"grad_norm": 0.7160035967826843, |
|
"learning_rate": 9.705882352941177e-06, |
|
"loss": 2.2849, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.041176470588235294, |
|
"grad_norm": 0.8269237875938416, |
|
"learning_rate": 9.647058823529412e-06, |
|
"loss": 2.3719, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.047058823529411764, |
|
"grad_norm": 0.7316713333129883, |
|
"learning_rate": 9.588235294117649e-06, |
|
"loss": 2.2175, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.052941176470588235, |
|
"grad_norm": 0.7852907776832581, |
|
"learning_rate": 9.529411764705882e-06, |
|
"loss": 2.2489, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.058823529411764705, |
|
"grad_norm": 0.7100040316581726, |
|
"learning_rate": 9.470588235294119e-06, |
|
"loss": 2.1828, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06470588235294118, |
|
"grad_norm": 0.6905198693275452, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 2.1709, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07058823529411765, |
|
"grad_norm": 0.6189457774162292, |
|
"learning_rate": 9.352941176470589e-06, |
|
"loss": 2.1152, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07647058823529412, |
|
"grad_norm": 0.5859349370002747, |
|
"learning_rate": 9.294117647058824e-06, |
|
"loss": 2.0362, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08235294117647059, |
|
"grad_norm": 0.6242568492889404, |
|
"learning_rate": 9.23529411764706e-06, |
|
"loss": 2.0808, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.08823529411764706, |
|
"grad_norm": 0.6139904856681824, |
|
"learning_rate": 9.176470588235294e-06, |
|
"loss": 2.017, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09411764705882353, |
|
"grad_norm": 0.6155012249946594, |
|
"learning_rate": 9.11764705882353e-06, |
|
"loss": 2.0315, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6213613152503967, |
|
"learning_rate": 9.058823529411765e-06, |
|
"loss": 1.9902, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.10588235294117647, |
|
"grad_norm": 0.584740936756134, |
|
"learning_rate": 9e-06, |
|
"loss": 1.9679, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.11176470588235295, |
|
"grad_norm": 0.5694301128387451, |
|
"learning_rate": 8.941176470588237e-06, |
|
"loss": 1.9416, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 0.5494748950004578, |
|
"learning_rate": 8.88235294117647e-06, |
|
"loss": 1.9129, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12352941176470589, |
|
"grad_norm": 0.5430072546005249, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 1.89, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.12941176470588237, |
|
"grad_norm": 0.5303496718406677, |
|
"learning_rate": 8.764705882352942e-06, |
|
"loss": 1.8751, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.13529411764705881, |
|
"grad_norm": 0.5339208841323853, |
|
"learning_rate": 8.705882352941177e-06, |
|
"loss": 1.8598, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1411764705882353, |
|
"grad_norm": 0.5348221659660339, |
|
"learning_rate": 8.647058823529413e-06, |
|
"loss": 1.8426, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.4850575923919678, |
|
"learning_rate": 8.588235294117647e-06, |
|
"loss": 1.8126, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.15294117647058825, |
|
"grad_norm": 0.5005661845207214, |
|
"learning_rate": 8.529411764705883e-06, |
|
"loss": 1.8054, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1588235294117647, |
|
"grad_norm": 0.47416189312934875, |
|
"learning_rate": 8.470588235294118e-06, |
|
"loss": 1.7775, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.16470588235294117, |
|
"grad_norm": 0.49917134642601013, |
|
"learning_rate": 8.411764705882353e-06, |
|
"loss": 1.7834, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17058823529411765, |
|
"grad_norm": 0.4690726101398468, |
|
"learning_rate": 8.35294117647059e-06, |
|
"loss": 1.769, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.17647058823529413, |
|
"grad_norm": 0.4899074137210846, |
|
"learning_rate": 8.294117647058825e-06, |
|
"loss": 1.7534, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18235294117647058, |
|
"grad_norm": 0.4322926104068756, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 1.7127, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.18823529411764706, |
|
"grad_norm": 0.4963333010673523, |
|
"learning_rate": 8.176470588235295e-06, |
|
"loss": 1.7316, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.19411764705882353, |
|
"grad_norm": 0.4416678547859192, |
|
"learning_rate": 8.11764705882353e-06, |
|
"loss": 1.6911, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.44732019305229187, |
|
"learning_rate": 8.058823529411766e-06, |
|
"loss": 1.6832, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.20588235294117646, |
|
"grad_norm": 0.4325319528579712, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.6849, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.21176470588235294, |
|
"grad_norm": 0.4243956506252289, |
|
"learning_rate": 7.941176470588236e-06, |
|
"loss": 1.6471, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.21764705882352942, |
|
"grad_norm": 0.41187071800231934, |
|
"learning_rate": 7.882352941176471e-06, |
|
"loss": 1.654, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2235294117647059, |
|
"grad_norm": 0.40401241183280945, |
|
"learning_rate": 7.823529411764706e-06, |
|
"loss": 1.644, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.22941176470588234, |
|
"grad_norm": 0.4079605042934418, |
|
"learning_rate": 7.764705882352941e-06, |
|
"loss": 1.6209, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 0.37295785546302795, |
|
"learning_rate": 7.705882352941178e-06, |
|
"loss": 1.6111, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2411764705882353, |
|
"grad_norm": 0.37890729308128357, |
|
"learning_rate": 7.647058823529411e-06, |
|
"loss": 1.6122, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.24705882352941178, |
|
"grad_norm": 0.3897000849246979, |
|
"learning_rate": 7.588235294117648e-06, |
|
"loss": 1.594, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2529411764705882, |
|
"grad_norm": 0.37150734663009644, |
|
"learning_rate": 7.529411764705883e-06, |
|
"loss": 1.5683, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.25882352941176473, |
|
"grad_norm": 0.3686462342739105, |
|
"learning_rate": 7.4705882352941185e-06, |
|
"loss": 1.5578, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2647058823529412, |
|
"grad_norm": 0.3615223467350006, |
|
"learning_rate": 7.4117647058823535e-06, |
|
"loss": 1.5553, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.27058823529411763, |
|
"grad_norm": 0.341239333152771, |
|
"learning_rate": 7.352941176470589e-06, |
|
"loss": 1.5504, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.27647058823529413, |
|
"grad_norm": 0.32972443103790283, |
|
"learning_rate": 7.294117647058823e-06, |
|
"loss": 1.5523, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2823529411764706, |
|
"grad_norm": 0.3313795328140259, |
|
"learning_rate": 7.235294117647059e-06, |
|
"loss": 1.5367, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.28823529411764703, |
|
"grad_norm": 0.3319094479084015, |
|
"learning_rate": 7.176470588235295e-06, |
|
"loss": 1.5233, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 0.3231871426105499, |
|
"learning_rate": 7.11764705882353e-06, |
|
"loss": 1.5064, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.3074081838130951, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 1.4804, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3058823529411765, |
|
"grad_norm": 0.329453706741333, |
|
"learning_rate": 7e-06, |
|
"loss": 1.5033, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.31176470588235294, |
|
"grad_norm": 0.3119613826274872, |
|
"learning_rate": 6.941176470588236e-06, |
|
"loss": 1.4898, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3176470588235294, |
|
"grad_norm": 0.31654036045074463, |
|
"learning_rate": 6.8823529411764715e-06, |
|
"loss": 1.4599, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3235294117647059, |
|
"grad_norm": 0.29753053188323975, |
|
"learning_rate": 6.8235294117647065e-06, |
|
"loss": 1.4625, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.32941176470588235, |
|
"grad_norm": 0.30820533633232117, |
|
"learning_rate": 6.764705882352942e-06, |
|
"loss": 1.4759, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3352941176470588, |
|
"grad_norm": 0.29135259985923767, |
|
"learning_rate": 6.705882352941176e-06, |
|
"loss": 1.4699, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3411764705882353, |
|
"grad_norm": 0.2927163243293762, |
|
"learning_rate": 6.647058823529412e-06, |
|
"loss": 1.4428, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.34705882352941175, |
|
"grad_norm": 0.3006676137447357, |
|
"learning_rate": 6.588235294117647e-06, |
|
"loss": 1.451, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.29078030586242676, |
|
"learning_rate": 6.529411764705883e-06, |
|
"loss": 1.4352, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3588235294117647, |
|
"grad_norm": 0.28280261158943176, |
|
"learning_rate": 6.470588235294119e-06, |
|
"loss": 1.4295, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.36470588235294116, |
|
"grad_norm": 0.3001053035259247, |
|
"learning_rate": 6.411764705882354e-06, |
|
"loss": 1.4375, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.37058823529411766, |
|
"grad_norm": 0.28294065594673157, |
|
"learning_rate": 6.352941176470589e-06, |
|
"loss": 1.4144, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3764705882352941, |
|
"grad_norm": 0.2832286059856415, |
|
"learning_rate": 6.294117647058824e-06, |
|
"loss": 1.4207, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.38235294117647056, |
|
"grad_norm": 0.2754327952861786, |
|
"learning_rate": 6.2352941176470595e-06, |
|
"loss": 1.4362, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.38823529411764707, |
|
"grad_norm": 0.28400981426239014, |
|
"learning_rate": 6.176470588235295e-06, |
|
"loss": 1.382, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3941176470588235, |
|
"grad_norm": 0.2783932387828827, |
|
"learning_rate": 6.11764705882353e-06, |
|
"loss": 1.4018, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.270181268453598, |
|
"learning_rate": 6.058823529411765e-06, |
|
"loss": 1.4002, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.40588235294117647, |
|
"grad_norm": 0.28010931611061096, |
|
"learning_rate": 6e-06, |
|
"loss": 1.3927, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4117647058823529, |
|
"grad_norm": 0.28210070729255676, |
|
"learning_rate": 5.941176470588236e-06, |
|
"loss": 1.3775, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4176470588235294, |
|
"grad_norm": 0.26174265146255493, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 1.3791, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4235294117647059, |
|
"grad_norm": 0.2730426788330078, |
|
"learning_rate": 5.823529411764707e-06, |
|
"loss": 1.3865, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4294117647058823, |
|
"grad_norm": 0.25816625356674194, |
|
"learning_rate": 5.764705882352941e-06, |
|
"loss": 1.357, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.43529411764705883, |
|
"grad_norm": 0.25862398743629456, |
|
"learning_rate": 5.705882352941177e-06, |
|
"loss": 1.3597, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 0.2514458894729614, |
|
"learning_rate": 5.6470588235294125e-06, |
|
"loss": 1.3971, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4470588235294118, |
|
"grad_norm": 0.2639279067516327, |
|
"learning_rate": 5.588235294117647e-06, |
|
"loss": 1.3693, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.45294117647058824, |
|
"grad_norm": 0.26090630888938904, |
|
"learning_rate": 5.529411764705883e-06, |
|
"loss": 1.3681, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4588235294117647, |
|
"grad_norm": 0.2618473470211029, |
|
"learning_rate": 5.470588235294119e-06, |
|
"loss": 1.3568, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4647058823529412, |
|
"grad_norm": 0.25189754366874695, |
|
"learning_rate": 5.411764705882353e-06, |
|
"loss": 1.3628, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.2481844574213028, |
|
"learning_rate": 5.352941176470589e-06, |
|
"loss": 1.3382, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4764705882352941, |
|
"grad_norm": 0.24728593230247498, |
|
"learning_rate": 5.294117647058824e-06, |
|
"loss": 1.3288, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.4823529411764706, |
|
"grad_norm": 0.25381624698638916, |
|
"learning_rate": 5.23529411764706e-06, |
|
"loss": 1.3215, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.48823529411764705, |
|
"grad_norm": 0.2516557276248932, |
|
"learning_rate": 5.176470588235295e-06, |
|
"loss": 1.3264, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.49411764705882355, |
|
"grad_norm": 0.24683943390846252, |
|
"learning_rate": 5.11764705882353e-06, |
|
"loss": 1.3244, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.24650059640407562, |
|
"learning_rate": 5.058823529411765e-06, |
|
"loss": 1.3259, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5058823529411764, |
|
"grad_norm": 0.2529411017894745, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3313, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5117647058823529, |
|
"grad_norm": 0.2540332078933716, |
|
"learning_rate": 4.941176470588236e-06, |
|
"loss": 1.33, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5176470588235295, |
|
"grad_norm": 0.25214681029319763, |
|
"learning_rate": 4.882352941176471e-06, |
|
"loss": 1.2992, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5235294117647059, |
|
"grad_norm": 0.27215129137039185, |
|
"learning_rate": 4.823529411764706e-06, |
|
"loss": 1.3119, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5294117647058824, |
|
"grad_norm": 0.2611463665962219, |
|
"learning_rate": 4.764705882352941e-06, |
|
"loss": 1.3265, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5352941176470588, |
|
"grad_norm": 0.2502508759498596, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 1.2926, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5411764705882353, |
|
"grad_norm": 0.26345929503440857, |
|
"learning_rate": 4.647058823529412e-06, |
|
"loss": 1.2975, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5470588235294118, |
|
"grad_norm": 0.2609890401363373, |
|
"learning_rate": 4.588235294117647e-06, |
|
"loss": 1.2921, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5529411764705883, |
|
"grad_norm": 0.2622078061103821, |
|
"learning_rate": 4.529411764705883e-06, |
|
"loss": 1.3016, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5588235294117647, |
|
"grad_norm": 0.2562355101108551, |
|
"learning_rate": 4.4705882352941184e-06, |
|
"loss": 1.2908, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5647058823529412, |
|
"grad_norm": 0.25484997034072876, |
|
"learning_rate": 4.411764705882353e-06, |
|
"loss": 1.3199, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5705882352941176, |
|
"grad_norm": 0.25862494111061096, |
|
"learning_rate": 4.352941176470588e-06, |
|
"loss": 1.2855, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5764705882352941, |
|
"grad_norm": 0.27047714591026306, |
|
"learning_rate": 4.294117647058823e-06, |
|
"loss": 1.3165, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5823529411764706, |
|
"grad_norm": 0.2632170021533966, |
|
"learning_rate": 4.235294117647059e-06, |
|
"loss": 1.2912, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.25326305627822876, |
|
"learning_rate": 4.176470588235295e-06, |
|
"loss": 1.3053, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5941176470588235, |
|
"grad_norm": 0.26147395372390747, |
|
"learning_rate": 4.11764705882353e-06, |
|
"loss": 1.2973, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.26799634099006653, |
|
"learning_rate": 4.058823529411765e-06, |
|
"loss": 1.2794, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6058823529411764, |
|
"grad_norm": 0.2632071077823639, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.2867, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.611764705882353, |
|
"grad_norm": 0.27080872654914856, |
|
"learning_rate": 3.941176470588236e-06, |
|
"loss": 1.277, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6176470588235294, |
|
"grad_norm": 0.2697356939315796, |
|
"learning_rate": 3.882352941176471e-06, |
|
"loss": 1.2697, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6235294117647059, |
|
"grad_norm": 0.27979159355163574, |
|
"learning_rate": 3.8235294117647055e-06, |
|
"loss": 1.2746, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6294117647058823, |
|
"grad_norm": 0.2690213620662689, |
|
"learning_rate": 3.7647058823529414e-06, |
|
"loss": 1.2734, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6352941176470588, |
|
"grad_norm": 0.27870768308639526, |
|
"learning_rate": 3.7058823529411767e-06, |
|
"loss": 1.2707, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6411764705882353, |
|
"grad_norm": 0.29579660296440125, |
|
"learning_rate": 3.6470588235294117e-06, |
|
"loss": 1.2616, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6470588235294118, |
|
"grad_norm": 0.2851077914237976, |
|
"learning_rate": 3.5882352941176475e-06, |
|
"loss": 1.2591, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6529411764705882, |
|
"grad_norm": 0.307041198015213, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 1.2522, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6588235294117647, |
|
"grad_norm": 0.29607197642326355, |
|
"learning_rate": 3.470588235294118e-06, |
|
"loss": 1.2831, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6647058823529411, |
|
"grad_norm": 0.29029569029808044, |
|
"learning_rate": 3.4117647058823532e-06, |
|
"loss": 1.2539, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6705882352941176, |
|
"grad_norm": 0.28268927335739136, |
|
"learning_rate": 3.352941176470588e-06, |
|
"loss": 1.2652, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6764705882352942, |
|
"grad_norm": 0.28747496008872986, |
|
"learning_rate": 3.2941176470588236e-06, |
|
"loss": 1.2394, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6823529411764706, |
|
"grad_norm": 0.2939983904361725, |
|
"learning_rate": 3.2352941176470594e-06, |
|
"loss": 1.2639, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6882352941176471, |
|
"grad_norm": 0.2975703179836273, |
|
"learning_rate": 3.1764705882352943e-06, |
|
"loss": 1.2762, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6941176470588235, |
|
"grad_norm": 0.2900603413581848, |
|
"learning_rate": 3.1176470588235297e-06, |
|
"loss": 1.2623, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.2925064265727997, |
|
"learning_rate": 3.058823529411765e-06, |
|
"loss": 1.27, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 0.2913402318954468, |
|
"learning_rate": 3e-06, |
|
"loss": 1.2558, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.711764705882353, |
|
"grad_norm": 0.3211301863193512, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 1.2397, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7176470588235294, |
|
"grad_norm": 0.3004200756549835, |
|
"learning_rate": 2.8823529411764704e-06, |
|
"loss": 1.2627, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7235294117647059, |
|
"grad_norm": 0.3165768086910248, |
|
"learning_rate": 2.8235294117647062e-06, |
|
"loss": 1.2388, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7294117647058823, |
|
"grad_norm": 0.29654860496520996, |
|
"learning_rate": 2.7647058823529416e-06, |
|
"loss": 1.2332, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 0.3117150068283081, |
|
"learning_rate": 2.7058823529411766e-06, |
|
"loss": 1.2588, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7411764705882353, |
|
"grad_norm": 0.33643701672554016, |
|
"learning_rate": 2.647058823529412e-06, |
|
"loss": 1.2289, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7470588235294118, |
|
"grad_norm": 0.3130914270877838, |
|
"learning_rate": 2.5882352941176473e-06, |
|
"loss": 1.263, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7529411764705882, |
|
"grad_norm": 0.3396664559841156, |
|
"learning_rate": 2.5294117647058823e-06, |
|
"loss": 1.2592, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7588235294117647, |
|
"grad_norm": 0.30291828513145447, |
|
"learning_rate": 2.470588235294118e-06, |
|
"loss": 1.2577, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7647058823529411, |
|
"grad_norm": 0.32175707817077637, |
|
"learning_rate": 2.411764705882353e-06, |
|
"loss": 1.247, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7705882352941177, |
|
"grad_norm": 0.346138596534729, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 1.2476, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7764705882352941, |
|
"grad_norm": 0.3127652406692505, |
|
"learning_rate": 2.2941176470588234e-06, |
|
"loss": 1.2392, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7823529411764706, |
|
"grad_norm": 0.349590927362442, |
|
"learning_rate": 2.2352941176470592e-06, |
|
"loss": 1.2377, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.788235294117647, |
|
"grad_norm": 0.3107239305973053, |
|
"learning_rate": 2.176470588235294e-06, |
|
"loss": 1.239, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7941176470588235, |
|
"grad_norm": 0.33791080117225647, |
|
"learning_rate": 2.1176470588235296e-06, |
|
"loss": 1.246, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3314568102359772, |
|
"learning_rate": 2.058823529411765e-06, |
|
"loss": 1.2402, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8058823529411765, |
|
"grad_norm": 0.3275523781776428, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.2348, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8117647058823529, |
|
"grad_norm": 0.33062854409217834, |
|
"learning_rate": 1.9411764705882353e-06, |
|
"loss": 1.2427, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8176470588235294, |
|
"grad_norm": 0.35148942470550537, |
|
"learning_rate": 1.8823529411764707e-06, |
|
"loss": 1.2261, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8235294117647058, |
|
"grad_norm": 0.3389197289943695, |
|
"learning_rate": 1.8235294117647058e-06, |
|
"loss": 1.2362, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8294117647058824, |
|
"grad_norm": 0.3360951244831085, |
|
"learning_rate": 1.7647058823529414e-06, |
|
"loss": 1.2302, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8352941176470589, |
|
"grad_norm": 0.34131404757499695, |
|
"learning_rate": 1.7058823529411766e-06, |
|
"loss": 1.2266, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8411764705882353, |
|
"grad_norm": 0.328914076089859, |
|
"learning_rate": 1.6470588235294118e-06, |
|
"loss": 1.2308, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8470588235294118, |
|
"grad_norm": 0.34804269671440125, |
|
"learning_rate": 1.5882352941176472e-06, |
|
"loss": 1.2212, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8529411764705882, |
|
"grad_norm": 0.35386762022972107, |
|
"learning_rate": 1.5294117647058826e-06, |
|
"loss": 1.229, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8588235294117647, |
|
"grad_norm": 0.33942756056785583, |
|
"learning_rate": 1.4705882352941177e-06, |
|
"loss": 1.2434, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8647058823529412, |
|
"grad_norm": 0.32963618636131287, |
|
"learning_rate": 1.4117647058823531e-06, |
|
"loss": 1.2385, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8705882352941177, |
|
"grad_norm": 0.3417942225933075, |
|
"learning_rate": 1.3529411764705883e-06, |
|
"loss": 1.2242, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.8764705882352941, |
|
"grad_norm": 0.33753451704978943, |
|
"learning_rate": 1.2941176470588237e-06, |
|
"loss": 1.2234, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.3514120876789093, |
|
"learning_rate": 1.235294117647059e-06, |
|
"loss": 1.2241, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.888235294117647, |
|
"grad_norm": 0.35951969027519226, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 1.2347, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8941176470588236, |
|
"grad_norm": 0.3717687726020813, |
|
"learning_rate": 1.1176470588235296e-06, |
|
"loss": 1.2224, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.3542497754096985, |
|
"learning_rate": 1.0588235294117648e-06, |
|
"loss": 1.2236, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9058823529411765, |
|
"grad_norm": 0.3436025083065033, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.2294, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9117647058823529, |
|
"grad_norm": 0.37331488728523254, |
|
"learning_rate": 9.411764705882353e-07, |
|
"loss": 1.2046, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9176470588235294, |
|
"grad_norm": 0.34907183051109314, |
|
"learning_rate": 8.823529411764707e-07, |
|
"loss": 1.2213, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9235294117647059, |
|
"grad_norm": 0.36500322818756104, |
|
"learning_rate": 8.235294117647059e-07, |
|
"loss": 1.2091, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9294117647058824, |
|
"grad_norm": 0.38440433144569397, |
|
"learning_rate": 7.647058823529413e-07, |
|
"loss": 1.2249, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9352941176470588, |
|
"grad_norm": 0.3387817144393921, |
|
"learning_rate": 7.058823529411766e-07, |
|
"loss": 1.2154, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.34928762912750244, |
|
"learning_rate": 6.470588235294118e-07, |
|
"loss": 1.2227, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9470588235294117, |
|
"grad_norm": 0.36257097125053406, |
|
"learning_rate": 5.882352941176471e-07, |
|
"loss": 1.2211, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9529411764705882, |
|
"grad_norm": 0.3723115026950836, |
|
"learning_rate": 5.294117647058824e-07, |
|
"loss": 1.2283, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.9588235294117647, |
|
"grad_norm": 0.3423607349395752, |
|
"learning_rate": 4.7058823529411767e-07, |
|
"loss": 1.2295, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.9647058823529412, |
|
"grad_norm": 0.3787173628807068, |
|
"learning_rate": 4.1176470588235295e-07, |
|
"loss": 1.2201, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.9705882352941176, |
|
"grad_norm": 0.36642688512802124, |
|
"learning_rate": 3.529411764705883e-07, |
|
"loss": 1.2313, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9764705882352941, |
|
"grad_norm": 0.3594622313976288, |
|
"learning_rate": 2.9411764705882356e-07, |
|
"loss": 1.2128, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.9823529411764705, |
|
"grad_norm": 0.3701726496219635, |
|
"learning_rate": 2.3529411764705883e-07, |
|
"loss": 1.2324, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.9882352941176471, |
|
"grad_norm": 0.34158623218536377, |
|
"learning_rate": 1.7647058823529414e-07, |
|
"loss": 1.2364, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.9941176470588236, |
|
"grad_norm": 0.3631001114845276, |
|
"learning_rate": 1.1764705882352942e-07, |
|
"loss": 1.2191, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.40616530179977417, |
|
"learning_rate": 5.882352941176471e-08, |
|
"loss": 1.2029, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.2080979347229004, |
|
"eval_runtime": 4.1103, |
|
"eval_samples_per_second": 4.379, |
|
"eval_steps_per_second": 0.73, |
|
"step": 170 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 170, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.7061227320088986e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|