|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 10, |
|
"global_step": 522, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005747126436781609, |
|
"grad_norm": 0.10123365372419357, |
|
"learning_rate": 0.0, |
|
"loss": 0.9918, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011494252873563218, |
|
"grad_norm": 0.09671098738908768, |
|
"learning_rate": 3.7735849056603773e-06, |
|
"loss": 0.9604, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.017241379310344827, |
|
"grad_norm": 0.0981190875172615, |
|
"learning_rate": 7.547169811320755e-06, |
|
"loss": 0.9868, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.022988505747126436, |
|
"grad_norm": 0.10396745055913925, |
|
"learning_rate": 1.1320754716981132e-05, |
|
"loss": 0.962, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.028735632183908046, |
|
"grad_norm": 0.0982985869050026, |
|
"learning_rate": 1.509433962264151e-05, |
|
"loss": 0.9684, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.034482758620689655, |
|
"grad_norm": 0.10332155227661133, |
|
"learning_rate": 1.8867924528301888e-05, |
|
"loss": 0.9442, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.040229885057471264, |
|
"grad_norm": 0.1124059334397316, |
|
"learning_rate": 2.2641509433962265e-05, |
|
"loss": 0.9382, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04597701149425287, |
|
"grad_norm": 0.12120208889245987, |
|
"learning_rate": 2.641509433962264e-05, |
|
"loss": 0.9416, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05172413793103448, |
|
"grad_norm": 0.12729395925998688, |
|
"learning_rate": 3.018867924528302e-05, |
|
"loss": 0.9356, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05747126436781609, |
|
"grad_norm": 0.13560789823532104, |
|
"learning_rate": 3.39622641509434e-05, |
|
"loss": 0.9293, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05747126436781609, |
|
"eval_loss": 1.0470749139785767, |
|
"eval_runtime": 412.2553, |
|
"eval_samples_per_second": 24.009, |
|
"eval_steps_per_second": 0.376, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06321839080459771, |
|
"grad_norm": 0.1474100798368454, |
|
"learning_rate": 3.7735849056603776e-05, |
|
"loss": 0.9533, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 0.16510824859142303, |
|
"learning_rate": 4.150943396226415e-05, |
|
"loss": 0.9206, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07471264367816093, |
|
"grad_norm": 0.17097796499729156, |
|
"learning_rate": 4.528301886792453e-05, |
|
"loss": 0.8921, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08045977011494253, |
|
"grad_norm": 0.17923878133296967, |
|
"learning_rate": 4.9056603773584906e-05, |
|
"loss": 0.8861, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.08620689655172414, |
|
"grad_norm": 0.18173959851264954, |
|
"learning_rate": 5.283018867924528e-05, |
|
"loss": 0.8904, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09195402298850575, |
|
"grad_norm": 0.17235629260540009, |
|
"learning_rate": 5.660377358490566e-05, |
|
"loss": 0.8424, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.09770114942528736, |
|
"grad_norm": 0.16792210936546326, |
|
"learning_rate": 6.037735849056604e-05, |
|
"loss": 0.8395, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.10344827586206896, |
|
"grad_norm": 0.14939646422863007, |
|
"learning_rate": 6.415094339622641e-05, |
|
"loss": 0.8203, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.10919540229885058, |
|
"grad_norm": 0.14632105827331543, |
|
"learning_rate": 6.79245283018868e-05, |
|
"loss": 0.8464, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.11494252873563218, |
|
"grad_norm": 0.14770475029945374, |
|
"learning_rate": 7.169811320754717e-05, |
|
"loss": 0.8085, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11494252873563218, |
|
"eval_loss": 0.8244547247886658, |
|
"eval_runtime": 404.4489, |
|
"eval_samples_per_second": 24.473, |
|
"eval_steps_per_second": 0.383, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1206896551724138, |
|
"grad_norm": 0.1725720465183258, |
|
"learning_rate": 7.547169811320755e-05, |
|
"loss": 0.8219, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.12643678160919541, |
|
"grad_norm": 0.1685618907213211, |
|
"learning_rate": 7.924528301886794e-05, |
|
"loss": 0.8148, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.13218390804597702, |
|
"grad_norm": 0.1653290092945099, |
|
"learning_rate": 8.30188679245283e-05, |
|
"loss": 0.7846, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.16122524440288544, |
|
"learning_rate": 8.679245283018869e-05, |
|
"loss": 0.7903, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.14367816091954022, |
|
"grad_norm": 0.12793505191802979, |
|
"learning_rate": 9.056603773584906e-05, |
|
"loss": 0.7741, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14942528735632185, |
|
"grad_norm": 0.10620377957820892, |
|
"learning_rate": 9.433962264150944e-05, |
|
"loss": 0.7308, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.15517241379310345, |
|
"grad_norm": 0.10993366688489914, |
|
"learning_rate": 9.811320754716981e-05, |
|
"loss": 0.7559, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.16091954022988506, |
|
"grad_norm": 0.11916384100914001, |
|
"learning_rate": 0.0001018867924528302, |
|
"loss": 0.7622, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.13500399887561798, |
|
"learning_rate": 0.00010566037735849057, |
|
"loss": 0.7436, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 0.12777844071388245, |
|
"learning_rate": 0.00010943396226415095, |
|
"loss": 0.7547, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"eval_loss": 0.7580565214157104, |
|
"eval_runtime": 404.708, |
|
"eval_samples_per_second": 24.457, |
|
"eval_steps_per_second": 0.383, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1781609195402299, |
|
"grad_norm": 0.11721828579902649, |
|
"learning_rate": 0.00011320754716981132, |
|
"loss": 0.7337, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1839080459770115, |
|
"grad_norm": 0.08667382597923279, |
|
"learning_rate": 0.0001169811320754717, |
|
"loss": 0.7538, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1896551724137931, |
|
"grad_norm": 0.06665026396512985, |
|
"learning_rate": 0.00012075471698113207, |
|
"loss": 0.7186, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.19540229885057472, |
|
"grad_norm": 0.04627465456724167, |
|
"learning_rate": 0.00012452830188679244, |
|
"loss": 0.7719, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.20114942528735633, |
|
"grad_norm": 0.04290887340903282, |
|
"learning_rate": 0.00012830188679245283, |
|
"loss": 0.752, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 0.056834809482097626, |
|
"learning_rate": 0.0001320754716981132, |
|
"loss": 0.7429, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.21264367816091953, |
|
"grad_norm": 0.062055498361587524, |
|
"learning_rate": 0.0001358490566037736, |
|
"loss": 0.7208, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.21839080459770116, |
|
"grad_norm": 0.070551298558712, |
|
"learning_rate": 0.00013962264150943395, |
|
"loss": 0.7651, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.22413793103448276, |
|
"grad_norm": 0.07514140754938126, |
|
"learning_rate": 0.00014339622641509434, |
|
"loss": 0.7456, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 0.06458627432584763, |
|
"learning_rate": 0.00014716981132075472, |
|
"loss": 0.7289, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"eval_loss": 0.7386028170585632, |
|
"eval_runtime": 407.409, |
|
"eval_samples_per_second": 24.295, |
|
"eval_steps_per_second": 0.38, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.23563218390804597, |
|
"grad_norm": 0.056490588933229446, |
|
"learning_rate": 0.0001509433962264151, |
|
"loss": 0.7503, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2413793103448276, |
|
"grad_norm": 0.036972932517528534, |
|
"learning_rate": 0.0001547169811320755, |
|
"loss": 0.7392, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2471264367816092, |
|
"grad_norm": 0.038239240646362305, |
|
"learning_rate": 0.00015849056603773587, |
|
"loss": 0.7206, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.25287356321839083, |
|
"grad_norm": 0.033113010227680206, |
|
"learning_rate": 0.00016226415094339625, |
|
"loss": 0.7198, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.25862068965517243, |
|
"grad_norm": 0.03197947517037392, |
|
"learning_rate": 0.0001660377358490566, |
|
"loss": 0.7393, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.26436781609195403, |
|
"grad_norm": 0.03696918115019798, |
|
"learning_rate": 0.000169811320754717, |
|
"loss": 0.7576, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.27011494252873564, |
|
"grad_norm": 0.04209383204579353, |
|
"learning_rate": 0.00017358490566037738, |
|
"loss": 0.7157, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.035038772970438004, |
|
"learning_rate": 0.00017735849056603776, |
|
"loss": 0.7256, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.28160919540229884, |
|
"grad_norm": 0.03674735128879547, |
|
"learning_rate": 0.00018113207547169812, |
|
"loss": 0.7295, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.28735632183908044, |
|
"grad_norm": 0.046050041913986206, |
|
"learning_rate": 0.0001849056603773585, |
|
"loss": 0.6965, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.28735632183908044, |
|
"eval_loss": 0.724204421043396, |
|
"eval_runtime": 405.0004, |
|
"eval_samples_per_second": 24.439, |
|
"eval_steps_per_second": 0.383, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.29310344827586204, |
|
"grad_norm": 0.036520447582006454, |
|
"learning_rate": 0.00018867924528301889, |
|
"loss": 0.7273, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.2988505747126437, |
|
"grad_norm": 0.03720232844352722, |
|
"learning_rate": 0.00019245283018867927, |
|
"loss": 0.7084, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3045977011494253, |
|
"grad_norm": 0.03159736469388008, |
|
"learning_rate": 0.00019622641509433963, |
|
"loss": 0.7485, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3103448275862069, |
|
"grad_norm": 0.03695262596011162, |
|
"learning_rate": 0.0002, |
|
"loss": 0.745, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3160919540229885, |
|
"grad_norm": 0.041795678436756134, |
|
"learning_rate": 0.00019999775651876987, |
|
"loss": 0.7165, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3218390804597701, |
|
"grad_norm": 0.03494727239012718, |
|
"learning_rate": 0.00019999102617574365, |
|
"loss": 0.7499, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3275862068965517, |
|
"grad_norm": 0.033885981887578964, |
|
"learning_rate": 0.00019997980927290927, |
|
"loss": 0.7118, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.03606434166431427, |
|
"learning_rate": 0.00019996410631356498, |
|
"loss": 0.6945, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3390804597701149, |
|
"grad_norm": 0.04015219211578369, |
|
"learning_rate": 0.00019994391800229666, |
|
"loss": 0.6982, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 0.0380714014172554, |
|
"learning_rate": 0.00019991924524494627, |
|
"loss": 0.6848, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"eval_loss": 0.7109408378601074, |
|
"eval_runtime": 404.9798, |
|
"eval_samples_per_second": 24.441, |
|
"eval_steps_per_second": 0.383, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3505747126436782, |
|
"grad_norm": 0.04110811650753021, |
|
"learning_rate": 0.00019989008914857116, |
|
"loss": 0.6899, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3563218390804598, |
|
"grad_norm": 0.03853503614664078, |
|
"learning_rate": 0.0001998564510213944, |
|
"loss": 0.7094, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3620689655172414, |
|
"grad_norm": 0.0391794852912426, |
|
"learning_rate": 0.00019981833237274618, |
|
"loss": 0.6975, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.367816091954023, |
|
"grad_norm": 0.03894927725195885, |
|
"learning_rate": 0.00019977573491299598, |
|
"loss": 0.714, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3735632183908046, |
|
"grad_norm": 0.04239923506975174, |
|
"learning_rate": 0.00019972866055347572, |
|
"loss": 0.7339, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3793103448275862, |
|
"grad_norm": 0.03982697054743767, |
|
"learning_rate": 0.0001996771114063943, |
|
"loss": 0.6821, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3850574712643678, |
|
"grad_norm": 0.04431302100419998, |
|
"learning_rate": 0.00019962108978474263, |
|
"loss": 0.7273, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.39080459770114945, |
|
"grad_norm": 0.043787937611341476, |
|
"learning_rate": 0.00019956059820218982, |
|
"loss": 0.6984, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.39655172413793105, |
|
"grad_norm": 0.054389603435993195, |
|
"learning_rate": 0.00019949563937297045, |
|
"loss": 0.6778, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.40229885057471265, |
|
"grad_norm": 0.041256386786699295, |
|
"learning_rate": 0.00019942621621176282, |
|
"loss": 0.693, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.40229885057471265, |
|
"eval_loss": 0.7021871209144592, |
|
"eval_runtime": 406.6755, |
|
"eval_samples_per_second": 24.339, |
|
"eval_steps_per_second": 0.381, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.40804597701149425, |
|
"grad_norm": 0.05022790655493736, |
|
"learning_rate": 0.0001993523318335581, |
|
"loss": 0.6967, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.06086933612823486, |
|
"learning_rate": 0.00019927398955352061, |
|
"loss": 0.7279, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.41954022988505746, |
|
"grad_norm": 0.04689742252230644, |
|
"learning_rate": 0.00019919119288683908, |
|
"loss": 0.6792, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.42528735632183906, |
|
"grad_norm": 0.04852883517742157, |
|
"learning_rate": 0.00019910394554856876, |
|
"loss": 0.701, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 0.06196567416191101, |
|
"learning_rate": 0.0001990122514534651, |
|
"loss": 0.6805, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4367816091954023, |
|
"grad_norm": 0.047033004462718964, |
|
"learning_rate": 0.00019891611471580764, |
|
"loss": 0.7058, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4425287356321839, |
|
"grad_norm": 0.047392234206199646, |
|
"learning_rate": 0.00019881553964921572, |
|
"loss": 0.6861, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4482758620689655, |
|
"grad_norm": 0.054070815443992615, |
|
"learning_rate": 0.00019871053076645488, |
|
"loss": 0.6969, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4540229885057471, |
|
"grad_norm": 0.055412329733371735, |
|
"learning_rate": 0.00019860109277923418, |
|
"loss": 0.7001, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 0.05274376645684242, |
|
"learning_rate": 0.00019848723059799506, |
|
"loss": 0.7101, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"eval_loss": 0.694656252861023, |
|
"eval_runtime": 410.9173, |
|
"eval_samples_per_second": 24.088, |
|
"eval_steps_per_second": 0.377, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.46551724137931033, |
|
"grad_norm": 0.05915577709674835, |
|
"learning_rate": 0.00019836894933169088, |
|
"loss": 0.6836, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.47126436781609193, |
|
"grad_norm": 0.051574286073446274, |
|
"learning_rate": 0.0001982462542875576, |
|
"loss": 0.7181, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.47701149425287354, |
|
"grad_norm": 0.050167519599199295, |
|
"learning_rate": 0.00019811915097087587, |
|
"loss": 0.6645, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 0.06501943618059158, |
|
"learning_rate": 0.00019798764508472373, |
|
"loss": 0.6891, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4885057471264368, |
|
"grad_norm": 0.05396122857928276, |
|
"learning_rate": 0.00019785174252972092, |
|
"loss": 0.6842, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4942528735632184, |
|
"grad_norm": 0.051826637238264084, |
|
"learning_rate": 0.0001977114494037641, |
|
"loss": 0.7047, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.05442539602518082, |
|
"learning_rate": 0.00019756677200175315, |
|
"loss": 0.7261, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5057471264367817, |
|
"grad_norm": 0.05559674650430679, |
|
"learning_rate": 0.0001974177168153088, |
|
"loss": 0.6699, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5114942528735632, |
|
"grad_norm": 0.058047693222761154, |
|
"learning_rate": 0.0001972642905324813, |
|
"loss": 0.6831, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 0.051893047988414764, |
|
"learning_rate": 0.0001971065000374504, |
|
"loss": 0.7293, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"eval_loss": 0.6888386607170105, |
|
"eval_runtime": 405.4362, |
|
"eval_samples_per_second": 24.413, |
|
"eval_steps_per_second": 0.382, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5229885057471264, |
|
"grad_norm": 0.051870737224817276, |
|
"learning_rate": 0.0001969443524102163, |
|
"loss": 0.6945, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5287356321839081, |
|
"grad_norm": 0.04907568544149399, |
|
"learning_rate": 0.0001967778549262822, |
|
"loss": 0.6985, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5344827586206896, |
|
"grad_norm": 0.05802120640873909, |
|
"learning_rate": 0.00019660701505632772, |
|
"loss": 0.6911, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5402298850574713, |
|
"grad_norm": 0.06809733808040619, |
|
"learning_rate": 0.0001964318404658737, |
|
"loss": 0.6815, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5459770114942529, |
|
"grad_norm": 0.05489501729607582, |
|
"learning_rate": 0.00019625233901493822, |
|
"loss": 0.6664, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.0648936778306961, |
|
"learning_rate": 0.000196068518757684, |
|
"loss": 0.6689, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5574712643678161, |
|
"grad_norm": 0.054548367857933044, |
|
"learning_rate": 0.00019588038794205703, |
|
"loss": 0.6695, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5632183908045977, |
|
"grad_norm": 0.0626642182469368, |
|
"learning_rate": 0.00019568795500941635, |
|
"loss": 0.7062, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5689655172413793, |
|
"grad_norm": 0.0539688840508461, |
|
"learning_rate": 0.00019549122859415538, |
|
"loss": 0.6891, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5747126436781609, |
|
"grad_norm": 0.05761811137199402, |
|
"learning_rate": 0.00019529021752331453, |
|
"loss": 0.6852, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5747126436781609, |
|
"eval_loss": 0.6821601986885071, |
|
"eval_runtime": 404.287, |
|
"eval_samples_per_second": 24.483, |
|
"eval_steps_per_second": 0.383, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5804597701149425, |
|
"grad_norm": 0.054896607995033264, |
|
"learning_rate": 0.00019508493081618513, |
|
"loss": 0.6785, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.5862068965517241, |
|
"grad_norm": 0.06048964709043503, |
|
"learning_rate": 0.00019487537768390464, |
|
"loss": 0.6724, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5919540229885057, |
|
"grad_norm": 0.06828396022319794, |
|
"learning_rate": 0.00019466156752904343, |
|
"loss": 0.7117, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5977011494252874, |
|
"grad_norm": 0.06610234081745148, |
|
"learning_rate": 0.0001944435099451829, |
|
"loss": 0.6982, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.603448275862069, |
|
"grad_norm": 0.06762486696243286, |
|
"learning_rate": 0.00019422121471648497, |
|
"loss": 0.6768, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6091954022988506, |
|
"grad_norm": 0.05772867798805237, |
|
"learning_rate": 0.0001939946918172531, |
|
"loss": 0.7073, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6149425287356322, |
|
"grad_norm": 0.11993183940649033, |
|
"learning_rate": 0.00019376395141148476, |
|
"loss": 0.6831, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 0.08105713874101639, |
|
"learning_rate": 0.00019352900385241536, |
|
"loss": 0.6857, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6264367816091954, |
|
"grad_norm": 0.06035466492176056, |
|
"learning_rate": 0.0001932898596820536, |
|
"loss": 0.672, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.632183908045977, |
|
"grad_norm": 0.09288731962442398, |
|
"learning_rate": 0.0001930465296307087, |
|
"loss": 0.7033, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.632183908045977, |
|
"eval_loss": 0.677044153213501, |
|
"eval_runtime": 405.2323, |
|
"eval_samples_per_second": 24.425, |
|
"eval_steps_per_second": 0.382, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6379310344827587, |
|
"grad_norm": 0.06630638986825943, |
|
"learning_rate": 0.00019279902461650866, |
|
"loss": 0.6831, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6436781609195402, |
|
"grad_norm": 0.05605092644691467, |
|
"learning_rate": 0.00019254735574491058, |
|
"loss": 0.6654, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6494252873563219, |
|
"grad_norm": 0.07270795851945877, |
|
"learning_rate": 0.00019229153430820232, |
|
"loss": 0.6744, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6551724137931034, |
|
"grad_norm": 0.06772006303071976, |
|
"learning_rate": 0.0001920315717849956, |
|
"loss": 0.6833, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6609195402298851, |
|
"grad_norm": 0.06296226382255554, |
|
"learning_rate": 0.0001917674798397113, |
|
"loss": 0.677, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.06553810834884644, |
|
"learning_rate": 0.00019149927032205587, |
|
"loss": 0.6828, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6724137931034483, |
|
"grad_norm": 0.057245928794145584, |
|
"learning_rate": 0.00019122695526648968, |
|
"loss": 0.6858, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6781609195402298, |
|
"grad_norm": 0.06503669917583466, |
|
"learning_rate": 0.00019095054689168705, |
|
"loss": 0.6591, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6839080459770115, |
|
"grad_norm": 0.05912588909268379, |
|
"learning_rate": 0.00019067005759998797, |
|
"loss": 0.6669, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.06517963111400604, |
|
"learning_rate": 0.0001903854999768417, |
|
"loss": 0.6815, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"eval_loss": 0.6735538244247437, |
|
"eval_runtime": 405.8319, |
|
"eval_samples_per_second": 24.389, |
|
"eval_steps_per_second": 0.382, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6954022988505747, |
|
"grad_norm": 0.06089121848344803, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.67, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7011494252873564, |
|
"grad_norm": 0.05764375999569893, |
|
"learning_rate": 0.00018980423099015402, |
|
"loss": 0.6733, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7068965517241379, |
|
"grad_norm": 0.06278955936431885, |
|
"learning_rate": 0.00018950754570793384, |
|
"loss": 0.6702, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7126436781609196, |
|
"grad_norm": 0.06360521912574768, |
|
"learning_rate": 0.00018920684425573865, |
|
"loss": 0.6619, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7183908045977011, |
|
"grad_norm": 0.0599365159869194, |
|
"learning_rate": 0.00018890214012592975, |
|
"loss": 0.6851, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7241379310344828, |
|
"grad_norm": 0.061885766685009, |
|
"learning_rate": 0.000188593446990467, |
|
"loss": 0.6346, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7298850574712644, |
|
"grad_norm": 0.061761509627103806, |
|
"learning_rate": 0.00018828077870029552, |
|
"loss": 0.6834, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.735632183908046, |
|
"grad_norm": 0.075982965528965, |
|
"learning_rate": 0.00018796414928472417, |
|
"loss": 0.6279, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7413793103448276, |
|
"grad_norm": 0.05802853778004646, |
|
"learning_rate": 0.0001876435729507959, |
|
"loss": 0.6348, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7471264367816092, |
|
"grad_norm": 0.06642711162567139, |
|
"learning_rate": 0.0001873190640826505, |
|
"loss": 0.679, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7471264367816092, |
|
"eval_loss": 0.6707044243812561, |
|
"eval_runtime": 407.4212, |
|
"eval_samples_per_second": 24.294, |
|
"eval_steps_per_second": 0.38, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7528735632183908, |
|
"grad_norm": 0.06452522426843643, |
|
"learning_rate": 0.00018699063724087904, |
|
"loss": 0.6423, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7586206896551724, |
|
"grad_norm": 0.05988775193691254, |
|
"learning_rate": 0.00018665830716187065, |
|
"loss": 0.6654, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.764367816091954, |
|
"grad_norm": 0.059349820017814636, |
|
"learning_rate": 0.0001863220887571512, |
|
"loss": 0.6866, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7701149425287356, |
|
"grad_norm": 0.06473397463560104, |
|
"learning_rate": 0.0001859819971127143, |
|
"loss": 0.7014, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7758620689655172, |
|
"grad_norm": 0.06945810467004776, |
|
"learning_rate": 0.00018563804748834438, |
|
"loss": 0.6769, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7816091954022989, |
|
"grad_norm": 0.06217830255627632, |
|
"learning_rate": 0.000185290255316932, |
|
"loss": 0.6821, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.7873563218390804, |
|
"grad_norm": 0.07021711021661758, |
|
"learning_rate": 0.00018493863620378122, |
|
"loss": 0.6614, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.7931034482758621, |
|
"grad_norm": 0.0640297532081604, |
|
"learning_rate": 0.00018458320592590975, |
|
"loss": 0.6699, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7988505747126436, |
|
"grad_norm": 0.0640842542052269, |
|
"learning_rate": 0.00018422398043134067, |
|
"loss": 0.6795, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8045977011494253, |
|
"grad_norm": 0.07371507585048676, |
|
"learning_rate": 0.00018386097583838714, |
|
"loss": 0.6571, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8045977011494253, |
|
"eval_loss": 0.6682229042053223, |
|
"eval_runtime": 404.8694, |
|
"eval_samples_per_second": 24.447, |
|
"eval_steps_per_second": 0.383, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8103448275862069, |
|
"grad_norm": 0.06185011938214302, |
|
"learning_rate": 0.00018349420843492888, |
|
"loss": 0.6524, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8160919540229885, |
|
"grad_norm": 0.08427827060222626, |
|
"learning_rate": 0.00018312369467768166, |
|
"loss": 0.6685, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8218390804597702, |
|
"grad_norm": 0.06529568880796432, |
|
"learning_rate": 0.0001827494511914587, |
|
"loss": 0.659, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.07357680797576904, |
|
"learning_rate": 0.0001823714947684247, |
|
"loss": 0.6792, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.09026575833559036, |
|
"learning_rate": 0.00018198984236734246, |
|
"loss": 0.6954, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8390804597701149, |
|
"grad_norm": 0.06157710403203964, |
|
"learning_rate": 0.000181604511112812, |
|
"loss": 0.6527, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8448275862068966, |
|
"grad_norm": 0.08122924715280533, |
|
"learning_rate": 0.000181215518294502, |
|
"loss": 0.6571, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8505747126436781, |
|
"grad_norm": 0.05926045402884483, |
|
"learning_rate": 0.00018082288136637422, |
|
"loss": 0.6773, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.8563218390804598, |
|
"grad_norm": 0.07869191467761993, |
|
"learning_rate": 0.00018042661794590023, |
|
"loss": 0.7066, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 0.07564139366149902, |
|
"learning_rate": 0.00018002674581327094, |
|
"loss": 0.6491, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"eval_loss": 0.6660047769546509, |
|
"eval_runtime": 406.5581, |
|
"eval_samples_per_second": 24.346, |
|
"eval_steps_per_second": 0.381, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.867816091954023, |
|
"grad_norm": 0.05749671533703804, |
|
"learning_rate": 0.00017962328291059888, |
|
"loss": 0.7081, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8735632183908046, |
|
"grad_norm": 0.08154609054327011, |
|
"learning_rate": 0.00017921624734111292, |
|
"loss": 0.6622, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.8793103448275862, |
|
"grad_norm": 0.08773736655712128, |
|
"learning_rate": 0.0001788056573683464, |
|
"loss": 0.6393, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.8850574712643678, |
|
"grad_norm": 0.06756340712308884, |
|
"learning_rate": 0.00017839153141531718, |
|
"loss": 0.6384, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.8908045977011494, |
|
"grad_norm": 0.08763930201530457, |
|
"learning_rate": 0.00017797388806370132, |
|
"loss": 0.6512, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 0.0647486001253128, |
|
"learning_rate": 0.00017755274605299923, |
|
"loss": 0.6502, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9022988505747126, |
|
"grad_norm": 0.11679747700691223, |
|
"learning_rate": 0.00017712812427969485, |
|
"loss": 0.6666, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9080459770114943, |
|
"grad_norm": 0.06472433358430862, |
|
"learning_rate": 0.00017670004179640774, |
|
"loss": 0.6495, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9137931034482759, |
|
"grad_norm": 0.09902803599834442, |
|
"learning_rate": 0.0001762685178110382, |
|
"loss": 0.6747, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.06362438946962357, |
|
"learning_rate": 0.0001758335716859055, |
|
"loss": 0.7015, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"eval_loss": 0.663636326789856, |
|
"eval_runtime": 404.5915, |
|
"eval_samples_per_second": 24.464, |
|
"eval_steps_per_second": 0.383, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9252873563218391, |
|
"grad_norm": 0.07304941862821579, |
|
"learning_rate": 0.00017539522293687898, |
|
"loss": 0.6825, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9310344827586207, |
|
"grad_norm": 0.08923015743494034, |
|
"learning_rate": 0.00017495349123250242, |
|
"loss": 0.674, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.9367816091954023, |
|
"grad_norm": 0.062135376036167145, |
|
"learning_rate": 0.00017450839639311162, |
|
"loss": 0.6477, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.9425287356321839, |
|
"grad_norm": 0.1098598912358284, |
|
"learning_rate": 0.00017405995838994494, |
|
"loss": 0.6742, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.9482758620689655, |
|
"grad_norm": 0.06947540491819382, |
|
"learning_rate": 0.00017360819734424715, |
|
"loss": 0.6509, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9540229885057471, |
|
"grad_norm": 0.11134368181228638, |
|
"learning_rate": 0.0001731531335263669, |
|
"loss": 0.6602, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.9597701149425287, |
|
"grad_norm": 0.06717904657125473, |
|
"learning_rate": 0.00017269478735484683, |
|
"loss": 0.6697, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.06737629324197769, |
|
"learning_rate": 0.00017223317939550753, |
|
"loss": 0.6636, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.9712643678160919, |
|
"grad_norm": 0.08558724075555801, |
|
"learning_rate": 0.00017176833036052495, |
|
"loss": 0.6733, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.9770114942528736, |
|
"grad_norm": 0.07127804309129715, |
|
"learning_rate": 0.0001713002611075007, |
|
"loss": 0.6523, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9770114942528736, |
|
"eval_loss": 0.6618800759315491, |
|
"eval_runtime": 411.375, |
|
"eval_samples_per_second": 24.061, |
|
"eval_steps_per_second": 0.377, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9827586206896551, |
|
"grad_norm": 0.08060283958911896, |
|
"learning_rate": 0.0001708289926385265, |
|
"loss": 0.658, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.9885057471264368, |
|
"grad_norm": 0.06496579200029373, |
|
"learning_rate": 0.0001703545460992416, |
|
"loss": 0.6697, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.9942528735632183, |
|
"grad_norm": 0.0646037757396698, |
|
"learning_rate": 0.00016987694277788417, |
|
"loss": 0.6231, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.08516079187393188, |
|
"learning_rate": 0.0001693962041043359, |
|
"loss": 0.6374, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.0057471264367817, |
|
"grad_norm": 0.06554190069437027, |
|
"learning_rate": 0.00016891235164916065, |
|
"loss": 0.6271, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.0114942528735633, |
|
"grad_norm": 0.06361629068851471, |
|
"learning_rate": 0.00016842540712263637, |
|
"loss": 0.649, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.0172413793103448, |
|
"grad_norm": 0.0814083069562912, |
|
"learning_rate": 0.00016793539237378128, |
|
"loss": 0.654, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.0229885057471264, |
|
"grad_norm": 0.06498701125383377, |
|
"learning_rate": 0.00016744232938937308, |
|
"loss": 0.6313, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.028735632183908, |
|
"grad_norm": 0.11292543262243271, |
|
"learning_rate": 0.0001669462402929629, |
|
"loss": 0.6803, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 0.0661187544465065, |
|
"learning_rate": 0.00016644714734388217, |
|
"loss": 0.6672, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"eval_loss": 0.6602174043655396, |
|
"eval_runtime": 410.2914, |
|
"eval_samples_per_second": 24.124, |
|
"eval_steps_per_second": 0.378, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0402298850574712, |
|
"grad_norm": 0.08441785722970963, |
|
"learning_rate": 0.00016594507293624425, |
|
"loss": 0.6257, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.0459770114942528, |
|
"grad_norm": 0.09075969457626343, |
|
"learning_rate": 0.00016544003959793925, |
|
"loss": 0.641, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.0517241379310345, |
|
"grad_norm": 0.07677901536226273, |
|
"learning_rate": 0.00016493206998962354, |
|
"loss": 0.6351, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.0574712643678161, |
|
"grad_norm": 0.09646302461624146, |
|
"learning_rate": 0.0001644211869037027, |
|
"loss": 0.6635, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.0632183908045978, |
|
"grad_norm": 0.06928115338087082, |
|
"learning_rate": 0.00016390741326330907, |
|
"loss": 0.6458, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.0689655172413792, |
|
"grad_norm": 0.1076992079615593, |
|
"learning_rate": 0.00016339077212127294, |
|
"loss": 0.6209, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.0747126436781609, |
|
"grad_norm": 0.08489565551280975, |
|
"learning_rate": 0.0001628712866590885, |
|
"loss": 0.6336, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.0804597701149425, |
|
"grad_norm": 0.11920158565044403, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.6496, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.0862068965517242, |
|
"grad_norm": 0.07987701892852783, |
|
"learning_rate": 0.00016182387613732291, |
|
"loss": 0.668, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.0919540229885056, |
|
"grad_norm": 0.1095438227057457, |
|
"learning_rate": 0.00016129599807465875, |
|
"loss": 0.6862, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0919540229885056, |
|
"eval_loss": 0.6588147282600403, |
|
"eval_runtime": 406.5115, |
|
"eval_samples_per_second": 24.349, |
|
"eval_steps_per_second": 0.381, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0977011494252873, |
|
"grad_norm": 0.08076825737953186, |
|
"learning_rate": 0.0001607653696835713, |
|
"loss": 0.6367, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 0.09829648584127426, |
|
"learning_rate": 0.00016023201477315731, |
|
"loss": 0.6391, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.1091954022988506, |
|
"grad_norm": 0.09008080512285233, |
|
"learning_rate": 0.0001596959572748514, |
|
"loss": 0.6462, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.1149425287356323, |
|
"grad_norm": 0.07725552469491959, |
|
"learning_rate": 0.00015915722124135227, |
|
"loss": 0.6356, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.1206896551724137, |
|
"grad_norm": 0.08215273171663284, |
|
"learning_rate": 0.00015861583084554349, |
|
"loss": 0.6557, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.1264367816091954, |
|
"grad_norm": 0.07044622302055359, |
|
"learning_rate": 0.0001580718103794089, |
|
"loss": 0.6401, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.132183908045977, |
|
"grad_norm": 0.06852877885103226, |
|
"learning_rate": 0.00015752518425294257, |
|
"loss": 0.6641, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 0.07775932550430298, |
|
"learning_rate": 0.00015697597699305366, |
|
"loss": 0.6689, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.1436781609195403, |
|
"grad_norm": 0.07384389638900757, |
|
"learning_rate": 0.00015642421324246568, |
|
"loss": 0.663, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.1494252873563218, |
|
"grad_norm": 0.074593685567379, |
|
"learning_rate": 0.00015586991775861102, |
|
"loss": 0.6755, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1494252873563218, |
|
"eval_loss": 0.6577329635620117, |
|
"eval_runtime": 406.5534, |
|
"eval_samples_per_second": 24.346, |
|
"eval_steps_per_second": 0.381, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1551724137931034, |
|
"grad_norm": 0.07201389968395233, |
|
"learning_rate": 0.00015531311541251995, |
|
"loss": 0.62, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.160919540229885, |
|
"grad_norm": 0.07052464783191681, |
|
"learning_rate": 0.00015475383118770472, |
|
"loss": 0.6456, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 0.07045558094978333, |
|
"learning_rate": 0.00015419209017903852, |
|
"loss": 0.6421, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.1724137931034484, |
|
"grad_norm": 0.0870729386806488, |
|
"learning_rate": 0.0001536279175916296, |
|
"loss": 0.6342, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.1781609195402298, |
|
"grad_norm": 0.0703926831483841, |
|
"learning_rate": 0.0001530613387396901, |
|
"loss": 0.6533, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.1839080459770115, |
|
"grad_norm": 0.07181324064731598, |
|
"learning_rate": 0.0001524923790454004, |
|
"loss": 0.6511, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.1896551724137931, |
|
"grad_norm": 0.07455940544605255, |
|
"learning_rate": 0.00015192106403776848, |
|
"loss": 0.6363, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.1954022988505748, |
|
"grad_norm": 0.08370154350996017, |
|
"learning_rate": 0.0001513474193514842, |
|
"loss": 0.6517, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.2011494252873562, |
|
"grad_norm": 0.08015818893909454, |
|
"learning_rate": 0.00015077147072576933, |
|
"loss": 0.6264, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 0.093206986784935, |
|
"learning_rate": 0.00015019324400322243, |
|
"loss": 0.6279, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"eval_loss": 0.6562607884407043, |
|
"eval_runtime": 407.9222, |
|
"eval_samples_per_second": 24.264, |
|
"eval_steps_per_second": 0.38, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2126436781609196, |
|
"grad_norm": 0.07707002758979797, |
|
"learning_rate": 0.00014961276512865954, |
|
"loss": 0.6726, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.2183908045977012, |
|
"grad_norm": 0.08275868743658066, |
|
"learning_rate": 0.00014903006014794983, |
|
"loss": 0.6493, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.2241379310344827, |
|
"grad_norm": 0.11222587525844574, |
|
"learning_rate": 0.00014844515520684703, |
|
"loss": 0.6367, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.2298850574712643, |
|
"grad_norm": 0.09210342168807983, |
|
"learning_rate": 0.00014785807654981627, |
|
"loss": 0.6734, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.235632183908046, |
|
"grad_norm": 0.08821109682321548, |
|
"learning_rate": 0.00014726885051885653, |
|
"loss": 0.6354, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"grad_norm": 0.12253956496715546, |
|
"learning_rate": 0.0001466775035523186, |
|
"loss": 0.6412, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.2471264367816093, |
|
"grad_norm": 0.08476684242486954, |
|
"learning_rate": 0.00014608406218371894, |
|
"loss": 0.6635, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.2528735632183907, |
|
"grad_norm": 0.08554086089134216, |
|
"learning_rate": 0.00014548855304054886, |
|
"loss": 0.6403, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.2586206896551724, |
|
"grad_norm": 0.10986476391553879, |
|
"learning_rate": 0.00014489100284308017, |
|
"loss": 0.6253, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.264367816091954, |
|
"grad_norm": 0.09221742302179337, |
|
"learning_rate": 0.00014429143840316585, |
|
"loss": 0.6622, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.264367816091954, |
|
"eval_loss": 0.6551185250282288, |
|
"eval_runtime": 408.2025, |
|
"eval_samples_per_second": 24.248, |
|
"eval_steps_per_second": 0.38, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2701149425287357, |
|
"grad_norm": 0.08050013333559036, |
|
"learning_rate": 0.00014368988662303732, |
|
"loss": 0.6226, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 0.16257594525814056, |
|
"learning_rate": 0.00014308637449409706, |
|
"loss": 0.6661, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.2816091954022988, |
|
"grad_norm": 0.07793809473514557, |
|
"learning_rate": 0.00014248092909570774, |
|
"loss": 0.6243, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.2873563218390804, |
|
"grad_norm": 0.0975632593035698, |
|
"learning_rate": 0.00014187357759397714, |
|
"loss": 0.6348, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.293103448275862, |
|
"grad_norm": 0.07041144371032715, |
|
"learning_rate": 0.00014126434724053913, |
|
"loss": 0.6386, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.2988505747126438, |
|
"grad_norm": 0.12080610543489456, |
|
"learning_rate": 0.00014065326537133094, |
|
"loss": 0.6276, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.3045977011494254, |
|
"grad_norm": 0.09340126812458038, |
|
"learning_rate": 0.0001400403594053667, |
|
"loss": 0.6431, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.3103448275862069, |
|
"grad_norm": 0.09178619831800461, |
|
"learning_rate": 0.00013942565684350698, |
|
"loss": 0.6457, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.3160919540229885, |
|
"grad_norm": 0.134804829955101, |
|
"learning_rate": 0.00013880918526722497, |
|
"loss": 0.6247, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.3218390804597702, |
|
"grad_norm": 0.07517404854297638, |
|
"learning_rate": 0.00013819097233736888, |
|
"loss": 0.6329, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3218390804597702, |
|
"eval_loss": 0.6541800498962402, |
|
"eval_runtime": 404.9523, |
|
"eval_samples_per_second": 24.442, |
|
"eval_steps_per_second": 0.383, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3275862068965516, |
|
"grad_norm": 0.1385478675365448, |
|
"learning_rate": 0.00013757104579292082, |
|
"loss": 0.6697, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.08156240731477737, |
|
"learning_rate": 0.00013694943344975212, |
|
"loss": 0.6279, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.339080459770115, |
|
"grad_norm": 0.10937108844518661, |
|
"learning_rate": 0.00013632616319937522, |
|
"loss": 0.6487, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.3448275862068966, |
|
"grad_norm": 0.12300366908311844, |
|
"learning_rate": 0.00013570126300769232, |
|
"loss": 0.6456, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.3505747126436782, |
|
"grad_norm": 0.07707128673791885, |
|
"learning_rate": 0.0001350747609137404, |
|
"loss": 0.6302, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.3563218390804597, |
|
"grad_norm": 0.0954674631357193, |
|
"learning_rate": 0.0001344466850284333, |
|
"loss": 0.6184, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.3620689655172413, |
|
"grad_norm": 0.10317125916481018, |
|
"learning_rate": 0.00013381706353330014, |
|
"loss": 0.6618, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.367816091954023, |
|
"grad_norm": 0.08765599131584167, |
|
"learning_rate": 0.0001331859246792211, |
|
"loss": 0.6191, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.3735632183908046, |
|
"grad_norm": 0.10305018723011017, |
|
"learning_rate": 0.0001325532967851596, |
|
"loss": 0.6397, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.08769567310810089, |
|
"learning_rate": 0.00013191920823689177, |
|
"loss": 0.6559, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"eval_loss": 0.6528159379959106, |
|
"eval_runtime": 407.607, |
|
"eval_samples_per_second": 24.283, |
|
"eval_steps_per_second": 0.38, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3850574712643677, |
|
"grad_norm": 0.09783841669559479, |
|
"learning_rate": 0.00013128368748573273, |
|
"loss": 0.6736, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.3908045977011494, |
|
"grad_norm": 0.08165410906076431, |
|
"learning_rate": 0.00013064676304726, |
|
"loss": 0.6467, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.396551724137931, |
|
"grad_norm": 0.10928885638713837, |
|
"learning_rate": 0.0001300084635000341, |
|
"loss": 0.6956, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.4022988505747127, |
|
"grad_norm": 0.09388460218906403, |
|
"learning_rate": 0.000129368817484316, |
|
"loss": 0.6474, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.4080459770114944, |
|
"grad_norm": 0.08257792145013809, |
|
"learning_rate": 0.0001287278537007824, |
|
"loss": 0.6301, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 0.07570406794548035, |
|
"learning_rate": 0.00012808560090923758, |
|
"loss": 0.6238, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.4195402298850575, |
|
"grad_norm": 0.097509004175663, |
|
"learning_rate": 0.00012744208792732324, |
|
"loss": 0.6383, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.4252873563218391, |
|
"grad_norm": 0.07778667658567429, |
|
"learning_rate": 0.00012679734362922528, |
|
"loss": 0.642, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.4310344827586206, |
|
"grad_norm": 0.08389262855052948, |
|
"learning_rate": 0.00012615139694437835, |
|
"loss": 0.6152, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.4367816091954024, |
|
"grad_norm": 0.08290071040391922, |
|
"learning_rate": 0.00012550427685616765, |
|
"loss": 0.6389, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4367816091954024, |
|
"eval_loss": 0.6516815423965454, |
|
"eval_runtime": 411.2719, |
|
"eval_samples_per_second": 24.067, |
|
"eval_steps_per_second": 0.377, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4425287356321839, |
|
"grad_norm": 0.08134254068136215, |
|
"learning_rate": 0.00012485601240062869, |
|
"loss": 0.6365, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.4482758620689655, |
|
"grad_norm": 0.11836981773376465, |
|
"learning_rate": 0.00012420663266514417, |
|
"loss": 0.6345, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.4540229885057472, |
|
"grad_norm": 0.07629366219043732, |
|
"learning_rate": 0.0001235561667871391, |
|
"loss": 0.6365, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.4597701149425286, |
|
"grad_norm": 0.09142953902482986, |
|
"learning_rate": 0.0001229046439527732, |
|
"loss": 0.6316, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.4655172413793103, |
|
"grad_norm": 0.12063657492399216, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.6221, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.471264367816092, |
|
"grad_norm": 0.07524894177913666, |
|
"learning_rate": 0.00012159854439541245, |
|
"loss": 0.6485, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.4770114942528736, |
|
"grad_norm": 0.08384133875370026, |
|
"learning_rate": 0.00012094402627661447, |
|
"loss": 0.6607, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.4827586206896552, |
|
"grad_norm": 0.08039575815200806, |
|
"learning_rate": 0.00012028856840721974, |
|
"loss": 0.6764, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.4885057471264367, |
|
"grad_norm": 0.09115740656852722, |
|
"learning_rate": 0.00011963220019737691, |
|
"loss": 0.6587, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.4942528735632183, |
|
"grad_norm": 0.08291927725076675, |
|
"learning_rate": 0.00011897495109808107, |
|
"loss": 0.6476, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4942528735632183, |
|
"eval_loss": 0.6506026983261108, |
|
"eval_runtime": 407.6949, |
|
"eval_samples_per_second": 24.278, |
|
"eval_steps_per_second": 0.38, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.09679999202489853, |
|
"learning_rate": 0.00011831685059985262, |
|
"loss": 0.6378, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.5057471264367817, |
|
"grad_norm": 0.07858405262231827, |
|
"learning_rate": 0.00011765792823141384, |
|
"loss": 0.6679, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.5114942528735633, |
|
"grad_norm": 0.07274090498685837, |
|
"learning_rate": 0.00011699821355836409, |
|
"loss": 0.6199, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.5172413793103448, |
|
"grad_norm": 0.11862179636955261, |
|
"learning_rate": 0.00011633773618185302, |
|
"loss": 0.6369, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.5229885057471264, |
|
"grad_norm": 0.08915189653635025, |
|
"learning_rate": 0.00011567652573725262, |
|
"loss": 0.6248, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.528735632183908, |
|
"grad_norm": 0.12184260040521622, |
|
"learning_rate": 0.00011501461189282733, |
|
"loss": 0.645, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.5344827586206895, |
|
"grad_norm": 0.09939936548471451, |
|
"learning_rate": 0.00011435202434840287, |
|
"loss": 0.6382, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.5402298850574714, |
|
"grad_norm": 0.07167995721101761, |
|
"learning_rate": 0.0001136887928340336, |
|
"loss": 0.6064, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.5459770114942528, |
|
"grad_norm": 0.09978017210960388, |
|
"learning_rate": 0.00011302494710866857, |
|
"loss": 0.6467, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 0.09598653763532639, |
|
"learning_rate": 0.00011236051695881633, |
|
"loss": 0.6412, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"eval_loss": 0.6497076749801636, |
|
"eval_runtime": 407.5672, |
|
"eval_samples_per_second": 24.286, |
|
"eval_steps_per_second": 0.38, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5574712643678161, |
|
"grad_norm": 0.08118661493062973, |
|
"learning_rate": 0.00011169553219720828, |
|
"loss": 0.6659, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.5632183908045976, |
|
"grad_norm": 0.11158329248428345, |
|
"learning_rate": 0.00011103002266146096, |
|
"loss": 0.6578, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.5689655172413794, |
|
"grad_norm": 0.12230509519577026, |
|
"learning_rate": 0.0001103640182127375, |
|
"loss": 0.6187, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.5747126436781609, |
|
"grad_norm": 0.07973505556583405, |
|
"learning_rate": 0.00010969754873440743, |
|
"loss": 0.6507, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.5804597701149425, |
|
"grad_norm": 0.07436943054199219, |
|
"learning_rate": 0.00010903064413070612, |
|
"loss": 0.6381, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.5862068965517242, |
|
"grad_norm": 0.0804380401968956, |
|
"learning_rate": 0.00010836333432539272, |
|
"loss": 0.6302, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.5919540229885056, |
|
"grad_norm": 0.07640023529529572, |
|
"learning_rate": 0.00010769564926040769, |
|
"loss": 0.618, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.5977011494252875, |
|
"grad_norm": 0.0787947028875351, |
|
"learning_rate": 0.0001070276188945293, |
|
"loss": 0.6308, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.603448275862069, |
|
"grad_norm": 0.08764500916004181, |
|
"learning_rate": 0.00010635927320202928, |
|
"loss": 0.6316, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.6091954022988506, |
|
"grad_norm": 0.07885821908712387, |
|
"learning_rate": 0.00010569064217132791, |
|
"loss": 0.6232, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.6091954022988506, |
|
"eval_loss": 0.6484516859054565, |
|
"eval_runtime": 406.5349, |
|
"eval_samples_per_second": 24.347, |
|
"eval_steps_per_second": 0.381, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.6149425287356323, |
|
"grad_norm": 0.08910427987575531, |
|
"learning_rate": 0.00010502175580364857, |
|
"loss": 0.6207, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.6206896551724137, |
|
"grad_norm": 0.08195802569389343, |
|
"learning_rate": 0.00010435264411167148, |
|
"loss": 0.6604, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.6264367816091954, |
|
"grad_norm": 0.09276524186134338, |
|
"learning_rate": 0.0001036833371181871, |
|
"loss": 0.6444, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.632183908045977, |
|
"grad_norm": 0.07577691972255707, |
|
"learning_rate": 0.00010301386485474889, |
|
"loss": 0.6439, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.6379310344827587, |
|
"grad_norm": 0.07871613651514053, |
|
"learning_rate": 0.00010234425736032607, |
|
"loss": 0.639, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.6436781609195403, |
|
"grad_norm": 0.07570876181125641, |
|
"learning_rate": 0.00010167454467995549, |
|
"loss": 0.6056, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.6494252873563218, |
|
"grad_norm": 0.09836837649345398, |
|
"learning_rate": 0.00010100475686339379, |
|
"loss": 0.6341, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 0.08796896785497665, |
|
"learning_rate": 0.00010033492396376878, |
|
"loss": 0.6193, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.660919540229885, |
|
"grad_norm": 0.07815764099359512, |
|
"learning_rate": 9.966507603623125e-05, |
|
"loss": 0.6227, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.13016292452812195, |
|
"learning_rate": 9.899524313660624e-05, |
|
"loss": 0.6243, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_loss": 0.6477526426315308, |
|
"eval_runtime": 405.0855, |
|
"eval_samples_per_second": 24.434, |
|
"eval_steps_per_second": 0.383, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.6724137931034484, |
|
"grad_norm": 0.09747885912656784, |
|
"learning_rate": 9.832545532004454e-05, |
|
"loss": 0.6328, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.6781609195402298, |
|
"grad_norm": 0.10131366550922394, |
|
"learning_rate": 9.765574263967396e-05, |
|
"loss": 0.6212, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.6839080459770115, |
|
"grad_norm": 0.1203976571559906, |
|
"learning_rate": 9.698613514525116e-05, |
|
"loss": 0.6563, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 0.07119957357645035, |
|
"learning_rate": 9.631666288181293e-05, |
|
"loss": 0.6278, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.6954022988505746, |
|
"grad_norm": 0.11370845884084702, |
|
"learning_rate": 9.564735588832856e-05, |
|
"loss": 0.6376, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.7011494252873565, |
|
"grad_norm": 0.07851264625787735, |
|
"learning_rate": 9.497824419635144e-05, |
|
"loss": 0.6149, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.706896551724138, |
|
"grad_norm": 0.0818655788898468, |
|
"learning_rate": 9.430935782867212e-05, |
|
"loss": 0.6048, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.7126436781609196, |
|
"grad_norm": 0.07335007190704346, |
|
"learning_rate": 9.364072679797073e-05, |
|
"loss": 0.6292, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.7183908045977012, |
|
"grad_norm": 0.07759315520524979, |
|
"learning_rate": 9.297238110547074e-05, |
|
"loss": 0.6464, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.0833640992641449, |
|
"learning_rate": 9.230435073959232e-05, |
|
"loss": 0.6467, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"eval_loss": 0.6469475030899048, |
|
"eval_runtime": 408.9385, |
|
"eval_samples_per_second": 24.204, |
|
"eval_steps_per_second": 0.379, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7298850574712645, |
|
"grad_norm": 0.08030898869037628, |
|
"learning_rate": 9.163666567460733e-05, |
|
"loss": 0.6268, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.735632183908046, |
|
"grad_norm": 0.08017026633024216, |
|
"learning_rate": 9.096935586929392e-05, |
|
"loss": 0.6367, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.7413793103448276, |
|
"grad_norm": 0.07945988327264786, |
|
"learning_rate": 9.030245126559262e-05, |
|
"loss": 0.6318, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.7471264367816093, |
|
"grad_norm": 0.09426795691251755, |
|
"learning_rate": 8.963598178726254e-05, |
|
"loss": 0.6399, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.7528735632183907, |
|
"grad_norm": 0.08182523399591446, |
|
"learning_rate": 8.896997733853903e-05, |
|
"loss": 0.6203, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.7586206896551724, |
|
"grad_norm": 0.07778620719909668, |
|
"learning_rate": 8.830446780279176e-05, |
|
"loss": 0.6816, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.764367816091954, |
|
"grad_norm": 0.11482707411050797, |
|
"learning_rate": 8.763948304118368e-05, |
|
"loss": 0.6442, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.7701149425287355, |
|
"grad_norm": 0.07546856999397278, |
|
"learning_rate": 8.697505289133145e-05, |
|
"loss": 0.6445, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.7758620689655173, |
|
"grad_norm": 0.11665278673171997, |
|
"learning_rate": 8.631120716596641e-05, |
|
"loss": 0.6374, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.7816091954022988, |
|
"grad_norm": 0.1181105300784111, |
|
"learning_rate": 8.564797565159714e-05, |
|
"loss": 0.6146, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7816091954022988, |
|
"eval_loss": 0.6459708213806152, |
|
"eval_runtime": 405.0602, |
|
"eval_samples_per_second": 24.436, |
|
"eval_steps_per_second": 0.383, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7873563218390804, |
|
"grad_norm": 0.07805997133255005, |
|
"learning_rate": 8.498538810717267e-05, |
|
"loss": 0.6679, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.793103448275862, |
|
"grad_norm": 0.08421120047569275, |
|
"learning_rate": 8.432347426274739e-05, |
|
"loss": 0.642, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.7988505747126435, |
|
"grad_norm": 0.10425391793251038, |
|
"learning_rate": 8.366226381814697e-05, |
|
"loss": 0.6354, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.8045977011494254, |
|
"grad_norm": 0.08861584216356277, |
|
"learning_rate": 8.300178644163594e-05, |
|
"loss": 0.6397, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.8103448275862069, |
|
"grad_norm": 0.08726219832897186, |
|
"learning_rate": 8.234207176858614e-05, |
|
"loss": 0.6474, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.8160919540229885, |
|
"grad_norm": 0.12218604981899261, |
|
"learning_rate": 8.16831494001474e-05, |
|
"loss": 0.6459, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.8218390804597702, |
|
"grad_norm": 0.08113615214824677, |
|
"learning_rate": 8.102504890191892e-05, |
|
"loss": 0.6114, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 0.08763635903596878, |
|
"learning_rate": 8.036779980262311e-05, |
|
"loss": 0.6602, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.1053246557712555, |
|
"learning_rate": 7.971143159278026e-05, |
|
"loss": 0.6182, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.839080459770115, |
|
"grad_norm": 0.09522312134504318, |
|
"learning_rate": 7.905597372338558e-05, |
|
"loss": 0.6386, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.839080459770115, |
|
"eval_loss": 0.6449984908103943, |
|
"eval_runtime": 405.9165, |
|
"eval_samples_per_second": 24.384, |
|
"eval_steps_per_second": 0.382, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.8448275862068966, |
|
"grad_norm": 0.09493348747491837, |
|
"learning_rate": 7.840145560458756e-05, |
|
"loss": 0.6522, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.8505747126436782, |
|
"grad_norm": 0.10554379224777222, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.6401, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.8563218390804597, |
|
"grad_norm": 0.09237196296453476, |
|
"learning_rate": 7.709535604722684e-05, |
|
"loss": 0.6315, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.8620689655172413, |
|
"grad_norm": 0.07175464183092117, |
|
"learning_rate": 7.644383321286094e-05, |
|
"loss": 0.6559, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.867816091954023, |
|
"grad_norm": 0.08578918129205704, |
|
"learning_rate": 7.579336733485584e-05, |
|
"loss": 0.6297, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.8735632183908046, |
|
"grad_norm": 0.14390091598033905, |
|
"learning_rate": 7.514398759937135e-05, |
|
"loss": 0.6155, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.8793103448275863, |
|
"grad_norm": 0.07774030417203903, |
|
"learning_rate": 7.449572314383237e-05, |
|
"loss": 0.6551, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.8850574712643677, |
|
"grad_norm": 0.07927459478378296, |
|
"learning_rate": 7.384860305562172e-05, |
|
"loss": 0.6312, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.8908045977011494, |
|
"grad_norm": 0.11287631094455719, |
|
"learning_rate": 7.320265637077473e-05, |
|
"loss": 0.66, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 0.09955232590436935, |
|
"learning_rate": 7.255791207267679e-05, |
|
"loss": 0.6456, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"eval_loss": 0.6442980766296387, |
|
"eval_runtime": 404.2901, |
|
"eval_samples_per_second": 24.482, |
|
"eval_steps_per_second": 0.383, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.9022988505747125, |
|
"grad_norm": 0.07881880551576614, |
|
"learning_rate": 7.191439909076243e-05, |
|
"loss": 0.6398, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.9080459770114944, |
|
"grad_norm": 0.15244217216968536, |
|
"learning_rate": 7.127214629921765e-05, |
|
"loss": 0.6614, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.9137931034482758, |
|
"grad_norm": 0.07337264716625214, |
|
"learning_rate": 7.0631182515684e-05, |
|
"loss": 0.6294, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.9195402298850575, |
|
"grad_norm": 0.07102935016155243, |
|
"learning_rate": 6.999153649996595e-05, |
|
"loss": 0.6237, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.9252873563218391, |
|
"grad_norm": 0.09349462389945984, |
|
"learning_rate": 6.935323695274002e-05, |
|
"loss": 0.6051, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 0.0851803794503212, |
|
"learning_rate": 6.871631251426728e-05, |
|
"loss": 0.6548, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.9367816091954024, |
|
"grad_norm": 0.08571562170982361, |
|
"learning_rate": 6.808079176310827e-05, |
|
"loss": 0.6136, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.9425287356321839, |
|
"grad_norm": 0.0772768035531044, |
|
"learning_rate": 6.744670321484043e-05, |
|
"loss": 0.6668, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.9482758620689655, |
|
"grad_norm": 0.08812547475099564, |
|
"learning_rate": 6.681407532077895e-05, |
|
"loss": 0.6427, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.9540229885057472, |
|
"grad_norm": 0.09011583775281906, |
|
"learning_rate": 6.618293646669986e-05, |
|
"loss": 0.6402, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.9540229885057472, |
|
"eval_loss": 0.6436823606491089, |
|
"eval_runtime": 413.0204, |
|
"eval_samples_per_second": 23.965, |
|
"eval_steps_per_second": 0.375, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.9597701149425286, |
|
"grad_norm": 0.08234158158302307, |
|
"learning_rate": 6.555331497156672e-05, |
|
"loss": 0.6362, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 0.0780014768242836, |
|
"learning_rate": 6.492523908625959e-05, |
|
"loss": 0.6454, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.971264367816092, |
|
"grad_norm": 0.08458276093006134, |
|
"learning_rate": 6.42987369923077e-05, |
|
"loss": 0.6587, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.9770114942528736, |
|
"grad_norm": 0.11979149281978607, |
|
"learning_rate": 6.367383680062478e-05, |
|
"loss": 0.6369, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.9827586206896552, |
|
"grad_norm": 0.08782167732715607, |
|
"learning_rate": 6.30505665502479e-05, |
|
"loss": 0.6382, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.9885057471264367, |
|
"grad_norm": 0.07542918622493744, |
|
"learning_rate": 6.242895420707917e-05, |
|
"loss": 0.6238, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.9942528735632183, |
|
"grad_norm": 0.09390002489089966, |
|
"learning_rate": 6.180902766263113e-05, |
|
"loss": 0.632, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.10154885053634644, |
|
"learning_rate": 6.119081473277501e-05, |
|
"loss": 0.6078, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.0057471264367814, |
|
"grad_norm": 0.09035320580005646, |
|
"learning_rate": 6.057434315649304e-05, |
|
"loss": 0.6331, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.0114942528735633, |
|
"grad_norm": 0.1151895746588707, |
|
"learning_rate": 5.99596405946333e-05, |
|
"loss": 0.6455, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.0114942528735633, |
|
"eval_loss": 0.6433547139167786, |
|
"eval_runtime": 409.0063, |
|
"eval_samples_per_second": 24.2, |
|
"eval_steps_per_second": 0.379, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.0172413793103448, |
|
"grad_norm": 0.10666079819202423, |
|
"learning_rate": 5.9346734628669065e-05, |
|
"loss": 0.6473, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.0229885057471266, |
|
"grad_norm": 0.09095422178506851, |
|
"learning_rate": 5.873565275946088e-05, |
|
"loss": 0.6335, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.028735632183908, |
|
"grad_norm": 0.09256957471370697, |
|
"learning_rate": 5.8126422406022885e-05, |
|
"loss": 0.5969, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.0344827586206895, |
|
"grad_norm": 0.1397576928138733, |
|
"learning_rate": 5.7519070904292247e-05, |
|
"loss": 0.5919, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.0402298850574714, |
|
"grad_norm": 0.0867573469877243, |
|
"learning_rate": 5.691362550590297e-05, |
|
"loss": 0.5909, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.045977011494253, |
|
"grad_norm": 0.07953327894210815, |
|
"learning_rate": 5.631011337696271e-05, |
|
"loss": 0.5959, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.0517241379310347, |
|
"grad_norm": 0.09324570745229721, |
|
"learning_rate": 5.570856159683418e-05, |
|
"loss": 0.6216, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.057471264367816, |
|
"grad_norm": 0.10510014742612839, |
|
"learning_rate": 5.510899715691984e-05, |
|
"loss": 0.6172, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.0632183908045976, |
|
"grad_norm": 0.08669542521238327, |
|
"learning_rate": 5.451144695945116e-05, |
|
"loss": 0.5931, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 0.09054102748632431, |
|
"learning_rate": 5.3915937816281095e-05, |
|
"loss": 0.5888, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"eval_loss": 0.643742024898529, |
|
"eval_runtime": 404.2471, |
|
"eval_samples_per_second": 24.485, |
|
"eval_steps_per_second": 0.383, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.074712643678161, |
|
"grad_norm": 0.11839323490858078, |
|
"learning_rate": 5.3322496447681414e-05, |
|
"loss": 0.6093, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.0804597701149423, |
|
"grad_norm": 0.1050933375954628, |
|
"learning_rate": 5.273114948114346e-05, |
|
"loss": 0.6247, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.086206896551724, |
|
"grad_norm": 0.09781333059072495, |
|
"learning_rate": 5.214192345018374e-05, |
|
"loss": 0.6274, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.0919540229885056, |
|
"grad_norm": 0.09329628199338913, |
|
"learning_rate": 5.1554844793153e-05, |
|
"loss": 0.6243, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.0977011494252875, |
|
"grad_norm": 0.08716364949941635, |
|
"learning_rate": 5.096993985205023e-05, |
|
"loss": 0.6149, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.103448275862069, |
|
"grad_norm": 0.09969545155763626, |
|
"learning_rate": 5.0387234871340486e-05, |
|
"loss": 0.635, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.1091954022988504, |
|
"grad_norm": 0.10841623693704605, |
|
"learning_rate": 4.980675599677757e-05, |
|
"loss": 0.6544, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.1149425287356323, |
|
"grad_norm": 0.07902085781097412, |
|
"learning_rate": 4.9228529274230695e-05, |
|
"loss": 0.6144, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.1206896551724137, |
|
"grad_norm": 0.11440268158912659, |
|
"learning_rate": 4.865258064851579e-05, |
|
"loss": 0.6217, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.1264367816091956, |
|
"grad_norm": 0.09594007581472397, |
|
"learning_rate": 4.807893596223152e-05, |
|
"loss": 0.6267, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.1264367816091956, |
|
"eval_loss": 0.6434890031814575, |
|
"eval_runtime": 404.1508, |
|
"eval_samples_per_second": 24.491, |
|
"eval_steps_per_second": 0.384, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.132183908045977, |
|
"grad_norm": 0.09025128185749054, |
|
"learning_rate": 4.75076209545996e-05, |
|
"loss": 0.6122, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.1379310344827585, |
|
"grad_norm": 0.09677668660879135, |
|
"learning_rate": 4.693866126030995e-05, |
|
"loss": 0.6339, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.1436781609195403, |
|
"grad_norm": 0.08178266882896423, |
|
"learning_rate": 4.637208240837042e-05, |
|
"loss": 0.6392, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.1494252873563218, |
|
"grad_norm": 0.10616466403007507, |
|
"learning_rate": 4.5807909820961494e-05, |
|
"loss": 0.6207, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.1551724137931036, |
|
"grad_norm": 0.08333076536655426, |
|
"learning_rate": 4.5246168812295286e-05, |
|
"loss": 0.6148, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.160919540229885, |
|
"grad_norm": 0.1016552671790123, |
|
"learning_rate": 4.468688458748006e-05, |
|
"loss": 0.6306, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.1666666666666665, |
|
"grad_norm": 0.08546506613492966, |
|
"learning_rate": 4.413008224138897e-05, |
|
"loss": 0.606, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.1724137931034484, |
|
"grad_norm": 0.08369904011487961, |
|
"learning_rate": 4.357578675753432e-05, |
|
"loss": 0.6007, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.17816091954023, |
|
"grad_norm": 0.08523935824632645, |
|
"learning_rate": 4.302402300694636e-05, |
|
"loss": 0.5884, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.1839080459770113, |
|
"grad_norm": 0.0944519191980362, |
|
"learning_rate": 4.247481574705744e-05, |
|
"loss": 0.6292, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.1839080459770113, |
|
"eval_loss": 0.6433520913124084, |
|
"eval_runtime": 404.2218, |
|
"eval_samples_per_second": 24.487, |
|
"eval_steps_per_second": 0.383, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.189655172413793, |
|
"grad_norm": 0.11311980336904526, |
|
"learning_rate": 4.1928189620591116e-05, |
|
"loss": 0.6103, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.1954022988505746, |
|
"grad_norm": 0.08662451803684235, |
|
"learning_rate": 4.138416915445655e-05, |
|
"loss": 0.5852, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.2011494252873565, |
|
"grad_norm": 0.09417479485273361, |
|
"learning_rate": 4.084277875864776e-05, |
|
"loss": 0.6467, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.206896551724138, |
|
"grad_norm": 0.09818896651268005, |
|
"learning_rate": 4.030404272514864e-05, |
|
"loss": 0.6112, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.2126436781609193, |
|
"grad_norm": 0.08806431293487549, |
|
"learning_rate": 3.9767985226842696e-05, |
|
"loss": 0.5822, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.218390804597701, |
|
"grad_norm": 0.0837361216545105, |
|
"learning_rate": 3.923463031642872e-05, |
|
"loss": 0.6137, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.2241379310344827, |
|
"grad_norm": 0.10712449252605438, |
|
"learning_rate": 3.870400192534128e-05, |
|
"loss": 0.602, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.2298850574712645, |
|
"grad_norm": 0.11590448766946793, |
|
"learning_rate": 3.81761238626771e-05, |
|
"loss": 0.6215, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.235632183908046, |
|
"grad_norm": 0.08264652639627457, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 0.6002, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.2413793103448274, |
|
"grad_norm": 0.08986306935548782, |
|
"learning_rate": 3.7128713340911535e-05, |
|
"loss": 0.6058, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.2413793103448274, |
|
"eval_loss": 0.6431533098220825, |
|
"eval_runtime": 419.2567, |
|
"eval_samples_per_second": 23.608, |
|
"eval_steps_per_second": 0.37, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.2471264367816093, |
|
"grad_norm": 0.3949902057647705, |
|
"learning_rate": 3.660922787872706e-05, |
|
"loss": 0.643, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.2528735632183907, |
|
"grad_norm": 0.09183293581008911, |
|
"learning_rate": 3.609258673669097e-05, |
|
"loss": 0.5931, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.2586206896551726, |
|
"grad_norm": 0.0786626785993576, |
|
"learning_rate": 3.557881309629729e-05, |
|
"loss": 0.5795, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.264367816091954, |
|
"grad_norm": 0.08318330347537994, |
|
"learning_rate": 3.5067930010376484e-05, |
|
"loss": 0.6173, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.2701149425287355, |
|
"grad_norm": 0.09149078279733658, |
|
"learning_rate": 3.455996040206076e-05, |
|
"loss": 0.6238, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.2758620689655173, |
|
"grad_norm": 0.09578599780797958, |
|
"learning_rate": 3.4054927063755796e-05, |
|
"loss": 0.6264, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.281609195402299, |
|
"grad_norm": 0.08735264092683792, |
|
"learning_rate": 3.355285265611784e-05, |
|
"loss": 0.6269, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.2873563218390807, |
|
"grad_norm": 0.0886816754937172, |
|
"learning_rate": 3.305375970703711e-05, |
|
"loss": 0.6043, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.293103448275862, |
|
"grad_norm": 0.07559609413146973, |
|
"learning_rate": 3.2557670610626925e-05, |
|
"loss": 0.6416, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.2988505747126435, |
|
"grad_norm": 0.11379113793373108, |
|
"learning_rate": 3.206460762621873e-05, |
|
"loss": 0.6221, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.2988505747126435, |
|
"eval_loss": 0.6427375078201294, |
|
"eval_runtime": 405.8229, |
|
"eval_samples_per_second": 24.39, |
|
"eval_steps_per_second": 0.382, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.3045977011494254, |
|
"grad_norm": 0.08930199593305588, |
|
"learning_rate": 3.157459287736362e-05, |
|
"loss": 0.599, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.310344827586207, |
|
"grad_norm": 0.11189960688352585, |
|
"learning_rate": 3.108764835083938e-05, |
|
"loss": 0.6243, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.3160919540229887, |
|
"grad_norm": 0.0793476328253746, |
|
"learning_rate": 3.0603795895664124e-05, |
|
"loss": 0.615, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.32183908045977, |
|
"grad_norm": 0.0860418751835823, |
|
"learning_rate": 3.0123057222115836e-05, |
|
"loss": 0.5968, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.3275862068965516, |
|
"grad_norm": 0.08753317594528198, |
|
"learning_rate": 2.964545390075841e-05, |
|
"loss": 0.6192, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 0.09598301351070404, |
|
"learning_rate": 2.9171007361473514e-05, |
|
"loss": 0.6237, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.339080459770115, |
|
"grad_norm": 0.10627751052379608, |
|
"learning_rate": 2.8699738892499328e-05, |
|
"loss": 0.6123, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.344827586206897, |
|
"grad_norm": 0.08839675039052963, |
|
"learning_rate": 2.8231669639475067e-05, |
|
"loss": 0.6123, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.3505747126436782, |
|
"grad_norm": 0.08533503860235214, |
|
"learning_rate": 2.776682060449247e-05, |
|
"loss": 0.6251, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.3563218390804597, |
|
"grad_norm": 0.10517686605453491, |
|
"learning_rate": 2.7305212645153212e-05, |
|
"loss": 0.6254, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.3563218390804597, |
|
"eval_loss": 0.6428195238113403, |
|
"eval_runtime": 404.1758, |
|
"eval_samples_per_second": 24.489, |
|
"eval_steps_per_second": 0.383, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.3620689655172415, |
|
"grad_norm": 0.10578128695487976, |
|
"learning_rate": 2.6846866473633125e-05, |
|
"loss": 0.6216, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.367816091954023, |
|
"grad_norm": 0.10083532333374023, |
|
"learning_rate": 2.6391802655752853e-05, |
|
"loss": 0.6052, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.3735632183908044, |
|
"grad_norm": 0.08413968980312347, |
|
"learning_rate": 2.594004161005511e-05, |
|
"loss": 0.6007, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.3793103448275863, |
|
"grad_norm": 0.08840201050043106, |
|
"learning_rate": 2.549160360688838e-05, |
|
"loss": 0.5876, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.3850574712643677, |
|
"grad_norm": 0.09680577367544174, |
|
"learning_rate": 2.50465087674976e-05, |
|
"loss": 0.6183, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.3908045977011496, |
|
"grad_norm": 0.09196774661540985, |
|
"learning_rate": 2.4604777063121033e-05, |
|
"loss": 0.613, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.396551724137931, |
|
"grad_norm": 0.0849708616733551, |
|
"learning_rate": 2.4166428314094514e-05, |
|
"loss": 0.6443, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.4022988505747125, |
|
"grad_norm": 0.09316956251859665, |
|
"learning_rate": 2.3731482188961818e-05, |
|
"loss": 0.6062, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.4080459770114944, |
|
"grad_norm": 0.08482903987169266, |
|
"learning_rate": 2.32999582035923e-05, |
|
"loss": 0.6099, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 0.08352029323577881, |
|
"learning_rate": 2.287187572030516e-05, |
|
"loss": 0.6178, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"eval_loss": 0.6422638297080994, |
|
"eval_runtime": 404.4609, |
|
"eval_samples_per_second": 24.472, |
|
"eval_steps_per_second": 0.383, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.4195402298850572, |
|
"grad_norm": 0.09856913238763809, |
|
"learning_rate": 2.244725394700079e-05, |
|
"loss": 0.6166, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.425287356321839, |
|
"grad_norm": 0.10127527266740799, |
|
"learning_rate": 2.202611193629869e-05, |
|
"loss": 0.6195, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.4310344827586206, |
|
"grad_norm": 0.09415800124406815, |
|
"learning_rate": 2.160846858468285e-05, |
|
"loss": 0.6157, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.4367816091954024, |
|
"grad_norm": 0.08563528954982758, |
|
"learning_rate": 2.1194342631653607e-05, |
|
"loss": 0.6212, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.442528735632184, |
|
"grad_norm": 0.0861605629324913, |
|
"learning_rate": 2.0783752658887066e-05, |
|
"loss": 0.6095, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.4482758620689653, |
|
"grad_norm": 0.1125798374414444, |
|
"learning_rate": 2.0376717089401164e-05, |
|
"loss": 0.606, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.454022988505747, |
|
"grad_norm": 0.09633134305477142, |
|
"learning_rate": 1.9973254186729086e-05, |
|
"loss": 0.6109, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.4597701149425286, |
|
"grad_norm": 0.08123010396957397, |
|
"learning_rate": 1.9573382054099786e-05, |
|
"loss": 0.5896, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.4655172413793105, |
|
"grad_norm": 0.08620712906122208, |
|
"learning_rate": 1.9177118633625814e-05, |
|
"loss": 0.6022, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.471264367816092, |
|
"grad_norm": 0.08710537105798721, |
|
"learning_rate": 1.8784481705498015e-05, |
|
"loss": 0.6161, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.471264367816092, |
|
"eval_loss": 0.642048180103302, |
|
"eval_runtime": 405.7821, |
|
"eval_samples_per_second": 24.392, |
|
"eval_steps_per_second": 0.382, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.4770114942528734, |
|
"grad_norm": 0.08711250126361847, |
|
"learning_rate": 1.8395488887188005e-05, |
|
"loss": 0.581, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.4827586206896552, |
|
"grad_norm": 0.08405685424804688, |
|
"learning_rate": 1.8010157632657543e-05, |
|
"loss": 0.6149, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.4885057471264367, |
|
"grad_norm": 0.08080325275659561, |
|
"learning_rate": 1.762850523157532e-05, |
|
"loss": 0.6264, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.4942528735632186, |
|
"grad_norm": 0.09836191684007645, |
|
"learning_rate": 1.7250548808541322e-05, |
|
"loss": 0.6055, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.10626177489757538, |
|
"learning_rate": 1.687630532231833e-05, |
|
"loss": 0.5907, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.5057471264367814, |
|
"grad_norm": 0.08308445662260056, |
|
"learning_rate": 1.6505791565071138e-05, |
|
"loss": 0.6189, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.5114942528735633, |
|
"grad_norm": 0.10249936580657959, |
|
"learning_rate": 1.613902416161288e-05, |
|
"loss": 0.6084, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.5172413793103448, |
|
"grad_norm": 0.08516431599855423, |
|
"learning_rate": 1.5776019568659338e-05, |
|
"loss": 0.624, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.5229885057471266, |
|
"grad_norm": 0.08852159231901169, |
|
"learning_rate": 1.5416794074090258e-05, |
|
"loss": 0.6374, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.528735632183908, |
|
"grad_norm": 0.09616044908761978, |
|
"learning_rate": 1.5061363796218785e-05, |
|
"loss": 0.634, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.528735632183908, |
|
"eval_loss": 0.6419377326965332, |
|
"eval_runtime": 416.5131, |
|
"eval_samples_per_second": 23.764, |
|
"eval_steps_per_second": 0.372, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.5344827586206895, |
|
"grad_norm": 0.1012992411851883, |
|
"learning_rate": 1.4709744683068039e-05, |
|
"loss": 0.6443, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.5402298850574714, |
|
"grad_norm": 0.102021224796772, |
|
"learning_rate": 1.4361952511655618e-05, |
|
"loss": 0.6111, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.545977011494253, |
|
"grad_norm": 0.08464264124631882, |
|
"learning_rate": 1.4018002887285687e-05, |
|
"loss": 0.6007, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.5517241379310347, |
|
"grad_norm": 0.0829034224152565, |
|
"learning_rate": 1.3677911242848806e-05, |
|
"loss": 0.6083, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.557471264367816, |
|
"grad_norm": 0.08752921968698502, |
|
"learning_rate": 1.334169283812936e-05, |
|
"loss": 0.6227, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.5632183908045976, |
|
"grad_norm": 0.080236054956913, |
|
"learning_rate": 1.300936275912098e-05, |
|
"loss": 0.6212, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.5689655172413794, |
|
"grad_norm": 0.08524277061223984, |
|
"learning_rate": 1.2680935917349523e-05, |
|
"loss": 0.5915, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.574712643678161, |
|
"grad_norm": 0.09109287708997726, |
|
"learning_rate": 1.2356427049204122e-05, |
|
"loss": 0.5972, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.5804597701149428, |
|
"grad_norm": 0.11969230324029922, |
|
"learning_rate": 1.2035850715275865e-05, |
|
"loss": 0.6358, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"grad_norm": 0.08512509614229202, |
|
"learning_rate": 1.1719221299704497e-05, |
|
"loss": 0.6241, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"eval_loss": 0.641758382320404, |
|
"eval_runtime": 404.7765, |
|
"eval_samples_per_second": 24.453, |
|
"eval_steps_per_second": 0.383, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.5919540229885056, |
|
"grad_norm": 0.08563876152038574, |
|
"learning_rate": 1.1406553009533027e-05, |
|
"loss": 0.6027, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.5977011494252875, |
|
"grad_norm": 0.07882750034332275, |
|
"learning_rate": 1.1097859874070294e-05, |
|
"loss": 0.6226, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.603448275862069, |
|
"grad_norm": 0.08562333881855011, |
|
"learning_rate": 1.0793155744261351e-05, |
|
"loss": 0.6145, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.609195402298851, |
|
"grad_norm": 0.08439898490905762, |
|
"learning_rate": 1.0492454292066178e-05, |
|
"loss": 0.6131, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.6149425287356323, |
|
"grad_norm": 0.09046713262796402, |
|
"learning_rate": 1.019576900984599e-05, |
|
"loss": 0.6312, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.6206896551724137, |
|
"grad_norm": 0.1001957505941391, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 0.6167, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.626436781609195, |
|
"grad_norm": 0.08048044890165329, |
|
"learning_rate": 9.614500023158336e-06, |
|
"loss": 0.5969, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.632183908045977, |
|
"grad_norm": 0.07949711382389069, |
|
"learning_rate": 9.32994240001206e-06, |
|
"loss": 0.6324, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.637931034482759, |
|
"grad_norm": 0.0978640615940094, |
|
"learning_rate": 9.049453108312966e-06, |
|
"loss": 0.5779, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.6436781609195403, |
|
"grad_norm": 0.08483273535966873, |
|
"learning_rate": 8.773044733510338e-06, |
|
"loss": 0.6084, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.6436781609195403, |
|
"eval_loss": 0.6415662169456482, |
|
"eval_runtime": 404.188, |
|
"eval_samples_per_second": 24.489, |
|
"eval_steps_per_second": 0.383, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.6494252873563218, |
|
"grad_norm": 0.08597224205732346, |
|
"learning_rate": 8.50072967794413e-06, |
|
"loss": 0.5962, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.655172413793103, |
|
"grad_norm": 0.08336161822080612, |
|
"learning_rate": 8.232520160288704e-06, |
|
"loss": 0.6276, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.660919540229885, |
|
"grad_norm": 0.08224053680896759, |
|
"learning_rate": 7.96842821500442e-06, |
|
"loss": 0.6047, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.08457629382610321, |
|
"learning_rate": 7.708465691797717e-06, |
|
"loss": 0.6006, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.6724137931034484, |
|
"grad_norm": 0.09363652020692825, |
|
"learning_rate": 7.452644255089425e-06, |
|
"loss": 0.6261, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.67816091954023, |
|
"grad_norm": 0.08728937804698944, |
|
"learning_rate": 7.20097538349136e-06, |
|
"loss": 0.6146, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.6839080459770113, |
|
"grad_norm": 0.08341008424758911, |
|
"learning_rate": 6.953470369291348e-06, |
|
"loss": 0.6237, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 2.689655172413793, |
|
"grad_norm": 0.08936601877212524, |
|
"learning_rate": 6.710140317946423e-06, |
|
"loss": 0.643, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.6954022988505746, |
|
"grad_norm": 0.09783781319856644, |
|
"learning_rate": 6.470996147584685e-06, |
|
"loss": 0.5764, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 2.7011494252873565, |
|
"grad_norm": 0.08959370106458664, |
|
"learning_rate": 6.236048588515242e-06, |
|
"loss": 0.6264, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.7011494252873565, |
|
"eval_loss": 0.6414589881896973, |
|
"eval_runtime": 405.1776, |
|
"eval_samples_per_second": 24.429, |
|
"eval_steps_per_second": 0.383, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.706896551724138, |
|
"grad_norm": 0.08131396770477295, |
|
"learning_rate": 6.0053081827469045e-06, |
|
"loss": 0.6455, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 2.7126436781609193, |
|
"grad_norm": 0.08353292942047119, |
|
"learning_rate": 5.778785283515053e-06, |
|
"loss": 0.6254, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.718390804597701, |
|
"grad_norm": 0.0802810862660408, |
|
"learning_rate": 5.556490054817132e-06, |
|
"loss": 0.6284, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 2.7241379310344827, |
|
"grad_norm": 0.08118069916963577, |
|
"learning_rate": 5.338432470956589e-06, |
|
"loss": 0.6092, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.7298850574712645, |
|
"grad_norm": 0.08621113002300262, |
|
"learning_rate": 5.1246223160953845e-06, |
|
"loss": 0.6489, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.735632183908046, |
|
"grad_norm": 0.08560863137245178, |
|
"learning_rate": 4.91506918381488e-06, |
|
"loss": 0.6154, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.7413793103448274, |
|
"grad_norm": 0.081720270216465, |
|
"learning_rate": 4.7097824766854756e-06, |
|
"loss": 0.6232, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 2.7471264367816093, |
|
"grad_norm": 0.08384092152118683, |
|
"learning_rate": 4.508771405844636e-06, |
|
"loss": 0.6209, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.7528735632183907, |
|
"grad_norm": 0.08142372965812683, |
|
"learning_rate": 4.312044990583675e-06, |
|
"loss": 0.6298, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.0810447633266449, |
|
"learning_rate": 4.119612057942978e-06, |
|
"loss": 0.608, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"eval_loss": 0.6413341164588928, |
|
"eval_runtime": 410.6577, |
|
"eval_samples_per_second": 24.103, |
|
"eval_steps_per_second": 0.377, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.764367816091954, |
|
"grad_norm": 0.08321461826562881, |
|
"learning_rate": 3.931481242315993e-06, |
|
"loss": 0.6426, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 2.7701149425287355, |
|
"grad_norm": 0.0784662514925003, |
|
"learning_rate": 3.747660985061785e-06, |
|
"loss": 0.6126, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 2.7758620689655173, |
|
"grad_norm": 0.09238499402999878, |
|
"learning_rate": 3.568159534126314e-06, |
|
"loss": 0.5786, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 2.781609195402299, |
|
"grad_norm": 0.08142554014921188, |
|
"learning_rate": 3.3929849436722728e-06, |
|
"loss": 0.6341, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 2.7873563218390807, |
|
"grad_norm": 0.08540128916501999, |
|
"learning_rate": 3.2221450737178083e-06, |
|
"loss": 0.6062, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.793103448275862, |
|
"grad_norm": 0.08547057211399078, |
|
"learning_rate": 3.0556475897837166e-06, |
|
"loss": 0.5974, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 2.7988505747126435, |
|
"grad_norm": 0.1007808968424797, |
|
"learning_rate": 2.8934999625496282e-06, |
|
"loss": 0.6157, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 2.8045977011494254, |
|
"grad_norm": 0.08533742278814316, |
|
"learning_rate": 2.735709467518699e-06, |
|
"loss": 0.625, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.810344827586207, |
|
"grad_norm": 0.08325877785682678, |
|
"learning_rate": 2.5822831846912033e-06, |
|
"loss": 0.5991, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 2.8160919540229887, |
|
"grad_norm": 0.08522289991378784, |
|
"learning_rate": 2.4332279982468453e-06, |
|
"loss": 0.6039, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.8160919540229887, |
|
"eval_loss": 0.6412601470947266, |
|
"eval_runtime": 405.8893, |
|
"eval_samples_per_second": 24.386, |
|
"eval_steps_per_second": 0.382, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.82183908045977, |
|
"grad_norm": 0.08191868662834167, |
|
"learning_rate": 2.2885505962359054e-06, |
|
"loss": 0.5907, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 2.8275862068965516, |
|
"grad_norm": 0.08263259381055832, |
|
"learning_rate": 2.1482574702790803e-06, |
|
"loss": 0.615, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 2.8333333333333335, |
|
"grad_norm": 0.08231104165315628, |
|
"learning_rate": 2.0123549152762823e-06, |
|
"loss": 0.6334, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 2.839080459770115, |
|
"grad_norm": 0.08760181069374084, |
|
"learning_rate": 1.8808490291241432e-06, |
|
"loss": 0.6186, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 2.844827586206897, |
|
"grad_norm": 0.07865423709154129, |
|
"learning_rate": 1.7537457124423895e-06, |
|
"loss": 0.6324, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.8505747126436782, |
|
"grad_norm": 0.08259916305541992, |
|
"learning_rate": 1.631050668309131e-06, |
|
"loss": 0.6406, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 2.8563218390804597, |
|
"grad_norm": 0.08283340930938721, |
|
"learning_rate": 1.5127694020049432e-06, |
|
"loss": 0.6253, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 2.862068965517241, |
|
"grad_norm": 0.0877593606710434, |
|
"learning_rate": 1.3989072207658328e-06, |
|
"loss": 0.6158, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.867816091954023, |
|
"grad_norm": 0.08183769136667252, |
|
"learning_rate": 1.2894692335451375e-06, |
|
"loss": 0.6091, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 2.873563218390805, |
|
"grad_norm": 0.08991672843694687, |
|
"learning_rate": 1.1844603507842668e-06, |
|
"loss": 0.6445, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.873563218390805, |
|
"eval_loss": 0.641264796257019, |
|
"eval_runtime": 405.0206, |
|
"eval_samples_per_second": 24.438, |
|
"eval_steps_per_second": 0.383, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.8793103448275863, |
|
"grad_norm": 0.07864410430192947, |
|
"learning_rate": 1.083885284192354e-06, |
|
"loss": 0.6061, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.8850574712643677, |
|
"grad_norm": 0.08581390231847763, |
|
"learning_rate": 9.877485465349058e-07, |
|
"loss": 0.5868, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 2.890804597701149, |
|
"grad_norm": 0.0851605013012886, |
|
"learning_rate": 8.960544514312275e-07, |
|
"loss": 0.6219, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 2.896551724137931, |
|
"grad_norm": 0.08249209821224213, |
|
"learning_rate": 8.088071131609587e-07, |
|
"loss": 0.6043, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.9022988505747125, |
|
"grad_norm": 0.07978759706020355, |
|
"learning_rate": 7.26010446479397e-07, |
|
"loss": 0.622, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.9080459770114944, |
|
"grad_norm": 0.08593284338712692, |
|
"learning_rate": 6.476681664419171e-07, |
|
"loss": 0.5829, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.913793103448276, |
|
"grad_norm": 0.08696813136339188, |
|
"learning_rate": 5.737837882371922e-07, |
|
"loss": 0.6162, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 2.9195402298850572, |
|
"grad_norm": 0.08092223852872849, |
|
"learning_rate": 5.043606270295654e-07, |
|
"loss": 0.604, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 2.925287356321839, |
|
"grad_norm": 0.08109483867883682, |
|
"learning_rate": 4.3940179781019055e-07, |
|
"loss": 0.6291, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 2.9310344827586206, |
|
"grad_norm": 0.07932683825492859, |
|
"learning_rate": 3.789102152573665e-07, |
|
"loss": 0.6249, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.9310344827586206, |
|
"eval_loss": 0.6412717700004578, |
|
"eval_runtime": 404.544, |
|
"eval_samples_per_second": 24.467, |
|
"eval_steps_per_second": 0.383, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.9367816091954024, |
|
"grad_norm": 0.08453306555747986, |
|
"learning_rate": 3.228885936056858e-07, |
|
"loss": 0.6103, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 2.942528735632184, |
|
"grad_norm": 0.08546116948127747, |
|
"learning_rate": 2.713394465242991e-07, |
|
"loss": 0.5975, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.9482758620689653, |
|
"grad_norm": 0.08308890461921692, |
|
"learning_rate": 2.242650870040497e-07, |
|
"loss": 0.6018, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.954022988505747, |
|
"grad_norm": 0.08195459842681885, |
|
"learning_rate": 1.8166762725381203e-07, |
|
"loss": 0.5967, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 2.9597701149425286, |
|
"grad_norm": 0.0842004343867302, |
|
"learning_rate": 1.4354897860558992e-07, |
|
"loss": 0.6012, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.9655172413793105, |
|
"grad_norm": 0.08774745464324951, |
|
"learning_rate": 1.0991085142886271e-07, |
|
"loss": 0.5863, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.971264367816092, |
|
"grad_norm": 0.09357178956270218, |
|
"learning_rate": 8.075475505373575e-08, |
|
"loss": 0.6229, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 2.9770114942528734, |
|
"grad_norm": 0.09469768404960632, |
|
"learning_rate": 5.608199770334999e-08, |
|
"loss": 0.6239, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 2.9827586206896552, |
|
"grad_norm": 0.09067104756832123, |
|
"learning_rate": 3.5893686435029e-08, |
|
"loss": 0.6178, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 2.9885057471264367, |
|
"grad_norm": 0.09194349497556686, |
|
"learning_rate": 2.019072709074088e-08, |
|
"loss": 0.6006, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.9885057471264367, |
|
"eval_loss": 0.6412657499313354, |
|
"eval_runtime": 407.3782, |
|
"eval_samples_per_second": 24.297, |
|
"eval_steps_per_second": 0.38, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.9942528735632186, |
|
"grad_norm": 0.09253750741481781, |
|
"learning_rate": 8.973824256364171e-09, |
|
"loss": 0.6094, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.0888516753911972, |
|
"learning_rate": 2.2434812301352913e-09, |
|
"loss": 0.6062, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 522, |
|
"total_flos": 5.0940118456365744e+19, |
|
"train_loss": 0.6584736234383565, |
|
"train_runtime": 47270.2985, |
|
"train_samples_per_second": 5.653, |
|
"train_steps_per_second": 0.011 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 522, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.0940118456365744e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|