Qwen3-32B-alpaca-th-52k-dolly-th-15k-wangchan-instruct-seed-4201
/
checkpoint-500
/trainer_state.json
{ | |
"best_global_step": null, | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 2.873563218390805, | |
"eval_steps": 10, | |
"global_step": 500, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.005747126436781609, | |
"grad_norm": 0.10123365372419357, | |
"learning_rate": 0.0, | |
"loss": 0.9918, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.011494252873563218, | |
"grad_norm": 0.09671098738908768, | |
"learning_rate": 3.7735849056603773e-06, | |
"loss": 0.9604, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.017241379310344827, | |
"grad_norm": 0.0981190875172615, | |
"learning_rate": 7.547169811320755e-06, | |
"loss": 0.9868, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.022988505747126436, | |
"grad_norm": 0.10396745055913925, | |
"learning_rate": 1.1320754716981132e-05, | |
"loss": 0.962, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.028735632183908046, | |
"grad_norm": 0.0982985869050026, | |
"learning_rate": 1.509433962264151e-05, | |
"loss": 0.9684, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.034482758620689655, | |
"grad_norm": 0.10332155227661133, | |
"learning_rate": 1.8867924528301888e-05, | |
"loss": 0.9442, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.040229885057471264, | |
"grad_norm": 0.1124059334397316, | |
"learning_rate": 2.2641509433962265e-05, | |
"loss": 0.9382, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.04597701149425287, | |
"grad_norm": 0.12120208889245987, | |
"learning_rate": 2.641509433962264e-05, | |
"loss": 0.9416, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.05172413793103448, | |
"grad_norm": 0.12729395925998688, | |
"learning_rate": 3.018867924528302e-05, | |
"loss": 0.9356, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.05747126436781609, | |
"grad_norm": 0.13560789823532104, | |
"learning_rate": 3.39622641509434e-05, | |
"loss": 0.9293, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.05747126436781609, | |
"eval_loss": 1.0470749139785767, | |
"eval_runtime": 412.2553, | |
"eval_samples_per_second": 24.009, | |
"eval_steps_per_second": 0.376, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.06321839080459771, | |
"grad_norm": 0.1474100798368454, | |
"learning_rate": 3.7735849056603776e-05, | |
"loss": 0.9533, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.06896551724137931, | |
"grad_norm": 0.16510824859142303, | |
"learning_rate": 4.150943396226415e-05, | |
"loss": 0.9206, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.07471264367816093, | |
"grad_norm": 0.17097796499729156, | |
"learning_rate": 4.528301886792453e-05, | |
"loss": 0.8921, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.08045977011494253, | |
"grad_norm": 0.17923878133296967, | |
"learning_rate": 4.9056603773584906e-05, | |
"loss": 0.8861, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.08620689655172414, | |
"grad_norm": 0.18173959851264954, | |
"learning_rate": 5.283018867924528e-05, | |
"loss": 0.8904, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.09195402298850575, | |
"grad_norm": 0.17235629260540009, | |
"learning_rate": 5.660377358490566e-05, | |
"loss": 0.8424, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.09770114942528736, | |
"grad_norm": 0.16792210936546326, | |
"learning_rate": 6.037735849056604e-05, | |
"loss": 0.8395, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.10344827586206896, | |
"grad_norm": 0.14939646422863007, | |
"learning_rate": 6.415094339622641e-05, | |
"loss": 0.8203, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.10919540229885058, | |
"grad_norm": 0.14632105827331543, | |
"learning_rate": 6.79245283018868e-05, | |
"loss": 0.8464, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.11494252873563218, | |
"grad_norm": 0.14770475029945374, | |
"learning_rate": 7.169811320754717e-05, | |
"loss": 0.8085, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.11494252873563218, | |
"eval_loss": 0.8244547247886658, | |
"eval_runtime": 404.4489, | |
"eval_samples_per_second": 24.473, | |
"eval_steps_per_second": 0.383, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.1206896551724138, | |
"grad_norm": 0.1725720465183258, | |
"learning_rate": 7.547169811320755e-05, | |
"loss": 0.8219, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.12643678160919541, | |
"grad_norm": 0.1685618907213211, | |
"learning_rate": 7.924528301886794e-05, | |
"loss": 0.8148, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.13218390804597702, | |
"grad_norm": 0.1653290092945099, | |
"learning_rate": 8.30188679245283e-05, | |
"loss": 0.7846, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.13793103448275862, | |
"grad_norm": 0.16122524440288544, | |
"learning_rate": 8.679245283018869e-05, | |
"loss": 0.7903, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.14367816091954022, | |
"grad_norm": 0.12793505191802979, | |
"learning_rate": 9.056603773584906e-05, | |
"loss": 0.7741, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.14942528735632185, | |
"grad_norm": 0.10620377957820892, | |
"learning_rate": 9.433962264150944e-05, | |
"loss": 0.7308, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.15517241379310345, | |
"grad_norm": 0.10993366688489914, | |
"learning_rate": 9.811320754716981e-05, | |
"loss": 0.7559, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.16091954022988506, | |
"grad_norm": 0.11916384100914001, | |
"learning_rate": 0.0001018867924528302, | |
"loss": 0.7622, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.16666666666666666, | |
"grad_norm": 0.13500399887561798, | |
"learning_rate": 0.00010566037735849057, | |
"loss": 0.7436, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.1724137931034483, | |
"grad_norm": 0.12777844071388245, | |
"learning_rate": 0.00010943396226415095, | |
"loss": 0.7547, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.1724137931034483, | |
"eval_loss": 0.7580565214157104, | |
"eval_runtime": 404.708, | |
"eval_samples_per_second": 24.457, | |
"eval_steps_per_second": 0.383, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.1781609195402299, | |
"grad_norm": 0.11721828579902649, | |
"learning_rate": 0.00011320754716981132, | |
"loss": 0.7337, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.1839080459770115, | |
"grad_norm": 0.08667382597923279, | |
"learning_rate": 0.0001169811320754717, | |
"loss": 0.7538, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.1896551724137931, | |
"grad_norm": 0.06665026396512985, | |
"learning_rate": 0.00012075471698113207, | |
"loss": 0.7186, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.19540229885057472, | |
"grad_norm": 0.04627465456724167, | |
"learning_rate": 0.00012452830188679244, | |
"loss": 0.7719, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.20114942528735633, | |
"grad_norm": 0.04290887340903282, | |
"learning_rate": 0.00012830188679245283, | |
"loss": 0.752, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.20689655172413793, | |
"grad_norm": 0.056834809482097626, | |
"learning_rate": 0.0001320754716981132, | |
"loss": 0.7429, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.21264367816091953, | |
"grad_norm": 0.062055498361587524, | |
"learning_rate": 0.0001358490566037736, | |
"loss": 0.7208, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.21839080459770116, | |
"grad_norm": 0.070551298558712, | |
"learning_rate": 0.00013962264150943395, | |
"loss": 0.7651, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.22413793103448276, | |
"grad_norm": 0.07514140754938126, | |
"learning_rate": 0.00014339622641509434, | |
"loss": 0.7456, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.22988505747126436, | |
"grad_norm": 0.06458627432584763, | |
"learning_rate": 0.00014716981132075472, | |
"loss": 0.7289, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.22988505747126436, | |
"eval_loss": 0.7386028170585632, | |
"eval_runtime": 407.409, | |
"eval_samples_per_second": 24.295, | |
"eval_steps_per_second": 0.38, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.23563218390804597, | |
"grad_norm": 0.056490588933229446, | |
"learning_rate": 0.0001509433962264151, | |
"loss": 0.7503, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.2413793103448276, | |
"grad_norm": 0.036972932517528534, | |
"learning_rate": 0.0001547169811320755, | |
"loss": 0.7392, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.2471264367816092, | |
"grad_norm": 0.038239240646362305, | |
"learning_rate": 0.00015849056603773587, | |
"loss": 0.7206, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.25287356321839083, | |
"grad_norm": 0.033113010227680206, | |
"learning_rate": 0.00016226415094339625, | |
"loss": 0.7198, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.25862068965517243, | |
"grad_norm": 0.03197947517037392, | |
"learning_rate": 0.0001660377358490566, | |
"loss": 0.7393, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.26436781609195403, | |
"grad_norm": 0.03696918115019798, | |
"learning_rate": 0.000169811320754717, | |
"loss": 0.7576, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.27011494252873564, | |
"grad_norm": 0.04209383204579353, | |
"learning_rate": 0.00017358490566037738, | |
"loss": 0.7157, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.27586206896551724, | |
"grad_norm": 0.035038772970438004, | |
"learning_rate": 0.00017735849056603776, | |
"loss": 0.7256, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.28160919540229884, | |
"grad_norm": 0.03674735128879547, | |
"learning_rate": 0.00018113207547169812, | |
"loss": 0.7295, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.28735632183908044, | |
"grad_norm": 0.046050041913986206, | |
"learning_rate": 0.0001849056603773585, | |
"loss": 0.6965, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.28735632183908044, | |
"eval_loss": 0.724204421043396, | |
"eval_runtime": 405.0004, | |
"eval_samples_per_second": 24.439, | |
"eval_steps_per_second": 0.383, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.29310344827586204, | |
"grad_norm": 0.036520447582006454, | |
"learning_rate": 0.00018867924528301889, | |
"loss": 0.7273, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.2988505747126437, | |
"grad_norm": 0.03720232844352722, | |
"learning_rate": 0.00019245283018867927, | |
"loss": 0.7084, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.3045977011494253, | |
"grad_norm": 0.03159736469388008, | |
"learning_rate": 0.00019622641509433963, | |
"loss": 0.7485, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.3103448275862069, | |
"grad_norm": 0.03695262596011162, | |
"learning_rate": 0.0002, | |
"loss": 0.745, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.3160919540229885, | |
"grad_norm": 0.041795678436756134, | |
"learning_rate": 0.00019999775651876987, | |
"loss": 0.7165, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.3218390804597701, | |
"grad_norm": 0.03494727239012718, | |
"learning_rate": 0.00019999102617574365, | |
"loss": 0.7499, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.3275862068965517, | |
"grad_norm": 0.033885981887578964, | |
"learning_rate": 0.00019997980927290927, | |
"loss": 0.7118, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"grad_norm": 0.03606434166431427, | |
"learning_rate": 0.00019996410631356498, | |
"loss": 0.6945, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.3390804597701149, | |
"grad_norm": 0.04015219211578369, | |
"learning_rate": 0.00019994391800229666, | |
"loss": 0.6982, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.3448275862068966, | |
"grad_norm": 0.0380714014172554, | |
"learning_rate": 0.00019991924524494627, | |
"loss": 0.6848, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.3448275862068966, | |
"eval_loss": 0.7109408378601074, | |
"eval_runtime": 404.9798, | |
"eval_samples_per_second": 24.441, | |
"eval_steps_per_second": 0.383, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.3505747126436782, | |
"grad_norm": 0.04110811650753021, | |
"learning_rate": 0.00019989008914857116, | |
"loss": 0.6899, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.3563218390804598, | |
"grad_norm": 0.03853503614664078, | |
"learning_rate": 0.0001998564510213944, | |
"loss": 0.7094, | |
"step": 62 | |
}, | |
{ | |
"epoch": 0.3620689655172414, | |
"grad_norm": 0.0391794852912426, | |
"learning_rate": 0.00019981833237274618, | |
"loss": 0.6975, | |
"step": 63 | |
}, | |
{ | |
"epoch": 0.367816091954023, | |
"grad_norm": 0.03894927725195885, | |
"learning_rate": 0.00019977573491299598, | |
"loss": 0.714, | |
"step": 64 | |
}, | |
{ | |
"epoch": 0.3735632183908046, | |
"grad_norm": 0.04239923506975174, | |
"learning_rate": 0.00019972866055347572, | |
"loss": 0.7339, | |
"step": 65 | |
}, | |
{ | |
"epoch": 0.3793103448275862, | |
"grad_norm": 0.03982697054743767, | |
"learning_rate": 0.0001996771114063943, | |
"loss": 0.6821, | |
"step": 66 | |
}, | |
{ | |
"epoch": 0.3850574712643678, | |
"grad_norm": 0.04431302100419998, | |
"learning_rate": 0.00019962108978474263, | |
"loss": 0.7273, | |
"step": 67 | |
}, | |
{ | |
"epoch": 0.39080459770114945, | |
"grad_norm": 0.043787937611341476, | |
"learning_rate": 0.00019956059820218982, | |
"loss": 0.6984, | |
"step": 68 | |
}, | |
{ | |
"epoch": 0.39655172413793105, | |
"grad_norm": 0.054389603435993195, | |
"learning_rate": 0.00019949563937297045, | |
"loss": 0.6778, | |
"step": 69 | |
}, | |
{ | |
"epoch": 0.40229885057471265, | |
"grad_norm": 0.041256386786699295, | |
"learning_rate": 0.00019942621621176282, | |
"loss": 0.693, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.40229885057471265, | |
"eval_loss": 0.7021871209144592, | |
"eval_runtime": 406.6755, | |
"eval_samples_per_second": 24.339, | |
"eval_steps_per_second": 0.381, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.40804597701149425, | |
"grad_norm": 0.05022790655493736, | |
"learning_rate": 0.0001993523318335581, | |
"loss": 0.6967, | |
"step": 71 | |
}, | |
{ | |
"epoch": 0.41379310344827586, | |
"grad_norm": 0.06086933612823486, | |
"learning_rate": 0.00019927398955352061, | |
"loss": 0.7279, | |
"step": 72 | |
}, | |
{ | |
"epoch": 0.41954022988505746, | |
"grad_norm": 0.04689742252230644, | |
"learning_rate": 0.00019919119288683908, | |
"loss": 0.6792, | |
"step": 73 | |
}, | |
{ | |
"epoch": 0.42528735632183906, | |
"grad_norm": 0.04852883517742157, | |
"learning_rate": 0.00019910394554856876, | |
"loss": 0.701, | |
"step": 74 | |
}, | |
{ | |
"epoch": 0.43103448275862066, | |
"grad_norm": 0.06196567416191101, | |
"learning_rate": 0.0001990122514534651, | |
"loss": 0.6805, | |
"step": 75 | |
}, | |
{ | |
"epoch": 0.4367816091954023, | |
"grad_norm": 0.047033004462718964, | |
"learning_rate": 0.00019891611471580764, | |
"loss": 0.7058, | |
"step": 76 | |
}, | |
{ | |
"epoch": 0.4425287356321839, | |
"grad_norm": 0.047392234206199646, | |
"learning_rate": 0.00019881553964921572, | |
"loss": 0.6861, | |
"step": 77 | |
}, | |
{ | |
"epoch": 0.4482758620689655, | |
"grad_norm": 0.054070815443992615, | |
"learning_rate": 0.00019871053076645488, | |
"loss": 0.6969, | |
"step": 78 | |
}, | |
{ | |
"epoch": 0.4540229885057471, | |
"grad_norm": 0.055412329733371735, | |
"learning_rate": 0.00019860109277923418, | |
"loss": 0.7001, | |
"step": 79 | |
}, | |
{ | |
"epoch": 0.45977011494252873, | |
"grad_norm": 0.05274376645684242, | |
"learning_rate": 0.00019848723059799506, | |
"loss": 0.7101, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.45977011494252873, | |
"eval_loss": 0.694656252861023, | |
"eval_runtime": 410.9173, | |
"eval_samples_per_second": 24.088, | |
"eval_steps_per_second": 0.377, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.46551724137931033, | |
"grad_norm": 0.05915577709674835, | |
"learning_rate": 0.00019836894933169088, | |
"loss": 0.6836, | |
"step": 81 | |
}, | |
{ | |
"epoch": 0.47126436781609193, | |
"grad_norm": 0.051574286073446274, | |
"learning_rate": 0.0001982462542875576, | |
"loss": 0.7181, | |
"step": 82 | |
}, | |
{ | |
"epoch": 0.47701149425287354, | |
"grad_norm": 0.050167519599199295, | |
"learning_rate": 0.00019811915097087587, | |
"loss": 0.6645, | |
"step": 83 | |
}, | |
{ | |
"epoch": 0.4827586206896552, | |
"grad_norm": 0.06501943618059158, | |
"learning_rate": 0.00019798764508472373, | |
"loss": 0.6891, | |
"step": 84 | |
}, | |
{ | |
"epoch": 0.4885057471264368, | |
"grad_norm": 0.05396122857928276, | |
"learning_rate": 0.00019785174252972092, | |
"loss": 0.6842, | |
"step": 85 | |
}, | |
{ | |
"epoch": 0.4942528735632184, | |
"grad_norm": 0.051826637238264084, | |
"learning_rate": 0.0001977114494037641, | |
"loss": 0.7047, | |
"step": 86 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 0.05442539602518082, | |
"learning_rate": 0.00019756677200175315, | |
"loss": 0.7261, | |
"step": 87 | |
}, | |
{ | |
"epoch": 0.5057471264367817, | |
"grad_norm": 0.05559674650430679, | |
"learning_rate": 0.0001974177168153088, | |
"loss": 0.6699, | |
"step": 88 | |
}, | |
{ | |
"epoch": 0.5114942528735632, | |
"grad_norm": 0.058047693222761154, | |
"learning_rate": 0.0001972642905324813, | |
"loss": 0.6831, | |
"step": 89 | |
}, | |
{ | |
"epoch": 0.5172413793103449, | |
"grad_norm": 0.051893047988414764, | |
"learning_rate": 0.0001971065000374504, | |
"loss": 0.7293, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.5172413793103449, | |
"eval_loss": 0.6888386607170105, | |
"eval_runtime": 405.4362, | |
"eval_samples_per_second": 24.413, | |
"eval_steps_per_second": 0.382, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.5229885057471264, | |
"grad_norm": 0.051870737224817276, | |
"learning_rate": 0.0001969443524102163, | |
"loss": 0.6945, | |
"step": 91 | |
}, | |
{ | |
"epoch": 0.5287356321839081, | |
"grad_norm": 0.04907568544149399, | |
"learning_rate": 0.0001967778549262822, | |
"loss": 0.6985, | |
"step": 92 | |
}, | |
{ | |
"epoch": 0.5344827586206896, | |
"grad_norm": 0.05802120640873909, | |
"learning_rate": 0.00019660701505632772, | |
"loss": 0.6911, | |
"step": 93 | |
}, | |
{ | |
"epoch": 0.5402298850574713, | |
"grad_norm": 0.06809733808040619, | |
"learning_rate": 0.0001964318404658737, | |
"loss": 0.6815, | |
"step": 94 | |
}, | |
{ | |
"epoch": 0.5459770114942529, | |
"grad_norm": 0.05489501729607582, | |
"learning_rate": 0.00019625233901493822, | |
"loss": 0.6664, | |
"step": 95 | |
}, | |
{ | |
"epoch": 0.5517241379310345, | |
"grad_norm": 0.0648936778306961, | |
"learning_rate": 0.000196068518757684, | |
"loss": 0.6689, | |
"step": 96 | |
}, | |
{ | |
"epoch": 0.5574712643678161, | |
"grad_norm": 0.054548367857933044, | |
"learning_rate": 0.00019588038794205703, | |
"loss": 0.6695, | |
"step": 97 | |
}, | |
{ | |
"epoch": 0.5632183908045977, | |
"grad_norm": 0.0626642182469368, | |
"learning_rate": 0.00019568795500941635, | |
"loss": 0.7062, | |
"step": 98 | |
}, | |
{ | |
"epoch": 0.5689655172413793, | |
"grad_norm": 0.0539688840508461, | |
"learning_rate": 0.00019549122859415538, | |
"loss": 0.6891, | |
"step": 99 | |
}, | |
{ | |
"epoch": 0.5747126436781609, | |
"grad_norm": 0.05761811137199402, | |
"learning_rate": 0.00019529021752331453, | |
"loss": 0.6852, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.5747126436781609, | |
"eval_loss": 0.6821601986885071, | |
"eval_runtime": 404.287, | |
"eval_samples_per_second": 24.483, | |
"eval_steps_per_second": 0.383, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.5804597701149425, | |
"grad_norm": 0.054896607995033264, | |
"learning_rate": 0.00019508493081618513, | |
"loss": 0.6785, | |
"step": 101 | |
}, | |
{ | |
"epoch": 0.5862068965517241, | |
"grad_norm": 0.06048964709043503, | |
"learning_rate": 0.00019487537768390464, | |
"loss": 0.6724, | |
"step": 102 | |
}, | |
{ | |
"epoch": 0.5919540229885057, | |
"grad_norm": 0.06828396022319794, | |
"learning_rate": 0.00019466156752904343, | |
"loss": 0.7117, | |
"step": 103 | |
}, | |
{ | |
"epoch": 0.5977011494252874, | |
"grad_norm": 0.06610234081745148, | |
"learning_rate": 0.0001944435099451829, | |
"loss": 0.6982, | |
"step": 104 | |
}, | |
{ | |
"epoch": 0.603448275862069, | |
"grad_norm": 0.06762486696243286, | |
"learning_rate": 0.00019422121471648497, | |
"loss": 0.6768, | |
"step": 105 | |
}, | |
{ | |
"epoch": 0.6091954022988506, | |
"grad_norm": 0.05772867798805237, | |
"learning_rate": 0.0001939946918172531, | |
"loss": 0.7073, | |
"step": 106 | |
}, | |
{ | |
"epoch": 0.6149425287356322, | |
"grad_norm": 0.11993183940649033, | |
"learning_rate": 0.00019376395141148476, | |
"loss": 0.6831, | |
"step": 107 | |
}, | |
{ | |
"epoch": 0.6206896551724138, | |
"grad_norm": 0.08105713874101639, | |
"learning_rate": 0.00019352900385241536, | |
"loss": 0.6857, | |
"step": 108 | |
}, | |
{ | |
"epoch": 0.6264367816091954, | |
"grad_norm": 0.06035466492176056, | |
"learning_rate": 0.0001932898596820536, | |
"loss": 0.672, | |
"step": 109 | |
}, | |
{ | |
"epoch": 0.632183908045977, | |
"grad_norm": 0.09288731962442398, | |
"learning_rate": 0.0001930465296307087, | |
"loss": 0.7033, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.632183908045977, | |
"eval_loss": 0.677044153213501, | |
"eval_runtime": 405.2323, | |
"eval_samples_per_second": 24.425, | |
"eval_steps_per_second": 0.382, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.6379310344827587, | |
"grad_norm": 0.06630638986825943, | |
"learning_rate": 0.00019279902461650866, | |
"loss": 0.6831, | |
"step": 111 | |
}, | |
{ | |
"epoch": 0.6436781609195402, | |
"grad_norm": 0.05605092644691467, | |
"learning_rate": 0.00019254735574491058, | |
"loss": 0.6654, | |
"step": 112 | |
}, | |
{ | |
"epoch": 0.6494252873563219, | |
"grad_norm": 0.07270795851945877, | |
"learning_rate": 0.00019229153430820232, | |
"loss": 0.6744, | |
"step": 113 | |
}, | |
{ | |
"epoch": 0.6551724137931034, | |
"grad_norm": 0.06772006303071976, | |
"learning_rate": 0.0001920315717849956, | |
"loss": 0.6833, | |
"step": 114 | |
}, | |
{ | |
"epoch": 0.6609195402298851, | |
"grad_norm": 0.06296226382255554, | |
"learning_rate": 0.0001917674798397113, | |
"loss": 0.677, | |
"step": 115 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"grad_norm": 0.06553810834884644, | |
"learning_rate": 0.00019149927032205587, | |
"loss": 0.6828, | |
"step": 116 | |
}, | |
{ | |
"epoch": 0.6724137931034483, | |
"grad_norm": 0.057245928794145584, | |
"learning_rate": 0.00019122695526648968, | |
"loss": 0.6858, | |
"step": 117 | |
}, | |
{ | |
"epoch": 0.6781609195402298, | |
"grad_norm": 0.06503669917583466, | |
"learning_rate": 0.00019095054689168705, | |
"loss": 0.6591, | |
"step": 118 | |
}, | |
{ | |
"epoch": 0.6839080459770115, | |
"grad_norm": 0.05912588909268379, | |
"learning_rate": 0.00019067005759998797, | |
"loss": 0.6669, | |
"step": 119 | |
}, | |
{ | |
"epoch": 0.6896551724137931, | |
"grad_norm": 0.06517963111400604, | |
"learning_rate": 0.0001903854999768417, | |
"loss": 0.6815, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.6896551724137931, | |
"eval_loss": 0.6735538244247437, | |
"eval_runtime": 405.8319, | |
"eval_samples_per_second": 24.389, | |
"eval_steps_per_second": 0.382, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.6954022988505747, | |
"grad_norm": 0.06089121848344803, | |
"learning_rate": 0.0001900968867902419, | |
"loss": 0.67, | |
"step": 121 | |
}, | |
{ | |
"epoch": 0.7011494252873564, | |
"grad_norm": 0.05764375999569893, | |
"learning_rate": 0.00018980423099015402, | |
"loss": 0.6733, | |
"step": 122 | |
}, | |
{ | |
"epoch": 0.7068965517241379, | |
"grad_norm": 0.06278955936431885, | |
"learning_rate": 0.00018950754570793384, | |
"loss": 0.6702, | |
"step": 123 | |
}, | |
{ | |
"epoch": 0.7126436781609196, | |
"grad_norm": 0.06360521912574768, | |
"learning_rate": 0.00018920684425573865, | |
"loss": 0.6619, | |
"step": 124 | |
}, | |
{ | |
"epoch": 0.7183908045977011, | |
"grad_norm": 0.0599365159869194, | |
"learning_rate": 0.00018890214012592975, | |
"loss": 0.6851, | |
"step": 125 | |
}, | |
{ | |
"epoch": 0.7241379310344828, | |
"grad_norm": 0.061885766685009, | |
"learning_rate": 0.000188593446990467, | |
"loss": 0.6346, | |
"step": 126 | |
}, | |
{ | |
"epoch": 0.7298850574712644, | |
"grad_norm": 0.061761509627103806, | |
"learning_rate": 0.00018828077870029552, | |
"loss": 0.6834, | |
"step": 127 | |
}, | |
{ | |
"epoch": 0.735632183908046, | |
"grad_norm": 0.075982965528965, | |
"learning_rate": 0.00018796414928472417, | |
"loss": 0.6279, | |
"step": 128 | |
}, | |
{ | |
"epoch": 0.7413793103448276, | |
"grad_norm": 0.05802853778004646, | |
"learning_rate": 0.0001876435729507959, | |
"loss": 0.6348, | |
"step": 129 | |
}, | |
{ | |
"epoch": 0.7471264367816092, | |
"grad_norm": 0.06642711162567139, | |
"learning_rate": 0.0001873190640826505, | |
"loss": 0.679, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.7471264367816092, | |
"eval_loss": 0.6707044243812561, | |
"eval_runtime": 407.4212, | |
"eval_samples_per_second": 24.294, | |
"eval_steps_per_second": 0.38, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.7528735632183908, | |
"grad_norm": 0.06452522426843643, | |
"learning_rate": 0.00018699063724087904, | |
"loss": 0.6423, | |
"step": 131 | |
}, | |
{ | |
"epoch": 0.7586206896551724, | |
"grad_norm": 0.05988775193691254, | |
"learning_rate": 0.00018665830716187065, | |
"loss": 0.6654, | |
"step": 132 | |
}, | |
{ | |
"epoch": 0.764367816091954, | |
"grad_norm": 0.059349820017814636, | |
"learning_rate": 0.0001863220887571512, | |
"loss": 0.6866, | |
"step": 133 | |
}, | |
{ | |
"epoch": 0.7701149425287356, | |
"grad_norm": 0.06473397463560104, | |
"learning_rate": 0.0001859819971127143, | |
"loss": 0.7014, | |
"step": 134 | |
}, | |
{ | |
"epoch": 0.7758620689655172, | |
"grad_norm": 0.06945810467004776, | |
"learning_rate": 0.00018563804748834438, | |
"loss": 0.6769, | |
"step": 135 | |
}, | |
{ | |
"epoch": 0.7816091954022989, | |
"grad_norm": 0.06217830255627632, | |
"learning_rate": 0.000185290255316932, | |
"loss": 0.6821, | |
"step": 136 | |
}, | |
{ | |
"epoch": 0.7873563218390804, | |
"grad_norm": 0.07021711021661758, | |
"learning_rate": 0.00018493863620378122, | |
"loss": 0.6614, | |
"step": 137 | |
}, | |
{ | |
"epoch": 0.7931034482758621, | |
"grad_norm": 0.0640297532081604, | |
"learning_rate": 0.00018458320592590975, | |
"loss": 0.6699, | |
"step": 138 | |
}, | |
{ | |
"epoch": 0.7988505747126436, | |
"grad_norm": 0.0640842542052269, | |
"learning_rate": 0.00018422398043134067, | |
"loss": 0.6795, | |
"step": 139 | |
}, | |
{ | |
"epoch": 0.8045977011494253, | |
"grad_norm": 0.07371507585048676, | |
"learning_rate": 0.00018386097583838714, | |
"loss": 0.6571, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.8045977011494253, | |
"eval_loss": 0.6682229042053223, | |
"eval_runtime": 404.8694, | |
"eval_samples_per_second": 24.447, | |
"eval_steps_per_second": 0.383, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.8103448275862069, | |
"grad_norm": 0.06185011938214302, | |
"learning_rate": 0.00018349420843492888, | |
"loss": 0.6524, | |
"step": 141 | |
}, | |
{ | |
"epoch": 0.8160919540229885, | |
"grad_norm": 0.08427827060222626, | |
"learning_rate": 0.00018312369467768166, | |
"loss": 0.6685, | |
"step": 142 | |
}, | |
{ | |
"epoch": 0.8218390804597702, | |
"grad_norm": 0.06529568880796432, | |
"learning_rate": 0.0001827494511914587, | |
"loss": 0.659, | |
"step": 143 | |
}, | |
{ | |
"epoch": 0.8275862068965517, | |
"grad_norm": 0.07357680797576904, | |
"learning_rate": 0.0001823714947684247, | |
"loss": 0.6792, | |
"step": 144 | |
}, | |
{ | |
"epoch": 0.8333333333333334, | |
"grad_norm": 0.09026575833559036, | |
"learning_rate": 0.00018198984236734246, | |
"loss": 0.6954, | |
"step": 145 | |
}, | |
{ | |
"epoch": 0.8390804597701149, | |
"grad_norm": 0.06157710403203964, | |
"learning_rate": 0.000181604511112812, | |
"loss": 0.6527, | |
"step": 146 | |
}, | |
{ | |
"epoch": 0.8448275862068966, | |
"grad_norm": 0.08122924715280533, | |
"learning_rate": 0.000181215518294502, | |
"loss": 0.6571, | |
"step": 147 | |
}, | |
{ | |
"epoch": 0.8505747126436781, | |
"grad_norm": 0.05926045402884483, | |
"learning_rate": 0.00018082288136637422, | |
"loss": 0.6773, | |
"step": 148 | |
}, | |
{ | |
"epoch": 0.8563218390804598, | |
"grad_norm": 0.07869191467761993, | |
"learning_rate": 0.00018042661794590023, | |
"loss": 0.7066, | |
"step": 149 | |
}, | |
{ | |
"epoch": 0.8620689655172413, | |
"grad_norm": 0.07564139366149902, | |
"learning_rate": 0.00018002674581327094, | |
"loss": 0.6491, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.8620689655172413, | |
"eval_loss": 0.6660047769546509, | |
"eval_runtime": 406.5581, | |
"eval_samples_per_second": 24.346, | |
"eval_steps_per_second": 0.381, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.867816091954023, | |
"grad_norm": 0.05749671533703804, | |
"learning_rate": 0.00017962328291059888, | |
"loss": 0.7081, | |
"step": 151 | |
}, | |
{ | |
"epoch": 0.8735632183908046, | |
"grad_norm": 0.08154609054327011, | |
"learning_rate": 0.00017921624734111292, | |
"loss": 0.6622, | |
"step": 152 | |
}, | |
{ | |
"epoch": 0.8793103448275862, | |
"grad_norm": 0.08773736655712128, | |
"learning_rate": 0.0001788056573683464, | |
"loss": 0.6393, | |
"step": 153 | |
}, | |
{ | |
"epoch": 0.8850574712643678, | |
"grad_norm": 0.06756340712308884, | |
"learning_rate": 0.00017839153141531718, | |
"loss": 0.6384, | |
"step": 154 | |
}, | |
{ | |
"epoch": 0.8908045977011494, | |
"grad_norm": 0.08763930201530457, | |
"learning_rate": 0.00017797388806370132, | |
"loss": 0.6512, | |
"step": 155 | |
}, | |
{ | |
"epoch": 0.896551724137931, | |
"grad_norm": 0.0647486001253128, | |
"learning_rate": 0.00017755274605299923, | |
"loss": 0.6502, | |
"step": 156 | |
}, | |
{ | |
"epoch": 0.9022988505747126, | |
"grad_norm": 0.11679747700691223, | |
"learning_rate": 0.00017712812427969485, | |
"loss": 0.6666, | |
"step": 157 | |
}, | |
{ | |
"epoch": 0.9080459770114943, | |
"grad_norm": 0.06472433358430862, | |
"learning_rate": 0.00017670004179640774, | |
"loss": 0.6495, | |
"step": 158 | |
}, | |
{ | |
"epoch": 0.9137931034482759, | |
"grad_norm": 0.09902803599834442, | |
"learning_rate": 0.0001762685178110382, | |
"loss": 0.6747, | |
"step": 159 | |
}, | |
{ | |
"epoch": 0.9195402298850575, | |
"grad_norm": 0.06362438946962357, | |
"learning_rate": 0.0001758335716859055, | |
"loss": 0.7015, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.9195402298850575, | |
"eval_loss": 0.663636326789856, | |
"eval_runtime": 404.5915, | |
"eval_samples_per_second": 24.464, | |
"eval_steps_per_second": 0.383, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.9252873563218391, | |
"grad_norm": 0.07304941862821579, | |
"learning_rate": 0.00017539522293687898, | |
"loss": 0.6825, | |
"step": 161 | |
}, | |
{ | |
"epoch": 0.9310344827586207, | |
"grad_norm": 0.08923015743494034, | |
"learning_rate": 0.00017495349123250242, | |
"loss": 0.674, | |
"step": 162 | |
}, | |
{ | |
"epoch": 0.9367816091954023, | |
"grad_norm": 0.062135376036167145, | |
"learning_rate": 0.00017450839639311162, | |
"loss": 0.6477, | |
"step": 163 | |
}, | |
{ | |
"epoch": 0.9425287356321839, | |
"grad_norm": 0.1098598912358284, | |
"learning_rate": 0.00017405995838994494, | |
"loss": 0.6742, | |
"step": 164 | |
}, | |
{ | |
"epoch": 0.9482758620689655, | |
"grad_norm": 0.06947540491819382, | |
"learning_rate": 0.00017360819734424715, | |
"loss": 0.6509, | |
"step": 165 | |
}, | |
{ | |
"epoch": 0.9540229885057471, | |
"grad_norm": 0.11134368181228638, | |
"learning_rate": 0.0001731531335263669, | |
"loss": 0.6602, | |
"step": 166 | |
}, | |
{ | |
"epoch": 0.9597701149425287, | |
"grad_norm": 0.06717904657125473, | |
"learning_rate": 0.00017269478735484683, | |
"loss": 0.6697, | |
"step": 167 | |
}, | |
{ | |
"epoch": 0.9655172413793104, | |
"grad_norm": 0.06737629324197769, | |
"learning_rate": 0.00017223317939550753, | |
"loss": 0.6636, | |
"step": 168 | |
}, | |
{ | |
"epoch": 0.9712643678160919, | |
"grad_norm": 0.08558724075555801, | |
"learning_rate": 0.00017176833036052495, | |
"loss": 0.6733, | |
"step": 169 | |
}, | |
{ | |
"epoch": 0.9770114942528736, | |
"grad_norm": 0.07127804309129715, | |
"learning_rate": 0.0001713002611075007, | |
"loss": 0.6523, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.9770114942528736, | |
"eval_loss": 0.6618800759315491, | |
"eval_runtime": 411.375, | |
"eval_samples_per_second": 24.061, | |
"eval_steps_per_second": 0.377, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.9827586206896551, | |
"grad_norm": 0.08060283958911896, | |
"learning_rate": 0.0001708289926385265, | |
"loss": 0.658, | |
"step": 171 | |
}, | |
{ | |
"epoch": 0.9885057471264368, | |
"grad_norm": 0.06496579200029373, | |
"learning_rate": 0.0001703545460992416, | |
"loss": 0.6697, | |
"step": 172 | |
}, | |
{ | |
"epoch": 0.9942528735632183, | |
"grad_norm": 0.0646037757396698, | |
"learning_rate": 0.00016987694277788417, | |
"loss": 0.6231, | |
"step": 173 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 0.08516079187393188, | |
"learning_rate": 0.0001693962041043359, | |
"loss": 0.6374, | |
"step": 174 | |
}, | |
{ | |
"epoch": 1.0057471264367817, | |
"grad_norm": 0.06554190069437027, | |
"learning_rate": 0.00016891235164916065, | |
"loss": 0.6271, | |
"step": 175 | |
}, | |
{ | |
"epoch": 1.0114942528735633, | |
"grad_norm": 0.06361629068851471, | |
"learning_rate": 0.00016842540712263637, | |
"loss": 0.649, | |
"step": 176 | |
}, | |
{ | |
"epoch": 1.0172413793103448, | |
"grad_norm": 0.0814083069562912, | |
"learning_rate": 0.00016793539237378128, | |
"loss": 0.654, | |
"step": 177 | |
}, | |
{ | |
"epoch": 1.0229885057471264, | |
"grad_norm": 0.06498701125383377, | |
"learning_rate": 0.00016744232938937308, | |
"loss": 0.6313, | |
"step": 178 | |
}, | |
{ | |
"epoch": 1.028735632183908, | |
"grad_norm": 0.11292543262243271, | |
"learning_rate": 0.0001669462402929629, | |
"loss": 0.6803, | |
"step": 179 | |
}, | |
{ | |
"epoch": 1.0344827586206897, | |
"grad_norm": 0.0661187544465065, | |
"learning_rate": 0.00016644714734388217, | |
"loss": 0.6672, | |
"step": 180 | |
}, | |
{ | |
"epoch": 1.0344827586206897, | |
"eval_loss": 0.6602174043655396, | |
"eval_runtime": 410.2914, | |
"eval_samples_per_second": 24.124, | |
"eval_steps_per_second": 0.378, | |
"step": 180 | |
}, | |
{ | |
"epoch": 1.0402298850574712, | |
"grad_norm": 0.08441785722970963, | |
"learning_rate": 0.00016594507293624425, | |
"loss": 0.6257, | |
"step": 181 | |
}, | |
{ | |
"epoch": 1.0459770114942528, | |
"grad_norm": 0.09075969457626343, | |
"learning_rate": 0.00016544003959793925, | |
"loss": 0.641, | |
"step": 182 | |
}, | |
{ | |
"epoch": 1.0517241379310345, | |
"grad_norm": 0.07677901536226273, | |
"learning_rate": 0.00016493206998962354, | |
"loss": 0.6351, | |
"step": 183 | |
}, | |
{ | |
"epoch": 1.0574712643678161, | |
"grad_norm": 0.09646302461624146, | |
"learning_rate": 0.0001644211869037027, | |
"loss": 0.6635, | |
"step": 184 | |
}, | |
{ | |
"epoch": 1.0632183908045978, | |
"grad_norm": 0.06928115338087082, | |
"learning_rate": 0.00016390741326330907, | |
"loss": 0.6458, | |
"step": 185 | |
}, | |
{ | |
"epoch": 1.0689655172413792, | |
"grad_norm": 0.1076992079615593, | |
"learning_rate": 0.00016339077212127294, | |
"loss": 0.6209, | |
"step": 186 | |
}, | |
{ | |
"epoch": 1.0747126436781609, | |
"grad_norm": 0.08489565551280975, | |
"learning_rate": 0.0001628712866590885, | |
"loss": 0.6336, | |
"step": 187 | |
}, | |
{ | |
"epoch": 1.0804597701149425, | |
"grad_norm": 0.11920158565044403, | |
"learning_rate": 0.00016234898018587337, | |
"loss": 0.6496, | |
"step": 188 | |
}, | |
{ | |
"epoch": 1.0862068965517242, | |
"grad_norm": 0.07987701892852783, | |
"learning_rate": 0.00016182387613732291, | |
"loss": 0.668, | |
"step": 189 | |
}, | |
{ | |
"epoch": 1.0919540229885056, | |
"grad_norm": 0.1095438227057457, | |
"learning_rate": 0.00016129599807465875, | |
"loss": 0.6862, | |
"step": 190 | |
}, | |
{ | |
"epoch": 1.0919540229885056, | |
"eval_loss": 0.6588147282600403, | |
"eval_runtime": 406.5115, | |
"eval_samples_per_second": 24.349, | |
"eval_steps_per_second": 0.381, | |
"step": 190 | |
}, | |
{ | |
"epoch": 1.0977011494252873, | |
"grad_norm": 0.08076825737953186, | |
"learning_rate": 0.0001607653696835713, | |
"loss": 0.6367, | |
"step": 191 | |
}, | |
{ | |
"epoch": 1.103448275862069, | |
"grad_norm": 0.09829648584127426, | |
"learning_rate": 0.00016023201477315731, | |
"loss": 0.6391, | |
"step": 192 | |
}, | |
{ | |
"epoch": 1.1091954022988506, | |
"grad_norm": 0.09008080512285233, | |
"learning_rate": 0.0001596959572748514, | |
"loss": 0.6462, | |
"step": 193 | |
}, | |
{ | |
"epoch": 1.1149425287356323, | |
"grad_norm": 0.07725552469491959, | |
"learning_rate": 0.00015915722124135227, | |
"loss": 0.6356, | |
"step": 194 | |
}, | |
{ | |
"epoch": 1.1206896551724137, | |
"grad_norm": 0.08215273171663284, | |
"learning_rate": 0.00015861583084554349, | |
"loss": 0.6557, | |
"step": 195 | |
}, | |
{ | |
"epoch": 1.1264367816091954, | |
"grad_norm": 0.07044622302055359, | |
"learning_rate": 0.0001580718103794089, | |
"loss": 0.6401, | |
"step": 196 | |
}, | |
{ | |
"epoch": 1.132183908045977, | |
"grad_norm": 0.06852877885103226, | |
"learning_rate": 0.00015752518425294257, | |
"loss": 0.6641, | |
"step": 197 | |
}, | |
{ | |
"epoch": 1.1379310344827587, | |
"grad_norm": 0.07775932550430298, | |
"learning_rate": 0.00015697597699305366, | |
"loss": 0.6689, | |
"step": 198 | |
}, | |
{ | |
"epoch": 1.1436781609195403, | |
"grad_norm": 0.07384389638900757, | |
"learning_rate": 0.00015642421324246568, | |
"loss": 0.663, | |
"step": 199 | |
}, | |
{ | |
"epoch": 1.1494252873563218, | |
"grad_norm": 0.074593685567379, | |
"learning_rate": 0.00015586991775861102, | |
"loss": 0.6755, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.1494252873563218, | |
"eval_loss": 0.6577329635620117, | |
"eval_runtime": 406.5534, | |
"eval_samples_per_second": 24.346, | |
"eval_steps_per_second": 0.381, | |
"step": 200 | |
}, | |
{ | |
"epoch": 1.1551724137931034, | |
"grad_norm": 0.07201389968395233, | |
"learning_rate": 0.00015531311541251995, | |
"loss": 0.62, | |
"step": 201 | |
}, | |
{ | |
"epoch": 1.160919540229885, | |
"grad_norm": 0.07052464783191681, | |
"learning_rate": 0.00015475383118770472, | |
"loss": 0.6456, | |
"step": 202 | |
}, | |
{ | |
"epoch": 1.1666666666666667, | |
"grad_norm": 0.07045558094978333, | |
"learning_rate": 0.00015419209017903852, | |
"loss": 0.6421, | |
"step": 203 | |
}, | |
{ | |
"epoch": 1.1724137931034484, | |
"grad_norm": 0.0870729386806488, | |
"learning_rate": 0.0001536279175916296, | |
"loss": 0.6342, | |
"step": 204 | |
}, | |
{ | |
"epoch": 1.1781609195402298, | |
"grad_norm": 0.0703926831483841, | |
"learning_rate": 0.0001530613387396901, | |
"loss": 0.6533, | |
"step": 205 | |
}, | |
{ | |
"epoch": 1.1839080459770115, | |
"grad_norm": 0.07181324064731598, | |
"learning_rate": 0.0001524923790454004, | |
"loss": 0.6511, | |
"step": 206 | |
}, | |
{ | |
"epoch": 1.1896551724137931, | |
"grad_norm": 0.07455940544605255, | |
"learning_rate": 0.00015192106403776848, | |
"loss": 0.6363, | |
"step": 207 | |
}, | |
{ | |
"epoch": 1.1954022988505748, | |
"grad_norm": 0.08370154350996017, | |
"learning_rate": 0.0001513474193514842, | |
"loss": 0.6517, | |
"step": 208 | |
}, | |
{ | |
"epoch": 1.2011494252873562, | |
"grad_norm": 0.08015818893909454, | |
"learning_rate": 0.00015077147072576933, | |
"loss": 0.6264, | |
"step": 209 | |
}, | |
{ | |
"epoch": 1.206896551724138, | |
"grad_norm": 0.093206986784935, | |
"learning_rate": 0.00015019324400322243, | |
"loss": 0.6279, | |
"step": 210 | |
}, | |
{ | |
"epoch": 1.206896551724138, | |
"eval_loss": 0.6562607884407043, | |
"eval_runtime": 407.9222, | |
"eval_samples_per_second": 24.264, | |
"eval_steps_per_second": 0.38, | |
"step": 210 | |
}, | |
{ | |
"epoch": 1.2126436781609196, | |
"grad_norm": 0.07707002758979797, | |
"learning_rate": 0.00014961276512865954, | |
"loss": 0.6726, | |
"step": 211 | |
}, | |
{ | |
"epoch": 1.2183908045977012, | |
"grad_norm": 0.08275868743658066, | |
"learning_rate": 0.00014903006014794983, | |
"loss": 0.6493, | |
"step": 212 | |
}, | |
{ | |
"epoch": 1.2241379310344827, | |
"grad_norm": 0.11222587525844574, | |
"learning_rate": 0.00014844515520684703, | |
"loss": 0.6367, | |
"step": 213 | |
}, | |
{ | |
"epoch": 1.2298850574712643, | |
"grad_norm": 0.09210342168807983, | |
"learning_rate": 0.00014785807654981627, | |
"loss": 0.6734, | |
"step": 214 | |
}, | |
{ | |
"epoch": 1.235632183908046, | |
"grad_norm": 0.08821109682321548, | |
"learning_rate": 0.00014726885051885653, | |
"loss": 0.6354, | |
"step": 215 | |
}, | |
{ | |
"epoch": 1.2413793103448276, | |
"grad_norm": 0.12253956496715546, | |
"learning_rate": 0.0001466775035523186, | |
"loss": 0.6412, | |
"step": 216 | |
}, | |
{ | |
"epoch": 1.2471264367816093, | |
"grad_norm": 0.08476684242486954, | |
"learning_rate": 0.00014608406218371894, | |
"loss": 0.6635, | |
"step": 217 | |
}, | |
{ | |
"epoch": 1.2528735632183907, | |
"grad_norm": 0.08554086089134216, | |
"learning_rate": 0.00014548855304054886, | |
"loss": 0.6403, | |
"step": 218 | |
}, | |
{ | |
"epoch": 1.2586206896551724, | |
"grad_norm": 0.10986476391553879, | |
"learning_rate": 0.00014489100284308017, | |
"loss": 0.6253, | |
"step": 219 | |
}, | |
{ | |
"epoch": 1.264367816091954, | |
"grad_norm": 0.09221742302179337, | |
"learning_rate": 0.00014429143840316585, | |
"loss": 0.6622, | |
"step": 220 | |
}, | |
{ | |
"epoch": 1.264367816091954, | |
"eval_loss": 0.6551185250282288, | |
"eval_runtime": 408.2025, | |
"eval_samples_per_second": 24.248, | |
"eval_steps_per_second": 0.38, | |
"step": 220 | |
}, | |
{ | |
"epoch": 1.2701149425287357, | |
"grad_norm": 0.08050013333559036, | |
"learning_rate": 0.00014368988662303732, | |
"loss": 0.6226, | |
"step": 221 | |
}, | |
{ | |
"epoch": 1.2758620689655173, | |
"grad_norm": 0.16257594525814056, | |
"learning_rate": 0.00014308637449409706, | |
"loss": 0.6661, | |
"step": 222 | |
}, | |
{ | |
"epoch": 1.2816091954022988, | |
"grad_norm": 0.07793809473514557, | |
"learning_rate": 0.00014248092909570774, | |
"loss": 0.6243, | |
"step": 223 | |
}, | |
{ | |
"epoch": 1.2873563218390804, | |
"grad_norm": 0.0975632593035698, | |
"learning_rate": 0.00014187357759397714, | |
"loss": 0.6348, | |
"step": 224 | |
}, | |
{ | |
"epoch": 1.293103448275862, | |
"grad_norm": 0.07041144371032715, | |
"learning_rate": 0.00014126434724053913, | |
"loss": 0.6386, | |
"step": 225 | |
}, | |
{ | |
"epoch": 1.2988505747126438, | |
"grad_norm": 0.12080610543489456, | |
"learning_rate": 0.00014065326537133094, | |
"loss": 0.6276, | |
"step": 226 | |
}, | |
{ | |
"epoch": 1.3045977011494254, | |
"grad_norm": 0.09340126812458038, | |
"learning_rate": 0.0001400403594053667, | |
"loss": 0.6431, | |
"step": 227 | |
}, | |
{ | |
"epoch": 1.3103448275862069, | |
"grad_norm": 0.09178619831800461, | |
"learning_rate": 0.00013942565684350698, | |
"loss": 0.6457, | |
"step": 228 | |
}, | |
{ | |
"epoch": 1.3160919540229885, | |
"grad_norm": 0.134804829955101, | |
"learning_rate": 0.00013880918526722497, | |
"loss": 0.6247, | |
"step": 229 | |
}, | |
{ | |
"epoch": 1.3218390804597702, | |
"grad_norm": 0.07517404854297638, | |
"learning_rate": 0.00013819097233736888, | |
"loss": 0.6329, | |
"step": 230 | |
}, | |
{ | |
"epoch": 1.3218390804597702, | |
"eval_loss": 0.6541800498962402, | |
"eval_runtime": 404.9523, | |
"eval_samples_per_second": 24.442, | |
"eval_steps_per_second": 0.383, | |
"step": 230 | |
}, | |
{ | |
"epoch": 1.3275862068965516, | |
"grad_norm": 0.1385478675365448, | |
"learning_rate": 0.00013757104579292082, | |
"loss": 0.6697, | |
"step": 231 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"grad_norm": 0.08156240731477737, | |
"learning_rate": 0.00013694943344975212, | |
"loss": 0.6279, | |
"step": 232 | |
}, | |
{ | |
"epoch": 1.339080459770115, | |
"grad_norm": 0.10937108844518661, | |
"learning_rate": 0.00013632616319937522, | |
"loss": 0.6487, | |
"step": 233 | |
}, | |
{ | |
"epoch": 1.3448275862068966, | |
"grad_norm": 0.12300366908311844, | |
"learning_rate": 0.00013570126300769232, | |
"loss": 0.6456, | |
"step": 234 | |
}, | |
{ | |
"epoch": 1.3505747126436782, | |
"grad_norm": 0.07707128673791885, | |
"learning_rate": 0.0001350747609137404, | |
"loss": 0.6302, | |
"step": 235 | |
}, | |
{ | |
"epoch": 1.3563218390804597, | |
"grad_norm": 0.0954674631357193, | |
"learning_rate": 0.0001344466850284333, | |
"loss": 0.6184, | |
"step": 236 | |
}, | |
{ | |
"epoch": 1.3620689655172413, | |
"grad_norm": 0.10317125916481018, | |
"learning_rate": 0.00013381706353330014, | |
"loss": 0.6618, | |
"step": 237 | |
}, | |
{ | |
"epoch": 1.367816091954023, | |
"grad_norm": 0.08765599131584167, | |
"learning_rate": 0.0001331859246792211, | |
"loss": 0.6191, | |
"step": 238 | |
}, | |
{ | |
"epoch": 1.3735632183908046, | |
"grad_norm": 0.10305018723011017, | |
"learning_rate": 0.0001325532967851596, | |
"loss": 0.6397, | |
"step": 239 | |
}, | |
{ | |
"epoch": 1.3793103448275863, | |
"grad_norm": 0.08769567310810089, | |
"learning_rate": 0.00013191920823689177, | |
"loss": 0.6559, | |
"step": 240 | |
}, | |
{ | |
"epoch": 1.3793103448275863, | |
"eval_loss": 0.6528159379959106, | |
"eval_runtime": 407.607, | |
"eval_samples_per_second": 24.283, | |
"eval_steps_per_second": 0.38, | |
"step": 240 | |
}, | |
{ | |
"epoch": 1.3850574712643677, | |
"grad_norm": 0.09783841669559479, | |
"learning_rate": 0.00013128368748573273, | |
"loss": 0.6736, | |
"step": 241 | |
}, | |
{ | |
"epoch": 1.3908045977011494, | |
"grad_norm": 0.08165410906076431, | |
"learning_rate": 0.00013064676304726, | |
"loss": 0.6467, | |
"step": 242 | |
}, | |
{ | |
"epoch": 1.396551724137931, | |
"grad_norm": 0.10928885638713837, | |
"learning_rate": 0.0001300084635000341, | |
"loss": 0.6956, | |
"step": 243 | |
}, | |
{ | |
"epoch": 1.4022988505747127, | |
"grad_norm": 0.09388460218906403, | |
"learning_rate": 0.000129368817484316, | |
"loss": 0.6474, | |
"step": 244 | |
}, | |
{ | |
"epoch": 1.4080459770114944, | |
"grad_norm": 0.08257792145013809, | |
"learning_rate": 0.0001287278537007824, | |
"loss": 0.6301, | |
"step": 245 | |
}, | |
{ | |
"epoch": 1.4137931034482758, | |
"grad_norm": 0.07570406794548035, | |
"learning_rate": 0.00012808560090923758, | |
"loss": 0.6238, | |
"step": 246 | |
}, | |
{ | |
"epoch": 1.4195402298850575, | |
"grad_norm": 0.097509004175663, | |
"learning_rate": 0.00012744208792732324, | |
"loss": 0.6383, | |
"step": 247 | |
}, | |
{ | |
"epoch": 1.4252873563218391, | |
"grad_norm": 0.07778667658567429, | |
"learning_rate": 0.00012679734362922528, | |
"loss": 0.642, | |
"step": 248 | |
}, | |
{ | |
"epoch": 1.4310344827586206, | |
"grad_norm": 0.08389262855052948, | |
"learning_rate": 0.00012615139694437835, | |
"loss": 0.6152, | |
"step": 249 | |
}, | |
{ | |
"epoch": 1.4367816091954024, | |
"grad_norm": 0.08290071040391922, | |
"learning_rate": 0.00012550427685616765, | |
"loss": 0.6389, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.4367816091954024, | |
"eval_loss": 0.6516815423965454, | |
"eval_runtime": 411.2719, | |
"eval_samples_per_second": 24.067, | |
"eval_steps_per_second": 0.377, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.4425287356321839, | |
"grad_norm": 0.08134254068136215, | |
"learning_rate": 0.00012485601240062869, | |
"loss": 0.6365, | |
"step": 251 | |
}, | |
{ | |
"epoch": 1.4482758620689655, | |
"grad_norm": 0.11836981773376465, | |
"learning_rate": 0.00012420663266514417, | |
"loss": 0.6345, | |
"step": 252 | |
}, | |
{ | |
"epoch": 1.4540229885057472, | |
"grad_norm": 0.07629366219043732, | |
"learning_rate": 0.0001235561667871391, | |
"loss": 0.6365, | |
"step": 253 | |
}, | |
{ | |
"epoch": 1.4597701149425286, | |
"grad_norm": 0.09142953902482986, | |
"learning_rate": 0.0001229046439527732, | |
"loss": 0.6316, | |
"step": 254 | |
}, | |
{ | |
"epoch": 1.4655172413793103, | |
"grad_norm": 0.12063657492399216, | |
"learning_rate": 0.00012225209339563145, | |
"loss": 0.6221, | |
"step": 255 | |
}, | |
{ | |
"epoch": 1.471264367816092, | |
"grad_norm": 0.07524894177913666, | |
"learning_rate": 0.00012159854439541245, | |
"loss": 0.6485, | |
"step": 256 | |
}, | |
{ | |
"epoch": 1.4770114942528736, | |
"grad_norm": 0.08384133875370026, | |
"learning_rate": 0.00012094402627661447, | |
"loss": 0.6607, | |
"step": 257 | |
}, | |
{ | |
"epoch": 1.4827586206896552, | |
"grad_norm": 0.08039575815200806, | |
"learning_rate": 0.00012028856840721974, | |
"loss": 0.6764, | |
"step": 258 | |
}, | |
{ | |
"epoch": 1.4885057471264367, | |
"grad_norm": 0.09115740656852722, | |
"learning_rate": 0.00011963220019737691, | |
"loss": 0.6587, | |
"step": 259 | |
}, | |
{ | |
"epoch": 1.4942528735632183, | |
"grad_norm": 0.08291927725076675, | |
"learning_rate": 0.00011897495109808107, | |
"loss": 0.6476, | |
"step": 260 | |
}, | |
{ | |
"epoch": 1.4942528735632183, | |
"eval_loss": 0.6506026983261108, | |
"eval_runtime": 407.6949, | |
"eval_samples_per_second": 24.278, | |
"eval_steps_per_second": 0.38, | |
"step": 260 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 0.09679999202489853, | |
"learning_rate": 0.00011831685059985262, | |
"loss": 0.6378, | |
"step": 261 | |
}, | |
{ | |
"epoch": 1.5057471264367817, | |
"grad_norm": 0.07858405262231827, | |
"learning_rate": 0.00011765792823141384, | |
"loss": 0.6679, | |
"step": 262 | |
}, | |
{ | |
"epoch": 1.5114942528735633, | |
"grad_norm": 0.07274090498685837, | |
"learning_rate": 0.00011699821355836409, | |
"loss": 0.6199, | |
"step": 263 | |
}, | |
{ | |
"epoch": 1.5172413793103448, | |
"grad_norm": 0.11862179636955261, | |
"learning_rate": 0.00011633773618185302, | |
"loss": 0.6369, | |
"step": 264 | |
}, | |
{ | |
"epoch": 1.5229885057471264, | |
"grad_norm": 0.08915189653635025, | |
"learning_rate": 0.00011567652573725262, | |
"loss": 0.6248, | |
"step": 265 | |
}, | |
{ | |
"epoch": 1.528735632183908, | |
"grad_norm": 0.12184260040521622, | |
"learning_rate": 0.00011501461189282733, | |
"loss": 0.645, | |
"step": 266 | |
}, | |
{ | |
"epoch": 1.5344827586206895, | |
"grad_norm": 0.09939936548471451, | |
"learning_rate": 0.00011435202434840287, | |
"loss": 0.6382, | |
"step": 267 | |
}, | |
{ | |
"epoch": 1.5402298850574714, | |
"grad_norm": 0.07167995721101761, | |
"learning_rate": 0.0001136887928340336, | |
"loss": 0.6064, | |
"step": 268 | |
}, | |
{ | |
"epoch": 1.5459770114942528, | |
"grad_norm": 0.09978017210960388, | |
"learning_rate": 0.00011302494710866857, | |
"loss": 0.6467, | |
"step": 269 | |
}, | |
{ | |
"epoch": 1.5517241379310345, | |
"grad_norm": 0.09598653763532639, | |
"learning_rate": 0.00011236051695881633, | |
"loss": 0.6412, | |
"step": 270 | |
}, | |
{ | |
"epoch": 1.5517241379310345, | |
"eval_loss": 0.6497076749801636, | |
"eval_runtime": 407.5672, | |
"eval_samples_per_second": 24.286, | |
"eval_steps_per_second": 0.38, | |
"step": 270 | |
}, | |
{ | |
"epoch": 1.5574712643678161, | |
"grad_norm": 0.08118661493062973, | |
"learning_rate": 0.00011169553219720828, | |
"loss": 0.6659, | |
"step": 271 | |
}, | |
{ | |
"epoch": 1.5632183908045976, | |
"grad_norm": 0.11158329248428345, | |
"learning_rate": 0.00011103002266146096, | |
"loss": 0.6578, | |
"step": 272 | |
}, | |
{ | |
"epoch": 1.5689655172413794, | |
"grad_norm": 0.12230509519577026, | |
"learning_rate": 0.0001103640182127375, | |
"loss": 0.6187, | |
"step": 273 | |
}, | |
{ | |
"epoch": 1.5747126436781609, | |
"grad_norm": 0.07973505556583405, | |
"learning_rate": 0.00010969754873440743, | |
"loss": 0.6507, | |
"step": 274 | |
}, | |
{ | |
"epoch": 1.5804597701149425, | |
"grad_norm": 0.07436943054199219, | |
"learning_rate": 0.00010903064413070612, | |
"loss": 0.6381, | |
"step": 275 | |
}, | |
{ | |
"epoch": 1.5862068965517242, | |
"grad_norm": 0.0804380401968956, | |
"learning_rate": 0.00010836333432539272, | |
"loss": 0.6302, | |
"step": 276 | |
}, | |
{ | |
"epoch": 1.5919540229885056, | |
"grad_norm": 0.07640023529529572, | |
"learning_rate": 0.00010769564926040769, | |
"loss": 0.618, | |
"step": 277 | |
}, | |
{ | |
"epoch": 1.5977011494252875, | |
"grad_norm": 0.0787947028875351, | |
"learning_rate": 0.0001070276188945293, | |
"loss": 0.6308, | |
"step": 278 | |
}, | |
{ | |
"epoch": 1.603448275862069, | |
"grad_norm": 0.08764500916004181, | |
"learning_rate": 0.00010635927320202928, | |
"loss": 0.6316, | |
"step": 279 | |
}, | |
{ | |
"epoch": 1.6091954022988506, | |
"grad_norm": 0.07885821908712387, | |
"learning_rate": 0.00010569064217132791, | |
"loss": 0.6232, | |
"step": 280 | |
}, | |
{ | |
"epoch": 1.6091954022988506, | |
"eval_loss": 0.6484516859054565, | |
"eval_runtime": 406.5349, | |
"eval_samples_per_second": 24.347, | |
"eval_steps_per_second": 0.381, | |
"step": 280 | |
}, | |
{ | |
"epoch": 1.6149425287356323, | |
"grad_norm": 0.08910427987575531, | |
"learning_rate": 0.00010502175580364857, | |
"loss": 0.6207, | |
"step": 281 | |
}, | |
{ | |
"epoch": 1.6206896551724137, | |
"grad_norm": 0.08195802569389343, | |
"learning_rate": 0.00010435264411167148, | |
"loss": 0.6604, | |
"step": 282 | |
}, | |
{ | |
"epoch": 1.6264367816091954, | |
"grad_norm": 0.09276524186134338, | |
"learning_rate": 0.0001036833371181871, | |
"loss": 0.6444, | |
"step": 283 | |
}, | |
{ | |
"epoch": 1.632183908045977, | |
"grad_norm": 0.07577691972255707, | |
"learning_rate": 0.00010301386485474889, | |
"loss": 0.6439, | |
"step": 284 | |
}, | |
{ | |
"epoch": 1.6379310344827587, | |
"grad_norm": 0.07871613651514053, | |
"learning_rate": 0.00010234425736032607, | |
"loss": 0.639, | |
"step": 285 | |
}, | |
{ | |
"epoch": 1.6436781609195403, | |
"grad_norm": 0.07570876181125641, | |
"learning_rate": 0.00010167454467995549, | |
"loss": 0.6056, | |
"step": 286 | |
}, | |
{ | |
"epoch": 1.6494252873563218, | |
"grad_norm": 0.09836837649345398, | |
"learning_rate": 0.00010100475686339379, | |
"loss": 0.6341, | |
"step": 287 | |
}, | |
{ | |
"epoch": 1.6551724137931034, | |
"grad_norm": 0.08796896785497665, | |
"learning_rate": 0.00010033492396376878, | |
"loss": 0.6193, | |
"step": 288 | |
}, | |
{ | |
"epoch": 1.660919540229885, | |
"grad_norm": 0.07815764099359512, | |
"learning_rate": 9.966507603623125e-05, | |
"loss": 0.6227, | |
"step": 289 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"grad_norm": 0.13016292452812195, | |
"learning_rate": 9.899524313660624e-05, | |
"loss": 0.6243, | |
"step": 290 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"eval_loss": 0.6477526426315308, | |
"eval_runtime": 405.0855, | |
"eval_samples_per_second": 24.434, | |
"eval_steps_per_second": 0.383, | |
"step": 290 | |
}, | |
{ | |
"epoch": 1.6724137931034484, | |
"grad_norm": 0.09747885912656784, | |
"learning_rate": 9.832545532004454e-05, | |
"loss": 0.6328, | |
"step": 291 | |
}, | |
{ | |
"epoch": 1.6781609195402298, | |
"grad_norm": 0.10131366550922394, | |
"learning_rate": 9.765574263967396e-05, | |
"loss": 0.6212, | |
"step": 292 | |
}, | |
{ | |
"epoch": 1.6839080459770115, | |
"grad_norm": 0.1203976571559906, | |
"learning_rate": 9.698613514525116e-05, | |
"loss": 0.6563, | |
"step": 293 | |
}, | |
{ | |
"epoch": 1.6896551724137931, | |
"grad_norm": 0.07119957357645035, | |
"learning_rate": 9.631666288181293e-05, | |
"loss": 0.6278, | |
"step": 294 | |
}, | |
{ | |
"epoch": 1.6954022988505746, | |
"grad_norm": 0.11370845884084702, | |
"learning_rate": 9.564735588832856e-05, | |
"loss": 0.6376, | |
"step": 295 | |
}, | |
{ | |
"epoch": 1.7011494252873565, | |
"grad_norm": 0.07851264625787735, | |
"learning_rate": 9.497824419635144e-05, | |
"loss": 0.6149, | |
"step": 296 | |
}, | |
{ | |
"epoch": 1.706896551724138, | |
"grad_norm": 0.0818655788898468, | |
"learning_rate": 9.430935782867212e-05, | |
"loss": 0.6048, | |
"step": 297 | |
}, | |
{ | |
"epoch": 1.7126436781609196, | |
"grad_norm": 0.07335007190704346, | |
"learning_rate": 9.364072679797073e-05, | |
"loss": 0.6292, | |
"step": 298 | |
}, | |
{ | |
"epoch": 1.7183908045977012, | |
"grad_norm": 0.07759315520524979, | |
"learning_rate": 9.297238110547074e-05, | |
"loss": 0.6464, | |
"step": 299 | |
}, | |
{ | |
"epoch": 1.7241379310344827, | |
"grad_norm": 0.0833640992641449, | |
"learning_rate": 9.230435073959232e-05, | |
"loss": 0.6467, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.7241379310344827, | |
"eval_loss": 0.6469475030899048, | |
"eval_runtime": 408.9385, | |
"eval_samples_per_second": 24.204, | |
"eval_steps_per_second": 0.379, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.7298850574712645, | |
"grad_norm": 0.08030898869037628, | |
"learning_rate": 9.163666567460733e-05, | |
"loss": 0.6268, | |
"step": 301 | |
}, | |
{ | |
"epoch": 1.735632183908046, | |
"grad_norm": 0.08017026633024216, | |
"learning_rate": 9.096935586929392e-05, | |
"loss": 0.6367, | |
"step": 302 | |
}, | |
{ | |
"epoch": 1.7413793103448276, | |
"grad_norm": 0.07945988327264786, | |
"learning_rate": 9.030245126559262e-05, | |
"loss": 0.6318, | |
"step": 303 | |
}, | |
{ | |
"epoch": 1.7471264367816093, | |
"grad_norm": 0.09426795691251755, | |
"learning_rate": 8.963598178726254e-05, | |
"loss": 0.6399, | |
"step": 304 | |
}, | |
{ | |
"epoch": 1.7528735632183907, | |
"grad_norm": 0.08182523399591446, | |
"learning_rate": 8.896997733853903e-05, | |
"loss": 0.6203, | |
"step": 305 | |
}, | |
{ | |
"epoch": 1.7586206896551724, | |
"grad_norm": 0.07778620719909668, | |
"learning_rate": 8.830446780279176e-05, | |
"loss": 0.6816, | |
"step": 306 | |
}, | |
{ | |
"epoch": 1.764367816091954, | |
"grad_norm": 0.11482707411050797, | |
"learning_rate": 8.763948304118368e-05, | |
"loss": 0.6442, | |
"step": 307 | |
}, | |
{ | |
"epoch": 1.7701149425287355, | |
"grad_norm": 0.07546856999397278, | |
"learning_rate": 8.697505289133145e-05, | |
"loss": 0.6445, | |
"step": 308 | |
}, | |
{ | |
"epoch": 1.7758620689655173, | |
"grad_norm": 0.11665278673171997, | |
"learning_rate": 8.631120716596641e-05, | |
"loss": 0.6374, | |
"step": 309 | |
}, | |
{ | |
"epoch": 1.7816091954022988, | |
"grad_norm": 0.1181105300784111, | |
"learning_rate": 8.564797565159714e-05, | |
"loss": 0.6146, | |
"step": 310 | |
}, | |
{ | |
"epoch": 1.7816091954022988, | |
"eval_loss": 0.6459708213806152, | |
"eval_runtime": 405.0602, | |
"eval_samples_per_second": 24.436, | |
"eval_steps_per_second": 0.383, | |
"step": 310 | |
}, | |
{ | |
"epoch": 1.7873563218390804, | |
"grad_norm": 0.07805997133255005, | |
"learning_rate": 8.498538810717267e-05, | |
"loss": 0.6679, | |
"step": 311 | |
}, | |
{ | |
"epoch": 1.793103448275862, | |
"grad_norm": 0.08421120047569275, | |
"learning_rate": 8.432347426274739e-05, | |
"loss": 0.642, | |
"step": 312 | |
}, | |
{ | |
"epoch": 1.7988505747126435, | |
"grad_norm": 0.10425391793251038, | |
"learning_rate": 8.366226381814697e-05, | |
"loss": 0.6354, | |
"step": 313 | |
}, | |
{ | |
"epoch": 1.8045977011494254, | |
"grad_norm": 0.08861584216356277, | |
"learning_rate": 8.300178644163594e-05, | |
"loss": 0.6397, | |
"step": 314 | |
}, | |
{ | |
"epoch": 1.8103448275862069, | |
"grad_norm": 0.08726219832897186, | |
"learning_rate": 8.234207176858614e-05, | |
"loss": 0.6474, | |
"step": 315 | |
}, | |
{ | |
"epoch": 1.8160919540229885, | |
"grad_norm": 0.12218604981899261, | |
"learning_rate": 8.16831494001474e-05, | |
"loss": 0.6459, | |
"step": 316 | |
}, | |
{ | |
"epoch": 1.8218390804597702, | |
"grad_norm": 0.08113615214824677, | |
"learning_rate": 8.102504890191892e-05, | |
"loss": 0.6114, | |
"step": 317 | |
}, | |
{ | |
"epoch": 1.8275862068965516, | |
"grad_norm": 0.08763635903596878, | |
"learning_rate": 8.036779980262311e-05, | |
"loss": 0.6602, | |
"step": 318 | |
}, | |
{ | |
"epoch": 1.8333333333333335, | |
"grad_norm": 0.1053246557712555, | |
"learning_rate": 7.971143159278026e-05, | |
"loss": 0.6182, | |
"step": 319 | |
}, | |
{ | |
"epoch": 1.839080459770115, | |
"grad_norm": 0.09522312134504318, | |
"learning_rate": 7.905597372338558e-05, | |
"loss": 0.6386, | |
"step": 320 | |
}, | |
{ | |
"epoch": 1.839080459770115, | |
"eval_loss": 0.6449984908103943, | |
"eval_runtime": 405.9165, | |
"eval_samples_per_second": 24.384, | |
"eval_steps_per_second": 0.382, | |
"step": 320 | |
}, | |
{ | |
"epoch": 1.8448275862068966, | |
"grad_norm": 0.09493348747491837, | |
"learning_rate": 7.840145560458756e-05, | |
"loss": 0.6522, | |
"step": 321 | |
}, | |
{ | |
"epoch": 1.8505747126436782, | |
"grad_norm": 0.10554379224777222, | |
"learning_rate": 7.774790660436858e-05, | |
"loss": 0.6401, | |
"step": 322 | |
}, | |
{ | |
"epoch": 1.8563218390804597, | |
"grad_norm": 0.09237196296453476, | |
"learning_rate": 7.709535604722684e-05, | |
"loss": 0.6315, | |
"step": 323 | |
}, | |
{ | |
"epoch": 1.8620689655172413, | |
"grad_norm": 0.07175464183092117, | |
"learning_rate": 7.644383321286094e-05, | |
"loss": 0.6559, | |
"step": 324 | |
}, | |
{ | |
"epoch": 1.867816091954023, | |
"grad_norm": 0.08578918129205704, | |
"learning_rate": 7.579336733485584e-05, | |
"loss": 0.6297, | |
"step": 325 | |
}, | |
{ | |
"epoch": 1.8735632183908046, | |
"grad_norm": 0.14390091598033905, | |
"learning_rate": 7.514398759937135e-05, | |
"loss": 0.6155, | |
"step": 326 | |
}, | |
{ | |
"epoch": 1.8793103448275863, | |
"grad_norm": 0.07774030417203903, | |
"learning_rate": 7.449572314383237e-05, | |
"loss": 0.6551, | |
"step": 327 | |
}, | |
{ | |
"epoch": 1.8850574712643677, | |
"grad_norm": 0.07927459478378296, | |
"learning_rate": 7.384860305562172e-05, | |
"loss": 0.6312, | |
"step": 328 | |
}, | |
{ | |
"epoch": 1.8908045977011494, | |
"grad_norm": 0.11287631094455719, | |
"learning_rate": 7.320265637077473e-05, | |
"loss": 0.66, | |
"step": 329 | |
}, | |
{ | |
"epoch": 1.896551724137931, | |
"grad_norm": 0.09955232590436935, | |
"learning_rate": 7.255791207267679e-05, | |
"loss": 0.6456, | |
"step": 330 | |
}, | |
{ | |
"epoch": 1.896551724137931, | |
"eval_loss": 0.6442980766296387, | |
"eval_runtime": 404.2901, | |
"eval_samples_per_second": 24.482, | |
"eval_steps_per_second": 0.383, | |
"step": 330 | |
}, | |
{ | |
"epoch": 1.9022988505747125, | |
"grad_norm": 0.07881880551576614, | |
"learning_rate": 7.191439909076243e-05, | |
"loss": 0.6398, | |
"step": 331 | |
}, | |
{ | |
"epoch": 1.9080459770114944, | |
"grad_norm": 0.15244217216968536, | |
"learning_rate": 7.127214629921765e-05, | |
"loss": 0.6614, | |
"step": 332 | |
}, | |
{ | |
"epoch": 1.9137931034482758, | |
"grad_norm": 0.07337264716625214, | |
"learning_rate": 7.0631182515684e-05, | |
"loss": 0.6294, | |
"step": 333 | |
}, | |
{ | |
"epoch": 1.9195402298850575, | |
"grad_norm": 0.07102935016155243, | |
"learning_rate": 6.999153649996595e-05, | |
"loss": 0.6237, | |
"step": 334 | |
}, | |
{ | |
"epoch": 1.9252873563218391, | |
"grad_norm": 0.09349462389945984, | |
"learning_rate": 6.935323695274002e-05, | |
"loss": 0.6051, | |
"step": 335 | |
}, | |
{ | |
"epoch": 1.9310344827586206, | |
"grad_norm": 0.0851803794503212, | |
"learning_rate": 6.871631251426728e-05, | |
"loss": 0.6548, | |
"step": 336 | |
}, | |
{ | |
"epoch": 1.9367816091954024, | |
"grad_norm": 0.08571562170982361, | |
"learning_rate": 6.808079176310827e-05, | |
"loss": 0.6136, | |
"step": 337 | |
}, | |
{ | |
"epoch": 1.9425287356321839, | |
"grad_norm": 0.0772768035531044, | |
"learning_rate": 6.744670321484043e-05, | |
"loss": 0.6668, | |
"step": 338 | |
}, | |
{ | |
"epoch": 1.9482758620689655, | |
"grad_norm": 0.08812547475099564, | |
"learning_rate": 6.681407532077895e-05, | |
"loss": 0.6427, | |
"step": 339 | |
}, | |
{ | |
"epoch": 1.9540229885057472, | |
"grad_norm": 0.09011583775281906, | |
"learning_rate": 6.618293646669986e-05, | |
"loss": 0.6402, | |
"step": 340 | |
}, | |
{ | |
"epoch": 1.9540229885057472, | |
"eval_loss": 0.6436823606491089, | |
"eval_runtime": 413.0204, | |
"eval_samples_per_second": 23.965, | |
"eval_steps_per_second": 0.375, | |
"step": 340 | |
}, | |
{ | |
"epoch": 1.9597701149425286, | |
"grad_norm": 0.08234158158302307, | |
"learning_rate": 6.555331497156672e-05, | |
"loss": 0.6362, | |
"step": 341 | |
}, | |
{ | |
"epoch": 1.9655172413793105, | |
"grad_norm": 0.0780014768242836, | |
"learning_rate": 6.492523908625959e-05, | |
"loss": 0.6454, | |
"step": 342 | |
}, | |
{ | |
"epoch": 1.971264367816092, | |
"grad_norm": 0.08458276093006134, | |
"learning_rate": 6.42987369923077e-05, | |
"loss": 0.6587, | |
"step": 343 | |
}, | |
{ | |
"epoch": 1.9770114942528736, | |
"grad_norm": 0.11979149281978607, | |
"learning_rate": 6.367383680062478e-05, | |
"loss": 0.6369, | |
"step": 344 | |
}, | |
{ | |
"epoch": 1.9827586206896552, | |
"grad_norm": 0.08782167732715607, | |
"learning_rate": 6.30505665502479e-05, | |
"loss": 0.6382, | |
"step": 345 | |
}, | |
{ | |
"epoch": 1.9885057471264367, | |
"grad_norm": 0.07542918622493744, | |
"learning_rate": 6.242895420707917e-05, | |
"loss": 0.6238, | |
"step": 346 | |
}, | |
{ | |
"epoch": 1.9942528735632183, | |
"grad_norm": 0.09390002489089966, | |
"learning_rate": 6.180902766263113e-05, | |
"loss": 0.632, | |
"step": 347 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 0.10154885053634644, | |
"learning_rate": 6.119081473277501e-05, | |
"loss": 0.6078, | |
"step": 348 | |
}, | |
{ | |
"epoch": 2.0057471264367814, | |
"grad_norm": 0.09035320580005646, | |
"learning_rate": 6.057434315649304e-05, | |
"loss": 0.6331, | |
"step": 349 | |
}, | |
{ | |
"epoch": 2.0114942528735633, | |
"grad_norm": 0.1151895746588707, | |
"learning_rate": 5.99596405946333e-05, | |
"loss": 0.6455, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.0114942528735633, | |
"eval_loss": 0.6433547139167786, | |
"eval_runtime": 409.0063, | |
"eval_samples_per_second": 24.2, | |
"eval_steps_per_second": 0.379, | |
"step": 350 | |
}, | |
{ | |
"epoch": 2.0172413793103448, | |
"grad_norm": 0.10666079819202423, | |
"learning_rate": 5.9346734628669065e-05, | |
"loss": 0.6473, | |
"step": 351 | |
}, | |
{ | |
"epoch": 2.0229885057471266, | |
"grad_norm": 0.09095422178506851, | |
"learning_rate": 5.873565275946088e-05, | |
"loss": 0.6335, | |
"step": 352 | |
}, | |
{ | |
"epoch": 2.028735632183908, | |
"grad_norm": 0.09256957471370697, | |
"learning_rate": 5.8126422406022885e-05, | |
"loss": 0.5969, | |
"step": 353 | |
}, | |
{ | |
"epoch": 2.0344827586206895, | |
"grad_norm": 0.1397576928138733, | |
"learning_rate": 5.7519070904292247e-05, | |
"loss": 0.5919, | |
"step": 354 | |
}, | |
{ | |
"epoch": 2.0402298850574714, | |
"grad_norm": 0.0867573469877243, | |
"learning_rate": 5.691362550590297e-05, | |
"loss": 0.5909, | |
"step": 355 | |
}, | |
{ | |
"epoch": 2.045977011494253, | |
"grad_norm": 0.07953327894210815, | |
"learning_rate": 5.631011337696271e-05, | |
"loss": 0.5959, | |
"step": 356 | |
}, | |
{ | |
"epoch": 2.0517241379310347, | |
"grad_norm": 0.09324570745229721, | |
"learning_rate": 5.570856159683418e-05, | |
"loss": 0.6216, | |
"step": 357 | |
}, | |
{ | |
"epoch": 2.057471264367816, | |
"grad_norm": 0.10510014742612839, | |
"learning_rate": 5.510899715691984e-05, | |
"loss": 0.6172, | |
"step": 358 | |
}, | |
{ | |
"epoch": 2.0632183908045976, | |
"grad_norm": 0.08669542521238327, | |
"learning_rate": 5.451144695945116e-05, | |
"loss": 0.5931, | |
"step": 359 | |
}, | |
{ | |
"epoch": 2.0689655172413794, | |
"grad_norm": 0.09054102748632431, | |
"learning_rate": 5.3915937816281095e-05, | |
"loss": 0.5888, | |
"step": 360 | |
}, | |
{ | |
"epoch": 2.0689655172413794, | |
"eval_loss": 0.643742024898529, | |
"eval_runtime": 404.2471, | |
"eval_samples_per_second": 24.485, | |
"eval_steps_per_second": 0.383, | |
"step": 360 | |
}, | |
{ | |
"epoch": 2.074712643678161, | |
"grad_norm": 0.11839323490858078, | |
"learning_rate": 5.3322496447681414e-05, | |
"loss": 0.6093, | |
"step": 361 | |
}, | |
{ | |
"epoch": 2.0804597701149423, | |
"grad_norm": 0.1050933375954628, | |
"learning_rate": 5.273114948114346e-05, | |
"loss": 0.6247, | |
"step": 362 | |
}, | |
{ | |
"epoch": 2.086206896551724, | |
"grad_norm": 0.09781333059072495, | |
"learning_rate": 5.214192345018374e-05, | |
"loss": 0.6274, | |
"step": 363 | |
}, | |
{ | |
"epoch": 2.0919540229885056, | |
"grad_norm": 0.09329628199338913, | |
"learning_rate": 5.1554844793153e-05, | |
"loss": 0.6243, | |
"step": 364 | |
}, | |
{ | |
"epoch": 2.0977011494252875, | |
"grad_norm": 0.08716364949941635, | |
"learning_rate": 5.096993985205023e-05, | |
"loss": 0.6149, | |
"step": 365 | |
}, | |
{ | |
"epoch": 2.103448275862069, | |
"grad_norm": 0.09969545155763626, | |
"learning_rate": 5.0387234871340486e-05, | |
"loss": 0.635, | |
"step": 366 | |
}, | |
{ | |
"epoch": 2.1091954022988504, | |
"grad_norm": 0.10841623693704605, | |
"learning_rate": 4.980675599677757e-05, | |
"loss": 0.6544, | |
"step": 367 | |
}, | |
{ | |
"epoch": 2.1149425287356323, | |
"grad_norm": 0.07902085781097412, | |
"learning_rate": 4.9228529274230695e-05, | |
"loss": 0.6144, | |
"step": 368 | |
}, | |
{ | |
"epoch": 2.1206896551724137, | |
"grad_norm": 0.11440268158912659, | |
"learning_rate": 4.865258064851579e-05, | |
"loss": 0.6217, | |
"step": 369 | |
}, | |
{ | |
"epoch": 2.1264367816091956, | |
"grad_norm": 0.09594007581472397, | |
"learning_rate": 4.807893596223152e-05, | |
"loss": 0.6267, | |
"step": 370 | |
}, | |
{ | |
"epoch": 2.1264367816091956, | |
"eval_loss": 0.6434890031814575, | |
"eval_runtime": 404.1508, | |
"eval_samples_per_second": 24.491, | |
"eval_steps_per_second": 0.384, | |
"step": 370 | |
}, | |
{ | |
"epoch": 2.132183908045977, | |
"grad_norm": 0.09025128185749054, | |
"learning_rate": 4.75076209545996e-05, | |
"loss": 0.6122, | |
"step": 371 | |
}, | |
{ | |
"epoch": 2.1379310344827585, | |
"grad_norm": 0.09677668660879135, | |
"learning_rate": 4.693866126030995e-05, | |
"loss": 0.6339, | |
"step": 372 | |
}, | |
{ | |
"epoch": 2.1436781609195403, | |
"grad_norm": 0.08178266882896423, | |
"learning_rate": 4.637208240837042e-05, | |
"loss": 0.6392, | |
"step": 373 | |
}, | |
{ | |
"epoch": 2.1494252873563218, | |
"grad_norm": 0.10616466403007507, | |
"learning_rate": 4.5807909820961494e-05, | |
"loss": 0.6207, | |
"step": 374 | |
}, | |
{ | |
"epoch": 2.1551724137931036, | |
"grad_norm": 0.08333076536655426, | |
"learning_rate": 4.5246168812295286e-05, | |
"loss": 0.6148, | |
"step": 375 | |
}, | |
{ | |
"epoch": 2.160919540229885, | |
"grad_norm": 0.1016552671790123, | |
"learning_rate": 4.468688458748006e-05, | |
"loss": 0.6306, | |
"step": 376 | |
}, | |
{ | |
"epoch": 2.1666666666666665, | |
"grad_norm": 0.08546506613492966, | |
"learning_rate": 4.413008224138897e-05, | |
"loss": 0.606, | |
"step": 377 | |
}, | |
{ | |
"epoch": 2.1724137931034484, | |
"grad_norm": 0.08369904011487961, | |
"learning_rate": 4.357578675753432e-05, | |
"loss": 0.6007, | |
"step": 378 | |
}, | |
{ | |
"epoch": 2.17816091954023, | |
"grad_norm": 0.08523935824632645, | |
"learning_rate": 4.302402300694636e-05, | |
"loss": 0.5884, | |
"step": 379 | |
}, | |
{ | |
"epoch": 2.1839080459770113, | |
"grad_norm": 0.0944519191980362, | |
"learning_rate": 4.247481574705744e-05, | |
"loss": 0.6292, | |
"step": 380 | |
}, | |
{ | |
"epoch": 2.1839080459770113, | |
"eval_loss": 0.6433520913124084, | |
"eval_runtime": 404.2218, | |
"eval_samples_per_second": 24.487, | |
"eval_steps_per_second": 0.383, | |
"step": 380 | |
}, | |
{ | |
"epoch": 2.189655172413793, | |
"grad_norm": 0.11311980336904526, | |
"learning_rate": 4.1928189620591116e-05, | |
"loss": 0.6103, | |
"step": 381 | |
}, | |
{ | |
"epoch": 2.1954022988505746, | |
"grad_norm": 0.08662451803684235, | |
"learning_rate": 4.138416915445655e-05, | |
"loss": 0.5852, | |
"step": 382 | |
}, | |
{ | |
"epoch": 2.2011494252873565, | |
"grad_norm": 0.09417479485273361, | |
"learning_rate": 4.084277875864776e-05, | |
"loss": 0.6467, | |
"step": 383 | |
}, | |
{ | |
"epoch": 2.206896551724138, | |
"grad_norm": 0.09818896651268005, | |
"learning_rate": 4.030404272514864e-05, | |
"loss": 0.6112, | |
"step": 384 | |
}, | |
{ | |
"epoch": 2.2126436781609193, | |
"grad_norm": 0.08806431293487549, | |
"learning_rate": 3.9767985226842696e-05, | |
"loss": 0.5822, | |
"step": 385 | |
}, | |
{ | |
"epoch": 2.218390804597701, | |
"grad_norm": 0.0837361216545105, | |
"learning_rate": 3.923463031642872e-05, | |
"loss": 0.6137, | |
"step": 386 | |
}, | |
{ | |
"epoch": 2.2241379310344827, | |
"grad_norm": 0.10712449252605438, | |
"learning_rate": 3.870400192534128e-05, | |
"loss": 0.602, | |
"step": 387 | |
}, | |
{ | |
"epoch": 2.2298850574712645, | |
"grad_norm": 0.11590448766946793, | |
"learning_rate": 3.81761238626771e-05, | |
"loss": 0.6215, | |
"step": 388 | |
}, | |
{ | |
"epoch": 2.235632183908046, | |
"grad_norm": 0.08264652639627457, | |
"learning_rate": 3.7651019814126654e-05, | |
"loss": 0.6002, | |
"step": 389 | |
}, | |
{ | |
"epoch": 2.2413793103448274, | |
"grad_norm": 0.08986306935548782, | |
"learning_rate": 3.7128713340911535e-05, | |
"loss": 0.6058, | |
"step": 390 | |
}, | |
{ | |
"epoch": 2.2413793103448274, | |
"eval_loss": 0.6431533098220825, | |
"eval_runtime": 419.2567, | |
"eval_samples_per_second": 23.608, | |
"eval_steps_per_second": 0.37, | |
"step": 390 | |
}, | |
{ | |
"epoch": 2.2471264367816093, | |
"grad_norm": 0.3949902057647705, | |
"learning_rate": 3.660922787872706e-05, | |
"loss": 0.643, | |
"step": 391 | |
}, | |
{ | |
"epoch": 2.2528735632183907, | |
"grad_norm": 0.09183293581008911, | |
"learning_rate": 3.609258673669097e-05, | |
"loss": 0.5931, | |
"step": 392 | |
}, | |
{ | |
"epoch": 2.2586206896551726, | |
"grad_norm": 0.0786626785993576, | |
"learning_rate": 3.557881309629729e-05, | |
"loss": 0.5795, | |
"step": 393 | |
}, | |
{ | |
"epoch": 2.264367816091954, | |
"grad_norm": 0.08318330347537994, | |
"learning_rate": 3.5067930010376484e-05, | |
"loss": 0.6173, | |
"step": 394 | |
}, | |
{ | |
"epoch": 2.2701149425287355, | |
"grad_norm": 0.09149078279733658, | |
"learning_rate": 3.455996040206076e-05, | |
"loss": 0.6238, | |
"step": 395 | |
}, | |
{ | |
"epoch": 2.2758620689655173, | |
"grad_norm": 0.09578599780797958, | |
"learning_rate": 3.4054927063755796e-05, | |
"loss": 0.6264, | |
"step": 396 | |
}, | |
{ | |
"epoch": 2.281609195402299, | |
"grad_norm": 0.08735264092683792, | |
"learning_rate": 3.355285265611784e-05, | |
"loss": 0.6269, | |
"step": 397 | |
}, | |
{ | |
"epoch": 2.2873563218390807, | |
"grad_norm": 0.0886816754937172, | |
"learning_rate": 3.305375970703711e-05, | |
"loss": 0.6043, | |
"step": 398 | |
}, | |
{ | |
"epoch": 2.293103448275862, | |
"grad_norm": 0.07559609413146973, | |
"learning_rate": 3.2557670610626925e-05, | |
"loss": 0.6416, | |
"step": 399 | |
}, | |
{ | |
"epoch": 2.2988505747126435, | |
"grad_norm": 0.11379113793373108, | |
"learning_rate": 3.206460762621873e-05, | |
"loss": 0.6221, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.2988505747126435, | |
"eval_loss": 0.6427375078201294, | |
"eval_runtime": 405.8229, | |
"eval_samples_per_second": 24.39, | |
"eval_steps_per_second": 0.382, | |
"step": 400 | |
}, | |
{ | |
"epoch": 2.3045977011494254, | |
"grad_norm": 0.08930199593305588, | |
"learning_rate": 3.157459287736362e-05, | |
"loss": 0.599, | |
"step": 401 | |
}, | |
{ | |
"epoch": 2.310344827586207, | |
"grad_norm": 0.11189960688352585, | |
"learning_rate": 3.108764835083938e-05, | |
"loss": 0.6243, | |
"step": 402 | |
}, | |
{ | |
"epoch": 2.3160919540229887, | |
"grad_norm": 0.0793476328253746, | |
"learning_rate": 3.0603795895664124e-05, | |
"loss": 0.615, | |
"step": 403 | |
}, | |
{ | |
"epoch": 2.32183908045977, | |
"grad_norm": 0.0860418751835823, | |
"learning_rate": 3.0123057222115836e-05, | |
"loss": 0.5968, | |
"step": 404 | |
}, | |
{ | |
"epoch": 2.3275862068965516, | |
"grad_norm": 0.08753317594528198, | |
"learning_rate": 2.964545390075841e-05, | |
"loss": 0.6192, | |
"step": 405 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"grad_norm": 0.09598301351070404, | |
"learning_rate": 2.9171007361473514e-05, | |
"loss": 0.6237, | |
"step": 406 | |
}, | |
{ | |
"epoch": 2.339080459770115, | |
"grad_norm": 0.10627751052379608, | |
"learning_rate": 2.8699738892499328e-05, | |
"loss": 0.6123, | |
"step": 407 | |
}, | |
{ | |
"epoch": 2.344827586206897, | |
"grad_norm": 0.08839675039052963, | |
"learning_rate": 2.8231669639475067e-05, | |
"loss": 0.6123, | |
"step": 408 | |
}, | |
{ | |
"epoch": 2.3505747126436782, | |
"grad_norm": 0.08533503860235214, | |
"learning_rate": 2.776682060449247e-05, | |
"loss": 0.6251, | |
"step": 409 | |
}, | |
{ | |
"epoch": 2.3563218390804597, | |
"grad_norm": 0.10517686605453491, | |
"learning_rate": 2.7305212645153212e-05, | |
"loss": 0.6254, | |
"step": 410 | |
}, | |
{ | |
"epoch": 2.3563218390804597, | |
"eval_loss": 0.6428195238113403, | |
"eval_runtime": 404.1758, | |
"eval_samples_per_second": 24.489, | |
"eval_steps_per_second": 0.383, | |
"step": 410 | |
}, | |
{ | |
"epoch": 2.3620689655172415, | |
"grad_norm": 0.10578128695487976, | |
"learning_rate": 2.6846866473633125e-05, | |
"loss": 0.6216, | |
"step": 411 | |
}, | |
{ | |
"epoch": 2.367816091954023, | |
"grad_norm": 0.10083532333374023, | |
"learning_rate": 2.6391802655752853e-05, | |
"loss": 0.6052, | |
"step": 412 | |
}, | |
{ | |
"epoch": 2.3735632183908044, | |
"grad_norm": 0.08413968980312347, | |
"learning_rate": 2.594004161005511e-05, | |
"loss": 0.6007, | |
"step": 413 | |
}, | |
{ | |
"epoch": 2.3793103448275863, | |
"grad_norm": 0.08840201050043106, | |
"learning_rate": 2.549160360688838e-05, | |
"loss": 0.5876, | |
"step": 414 | |
}, | |
{ | |
"epoch": 2.3850574712643677, | |
"grad_norm": 0.09680577367544174, | |
"learning_rate": 2.50465087674976e-05, | |
"loss": 0.6183, | |
"step": 415 | |
}, | |
{ | |
"epoch": 2.3908045977011496, | |
"grad_norm": 0.09196774661540985, | |
"learning_rate": 2.4604777063121033e-05, | |
"loss": 0.613, | |
"step": 416 | |
}, | |
{ | |
"epoch": 2.396551724137931, | |
"grad_norm": 0.0849708616733551, | |
"learning_rate": 2.4166428314094514e-05, | |
"loss": 0.6443, | |
"step": 417 | |
}, | |
{ | |
"epoch": 2.4022988505747125, | |
"grad_norm": 0.09316956251859665, | |
"learning_rate": 2.3731482188961818e-05, | |
"loss": 0.6062, | |
"step": 418 | |
}, | |
{ | |
"epoch": 2.4080459770114944, | |
"grad_norm": 0.08482903987169266, | |
"learning_rate": 2.32999582035923e-05, | |
"loss": 0.6099, | |
"step": 419 | |
}, | |
{ | |
"epoch": 2.413793103448276, | |
"grad_norm": 0.08352029323577881, | |
"learning_rate": 2.287187572030516e-05, | |
"loss": 0.6178, | |
"step": 420 | |
}, | |
{ | |
"epoch": 2.413793103448276, | |
"eval_loss": 0.6422638297080994, | |
"eval_runtime": 404.4609, | |
"eval_samples_per_second": 24.472, | |
"eval_steps_per_second": 0.383, | |
"step": 420 | |
}, | |
{ | |
"epoch": 2.4195402298850572, | |
"grad_norm": 0.09856913238763809, | |
"learning_rate": 2.244725394700079e-05, | |
"loss": 0.6166, | |
"step": 421 | |
}, | |
{ | |
"epoch": 2.425287356321839, | |
"grad_norm": 0.10127527266740799, | |
"learning_rate": 2.202611193629869e-05, | |
"loss": 0.6195, | |
"step": 422 | |
}, | |
{ | |
"epoch": 2.4310344827586206, | |
"grad_norm": 0.09415800124406815, | |
"learning_rate": 2.160846858468285e-05, | |
"loss": 0.6157, | |
"step": 423 | |
}, | |
{ | |
"epoch": 2.4367816091954024, | |
"grad_norm": 0.08563528954982758, | |
"learning_rate": 2.1194342631653607e-05, | |
"loss": 0.6212, | |
"step": 424 | |
}, | |
{ | |
"epoch": 2.442528735632184, | |
"grad_norm": 0.0861605629324913, | |
"learning_rate": 2.0783752658887066e-05, | |
"loss": 0.6095, | |
"step": 425 | |
}, | |
{ | |
"epoch": 2.4482758620689653, | |
"grad_norm": 0.1125798374414444, | |
"learning_rate": 2.0376717089401164e-05, | |
"loss": 0.606, | |
"step": 426 | |
}, | |
{ | |
"epoch": 2.454022988505747, | |
"grad_norm": 0.09633134305477142, | |
"learning_rate": 1.9973254186729086e-05, | |
"loss": 0.6109, | |
"step": 427 | |
}, | |
{ | |
"epoch": 2.4597701149425286, | |
"grad_norm": 0.08123010396957397, | |
"learning_rate": 1.9573382054099786e-05, | |
"loss": 0.5896, | |
"step": 428 | |
}, | |
{ | |
"epoch": 2.4655172413793105, | |
"grad_norm": 0.08620712906122208, | |
"learning_rate": 1.9177118633625814e-05, | |
"loss": 0.6022, | |
"step": 429 | |
}, | |
{ | |
"epoch": 2.471264367816092, | |
"grad_norm": 0.08710537105798721, | |
"learning_rate": 1.8784481705498015e-05, | |
"loss": 0.6161, | |
"step": 430 | |
}, | |
{ | |
"epoch": 2.471264367816092, | |
"eval_loss": 0.642048180103302, | |
"eval_runtime": 405.7821, | |
"eval_samples_per_second": 24.392, | |
"eval_steps_per_second": 0.382, | |
"step": 430 | |
}, | |
{ | |
"epoch": 2.4770114942528734, | |
"grad_norm": 0.08711250126361847, | |
"learning_rate": 1.8395488887188005e-05, | |
"loss": 0.581, | |
"step": 431 | |
}, | |
{ | |
"epoch": 2.4827586206896552, | |
"grad_norm": 0.08405685424804688, | |
"learning_rate": 1.8010157632657543e-05, | |
"loss": 0.6149, | |
"step": 432 | |
}, | |
{ | |
"epoch": 2.4885057471264367, | |
"grad_norm": 0.08080325275659561, | |
"learning_rate": 1.762850523157532e-05, | |
"loss": 0.6264, | |
"step": 433 | |
}, | |
{ | |
"epoch": 2.4942528735632186, | |
"grad_norm": 0.09836191684007645, | |
"learning_rate": 1.7250548808541322e-05, | |
"loss": 0.6055, | |
"step": 434 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 0.10626177489757538, | |
"learning_rate": 1.687630532231833e-05, | |
"loss": 0.5907, | |
"step": 435 | |
}, | |
{ | |
"epoch": 2.5057471264367814, | |
"grad_norm": 0.08308445662260056, | |
"learning_rate": 1.6505791565071138e-05, | |
"loss": 0.6189, | |
"step": 436 | |
}, | |
{ | |
"epoch": 2.5114942528735633, | |
"grad_norm": 0.10249936580657959, | |
"learning_rate": 1.613902416161288e-05, | |
"loss": 0.6084, | |
"step": 437 | |
}, | |
{ | |
"epoch": 2.5172413793103448, | |
"grad_norm": 0.08516431599855423, | |
"learning_rate": 1.5776019568659338e-05, | |
"loss": 0.624, | |
"step": 438 | |
}, | |
{ | |
"epoch": 2.5229885057471266, | |
"grad_norm": 0.08852159231901169, | |
"learning_rate": 1.5416794074090258e-05, | |
"loss": 0.6374, | |
"step": 439 | |
}, | |
{ | |
"epoch": 2.528735632183908, | |
"grad_norm": 0.09616044908761978, | |
"learning_rate": 1.5061363796218785e-05, | |
"loss": 0.634, | |
"step": 440 | |
}, | |
{ | |
"epoch": 2.528735632183908, | |
"eval_loss": 0.6419377326965332, | |
"eval_runtime": 416.5131, | |
"eval_samples_per_second": 23.764, | |
"eval_steps_per_second": 0.372, | |
"step": 440 | |
}, | |
{ | |
"epoch": 2.5344827586206895, | |
"grad_norm": 0.1012992411851883, | |
"learning_rate": 1.4709744683068039e-05, | |
"loss": 0.6443, | |
"step": 441 | |
}, | |
{ | |
"epoch": 2.5402298850574714, | |
"grad_norm": 0.102021224796772, | |
"learning_rate": 1.4361952511655618e-05, | |
"loss": 0.6111, | |
"step": 442 | |
}, | |
{ | |
"epoch": 2.545977011494253, | |
"grad_norm": 0.08464264124631882, | |
"learning_rate": 1.4018002887285687e-05, | |
"loss": 0.6007, | |
"step": 443 | |
}, | |
{ | |
"epoch": 2.5517241379310347, | |
"grad_norm": 0.0829034224152565, | |
"learning_rate": 1.3677911242848806e-05, | |
"loss": 0.6083, | |
"step": 444 | |
}, | |
{ | |
"epoch": 2.557471264367816, | |
"grad_norm": 0.08752921968698502, | |
"learning_rate": 1.334169283812936e-05, | |
"loss": 0.6227, | |
"step": 445 | |
}, | |
{ | |
"epoch": 2.5632183908045976, | |
"grad_norm": 0.080236054956913, | |
"learning_rate": 1.300936275912098e-05, | |
"loss": 0.6212, | |
"step": 446 | |
}, | |
{ | |
"epoch": 2.5689655172413794, | |
"grad_norm": 0.08524277061223984, | |
"learning_rate": 1.2680935917349523e-05, | |
"loss": 0.5915, | |
"step": 447 | |
}, | |
{ | |
"epoch": 2.574712643678161, | |
"grad_norm": 0.09109287708997726, | |
"learning_rate": 1.2356427049204122e-05, | |
"loss": 0.5972, | |
"step": 448 | |
}, | |
{ | |
"epoch": 2.5804597701149428, | |
"grad_norm": 0.11969230324029922, | |
"learning_rate": 1.2035850715275865e-05, | |
"loss": 0.6358, | |
"step": 449 | |
}, | |
{ | |
"epoch": 2.586206896551724, | |
"grad_norm": 0.08512509614229202, | |
"learning_rate": 1.1719221299704497e-05, | |
"loss": 0.6241, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.586206896551724, | |
"eval_loss": 0.641758382320404, | |
"eval_runtime": 404.7765, | |
"eval_samples_per_second": 24.453, | |
"eval_steps_per_second": 0.383, | |
"step": 450 | |
}, | |
{ | |
"epoch": 2.5919540229885056, | |
"grad_norm": 0.08563876152038574, | |
"learning_rate": 1.1406553009533027e-05, | |
"loss": 0.6027, | |
"step": 451 | |
}, | |
{ | |
"epoch": 2.5977011494252875, | |
"grad_norm": 0.07882750034332275, | |
"learning_rate": 1.1097859874070294e-05, | |
"loss": 0.6226, | |
"step": 452 | |
}, | |
{ | |
"epoch": 2.603448275862069, | |
"grad_norm": 0.08562333881855011, | |
"learning_rate": 1.0793155744261351e-05, | |
"loss": 0.6145, | |
"step": 453 | |
}, | |
{ | |
"epoch": 2.609195402298851, | |
"grad_norm": 0.08439898490905762, | |
"learning_rate": 1.0492454292066178e-05, | |
"loss": 0.6131, | |
"step": 454 | |
}, | |
{ | |
"epoch": 2.6149425287356323, | |
"grad_norm": 0.09046713262796402, | |
"learning_rate": 1.019576900984599e-05, | |
"loss": 0.6312, | |
"step": 455 | |
}, | |
{ | |
"epoch": 2.6206896551724137, | |
"grad_norm": 0.1001957505941391, | |
"learning_rate": 9.903113209758096e-06, | |
"loss": 0.6167, | |
"step": 456 | |
}, | |
{ | |
"epoch": 2.626436781609195, | |
"grad_norm": 0.08048044890165329, | |
"learning_rate": 9.614500023158336e-06, | |
"loss": 0.5969, | |
"step": 457 | |
}, | |
{ | |
"epoch": 2.632183908045977, | |
"grad_norm": 0.07949711382389069, | |
"learning_rate": 9.32994240001206e-06, | |
"loss": 0.6324, | |
"step": 458 | |
}, | |
{ | |
"epoch": 2.637931034482759, | |
"grad_norm": 0.0978640615940094, | |
"learning_rate": 9.049453108312966e-06, | |
"loss": 0.5779, | |
"step": 459 | |
}, | |
{ | |
"epoch": 2.6436781609195403, | |
"grad_norm": 0.08483273535966873, | |
"learning_rate": 8.773044733510338e-06, | |
"loss": 0.6084, | |
"step": 460 | |
}, | |
{ | |
"epoch": 2.6436781609195403, | |
"eval_loss": 0.6415662169456482, | |
"eval_runtime": 404.188, | |
"eval_samples_per_second": 24.489, | |
"eval_steps_per_second": 0.383, | |
"step": 460 | |
}, | |
{ | |
"epoch": 2.6494252873563218, | |
"grad_norm": 0.08597224205732346, | |
"learning_rate": 8.50072967794413e-06, | |
"loss": 0.5962, | |
"step": 461 | |
}, | |
{ | |
"epoch": 2.655172413793103, | |
"grad_norm": 0.08336161822080612, | |
"learning_rate": 8.232520160288704e-06, | |
"loss": 0.6276, | |
"step": 462 | |
}, | |
{ | |
"epoch": 2.660919540229885, | |
"grad_norm": 0.08224053680896759, | |
"learning_rate": 7.96842821500442e-06, | |
"loss": 0.6047, | |
"step": 463 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"grad_norm": 0.08457629382610321, | |
"learning_rate": 7.708465691797717e-06, | |
"loss": 0.6006, | |
"step": 464 | |
}, | |
{ | |
"epoch": 2.6724137931034484, | |
"grad_norm": 0.09363652020692825, | |
"learning_rate": 7.452644255089425e-06, | |
"loss": 0.6261, | |
"step": 465 | |
}, | |
{ | |
"epoch": 2.67816091954023, | |
"grad_norm": 0.08728937804698944, | |
"learning_rate": 7.20097538349136e-06, | |
"loss": 0.6146, | |
"step": 466 | |
}, | |
{ | |
"epoch": 2.6839080459770113, | |
"grad_norm": 0.08341008424758911, | |
"learning_rate": 6.953470369291348e-06, | |
"loss": 0.6237, | |
"step": 467 | |
}, | |
{ | |
"epoch": 2.689655172413793, | |
"grad_norm": 0.08936601877212524, | |
"learning_rate": 6.710140317946423e-06, | |
"loss": 0.643, | |
"step": 468 | |
}, | |
{ | |
"epoch": 2.6954022988505746, | |
"grad_norm": 0.09783781319856644, | |
"learning_rate": 6.470996147584685e-06, | |
"loss": 0.5764, | |
"step": 469 | |
}, | |
{ | |
"epoch": 2.7011494252873565, | |
"grad_norm": 0.08959370106458664, | |
"learning_rate": 6.236048588515242e-06, | |
"loss": 0.6264, | |
"step": 470 | |
}, | |
{ | |
"epoch": 2.7011494252873565, | |
"eval_loss": 0.6414589881896973, | |
"eval_runtime": 405.1776, | |
"eval_samples_per_second": 24.429, | |
"eval_steps_per_second": 0.383, | |
"step": 470 | |
}, | |
{ | |
"epoch": 2.706896551724138, | |
"grad_norm": 0.08131396770477295, | |
"learning_rate": 6.0053081827469045e-06, | |
"loss": 0.6455, | |
"step": 471 | |
}, | |
{ | |
"epoch": 2.7126436781609193, | |
"grad_norm": 0.08353292942047119, | |
"learning_rate": 5.778785283515053e-06, | |
"loss": 0.6254, | |
"step": 472 | |
}, | |
{ | |
"epoch": 2.718390804597701, | |
"grad_norm": 0.0802810862660408, | |
"learning_rate": 5.556490054817132e-06, | |
"loss": 0.6284, | |
"step": 473 | |
}, | |
{ | |
"epoch": 2.7241379310344827, | |
"grad_norm": 0.08118069916963577, | |
"learning_rate": 5.338432470956589e-06, | |
"loss": 0.6092, | |
"step": 474 | |
}, | |
{ | |
"epoch": 2.7298850574712645, | |
"grad_norm": 0.08621113002300262, | |
"learning_rate": 5.1246223160953845e-06, | |
"loss": 0.6489, | |
"step": 475 | |
}, | |
{ | |
"epoch": 2.735632183908046, | |
"grad_norm": 0.08560863137245178, | |
"learning_rate": 4.91506918381488e-06, | |
"loss": 0.6154, | |
"step": 476 | |
}, | |
{ | |
"epoch": 2.7413793103448274, | |
"grad_norm": 0.081720270216465, | |
"learning_rate": 4.7097824766854756e-06, | |
"loss": 0.6232, | |
"step": 477 | |
}, | |
{ | |
"epoch": 2.7471264367816093, | |
"grad_norm": 0.08384092152118683, | |
"learning_rate": 4.508771405844636e-06, | |
"loss": 0.6209, | |
"step": 478 | |
}, | |
{ | |
"epoch": 2.7528735632183907, | |
"grad_norm": 0.08142372965812683, | |
"learning_rate": 4.312044990583675e-06, | |
"loss": 0.6298, | |
"step": 479 | |
}, | |
{ | |
"epoch": 2.7586206896551726, | |
"grad_norm": 0.0810447633266449, | |
"learning_rate": 4.119612057942978e-06, | |
"loss": 0.608, | |
"step": 480 | |
}, | |
{ | |
"epoch": 2.7586206896551726, | |
"eval_loss": 0.6413341164588928, | |
"eval_runtime": 410.6577, | |
"eval_samples_per_second": 24.103, | |
"eval_steps_per_second": 0.377, | |
"step": 480 | |
}, | |
{ | |
"epoch": 2.764367816091954, | |
"grad_norm": 0.08321461826562881, | |
"learning_rate": 3.931481242315993e-06, | |
"loss": 0.6426, | |
"step": 481 | |
}, | |
{ | |
"epoch": 2.7701149425287355, | |
"grad_norm": 0.0784662514925003, | |
"learning_rate": 3.747660985061785e-06, | |
"loss": 0.6126, | |
"step": 482 | |
}, | |
{ | |
"epoch": 2.7758620689655173, | |
"grad_norm": 0.09238499402999878, | |
"learning_rate": 3.568159534126314e-06, | |
"loss": 0.5786, | |
"step": 483 | |
}, | |
{ | |
"epoch": 2.781609195402299, | |
"grad_norm": 0.08142554014921188, | |
"learning_rate": 3.3929849436722728e-06, | |
"loss": 0.6341, | |
"step": 484 | |
}, | |
{ | |
"epoch": 2.7873563218390807, | |
"grad_norm": 0.08540128916501999, | |
"learning_rate": 3.2221450737178083e-06, | |
"loss": 0.6062, | |
"step": 485 | |
}, | |
{ | |
"epoch": 2.793103448275862, | |
"grad_norm": 0.08547057211399078, | |
"learning_rate": 3.0556475897837166e-06, | |
"loss": 0.5974, | |
"step": 486 | |
}, | |
{ | |
"epoch": 2.7988505747126435, | |
"grad_norm": 0.1007808968424797, | |
"learning_rate": 2.8934999625496282e-06, | |
"loss": 0.6157, | |
"step": 487 | |
}, | |
{ | |
"epoch": 2.8045977011494254, | |
"grad_norm": 0.08533742278814316, | |
"learning_rate": 2.735709467518699e-06, | |
"loss": 0.625, | |
"step": 488 | |
}, | |
{ | |
"epoch": 2.810344827586207, | |
"grad_norm": 0.08325877785682678, | |
"learning_rate": 2.5822831846912033e-06, | |
"loss": 0.5991, | |
"step": 489 | |
}, | |
{ | |
"epoch": 2.8160919540229887, | |
"grad_norm": 0.08522289991378784, | |
"learning_rate": 2.4332279982468453e-06, | |
"loss": 0.6039, | |
"step": 490 | |
}, | |
{ | |
"epoch": 2.8160919540229887, | |
"eval_loss": 0.6412601470947266, | |
"eval_runtime": 405.8893, | |
"eval_samples_per_second": 24.386, | |
"eval_steps_per_second": 0.382, | |
"step": 490 | |
}, | |
{ | |
"epoch": 2.82183908045977, | |
"grad_norm": 0.08191868662834167, | |
"learning_rate": 2.2885505962359054e-06, | |
"loss": 0.5907, | |
"step": 491 | |
}, | |
{ | |
"epoch": 2.8275862068965516, | |
"grad_norm": 0.08263259381055832, | |
"learning_rate": 2.1482574702790803e-06, | |
"loss": 0.615, | |
"step": 492 | |
}, | |
{ | |
"epoch": 2.8333333333333335, | |
"grad_norm": 0.08231104165315628, | |
"learning_rate": 2.0123549152762823e-06, | |
"loss": 0.6334, | |
"step": 493 | |
}, | |
{ | |
"epoch": 2.839080459770115, | |
"grad_norm": 0.08760181069374084, | |
"learning_rate": 1.8808490291241432e-06, | |
"loss": 0.6186, | |
"step": 494 | |
}, | |
{ | |
"epoch": 2.844827586206897, | |
"grad_norm": 0.07865423709154129, | |
"learning_rate": 1.7537457124423895e-06, | |
"loss": 0.6324, | |
"step": 495 | |
}, | |
{ | |
"epoch": 2.8505747126436782, | |
"grad_norm": 0.08259916305541992, | |
"learning_rate": 1.631050668309131e-06, | |
"loss": 0.6406, | |
"step": 496 | |
}, | |
{ | |
"epoch": 2.8563218390804597, | |
"grad_norm": 0.08283340930938721, | |
"learning_rate": 1.5127694020049432e-06, | |
"loss": 0.6253, | |
"step": 497 | |
}, | |
{ | |
"epoch": 2.862068965517241, | |
"grad_norm": 0.0877593606710434, | |
"learning_rate": 1.3989072207658328e-06, | |
"loss": 0.6158, | |
"step": 498 | |
}, | |
{ | |
"epoch": 2.867816091954023, | |
"grad_norm": 0.08183769136667252, | |
"learning_rate": 1.2894692335451375e-06, | |
"loss": 0.6091, | |
"step": 499 | |
}, | |
{ | |
"epoch": 2.873563218390805, | |
"grad_norm": 0.08991672843694687, | |
"learning_rate": 1.1844603507842668e-06, | |
"loss": 0.6445, | |
"step": 500 | |
}, | |
{ | |
"epoch": 2.873563218390805, | |
"eval_loss": 0.641264796257019, | |
"eval_runtime": 405.0206, | |
"eval_samples_per_second": 24.438, | |
"eval_steps_per_second": 0.383, | |
"step": 500 | |
} | |
], | |
"logging_steps": 1.0, | |
"max_steps": 522, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 3, | |
"save_steps": 50, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 4.880737746399789e+19, | |
"train_batch_size": 2, | |
"trial_name": null, | |
"trial_params": null | |
} | |