|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 1000, |
|
"global_step": 4320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.046296296296296294, |
|
"grad_norm": 9.519277572631836, |
|
"learning_rate": 1.8254252421884168e-05, |
|
"loss": 2.9957, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09259259259259259, |
|
"grad_norm": 8.448235511779785, |
|
"learning_rate": 2.3749329949293174e-05, |
|
"loss": 0.7782, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 5.370626449584961, |
|
"learning_rate": 2.6963744241382978e-05, |
|
"loss": 0.6191, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 5.509120464324951, |
|
"learning_rate": 2.924440747670218e-05, |
|
"loss": 0.6412, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.23148148148148148, |
|
"grad_norm": 4.508421897888184, |
|
"learning_rate": 3e-05, |
|
"loss": 0.528, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 3.5512583255767822, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4154, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.32407407407407407, |
|
"grad_norm": 4.571855545043945, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4413, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 3.775933265686035, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5107, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 6.0832414627075195, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4848, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"grad_norm": 5.910586357116699, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5073, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5092592592592593, |
|
"grad_norm": 3.3788774013519287, |
|
"learning_rate": 3e-05, |
|
"loss": 0.383, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 2.238117218017578, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4012, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6018518518518519, |
|
"grad_norm": 3.789665937423706, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3533, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6481481481481481, |
|
"grad_norm": 3.239994764328003, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4858, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 3.5634827613830566, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3845, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 4.085621356964111, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3996, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7870370370370371, |
|
"grad_norm": 2.4866838455200195, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4485, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 2.403611421585083, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3869, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8796296296296297, |
|
"grad_norm": 24.364118576049805, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4605, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 3.141599655151367, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4298, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 2.0329315662384033, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4584, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 2.6605312824249268, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3293, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0648148148148149, |
|
"grad_norm": 2.2286181449890137, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3676, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 1.9300888776779175, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3423, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1574074074074074, |
|
"grad_norm": 2.018017053604126, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3378, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2037037037037037, |
|
"grad_norm": 2.760201930999756, |
|
"learning_rate": 3e-05, |
|
"loss": 0.334, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.0350053310394287, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3699, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 2.3771510124206543, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3583, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.3425925925925926, |
|
"grad_norm": 3.1241819858551025, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3588, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 1.825257658958435, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3715, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.4351851851851851, |
|
"grad_norm": 2.6988837718963623, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3707, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 4.2009406089782715, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3721, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.5277777777777777, |
|
"grad_norm": 3.3356688022613525, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3076, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.574074074074074, |
|
"grad_norm": 2.5690464973449707, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3025, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.6203703703703702, |
|
"grad_norm": 1.67151939868927, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3098, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.9243090152740479, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3651, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.7129629629629628, |
|
"grad_norm": 3.038501739501953, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3634, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.7592592592592593, |
|
"grad_norm": 2.4423258304595947, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3351, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.8055555555555556, |
|
"grad_norm": 3.9061005115509033, |
|
"learning_rate": 3e-05, |
|
"loss": 0.356, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 2.0409352779388428, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3341, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.8981481481481481, |
|
"grad_norm": 1.5015701055526733, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3501, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 1.3792345523834229, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3143, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.9907407407407407, |
|
"grad_norm": 1.581567406654358, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3473, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.037037037037037, |
|
"grad_norm": 1.8894872665405273, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2751, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 2.4780213832855225, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2993, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.1296296296296298, |
|
"grad_norm": 3.4187729358673096, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3286, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.175925925925926, |
|
"grad_norm": 1.52755606174469, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2814, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.6060500144958496, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2999, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.2685185185185186, |
|
"grad_norm": 2.4506280422210693, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2929, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.314814814814815, |
|
"grad_norm": 1.7601087093353271, |
|
"learning_rate": 3e-05, |
|
"loss": 0.351, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.361111111111111, |
|
"grad_norm": 1.7349791526794434, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2827, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.4074074074074074, |
|
"grad_norm": 1.8765602111816406, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3244, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.4537037037037037, |
|
"grad_norm": 2.0878164768218994, |
|
"learning_rate": 3e-05, |
|
"loss": 0.323, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.7491815090179443, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2458, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.5462962962962963, |
|
"grad_norm": 1.950523018836975, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2599, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 2.113060712814331, |
|
"learning_rate": 3e-05, |
|
"loss": 0.309, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.638888888888889, |
|
"grad_norm": 2.221663475036621, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3166, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.685185185185185, |
|
"grad_norm": 2.251364231109619, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2997, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.7314814814814814, |
|
"grad_norm": 3.0125315189361572, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2885, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 1.529222846031189, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2885, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.824074074074074, |
|
"grad_norm": 1.5727730989456177, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3328, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.8703703703703702, |
|
"grad_norm": 1.7602397203445435, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3089, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 1.2951428890228271, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2912, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 2.3238091468811035, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3007, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.009259259259259, |
|
"grad_norm": 1.14413321018219, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2939, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.0555555555555554, |
|
"grad_norm": 1.6297560930252075, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2275, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.1018518518518516, |
|
"grad_norm": 1.8607372045516968, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2504, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.148148148148148, |
|
"grad_norm": 1.9002240896224976, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2693, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.1944444444444446, |
|
"grad_norm": 2.417104959487915, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2609, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.240740740740741, |
|
"grad_norm": 1.9233345985412598, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2669, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.287037037037037, |
|
"grad_norm": 1.330132246017456, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2776, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 2.0991740226745605, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2365, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.3796296296296298, |
|
"grad_norm": 1.7612348794937134, |
|
"learning_rate": 3e-05, |
|
"loss": 0.229, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.425925925925926, |
|
"grad_norm": 2.1555356979370117, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3083, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.4722222222222223, |
|
"grad_norm": 1.8027970790863037, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2834, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.5185185185185186, |
|
"grad_norm": 1.9787973165512085, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2771, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.564814814814815, |
|
"grad_norm": 1.3383827209472656, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3039, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.611111111111111, |
|
"grad_norm": 1.6759270429611206, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2637, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.6574074074074074, |
|
"grad_norm": 1.937604308128357, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2702, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 1.4416754245758057, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2522, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.7105003595352173, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2751, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.7962962962962963, |
|
"grad_norm": 1.843482255935669, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2753, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.8425925925925926, |
|
"grad_norm": 1.3820255994796753, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2542, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 1.7006720304489136, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2612, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.935185185185185, |
|
"grad_norm": 1.5114960670471191, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2849, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.9814814814814814, |
|
"grad_norm": 2.0061888694763184, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2932, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.027777777777778, |
|
"grad_norm": 1.8746589422225952, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2782, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.074074074074074, |
|
"grad_norm": 3.473623037338257, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2532, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.12037037037037, |
|
"grad_norm": 1.4170212745666504, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2259, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 24.153331756591797, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2273, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.212962962962963, |
|
"grad_norm": 1.2688499689102173, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2204, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.2592592592592595, |
|
"grad_norm": 1.7967044115066528, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2248, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.305555555555555, |
|
"grad_norm": 1.8120778799057007, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2409, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.351851851851852, |
|
"grad_norm": 1.1904025077819824, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2173, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.398148148148148, |
|
"grad_norm": 1.1123757362365723, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2418, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 1.3130488395690918, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2456, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.4907407407407405, |
|
"grad_norm": 2.091806173324585, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2219, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.537037037037037, |
|
"grad_norm": 3.246419906616211, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2512, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.583333333333333, |
|
"grad_norm": 1.6817961931228638, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2635, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"grad_norm": 1.7699165344238281, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2489, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"eval_loss": 0.39369502663612366, |
|
"eval_runtime": 88.9571, |
|
"eval_samples_per_second": 6.07, |
|
"eval_steps_per_second": 0.304, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.675925925925926, |
|
"grad_norm": 1.4021247625350952, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2433, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.722222222222222, |
|
"grad_norm": 1.8957483768463135, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2328, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.768518518518518, |
|
"grad_norm": 1.1866583824157715, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2477, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.814814814814815, |
|
"grad_norm": 1.4801381826400757, |
|
"learning_rate": 3e-05, |
|
"loss": 0.255, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.861111111111111, |
|
"grad_norm": 1.803208351135254, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2645, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.907407407407407, |
|
"grad_norm": 1.838924527168274, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2039, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.953703703703704, |
|
"grad_norm": 1.457726001739502, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2714, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.394857406616211, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2533, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 5.046296296296297, |
|
"grad_norm": 1.6803613901138306, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2275, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 5.092592592592593, |
|
"grad_norm": 0.976388156414032, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2169, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.138888888888889, |
|
"grad_norm": 1.692513346672058, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2132, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 5.185185185185185, |
|
"grad_norm": 1.3766963481903076, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1997, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.231481481481482, |
|
"grad_norm": 2.006950616836548, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2142, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.277777777777778, |
|
"grad_norm": 1.7959928512573242, |
|
"learning_rate": 3e-05, |
|
"loss": 0.213, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.324074074074074, |
|
"grad_norm": 1.3566243648529053, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1878, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.37037037037037, |
|
"grad_norm": 2.6776769161224365, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2325, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.416666666666667, |
|
"grad_norm": 2.4411234855651855, |
|
"learning_rate": 3e-05, |
|
"loss": 0.213, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.462962962962963, |
|
"grad_norm": 1.2669930458068848, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2174, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.5092592592592595, |
|
"grad_norm": 1.9736443758010864, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2309, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 1.8092678785324097, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2403, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.601851851851852, |
|
"grad_norm": 1.1341086626052856, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2329, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 5.648148148148148, |
|
"grad_norm": 1.8351995944976807, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2112, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.694444444444445, |
|
"grad_norm": 1.6290438175201416, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2387, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.7407407407407405, |
|
"grad_norm": 1.9721460342407227, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2191, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.787037037037037, |
|
"grad_norm": 1.7176364660263062, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2358, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.833333333333333, |
|
"grad_norm": 1.7181181907653809, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2532, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.87962962962963, |
|
"grad_norm": 1.7120261192321777, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2199, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"grad_norm": 1.7863956689834595, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2425, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.972222222222222, |
|
"grad_norm": 1.8824771642684937, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2404, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 6.018518518518518, |
|
"grad_norm": 1.471390724182129, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2045, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.064814814814815, |
|
"grad_norm": 1.220346212387085, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1946, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 6.111111111111111, |
|
"grad_norm": 1.6119085550308228, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2073, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 6.157407407407407, |
|
"grad_norm": 2.3939287662506104, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1928, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 6.203703703703703, |
|
"grad_norm": 1.7139561176300049, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2011, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 1.6871051788330078, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2034, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.296296296296296, |
|
"grad_norm": 1.7590142488479614, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1869, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 6.342592592592593, |
|
"grad_norm": 0.823594868183136, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.388888888888889, |
|
"grad_norm": 2.3577373027801514, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2333, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.435185185185185, |
|
"grad_norm": 1.2093870639801025, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2143, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 6.481481481481482, |
|
"grad_norm": 1.7904540300369263, |
|
"learning_rate": 3e-05, |
|
"loss": 0.23, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.527777777777778, |
|
"grad_norm": 1.5087484121322632, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2576, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 6.574074074074074, |
|
"grad_norm": 1.7448809146881104, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2294, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 6.62037037037037, |
|
"grad_norm": 1.4423376321792603, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2051, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 3.799774646759033, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2014, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 6.712962962962963, |
|
"grad_norm": 2.199467420578003, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2321, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 6.7592592592592595, |
|
"grad_norm": 9.484046936035156, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1942, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 6.805555555555555, |
|
"grad_norm": 1.1799571514129639, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2162, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 6.851851851851852, |
|
"grad_norm": 1.902580738067627, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2172, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 6.898148148148148, |
|
"grad_norm": 1.4826103448867798, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2115, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 6.944444444444445, |
|
"grad_norm": 1.7186169624328613, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2578, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.9907407407407405, |
|
"grad_norm": 1.441232442855835, |
|
"learning_rate": 3e-05, |
|
"loss": 0.215, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 7.037037037037037, |
|
"grad_norm": 1.0220807790756226, |
|
"learning_rate": 3e-05, |
|
"loss": 0.181, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 7.083333333333333, |
|
"grad_norm": 1.1033027172088623, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1875, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 7.12962962962963, |
|
"grad_norm": 1.4569389820098877, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2119, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 7.175925925925926, |
|
"grad_norm": 1.2294921875, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1978, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 7.222222222222222, |
|
"grad_norm": 1.2544004917144775, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1752, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 7.268518518518518, |
|
"grad_norm": 1.6726148128509521, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1952, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 7.314814814814815, |
|
"grad_norm": 1.556419849395752, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2052, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 7.361111111111111, |
|
"grad_norm": 1.9425777196884155, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1974, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 1.915613055229187, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2083, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 7.453703703703704, |
|
"grad_norm": 2.244108200073242, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2169, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 1.9481151103973389, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1952, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 7.546296296296296, |
|
"grad_norm": 1.2878506183624268, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2063, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 7.592592592592593, |
|
"grad_norm": 0.863036572933197, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 7.638888888888889, |
|
"grad_norm": 1.7689223289489746, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1879, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 7.685185185185185, |
|
"grad_norm": 1.5072979927062988, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2032, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 7.731481481481482, |
|
"grad_norm": 1.675126314163208, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2017, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 7.777777777777778, |
|
"grad_norm": 1.5218942165374756, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2051, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 7.824074074074074, |
|
"grad_norm": 1.1887863874435425, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1767, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 7.87037037037037, |
|
"grad_norm": 3.391000270843506, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2019, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.916666666666667, |
|
"grad_norm": 1.23732328414917, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2008, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 7.962962962962963, |
|
"grad_norm": 1.2429234981536865, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1795, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 8.00925925925926, |
|
"grad_norm": 1.0785852670669556, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1828, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 8.055555555555555, |
|
"grad_norm": 1.2642838954925537, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1909, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 8.101851851851851, |
|
"grad_norm": 1.251452922821045, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1807, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 8.148148148148149, |
|
"grad_norm": 1.4957802295684814, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1877, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 8.194444444444445, |
|
"grad_norm": 2.1930840015411377, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1785, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 8.24074074074074, |
|
"grad_norm": 1.210434913635254, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1713, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 8.287037037037036, |
|
"grad_norm": 0.8785271048545837, |
|
"learning_rate": 3e-05, |
|
"loss": 0.181, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 1.0910536050796509, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1773, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 8.37962962962963, |
|
"grad_norm": 1.1678903102874756, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1847, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 8.425925925925926, |
|
"grad_norm": 1.546592354774475, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1938, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 8.472222222222221, |
|
"grad_norm": 0.8986390829086304, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1872, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 8.518518518518519, |
|
"grad_norm": 1.2651722431182861, |
|
"learning_rate": 3e-05, |
|
"loss": 0.181, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 8.564814814814815, |
|
"grad_norm": 1.5655752420425415, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1932, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 8.61111111111111, |
|
"grad_norm": 1.7608935832977295, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1857, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 8.657407407407408, |
|
"grad_norm": 2.4741933345794678, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1766, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 8.703703703703704, |
|
"grad_norm": 1.4740314483642578, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1755, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 1.4977920055389404, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1867, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 8.796296296296296, |
|
"grad_norm": 1.9219239950180054, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1881, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.842592592592592, |
|
"grad_norm": 1.5815542936325073, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2056, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 1.3508894443511963, |
|
"learning_rate": 3e-05, |
|
"loss": 0.192, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 8.935185185185185, |
|
"grad_norm": 1.4464216232299805, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1848, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 8.981481481481481, |
|
"grad_norm": 1.9274829626083374, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2004, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 9.027777777777779, |
|
"grad_norm": 1.2753411531448364, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1787, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 9.074074074074074, |
|
"grad_norm": 1.0816010236740112, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1579, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 9.12037037037037, |
|
"grad_norm": 1.283239483833313, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1713, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 9.166666666666666, |
|
"grad_norm": 1.5639042854309082, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1651, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 9.212962962962964, |
|
"grad_norm": 1.0681788921356201, |
|
"learning_rate": 3e-05, |
|
"loss": 0.163, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 9.25925925925926, |
|
"grad_norm": 1.6595847606658936, |
|
"learning_rate": 3e-05, |
|
"loss": 0.173, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 9.25925925925926, |
|
"eval_loss": 0.4976138770580292, |
|
"eval_runtime": 89.2657, |
|
"eval_samples_per_second": 6.049, |
|
"eval_steps_per_second": 0.302, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 9.305555555555555, |
|
"grad_norm": 1.3802722692489624, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1827, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 9.351851851851851, |
|
"grad_norm": 1.4063881635665894, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1766, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 9.398148148148149, |
|
"grad_norm": 1.460650086402893, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1745, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 9.444444444444445, |
|
"grad_norm": 1.2076504230499268, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1604, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 9.49074074074074, |
|
"grad_norm": 2.070692539215088, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1565, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 9.537037037037036, |
|
"grad_norm": 1.0405620336532593, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1772, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 9.583333333333334, |
|
"grad_norm": 1.3936161994934082, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1644, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 9.62962962962963, |
|
"grad_norm": 1.1049543619155884, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1716, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 9.675925925925926, |
|
"grad_norm": 0.986007034778595, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1733, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 9.722222222222221, |
|
"grad_norm": 0.7544007897377014, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1845, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 9.768518518518519, |
|
"grad_norm": 1.322129249572754, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1555, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 9.814814814814815, |
|
"grad_norm": 1.0886545181274414, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1904, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 9.86111111111111, |
|
"grad_norm": 1.0224428176879883, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1917, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 9.907407407407408, |
|
"grad_norm": 1.9668025970458984, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1887, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 9.953703703703704, |
|
"grad_norm": 1.6308960914611816, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2052, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 1.2425373792648315, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1955, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 10.046296296296296, |
|
"grad_norm": 1.3138960599899292, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1622, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 10.092592592592593, |
|
"grad_norm": 1.3728431463241577, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1688, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 10.13888888888889, |
|
"grad_norm": 1.1341276168823242, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1711, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 10.185185185185185, |
|
"grad_norm": 1.199234962463379, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1638, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 10.231481481481481, |
|
"grad_norm": 1.1935800313949585, |
|
"learning_rate": 3e-05, |
|
"loss": 0.148, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 10.277777777777779, |
|
"grad_norm": 1.2901479005813599, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1577, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 10.324074074074074, |
|
"grad_norm": 1.572486400604248, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1511, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 10.37037037037037, |
|
"grad_norm": 1.2550723552703857, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1523, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 10.416666666666666, |
|
"grad_norm": 1.5212897062301636, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1701, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 10.462962962962964, |
|
"grad_norm": 1.4204086065292358, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1695, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 10.50925925925926, |
|
"grad_norm": 1.7510631084442139, |
|
"learning_rate": 3e-05, |
|
"loss": 0.179, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 10.555555555555555, |
|
"grad_norm": 1.1196744441986084, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1713, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 10.601851851851851, |
|
"grad_norm": 1.7313737869262695, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1625, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 10.648148148148149, |
|
"grad_norm": 1.1136900186538696, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1584, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 10.694444444444445, |
|
"grad_norm": 0.900171160697937, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1746, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 10.74074074074074, |
|
"grad_norm": 1.5992180109024048, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1766, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 10.787037037037036, |
|
"grad_norm": 1.1199150085449219, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1641, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 10.833333333333334, |
|
"grad_norm": 1.3033232688903809, |
|
"learning_rate": 3e-05, |
|
"loss": 0.164, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 10.87962962962963, |
|
"grad_norm": 1.589557409286499, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1629, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 10.925925925925926, |
|
"grad_norm": 1.715973973274231, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1709, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 10.972222222222221, |
|
"grad_norm": 0.9544859528541565, |
|
"learning_rate": 3e-05, |
|
"loss": 0.169, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 11.018518518518519, |
|
"grad_norm": 2.256932497024536, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1683, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 11.064814814814815, |
|
"grad_norm": 0.7787851691246033, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1428, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 0.9551990628242493, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1587, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 11.157407407407407, |
|
"grad_norm": 1.0669728517532349, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1507, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 11.203703703703704, |
|
"grad_norm": 1.5834068059921265, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1479, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 11.25, |
|
"grad_norm": 1.749540090560913, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1605, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 11.296296296296296, |
|
"grad_norm": 1.4417942762374878, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1636, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 11.342592592592593, |
|
"grad_norm": 1.16630220413208, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1717, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 11.38888888888889, |
|
"grad_norm": 1.6200324296951294, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1502, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 11.435185185185185, |
|
"grad_norm": 1.5884885787963867, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1569, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 11.481481481481481, |
|
"grad_norm": 1.0750160217285156, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1741, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 11.527777777777779, |
|
"grad_norm": 1.3603105545043945, |
|
"learning_rate": 3e-05, |
|
"loss": 0.14, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 11.574074074074074, |
|
"grad_norm": 0.7031620740890503, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1589, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 11.62037037037037, |
|
"grad_norm": 0.9268414378166199, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1679, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 11.666666666666666, |
|
"grad_norm": 1.3924790620803833, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1681, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 11.712962962962964, |
|
"grad_norm": 1.586488962173462, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1553, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 11.75925925925926, |
|
"grad_norm": 1.3625974655151367, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1665, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 11.805555555555555, |
|
"grad_norm": 1.6760375499725342, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1826, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 11.851851851851851, |
|
"grad_norm": 1.3086282014846802, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1689, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 11.898148148148149, |
|
"grad_norm": 1.2318929433822632, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1575, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 11.944444444444445, |
|
"grad_norm": 1.7143497467041016, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1632, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 11.99074074074074, |
|
"grad_norm": 1.5415269136428833, |
|
"learning_rate": 3e-05, |
|
"loss": 0.177, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 12.037037037037036, |
|
"grad_norm": 1.0246485471725464, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1289, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 12.083333333333334, |
|
"grad_norm": 0.7156587243080139, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1552, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 12.12962962962963, |
|
"grad_norm": 1.3618501424789429, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1599, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 12.175925925925926, |
|
"grad_norm": 1.4899464845657349, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1421, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 12.222222222222221, |
|
"grad_norm": 0.9834685325622559, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1471, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 12.268518518518519, |
|
"grad_norm": 1.0517306327819824, |
|
"learning_rate": 3e-05, |
|
"loss": 0.164, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 12.314814814814815, |
|
"grad_norm": 1.0001568794250488, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1571, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 12.36111111111111, |
|
"grad_norm": 1.084662914276123, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1585, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 12.407407407407407, |
|
"grad_norm": 1.2706600427627563, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1452, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 12.453703703703704, |
|
"grad_norm": 1.2018481492996216, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1556, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 1.043424367904663, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1665, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 12.546296296296296, |
|
"grad_norm": 1.4996309280395508, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1445, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 12.592592592592592, |
|
"grad_norm": 0.9113991856575012, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1553, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 12.63888888888889, |
|
"grad_norm": 1.2703651189804077, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1683, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 12.685185185185185, |
|
"grad_norm": 1.1912919282913208, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1526, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 12.731481481481481, |
|
"grad_norm": 1.1190593242645264, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1627, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 12.777777777777779, |
|
"grad_norm": 1.0739121437072754, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1495, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 12.824074074074074, |
|
"grad_norm": 1.53770112991333, |
|
"learning_rate": 3e-05, |
|
"loss": 0.157, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 12.87037037037037, |
|
"grad_norm": 1.3282749652862549, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1593, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 12.916666666666666, |
|
"grad_norm": 1.5690109729766846, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1568, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 12.962962962962964, |
|
"grad_norm": 1.1614137887954712, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1664, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 13.00925925925926, |
|
"grad_norm": 1.3638834953308105, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1443, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 13.055555555555555, |
|
"grad_norm": 1.4731112718582153, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1415, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 13.101851851851851, |
|
"grad_norm": 1.0417250394821167, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1315, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 13.148148148148149, |
|
"grad_norm": 1.570486307144165, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1473, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 13.194444444444445, |
|
"grad_norm": 1.0269405841827393, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1541, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 13.24074074074074, |
|
"grad_norm": 1.0735430717468262, |
|
"learning_rate": 3e-05, |
|
"loss": 0.148, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 13.287037037037036, |
|
"grad_norm": 1.6693620681762695, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1543, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 1.2953250408172607, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1495, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 13.37962962962963, |
|
"grad_norm": 1.1142494678497314, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1484, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 13.425925925925926, |
|
"grad_norm": 1.1380842924118042, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1548, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 13.472222222222221, |
|
"grad_norm": 1.7548078298568726, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1468, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 13.518518518518519, |
|
"grad_norm": 0.8444296717643738, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1348, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 13.564814814814815, |
|
"grad_norm": 0.9462345242500305, |
|
"learning_rate": 3e-05, |
|
"loss": 0.156, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 13.61111111111111, |
|
"grad_norm": 1.155483365058899, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1419, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 13.657407407407408, |
|
"grad_norm": 0.9843894243240356, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1418, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 13.703703703703704, |
|
"grad_norm": 0.7480261325836182, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1465, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 13.75, |
|
"grad_norm": 1.5990567207336426, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1447, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 13.796296296296296, |
|
"grad_norm": 1.809402346611023, |
|
"learning_rate": 3e-05, |
|
"loss": 0.175, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 13.842592592592592, |
|
"grad_norm": 1.865115761756897, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1418, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 13.88888888888889, |
|
"grad_norm": 1.2112107276916504, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1715, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 13.88888888888889, |
|
"eval_loss": 0.5447816252708435, |
|
"eval_runtime": 89.1984, |
|
"eval_samples_per_second": 6.054, |
|
"eval_steps_per_second": 0.303, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 13.935185185185185, |
|
"grad_norm": 1.0728169679641724, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1596, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 13.981481481481481, |
|
"grad_norm": 1.8573318719863892, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1665, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 14.027777777777779, |
|
"grad_norm": 1.1071803569793701, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1544, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 14.074074074074074, |
|
"grad_norm": 1.5413600206375122, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1432, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 14.12037037037037, |
|
"grad_norm": 1.1424851417541504, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1366, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 14.166666666666666, |
|
"grad_norm": 0.8150787353515625, |
|
"learning_rate": 3e-05, |
|
"loss": 0.154, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 14.212962962962964, |
|
"grad_norm": 1.217228889465332, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1287, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 14.25925925925926, |
|
"grad_norm": 0.901224672794342, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1401, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 14.305555555555555, |
|
"grad_norm": 1.5435835123062134, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1436, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 14.351851851851851, |
|
"grad_norm": 1.2040764093399048, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1432, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 14.398148148148149, |
|
"grad_norm": 0.8125296235084534, |
|
"learning_rate": 3e-05, |
|
"loss": 0.138, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 14.444444444444445, |
|
"grad_norm": 1.1610842943191528, |
|
"learning_rate": 3e-05, |
|
"loss": 0.145, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 14.49074074074074, |
|
"grad_norm": 1.8344600200653076, |
|
"learning_rate": 3e-05, |
|
"loss": 0.159, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 14.537037037037036, |
|
"grad_norm": 0.9230263829231262, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1482, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 14.583333333333334, |
|
"grad_norm": 1.1558572053909302, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1515, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 14.62962962962963, |
|
"grad_norm": 1.1165474653244019, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1398, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 14.675925925925926, |
|
"grad_norm": 1.8796123266220093, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1477, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 14.722222222222221, |
|
"grad_norm": 1.0877175331115723, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1537, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 14.768518518518519, |
|
"grad_norm": 1.361430048942566, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1467, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 14.814814814814815, |
|
"grad_norm": 0.8188061714172363, |
|
"learning_rate": 3e-05, |
|
"loss": 0.149, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 14.86111111111111, |
|
"grad_norm": 1.4913760423660278, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1586, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 14.907407407407408, |
|
"grad_norm": 0.8685842156410217, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1513, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 14.953703703703704, |
|
"grad_norm": 1.0157376527786255, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1499, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 1.0665968656539917, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1673, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 15.046296296296296, |
|
"grad_norm": 1.3810080289840698, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1307, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 15.092592592592593, |
|
"grad_norm": 0.9249776601791382, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1389, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 15.13888888888889, |
|
"grad_norm": 1.1835044622421265, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1337, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 15.185185185185185, |
|
"grad_norm": 1.341606616973877, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1596, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 15.231481481481481, |
|
"grad_norm": 1.2246119976043701, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1368, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 15.277777777777779, |
|
"grad_norm": 1.2647781372070312, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1442, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 15.324074074074074, |
|
"grad_norm": 1.158076524734497, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1366, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 15.37037037037037, |
|
"grad_norm": 1.2419984340667725, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1429, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 15.416666666666666, |
|
"grad_norm": 1.4107789993286133, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1565, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 15.462962962962964, |
|
"grad_norm": 1.0196819305419922, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1477, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 15.50925925925926, |
|
"grad_norm": 1.286967158317566, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1418, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 15.555555555555555, |
|
"grad_norm": 0.8533580899238586, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1438, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 15.601851851851851, |
|
"grad_norm": 3.57099986076355, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1628, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 15.648148148148149, |
|
"grad_norm": 0.7320424914360046, |
|
"learning_rate": 3e-05, |
|
"loss": 0.152, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 15.694444444444445, |
|
"grad_norm": 1.4328055381774902, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1394, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 15.74074074074074, |
|
"grad_norm": 1.4889895915985107, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1419, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 15.787037037037036, |
|
"grad_norm": 1.5108295679092407, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1519, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 15.833333333333334, |
|
"grad_norm": 0.8050726652145386, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1447, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 15.87962962962963, |
|
"grad_norm": 0.9805731177330017, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1489, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 15.925925925925926, |
|
"grad_norm": 0.8113260865211487, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1542, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 15.972222222222221, |
|
"grad_norm": 1.0790350437164307, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1551, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 16.01851851851852, |
|
"grad_norm": 0.9748013615608215, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1509, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 16.064814814814813, |
|
"grad_norm": 0.8632538914680481, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1272, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 16.11111111111111, |
|
"grad_norm": 0.9647319912910461, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1283, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 16.15740740740741, |
|
"grad_norm": 1.337175726890564, |
|
"learning_rate": 3e-05, |
|
"loss": 0.133, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 16.203703703703702, |
|
"grad_norm": 1.582036018371582, |
|
"learning_rate": 3e-05, |
|
"loss": 0.139, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 16.25, |
|
"grad_norm": 0.8627964854240417, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1382, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 16.296296296296298, |
|
"grad_norm": 0.9686309099197388, |
|
"learning_rate": 3e-05, |
|
"loss": 0.132, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 16.34259259259259, |
|
"grad_norm": 1.2272250652313232, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1503, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 16.38888888888889, |
|
"grad_norm": 1.213373064994812, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1428, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 16.435185185185187, |
|
"grad_norm": 1.6020861864089966, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1392, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 16.48148148148148, |
|
"grad_norm": 0.9201906323432922, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1421, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 16.52777777777778, |
|
"grad_norm": 0.764667272567749, |
|
"learning_rate": 3e-05, |
|
"loss": 0.14, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 16.574074074074073, |
|
"grad_norm": 0.983479917049408, |
|
"learning_rate": 3e-05, |
|
"loss": 0.141, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 16.62037037037037, |
|
"grad_norm": 1.4715135097503662, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1447, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 16.666666666666668, |
|
"grad_norm": 0.8298927545547485, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1466, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 16.712962962962962, |
|
"grad_norm": 13.994155883789062, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1519, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 16.75925925925926, |
|
"grad_norm": 0.9934417605400085, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1476, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 16.805555555555557, |
|
"grad_norm": 0.7088779211044312, |
|
"learning_rate": 3e-05, |
|
"loss": 0.149, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 16.85185185185185, |
|
"grad_norm": 0.7401979565620422, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1295, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 16.89814814814815, |
|
"grad_norm": 1.1984256505966187, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1532, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 16.944444444444443, |
|
"grad_norm": 0.974200963973999, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1461, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 16.99074074074074, |
|
"grad_norm": 0.9635884761810303, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1406, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 17.037037037037038, |
|
"grad_norm": 1.3428035974502563, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1376, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 17.083333333333332, |
|
"grad_norm": 5.439419269561768, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1282, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 17.12962962962963, |
|
"grad_norm": 0.9634020924568176, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1331, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 17.175925925925927, |
|
"grad_norm": 1.4761830568313599, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1303, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 17.22222222222222, |
|
"grad_norm": 0.9295877814292908, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1395, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 17.26851851851852, |
|
"grad_norm": 0.9482799172401428, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1363, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 17.314814814814813, |
|
"grad_norm": 1.4431120157241821, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1396, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 17.36111111111111, |
|
"grad_norm": 1.708990454673767, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1376, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 17.40740740740741, |
|
"grad_norm": 1.1707541942596436, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1482, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 17.453703703703702, |
|
"grad_norm": 1.2253369092941284, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1424, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"grad_norm": 1.2547239065170288, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1426, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 17.546296296296298, |
|
"grad_norm": 0.9535191655158997, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1467, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 17.59259259259259, |
|
"grad_norm": 1.2108802795410156, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1417, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 17.63888888888889, |
|
"grad_norm": 0.9786614179611206, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1341, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 17.685185185185187, |
|
"grad_norm": 1.202910304069519, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1424, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 17.73148148148148, |
|
"grad_norm": 0.8020798563957214, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1351, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 17.77777777777778, |
|
"grad_norm": 1.3408881425857544, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1423, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 17.824074074074073, |
|
"grad_norm": 1.0835148096084595, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1278, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 17.87037037037037, |
|
"grad_norm": 0.8985652327537537, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1543, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 17.916666666666668, |
|
"grad_norm": 1.2214702367782593, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1471, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 17.962962962962962, |
|
"grad_norm": 0.89570152759552, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1403, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 18.00925925925926, |
|
"grad_norm": 1.1677643060684204, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1481, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 18.055555555555557, |
|
"grad_norm": 1.2765920162200928, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1281, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 18.10185185185185, |
|
"grad_norm": 1.1932995319366455, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1319, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 18.14814814814815, |
|
"grad_norm": 0.8899739384651184, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1315, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 18.194444444444443, |
|
"grad_norm": 0.9039879441261292, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1367, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 18.24074074074074, |
|
"grad_norm": 1.6004914045333862, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1373, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 18.287037037037038, |
|
"grad_norm": 0.8183736205101013, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1353, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 18.333333333333332, |
|
"grad_norm": 0.9930373430252075, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1485, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 18.37962962962963, |
|
"grad_norm": 1.204205870628357, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1436, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 18.425925925925927, |
|
"grad_norm": 0.8955985903739929, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1307, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 18.47222222222222, |
|
"grad_norm": 0.5730966329574585, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1405, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 18.51851851851852, |
|
"grad_norm": 0.8282036781311035, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1352, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 18.51851851851852, |
|
"eval_loss": 0.57914799451828, |
|
"eval_runtime": 89.2393, |
|
"eval_samples_per_second": 6.051, |
|
"eval_steps_per_second": 0.303, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 18.564814814814813, |
|
"grad_norm": 1.324410319328308, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1455, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 18.61111111111111, |
|
"grad_norm": 1.626372218132019, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1249, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 18.65740740740741, |
|
"grad_norm": 0.9152678847312927, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1318, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 18.703703703703702, |
|
"grad_norm": 0.8717418909072876, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1274, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 18.75, |
|
"grad_norm": 1.3345868587493896, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1534, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 18.796296296296298, |
|
"grad_norm": 0.9496395587921143, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1384, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 18.84259259259259, |
|
"grad_norm": 1.159990906715393, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1428, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 18.88888888888889, |
|
"grad_norm": 0.9783802032470703, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1454, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 18.935185185185187, |
|
"grad_norm": 1.1276887655258179, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1342, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 18.98148148148148, |
|
"grad_norm": 1.6755024194717407, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1441, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 19.02777777777778, |
|
"grad_norm": 0.9260687828063965, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1236, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 19.074074074074073, |
|
"grad_norm": 1.0526010990142822, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1209, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 19.12037037037037, |
|
"grad_norm": 1.0216201543807983, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1217, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 19.166666666666668, |
|
"grad_norm": 1.3933924436569214, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1361, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 19.212962962962962, |
|
"grad_norm": 1.1197868585586548, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1342, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 19.25925925925926, |
|
"grad_norm": 0.781484842300415, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1331, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 19.305555555555557, |
|
"grad_norm": 1.4528825283050537, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1388, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 19.35185185185185, |
|
"grad_norm": 0.972441554069519, |
|
"learning_rate": 3e-05, |
|
"loss": 0.139, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 19.39814814814815, |
|
"grad_norm": 1.5952485799789429, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1388, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 19.444444444444443, |
|
"grad_norm": 1.5413490533828735, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1361, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 19.49074074074074, |
|
"grad_norm": 1.033690094947815, |
|
"learning_rate": 3e-05, |
|
"loss": 0.131, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 19.537037037037038, |
|
"grad_norm": 0.702571451663971, |
|
"learning_rate": 3e-05, |
|
"loss": 0.15, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 19.583333333333332, |
|
"grad_norm": 1.326290488243103, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1325, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 19.62962962962963, |
|
"grad_norm": 0.7209446430206299, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1346, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 19.675925925925927, |
|
"grad_norm": 0.8662400841712952, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1507, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 19.72222222222222, |
|
"grad_norm": 1.1025499105453491, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1498, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 19.76851851851852, |
|
"grad_norm": 1.2367596626281738, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1469, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 19.814814814814813, |
|
"grad_norm": 1.2120156288146973, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1384, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 19.86111111111111, |
|
"grad_norm": 1.5859310626983643, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1296, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 19.90740740740741, |
|
"grad_norm": 1.9393326044082642, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1517, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 19.953703703703702, |
|
"grad_norm": 0.6213334202766418, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1368, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 1.004947543144226, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1345, |
|
"step": 4320 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 65323465064448.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|