|
{ |
|
"best_metric": 0.8745647668838501, |
|
"best_model_checkpoint": "./beans_outputs/checkpoint-1680", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 1680, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02976190476190476, |
|
"grad_norm": 3.170905113220215, |
|
"learning_rate": 1.9880952380952384e-05, |
|
"loss": 3.6048, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05952380952380952, |
|
"grad_norm": 2.51796555519104, |
|
"learning_rate": 1.9761904761904763e-05, |
|
"loss": 3.5551, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 2.868872880935669, |
|
"learning_rate": 1.9642857142857145e-05, |
|
"loss": 3.4868, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 3.2086801528930664, |
|
"learning_rate": 1.9523809523809524e-05, |
|
"loss": 3.4105, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1488095238095238, |
|
"grad_norm": 2.825397253036499, |
|
"learning_rate": 1.9404761904761906e-05, |
|
"loss": 3.3538, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 3.486938238143921, |
|
"learning_rate": 1.928571428571429e-05, |
|
"loss": 3.3201, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 2.802475929260254, |
|
"learning_rate": 1.916666666666667e-05, |
|
"loss": 3.2432, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 2.789459228515625, |
|
"learning_rate": 1.904761904761905e-05, |
|
"loss": 3.2041, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 3.008307933807373, |
|
"learning_rate": 1.892857142857143e-05, |
|
"loss": 3.1679, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 2.6487619876861572, |
|
"learning_rate": 1.880952380952381e-05, |
|
"loss": 3.1249, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3273809523809524, |
|
"grad_norm": 2.947179079055786, |
|
"learning_rate": 1.8690476190476193e-05, |
|
"loss": 3.0909, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 3.1243131160736084, |
|
"learning_rate": 1.8571428571428575e-05, |
|
"loss": 3.0953, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3869047619047619, |
|
"grad_norm": 2.9400837421417236, |
|
"learning_rate": 1.8452380952380954e-05, |
|
"loss": 2.9629, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 2.7061338424682617, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 2.9307, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 2.6359243392944336, |
|
"learning_rate": 1.8214285714285715e-05, |
|
"loss": 2.8238, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 2.740408420562744, |
|
"learning_rate": 1.8095238095238097e-05, |
|
"loss": 2.8961, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5059523809523809, |
|
"grad_norm": 2.858968496322632, |
|
"learning_rate": 1.797619047619048e-05, |
|
"loss": 2.7505, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 2.7578256130218506, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 2.7989, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5654761904761905, |
|
"grad_norm": 2.9766931533813477, |
|
"learning_rate": 1.7738095238095237e-05, |
|
"loss": 2.6722, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 2.7900352478027344, |
|
"learning_rate": 1.761904761904762e-05, |
|
"loss": 2.7213, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 3.004939556121826, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 2.7287, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6547619047619048, |
|
"grad_norm": 2.7375917434692383, |
|
"learning_rate": 1.7380952380952384e-05, |
|
"loss": 2.6691, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6845238095238095, |
|
"grad_norm": 3.2530713081359863, |
|
"learning_rate": 1.7261904761904763e-05, |
|
"loss": 2.5742, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 3.0463545322418213, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 2.4523, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7440476190476191, |
|
"grad_norm": 3.0471720695495605, |
|
"learning_rate": 1.7023809523809524e-05, |
|
"loss": 2.4592, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7738095238095238, |
|
"grad_norm": 3.4415907859802246, |
|
"learning_rate": 1.6904761904761906e-05, |
|
"loss": 2.4316, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 2.830673933029175, |
|
"learning_rate": 1.678571428571429e-05, |
|
"loss": 2.3903, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 3.584303617477417, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 2.4643, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8630952380952381, |
|
"grad_norm": 3.9748589992523193, |
|
"learning_rate": 1.6547619047619046e-05, |
|
"loss": 2.3237, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 2.929922103881836, |
|
"learning_rate": 1.642857142857143e-05, |
|
"loss": 2.2639, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9226190476190477, |
|
"grad_norm": 4.647745132446289, |
|
"learning_rate": 1.630952380952381e-05, |
|
"loss": 2.4637, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 3.6543118953704834, |
|
"learning_rate": 1.6190476190476193e-05, |
|
"loss": 2.2519, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9821428571428571, |
|
"grad_norm": 3.3143322467803955, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 2.1775, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7616033755274262, |
|
"eval_loss": 2.1820600032806396, |
|
"eval_runtime": 50.7645, |
|
"eval_samples_per_second": 9.337, |
|
"eval_steps_per_second": 1.182, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.0119047619047619, |
|
"grad_norm": 3.666236639022827, |
|
"learning_rate": 1.5952380952380954e-05, |
|
"loss": 2.1187, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0416666666666667, |
|
"grad_norm": 3.736830472946167, |
|
"learning_rate": 1.5833333333333333e-05, |
|
"loss": 2.1312, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 3.002455711364746, |
|
"learning_rate": 1.5714285714285715e-05, |
|
"loss": 2.2274, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1011904761904763, |
|
"grad_norm": 3.2685108184814453, |
|
"learning_rate": 1.5595238095238098e-05, |
|
"loss": 2.1347, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.130952380952381, |
|
"grad_norm": 3.4998621940612793, |
|
"learning_rate": 1.5476190476190476e-05, |
|
"loss": 2.0757, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1607142857142858, |
|
"grad_norm": 3.306267738342285, |
|
"learning_rate": 1.535714285714286e-05, |
|
"loss": 2.0177, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 3.8774032592773438, |
|
"learning_rate": 1.523809523809524e-05, |
|
"loss": 1.9748, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2202380952380953, |
|
"grad_norm": 2.662797212600708, |
|
"learning_rate": 1.511904761904762e-05, |
|
"loss": 1.9628, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.9353742599487305, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 2.0104, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2797619047619047, |
|
"grad_norm": 3.3460521697998047, |
|
"learning_rate": 1.4880952380952383e-05, |
|
"loss": 2.0678, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3095238095238095, |
|
"grad_norm": 3.0211353302001953, |
|
"learning_rate": 1.4761904761904763e-05, |
|
"loss": 2.0294, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3392857142857144, |
|
"grad_norm": 2.827756404876709, |
|
"learning_rate": 1.4642857142857144e-05, |
|
"loss": 1.9104, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.369047619047619, |
|
"grad_norm": 2.606844663619995, |
|
"learning_rate": 1.4523809523809524e-05, |
|
"loss": 1.933, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3988095238095237, |
|
"grad_norm": 3.994950294494629, |
|
"learning_rate": 1.4404761904761907e-05, |
|
"loss": 1.9977, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 3.6433207988739014, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 1.897, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4583333333333333, |
|
"grad_norm": 3.1899826526641846, |
|
"learning_rate": 1.416666666666667e-05, |
|
"loss": 1.9046, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4880952380952381, |
|
"grad_norm": 3.352928638458252, |
|
"learning_rate": 1.4047619047619048e-05, |
|
"loss": 1.7378, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5178571428571428, |
|
"grad_norm": 4.73577880859375, |
|
"learning_rate": 1.3928571428571429e-05, |
|
"loss": 1.7998, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5476190476190477, |
|
"grad_norm": 3.118739366531372, |
|
"learning_rate": 1.3809523809523811e-05, |
|
"loss": 1.7316, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5773809523809523, |
|
"grad_norm": 2.617877721786499, |
|
"learning_rate": 1.3690476190476192e-05, |
|
"loss": 1.6478, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 3.3894600868225098, |
|
"learning_rate": 1.3571428571428574e-05, |
|
"loss": 1.7311, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6369047619047619, |
|
"grad_norm": 4.088054656982422, |
|
"learning_rate": 1.3452380952380954e-05, |
|
"loss": 1.5008, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 3.2209737300872803, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.6994, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.6964285714285714, |
|
"grad_norm": 3.8286681175231934, |
|
"learning_rate": 1.3214285714285716e-05, |
|
"loss": 1.6879, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7261904761904763, |
|
"grad_norm": 2.611720561981201, |
|
"learning_rate": 1.3095238095238096e-05, |
|
"loss": 1.6061, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.755952380952381, |
|
"grad_norm": 2.898097276687622, |
|
"learning_rate": 1.2976190476190478e-05, |
|
"loss": 1.5223, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 2.2522895336151123, |
|
"learning_rate": 1.2857142857142859e-05, |
|
"loss": 1.5095, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8154761904761905, |
|
"grad_norm": 3.5610804557800293, |
|
"learning_rate": 1.2738095238095238e-05, |
|
"loss": 1.6524, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8452380952380953, |
|
"grad_norm": 3.532130002975464, |
|
"learning_rate": 1.261904761904762e-05, |
|
"loss": 1.5345, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 3.8648953437805176, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.691, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 2.4936046600341797, |
|
"learning_rate": 1.2380952380952383e-05, |
|
"loss": 1.4573, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9345238095238095, |
|
"grad_norm": 3.499699592590332, |
|
"learning_rate": 1.2261904761904763e-05, |
|
"loss": 1.5181, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 2.7815959453582764, |
|
"learning_rate": 1.2142857142857142e-05, |
|
"loss": 1.4333, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.994047619047619, |
|
"grad_norm": 3.007183790206909, |
|
"learning_rate": 1.2023809523809525e-05, |
|
"loss": 1.4653, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8839662447257384, |
|
"eval_loss": 1.4698303937911987, |
|
"eval_runtime": 51.4369, |
|
"eval_samples_per_second": 9.215, |
|
"eval_steps_per_second": 1.166, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 2.0238095238095237, |
|
"grad_norm": 3.4663267135620117, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 1.4428, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0535714285714284, |
|
"grad_norm": 2.2934768199920654, |
|
"learning_rate": 1.1785714285714287e-05, |
|
"loss": 1.4135, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 2.601954221725464, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 1.456, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.113095238095238, |
|
"grad_norm": 3.2254600524902344, |
|
"learning_rate": 1.1547619047619047e-05, |
|
"loss": 1.5227, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 3.2958316802978516, |
|
"learning_rate": 1.1428571428571429e-05, |
|
"loss": 1.4248, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.1726190476190474, |
|
"grad_norm": 4.993536472320557, |
|
"learning_rate": 1.130952380952381e-05, |
|
"loss": 1.4717, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2023809523809526, |
|
"grad_norm": 3.3640084266662598, |
|
"learning_rate": 1.1190476190476192e-05, |
|
"loss": 1.4265, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.232142857142857, |
|
"grad_norm": 2.6835250854492188, |
|
"learning_rate": 1.1071428571428572e-05, |
|
"loss": 1.408, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.261904761904762, |
|
"grad_norm": 3.8518381118774414, |
|
"learning_rate": 1.0952380952380955e-05, |
|
"loss": 1.2666, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.2916666666666665, |
|
"grad_norm": 3.553366184234619, |
|
"learning_rate": 1.0833333333333334e-05, |
|
"loss": 1.4052, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 2.657440423965454, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 1.3953, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.3511904761904763, |
|
"grad_norm": 4.050617694854736, |
|
"learning_rate": 1.0595238095238096e-05, |
|
"loss": 1.3073, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 3.039287567138672, |
|
"learning_rate": 1.0476190476190477e-05, |
|
"loss": 1.3765, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4107142857142856, |
|
"grad_norm": 3.350076913833618, |
|
"learning_rate": 1.0357142857142859e-05, |
|
"loss": 1.2713, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4404761904761907, |
|
"grad_norm": 4.112967491149902, |
|
"learning_rate": 1.0238095238095238e-05, |
|
"loss": 1.3557, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.4702380952380953, |
|
"grad_norm": 2.587895154953003, |
|
"learning_rate": 1.011904761904762e-05, |
|
"loss": 1.2106, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.2189221382141113, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1529, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5297619047619047, |
|
"grad_norm": 1.7763313055038452, |
|
"learning_rate": 9.880952380952381e-06, |
|
"loss": 1.2066, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5595238095238093, |
|
"grad_norm": 2.5652577877044678, |
|
"learning_rate": 9.761904761904762e-06, |
|
"loss": 1.2206, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.5892857142857144, |
|
"grad_norm": 2.4081642627716064, |
|
"learning_rate": 9.642857142857144e-06, |
|
"loss": 1.2288, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 3.4448933601379395, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 1.2764, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.6488095238095237, |
|
"grad_norm": 3.65535044670105, |
|
"learning_rate": 9.404761904761905e-06, |
|
"loss": 1.1818, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 2.902886152267456, |
|
"learning_rate": 9.285714285714288e-06, |
|
"loss": 1.2662, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7083333333333335, |
|
"grad_norm": 2.8251378536224365, |
|
"learning_rate": 9.166666666666666e-06, |
|
"loss": 1.1246, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.738095238095238, |
|
"grad_norm": 2.1443264484405518, |
|
"learning_rate": 9.047619047619049e-06, |
|
"loss": 1.2486, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.767857142857143, |
|
"grad_norm": 4.930934429168701, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 1.1865, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.7976190476190474, |
|
"grad_norm": 3.2018985748291016, |
|
"learning_rate": 8.80952380952381e-06, |
|
"loss": 1.1047, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8273809523809526, |
|
"grad_norm": 3.2998268604278564, |
|
"learning_rate": 8.690476190476192e-06, |
|
"loss": 1.2098, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 2.1316542625427246, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 1.0918, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.886904761904762, |
|
"grad_norm": 3.8014087677001953, |
|
"learning_rate": 8.452380952380953e-06, |
|
"loss": 1.1139, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 2.8320999145507812, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.213, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.946428571428571, |
|
"grad_norm": 3.016481876373291, |
|
"learning_rate": 8.214285714285714e-06, |
|
"loss": 1.1398, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"grad_norm": 3.9006187915802, |
|
"learning_rate": 8.095238095238097e-06, |
|
"loss": 1.1052, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.930379746835443, |
|
"eval_loss": 1.0801581144332886, |
|
"eval_runtime": 51.0077, |
|
"eval_samples_per_second": 9.293, |
|
"eval_steps_per_second": 1.176, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 3.005952380952381, |
|
"grad_norm": 2.796464204788208, |
|
"learning_rate": 7.976190476190477e-06, |
|
"loss": 1.1341, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.0357142857142856, |
|
"grad_norm": 2.1846368312835693, |
|
"learning_rate": 7.857142857142858e-06, |
|
"loss": 1.173, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.0654761904761907, |
|
"grad_norm": 3.3909096717834473, |
|
"learning_rate": 7.738095238095238e-06, |
|
"loss": 1.0198, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.0952380952380953, |
|
"grad_norm": 3.5887138843536377, |
|
"learning_rate": 7.61904761904762e-06, |
|
"loss": 1.0729, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 2.7871737480163574, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.9676, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.1547619047619047, |
|
"grad_norm": 3.3368754386901855, |
|
"learning_rate": 7.380952380952382e-06, |
|
"loss": 0.9599, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.1845238095238093, |
|
"grad_norm": 3.748992919921875, |
|
"learning_rate": 7.261904761904762e-06, |
|
"loss": 1.1599, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.2142857142857144, |
|
"grad_norm": 4.470694065093994, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.155, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.244047619047619, |
|
"grad_norm": 1.8315823078155518, |
|
"learning_rate": 7.023809523809524e-06, |
|
"loss": 0.979, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.2738095238095237, |
|
"grad_norm": 2.505209445953369, |
|
"learning_rate": 6.9047619047619055e-06, |
|
"loss": 1.142, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.3035714285714284, |
|
"grad_norm": 3.056353807449341, |
|
"learning_rate": 6.785714285714287e-06, |
|
"loss": 1.0072, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 3.9302310943603516, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.0705, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.363095238095238, |
|
"grad_norm": 4.6520490646362305, |
|
"learning_rate": 6.547619047619048e-06, |
|
"loss": 1.0325, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.392857142857143, |
|
"grad_norm": 3.9381701946258545, |
|
"learning_rate": 6.4285714285714295e-06, |
|
"loss": 0.9674, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.4226190476190474, |
|
"grad_norm": 5.080965042114258, |
|
"learning_rate": 6.30952380952381e-06, |
|
"loss": 0.9812, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.4523809523809526, |
|
"grad_norm": 4.649317264556885, |
|
"learning_rate": 6.1904761904761914e-06, |
|
"loss": 1.1093, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.482142857142857, |
|
"grad_norm": 5.5956315994262695, |
|
"learning_rate": 6.071428571428571e-06, |
|
"loss": 1.0133, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.511904761904762, |
|
"grad_norm": 4.99602746963501, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 1.075, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.5416666666666665, |
|
"grad_norm": 3.875300407409668, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 1.1469, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 2.9351279735565186, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 1.1746, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.6011904761904763, |
|
"grad_norm": 3.581909418106079, |
|
"learning_rate": 5.595238095238096e-06, |
|
"loss": 1.0452, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.630952380952381, |
|
"grad_norm": 2.4383697509765625, |
|
"learning_rate": 5.476190476190477e-06, |
|
"loss": 0.884, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.6607142857142856, |
|
"grad_norm": 3.386600971221924, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 0.9479, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.6904761904761907, |
|
"grad_norm": 1.5890535116195679, |
|
"learning_rate": 5.2380952380952384e-06, |
|
"loss": 0.8953, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.7202380952380953, |
|
"grad_norm": 2.729491710662842, |
|
"learning_rate": 5.119047619047619e-06, |
|
"loss": 0.9071, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 4.265748977661133, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0496, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.7797619047619047, |
|
"grad_norm": 3.6234512329101562, |
|
"learning_rate": 4.880952380952381e-06, |
|
"loss": 0.9945, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 3.0296449661254883, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 1.0592, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.8392857142857144, |
|
"grad_norm": 3.7550673484802246, |
|
"learning_rate": 4.642857142857144e-06, |
|
"loss": 0.9102, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.869047619047619, |
|
"grad_norm": 2.3732712268829346, |
|
"learning_rate": 4.523809523809524e-06, |
|
"loss": 0.9721, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.8988095238095237, |
|
"grad_norm": 4.049142360687256, |
|
"learning_rate": 4.404761904761905e-06, |
|
"loss": 0.9409, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.928571428571429, |
|
"grad_norm": 2.1877949237823486, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 1.0235, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.9583333333333335, |
|
"grad_norm": 1.8449411392211914, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.978, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.988095238095238, |
|
"grad_norm": 2.8841190338134766, |
|
"learning_rate": 4.047619047619048e-06, |
|
"loss": 1.0055, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9493670886075949, |
|
"eval_loss": 0.9248189926147461, |
|
"eval_runtime": 51.0423, |
|
"eval_samples_per_second": 9.286, |
|
"eval_steps_per_second": 1.175, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 4.017857142857143, |
|
"grad_norm": 2.242076873779297, |
|
"learning_rate": 3.928571428571429e-06, |
|
"loss": 0.9244, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.0476190476190474, |
|
"grad_norm": 1.98090660572052, |
|
"learning_rate": 3.80952380952381e-06, |
|
"loss": 0.8568, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.0773809523809526, |
|
"grad_norm": 3.927706718444824, |
|
"learning_rate": 3.690476190476191e-06, |
|
"loss": 0.9644, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.107142857142857, |
|
"grad_norm": 2.3780994415283203, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 0.97, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.136904761904762, |
|
"grad_norm": 2.21608304977417, |
|
"learning_rate": 3.4523809523809528e-06, |
|
"loss": 0.9728, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 6.764073848724365, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.8729, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.196428571428571, |
|
"grad_norm": 1.5746071338653564, |
|
"learning_rate": 3.2142857142857147e-06, |
|
"loss": 0.7702, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.226190476190476, |
|
"grad_norm": 1.8241825103759766, |
|
"learning_rate": 3.0952380952380957e-06, |
|
"loss": 0.9121, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.255952380952381, |
|
"grad_norm": 3.9683926105499268, |
|
"learning_rate": 2.9761904761904763e-06, |
|
"loss": 0.8749, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 1.5732113122940063, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.9421, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.315476190476191, |
|
"grad_norm": 2.5848405361175537, |
|
"learning_rate": 2.7380952380952387e-06, |
|
"loss": 0.9617, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.345238095238095, |
|
"grad_norm": 3.7017910480499268, |
|
"learning_rate": 2.6190476190476192e-06, |
|
"loss": 0.905, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.375, |
|
"grad_norm": 5.973739147186279, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.89, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.404761904761905, |
|
"grad_norm": 1.8716737031936646, |
|
"learning_rate": 2.380952380952381e-06, |
|
"loss": 0.9635, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.434523809523809, |
|
"grad_norm": 3.3029792308807373, |
|
"learning_rate": 2.261904761904762e-06, |
|
"loss": 0.933, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.464285714285714, |
|
"grad_norm": 2.5819740295410156, |
|
"learning_rate": 2.1428571428571427e-06, |
|
"loss": 0.8899, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.494047619047619, |
|
"grad_norm": 3.5635058879852295, |
|
"learning_rate": 2.023809523809524e-06, |
|
"loss": 0.8539, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.523809523809524, |
|
"grad_norm": 2.5672874450683594, |
|
"learning_rate": 1.904761904761905e-06, |
|
"loss": 1.0972, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.553571428571429, |
|
"grad_norm": 5.11098051071167, |
|
"learning_rate": 1.7857142857142859e-06, |
|
"loss": 0.9862, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.583333333333333, |
|
"grad_norm": 2.5244972705841064, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 1.0213, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.613095238095238, |
|
"grad_norm": 3.5044398307800293, |
|
"learning_rate": 1.5476190476190479e-06, |
|
"loss": 0.9144, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.642857142857143, |
|
"grad_norm": 2.4903435707092285, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 0.9331, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.6726190476190474, |
|
"grad_norm": 3.208696126937866, |
|
"learning_rate": 1.3095238095238096e-06, |
|
"loss": 1.013, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.7023809523809526, |
|
"grad_norm": 2.255563735961914, |
|
"learning_rate": 1.1904761904761906e-06, |
|
"loss": 0.7625, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.732142857142857, |
|
"grad_norm": 2.1157748699188232, |
|
"learning_rate": 1.0714285714285714e-06, |
|
"loss": 0.8885, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 3.0076255798339844, |
|
"learning_rate": 9.523809523809525e-07, |
|
"loss": 1.0166, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.791666666666667, |
|
"grad_norm": 2.899481773376465, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.9983, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.821428571428571, |
|
"grad_norm": 6.084941387176514, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 1.1526, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.851190476190476, |
|
"grad_norm": 3.8710179328918457, |
|
"learning_rate": 5.952380952380953e-07, |
|
"loss": 0.8589, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.880952380952381, |
|
"grad_norm": 2.1053106784820557, |
|
"learning_rate": 4.7619047619047623e-07, |
|
"loss": 0.8788, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.910714285714286, |
|
"grad_norm": 2.2121217250823975, |
|
"learning_rate": 3.5714285714285716e-07, |
|
"loss": 0.8718, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.940476190476191, |
|
"grad_norm": 2.3137481212615967, |
|
"learning_rate": 2.3809523809523811e-07, |
|
"loss": 0.7878, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.970238095238095, |
|
"grad_norm": 2.676529884338379, |
|
"learning_rate": 1.1904761904761906e-07, |
|
"loss": 0.7782, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 7.775545597076416, |
|
"learning_rate": 0.0, |
|
"loss": 0.7847, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9514767932489452, |
|
"eval_loss": 0.8745647668838501, |
|
"eval_runtime": 50.8678, |
|
"eval_samples_per_second": 9.318, |
|
"eval_steps_per_second": 1.18, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 1680, |
|
"total_flos": 1.0410532148820787e+18, |
|
"train_loss": 1.5688391100792658, |
|
"train_runtime": 1801.1044, |
|
"train_samples_per_second": 7.457, |
|
"train_steps_per_second": 0.933 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1680, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0410532148820787e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|