ar5entum's picture
Upload folder using huggingface_hub
a185f17 verified
{
"best_metric": 0.8745647668838501,
"best_model_checkpoint": "./beans_outputs/checkpoint-1680",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1680,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02976190476190476,
"grad_norm": 3.170905113220215,
"learning_rate": 1.9880952380952384e-05,
"loss": 3.6048,
"step": 10
},
{
"epoch": 0.05952380952380952,
"grad_norm": 2.51796555519104,
"learning_rate": 1.9761904761904763e-05,
"loss": 3.5551,
"step": 20
},
{
"epoch": 0.08928571428571429,
"grad_norm": 2.868872880935669,
"learning_rate": 1.9642857142857145e-05,
"loss": 3.4868,
"step": 30
},
{
"epoch": 0.11904761904761904,
"grad_norm": 3.2086801528930664,
"learning_rate": 1.9523809523809524e-05,
"loss": 3.4105,
"step": 40
},
{
"epoch": 0.1488095238095238,
"grad_norm": 2.825397253036499,
"learning_rate": 1.9404761904761906e-05,
"loss": 3.3538,
"step": 50
},
{
"epoch": 0.17857142857142858,
"grad_norm": 3.486938238143921,
"learning_rate": 1.928571428571429e-05,
"loss": 3.3201,
"step": 60
},
{
"epoch": 0.20833333333333334,
"grad_norm": 2.802475929260254,
"learning_rate": 1.916666666666667e-05,
"loss": 3.2432,
"step": 70
},
{
"epoch": 0.23809523809523808,
"grad_norm": 2.789459228515625,
"learning_rate": 1.904761904761905e-05,
"loss": 3.2041,
"step": 80
},
{
"epoch": 0.26785714285714285,
"grad_norm": 3.008307933807373,
"learning_rate": 1.892857142857143e-05,
"loss": 3.1679,
"step": 90
},
{
"epoch": 0.2976190476190476,
"grad_norm": 2.6487619876861572,
"learning_rate": 1.880952380952381e-05,
"loss": 3.1249,
"step": 100
},
{
"epoch": 0.3273809523809524,
"grad_norm": 2.947179079055786,
"learning_rate": 1.8690476190476193e-05,
"loss": 3.0909,
"step": 110
},
{
"epoch": 0.35714285714285715,
"grad_norm": 3.1243131160736084,
"learning_rate": 1.8571428571428575e-05,
"loss": 3.0953,
"step": 120
},
{
"epoch": 0.3869047619047619,
"grad_norm": 2.9400837421417236,
"learning_rate": 1.8452380952380954e-05,
"loss": 2.9629,
"step": 130
},
{
"epoch": 0.4166666666666667,
"grad_norm": 2.7061338424682617,
"learning_rate": 1.8333333333333333e-05,
"loss": 2.9307,
"step": 140
},
{
"epoch": 0.44642857142857145,
"grad_norm": 2.6359243392944336,
"learning_rate": 1.8214285714285715e-05,
"loss": 2.8238,
"step": 150
},
{
"epoch": 0.47619047619047616,
"grad_norm": 2.740408420562744,
"learning_rate": 1.8095238095238097e-05,
"loss": 2.8961,
"step": 160
},
{
"epoch": 0.5059523809523809,
"grad_norm": 2.858968496322632,
"learning_rate": 1.797619047619048e-05,
"loss": 2.7505,
"step": 170
},
{
"epoch": 0.5357142857142857,
"grad_norm": 2.7578256130218506,
"learning_rate": 1.785714285714286e-05,
"loss": 2.7989,
"step": 180
},
{
"epoch": 0.5654761904761905,
"grad_norm": 2.9766931533813477,
"learning_rate": 1.7738095238095237e-05,
"loss": 2.6722,
"step": 190
},
{
"epoch": 0.5952380952380952,
"grad_norm": 2.7900352478027344,
"learning_rate": 1.761904761904762e-05,
"loss": 2.7213,
"step": 200
},
{
"epoch": 0.625,
"grad_norm": 3.004939556121826,
"learning_rate": 1.7500000000000002e-05,
"loss": 2.7287,
"step": 210
},
{
"epoch": 0.6547619047619048,
"grad_norm": 2.7375917434692383,
"learning_rate": 1.7380952380952384e-05,
"loss": 2.6691,
"step": 220
},
{
"epoch": 0.6845238095238095,
"grad_norm": 3.2530713081359863,
"learning_rate": 1.7261904761904763e-05,
"loss": 2.5742,
"step": 230
},
{
"epoch": 0.7142857142857143,
"grad_norm": 3.0463545322418213,
"learning_rate": 1.7142857142857142e-05,
"loss": 2.4523,
"step": 240
},
{
"epoch": 0.7440476190476191,
"grad_norm": 3.0471720695495605,
"learning_rate": 1.7023809523809524e-05,
"loss": 2.4592,
"step": 250
},
{
"epoch": 0.7738095238095238,
"grad_norm": 3.4415907859802246,
"learning_rate": 1.6904761904761906e-05,
"loss": 2.4316,
"step": 260
},
{
"epoch": 0.8035714285714286,
"grad_norm": 2.830673933029175,
"learning_rate": 1.678571428571429e-05,
"loss": 2.3903,
"step": 270
},
{
"epoch": 0.8333333333333334,
"grad_norm": 3.584303617477417,
"learning_rate": 1.6666666666666667e-05,
"loss": 2.4643,
"step": 280
},
{
"epoch": 0.8630952380952381,
"grad_norm": 3.9748589992523193,
"learning_rate": 1.6547619047619046e-05,
"loss": 2.3237,
"step": 290
},
{
"epoch": 0.8928571428571429,
"grad_norm": 2.929922103881836,
"learning_rate": 1.642857142857143e-05,
"loss": 2.2639,
"step": 300
},
{
"epoch": 0.9226190476190477,
"grad_norm": 4.647745132446289,
"learning_rate": 1.630952380952381e-05,
"loss": 2.4637,
"step": 310
},
{
"epoch": 0.9523809523809523,
"grad_norm": 3.6543118953704834,
"learning_rate": 1.6190476190476193e-05,
"loss": 2.2519,
"step": 320
},
{
"epoch": 0.9821428571428571,
"grad_norm": 3.3143322467803955,
"learning_rate": 1.6071428571428572e-05,
"loss": 2.1775,
"step": 330
},
{
"epoch": 1.0,
"eval_accuracy": 0.7616033755274262,
"eval_loss": 2.1820600032806396,
"eval_runtime": 50.7645,
"eval_samples_per_second": 9.337,
"eval_steps_per_second": 1.182,
"step": 336
},
{
"epoch": 1.0119047619047619,
"grad_norm": 3.666236639022827,
"learning_rate": 1.5952380952380954e-05,
"loss": 2.1187,
"step": 340
},
{
"epoch": 1.0416666666666667,
"grad_norm": 3.736830472946167,
"learning_rate": 1.5833333333333333e-05,
"loss": 2.1312,
"step": 350
},
{
"epoch": 1.0714285714285714,
"grad_norm": 3.002455711364746,
"learning_rate": 1.5714285714285715e-05,
"loss": 2.2274,
"step": 360
},
{
"epoch": 1.1011904761904763,
"grad_norm": 3.2685108184814453,
"learning_rate": 1.5595238095238098e-05,
"loss": 2.1347,
"step": 370
},
{
"epoch": 1.130952380952381,
"grad_norm": 3.4998621940612793,
"learning_rate": 1.5476190476190476e-05,
"loss": 2.0757,
"step": 380
},
{
"epoch": 1.1607142857142858,
"grad_norm": 3.306267738342285,
"learning_rate": 1.535714285714286e-05,
"loss": 2.0177,
"step": 390
},
{
"epoch": 1.1904761904761905,
"grad_norm": 3.8774032592773438,
"learning_rate": 1.523809523809524e-05,
"loss": 1.9748,
"step": 400
},
{
"epoch": 1.2202380952380953,
"grad_norm": 2.662797212600708,
"learning_rate": 1.511904761904762e-05,
"loss": 1.9628,
"step": 410
},
{
"epoch": 1.25,
"grad_norm": 3.9353742599487305,
"learning_rate": 1.5000000000000002e-05,
"loss": 2.0104,
"step": 420
},
{
"epoch": 1.2797619047619047,
"grad_norm": 3.3460521697998047,
"learning_rate": 1.4880952380952383e-05,
"loss": 2.0678,
"step": 430
},
{
"epoch": 1.3095238095238095,
"grad_norm": 3.0211353302001953,
"learning_rate": 1.4761904761904763e-05,
"loss": 2.0294,
"step": 440
},
{
"epoch": 1.3392857142857144,
"grad_norm": 2.827756404876709,
"learning_rate": 1.4642857142857144e-05,
"loss": 1.9104,
"step": 450
},
{
"epoch": 1.369047619047619,
"grad_norm": 2.606844663619995,
"learning_rate": 1.4523809523809524e-05,
"loss": 1.933,
"step": 460
},
{
"epoch": 1.3988095238095237,
"grad_norm": 3.994950294494629,
"learning_rate": 1.4404761904761907e-05,
"loss": 1.9977,
"step": 470
},
{
"epoch": 1.4285714285714286,
"grad_norm": 3.6433207988739014,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.897,
"step": 480
},
{
"epoch": 1.4583333333333333,
"grad_norm": 3.1899826526641846,
"learning_rate": 1.416666666666667e-05,
"loss": 1.9046,
"step": 490
},
{
"epoch": 1.4880952380952381,
"grad_norm": 3.352928638458252,
"learning_rate": 1.4047619047619048e-05,
"loss": 1.7378,
"step": 500
},
{
"epoch": 1.5178571428571428,
"grad_norm": 4.73577880859375,
"learning_rate": 1.3928571428571429e-05,
"loss": 1.7998,
"step": 510
},
{
"epoch": 1.5476190476190477,
"grad_norm": 3.118739366531372,
"learning_rate": 1.3809523809523811e-05,
"loss": 1.7316,
"step": 520
},
{
"epoch": 1.5773809523809523,
"grad_norm": 2.617877721786499,
"learning_rate": 1.3690476190476192e-05,
"loss": 1.6478,
"step": 530
},
{
"epoch": 1.6071428571428572,
"grad_norm": 3.3894600868225098,
"learning_rate": 1.3571428571428574e-05,
"loss": 1.7311,
"step": 540
},
{
"epoch": 1.6369047619047619,
"grad_norm": 4.088054656982422,
"learning_rate": 1.3452380952380954e-05,
"loss": 1.5008,
"step": 550
},
{
"epoch": 1.6666666666666665,
"grad_norm": 3.2209737300872803,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.6994,
"step": 560
},
{
"epoch": 1.6964285714285714,
"grad_norm": 3.8286681175231934,
"learning_rate": 1.3214285714285716e-05,
"loss": 1.6879,
"step": 570
},
{
"epoch": 1.7261904761904763,
"grad_norm": 2.611720561981201,
"learning_rate": 1.3095238095238096e-05,
"loss": 1.6061,
"step": 580
},
{
"epoch": 1.755952380952381,
"grad_norm": 2.898097276687622,
"learning_rate": 1.2976190476190478e-05,
"loss": 1.5223,
"step": 590
},
{
"epoch": 1.7857142857142856,
"grad_norm": 2.2522895336151123,
"learning_rate": 1.2857142857142859e-05,
"loss": 1.5095,
"step": 600
},
{
"epoch": 1.8154761904761905,
"grad_norm": 3.5610804557800293,
"learning_rate": 1.2738095238095238e-05,
"loss": 1.6524,
"step": 610
},
{
"epoch": 1.8452380952380953,
"grad_norm": 3.532130002975464,
"learning_rate": 1.261904761904762e-05,
"loss": 1.5345,
"step": 620
},
{
"epoch": 1.875,
"grad_norm": 3.8648953437805176,
"learning_rate": 1.25e-05,
"loss": 1.691,
"step": 630
},
{
"epoch": 1.9047619047619047,
"grad_norm": 2.4936046600341797,
"learning_rate": 1.2380952380952383e-05,
"loss": 1.4573,
"step": 640
},
{
"epoch": 1.9345238095238095,
"grad_norm": 3.499699592590332,
"learning_rate": 1.2261904761904763e-05,
"loss": 1.5181,
"step": 650
},
{
"epoch": 1.9642857142857144,
"grad_norm": 2.7815959453582764,
"learning_rate": 1.2142857142857142e-05,
"loss": 1.4333,
"step": 660
},
{
"epoch": 1.994047619047619,
"grad_norm": 3.007183790206909,
"learning_rate": 1.2023809523809525e-05,
"loss": 1.4653,
"step": 670
},
{
"epoch": 2.0,
"eval_accuracy": 0.8839662447257384,
"eval_loss": 1.4698303937911987,
"eval_runtime": 51.4369,
"eval_samples_per_second": 9.215,
"eval_steps_per_second": 1.166,
"step": 672
},
{
"epoch": 2.0238095238095237,
"grad_norm": 3.4663267135620117,
"learning_rate": 1.1904761904761905e-05,
"loss": 1.4428,
"step": 680
},
{
"epoch": 2.0535714285714284,
"grad_norm": 2.2934768199920654,
"learning_rate": 1.1785714285714287e-05,
"loss": 1.4135,
"step": 690
},
{
"epoch": 2.0833333333333335,
"grad_norm": 2.601954221725464,
"learning_rate": 1.1666666666666668e-05,
"loss": 1.456,
"step": 700
},
{
"epoch": 2.113095238095238,
"grad_norm": 3.2254600524902344,
"learning_rate": 1.1547619047619047e-05,
"loss": 1.5227,
"step": 710
},
{
"epoch": 2.142857142857143,
"grad_norm": 3.2958316802978516,
"learning_rate": 1.1428571428571429e-05,
"loss": 1.4248,
"step": 720
},
{
"epoch": 2.1726190476190474,
"grad_norm": 4.993536472320557,
"learning_rate": 1.130952380952381e-05,
"loss": 1.4717,
"step": 730
},
{
"epoch": 2.2023809523809526,
"grad_norm": 3.3640084266662598,
"learning_rate": 1.1190476190476192e-05,
"loss": 1.4265,
"step": 740
},
{
"epoch": 2.232142857142857,
"grad_norm": 2.6835250854492188,
"learning_rate": 1.1071428571428572e-05,
"loss": 1.408,
"step": 750
},
{
"epoch": 2.261904761904762,
"grad_norm": 3.8518381118774414,
"learning_rate": 1.0952380952380955e-05,
"loss": 1.2666,
"step": 760
},
{
"epoch": 2.2916666666666665,
"grad_norm": 3.553366184234619,
"learning_rate": 1.0833333333333334e-05,
"loss": 1.4052,
"step": 770
},
{
"epoch": 2.3214285714285716,
"grad_norm": 2.657440423965454,
"learning_rate": 1.0714285714285714e-05,
"loss": 1.3953,
"step": 780
},
{
"epoch": 2.3511904761904763,
"grad_norm": 4.050617694854736,
"learning_rate": 1.0595238095238096e-05,
"loss": 1.3073,
"step": 790
},
{
"epoch": 2.380952380952381,
"grad_norm": 3.039287567138672,
"learning_rate": 1.0476190476190477e-05,
"loss": 1.3765,
"step": 800
},
{
"epoch": 2.4107142857142856,
"grad_norm": 3.350076913833618,
"learning_rate": 1.0357142857142859e-05,
"loss": 1.2713,
"step": 810
},
{
"epoch": 2.4404761904761907,
"grad_norm": 4.112967491149902,
"learning_rate": 1.0238095238095238e-05,
"loss": 1.3557,
"step": 820
},
{
"epoch": 2.4702380952380953,
"grad_norm": 2.587895154953003,
"learning_rate": 1.011904761904762e-05,
"loss": 1.2106,
"step": 830
},
{
"epoch": 2.5,
"grad_norm": 2.2189221382141113,
"learning_rate": 1e-05,
"loss": 1.1529,
"step": 840
},
{
"epoch": 2.5297619047619047,
"grad_norm": 1.7763313055038452,
"learning_rate": 9.880952380952381e-06,
"loss": 1.2066,
"step": 850
},
{
"epoch": 2.5595238095238093,
"grad_norm": 2.5652577877044678,
"learning_rate": 9.761904761904762e-06,
"loss": 1.2206,
"step": 860
},
{
"epoch": 2.5892857142857144,
"grad_norm": 2.4081642627716064,
"learning_rate": 9.642857142857144e-06,
"loss": 1.2288,
"step": 870
},
{
"epoch": 2.619047619047619,
"grad_norm": 3.4448933601379395,
"learning_rate": 9.523809523809525e-06,
"loss": 1.2764,
"step": 880
},
{
"epoch": 2.6488095238095237,
"grad_norm": 3.65535044670105,
"learning_rate": 9.404761904761905e-06,
"loss": 1.1818,
"step": 890
},
{
"epoch": 2.678571428571429,
"grad_norm": 2.902886152267456,
"learning_rate": 9.285714285714288e-06,
"loss": 1.2662,
"step": 900
},
{
"epoch": 2.7083333333333335,
"grad_norm": 2.8251378536224365,
"learning_rate": 9.166666666666666e-06,
"loss": 1.1246,
"step": 910
},
{
"epoch": 2.738095238095238,
"grad_norm": 2.1443264484405518,
"learning_rate": 9.047619047619049e-06,
"loss": 1.2486,
"step": 920
},
{
"epoch": 2.767857142857143,
"grad_norm": 4.930934429168701,
"learning_rate": 8.92857142857143e-06,
"loss": 1.1865,
"step": 930
},
{
"epoch": 2.7976190476190474,
"grad_norm": 3.2018985748291016,
"learning_rate": 8.80952380952381e-06,
"loss": 1.1047,
"step": 940
},
{
"epoch": 2.8273809523809526,
"grad_norm": 3.2998268604278564,
"learning_rate": 8.690476190476192e-06,
"loss": 1.2098,
"step": 950
},
{
"epoch": 2.857142857142857,
"grad_norm": 2.1316542625427246,
"learning_rate": 8.571428571428571e-06,
"loss": 1.0918,
"step": 960
},
{
"epoch": 2.886904761904762,
"grad_norm": 3.8014087677001953,
"learning_rate": 8.452380952380953e-06,
"loss": 1.1139,
"step": 970
},
{
"epoch": 2.9166666666666665,
"grad_norm": 2.8320999145507812,
"learning_rate": 8.333333333333334e-06,
"loss": 1.213,
"step": 980
},
{
"epoch": 2.946428571428571,
"grad_norm": 3.016481876373291,
"learning_rate": 8.214285714285714e-06,
"loss": 1.1398,
"step": 990
},
{
"epoch": 2.9761904761904763,
"grad_norm": 3.9006187915802,
"learning_rate": 8.095238095238097e-06,
"loss": 1.1052,
"step": 1000
},
{
"epoch": 3.0,
"eval_accuracy": 0.930379746835443,
"eval_loss": 1.0801581144332886,
"eval_runtime": 51.0077,
"eval_samples_per_second": 9.293,
"eval_steps_per_second": 1.176,
"step": 1008
},
{
"epoch": 3.005952380952381,
"grad_norm": 2.796464204788208,
"learning_rate": 7.976190476190477e-06,
"loss": 1.1341,
"step": 1010
},
{
"epoch": 3.0357142857142856,
"grad_norm": 2.1846368312835693,
"learning_rate": 7.857142857142858e-06,
"loss": 1.173,
"step": 1020
},
{
"epoch": 3.0654761904761907,
"grad_norm": 3.3909096717834473,
"learning_rate": 7.738095238095238e-06,
"loss": 1.0198,
"step": 1030
},
{
"epoch": 3.0952380952380953,
"grad_norm": 3.5887138843536377,
"learning_rate": 7.61904761904762e-06,
"loss": 1.0729,
"step": 1040
},
{
"epoch": 3.125,
"grad_norm": 2.7871737480163574,
"learning_rate": 7.500000000000001e-06,
"loss": 0.9676,
"step": 1050
},
{
"epoch": 3.1547619047619047,
"grad_norm": 3.3368754386901855,
"learning_rate": 7.380952380952382e-06,
"loss": 0.9599,
"step": 1060
},
{
"epoch": 3.1845238095238093,
"grad_norm": 3.748992919921875,
"learning_rate": 7.261904761904762e-06,
"loss": 1.1599,
"step": 1070
},
{
"epoch": 3.2142857142857144,
"grad_norm": 4.470694065093994,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.155,
"step": 1080
},
{
"epoch": 3.244047619047619,
"grad_norm": 1.8315823078155518,
"learning_rate": 7.023809523809524e-06,
"loss": 0.979,
"step": 1090
},
{
"epoch": 3.2738095238095237,
"grad_norm": 2.505209445953369,
"learning_rate": 6.9047619047619055e-06,
"loss": 1.142,
"step": 1100
},
{
"epoch": 3.3035714285714284,
"grad_norm": 3.056353807449341,
"learning_rate": 6.785714285714287e-06,
"loss": 1.0072,
"step": 1110
},
{
"epoch": 3.3333333333333335,
"grad_norm": 3.9302310943603516,
"learning_rate": 6.666666666666667e-06,
"loss": 1.0705,
"step": 1120
},
{
"epoch": 3.363095238095238,
"grad_norm": 4.6520490646362305,
"learning_rate": 6.547619047619048e-06,
"loss": 1.0325,
"step": 1130
},
{
"epoch": 3.392857142857143,
"grad_norm": 3.9381701946258545,
"learning_rate": 6.4285714285714295e-06,
"loss": 0.9674,
"step": 1140
},
{
"epoch": 3.4226190476190474,
"grad_norm": 5.080965042114258,
"learning_rate": 6.30952380952381e-06,
"loss": 0.9812,
"step": 1150
},
{
"epoch": 3.4523809523809526,
"grad_norm": 4.649317264556885,
"learning_rate": 6.1904761904761914e-06,
"loss": 1.1093,
"step": 1160
},
{
"epoch": 3.482142857142857,
"grad_norm": 5.5956315994262695,
"learning_rate": 6.071428571428571e-06,
"loss": 1.0133,
"step": 1170
},
{
"epoch": 3.511904761904762,
"grad_norm": 4.99602746963501,
"learning_rate": 5.9523809523809525e-06,
"loss": 1.075,
"step": 1180
},
{
"epoch": 3.5416666666666665,
"grad_norm": 3.875300407409668,
"learning_rate": 5.833333333333334e-06,
"loss": 1.1469,
"step": 1190
},
{
"epoch": 3.571428571428571,
"grad_norm": 2.9351279735565186,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.1746,
"step": 1200
},
{
"epoch": 3.6011904761904763,
"grad_norm": 3.581909418106079,
"learning_rate": 5.595238095238096e-06,
"loss": 1.0452,
"step": 1210
},
{
"epoch": 3.630952380952381,
"grad_norm": 2.4383697509765625,
"learning_rate": 5.476190476190477e-06,
"loss": 0.884,
"step": 1220
},
{
"epoch": 3.6607142857142856,
"grad_norm": 3.386600971221924,
"learning_rate": 5.357142857142857e-06,
"loss": 0.9479,
"step": 1230
},
{
"epoch": 3.6904761904761907,
"grad_norm": 1.5890535116195679,
"learning_rate": 5.2380952380952384e-06,
"loss": 0.8953,
"step": 1240
},
{
"epoch": 3.7202380952380953,
"grad_norm": 2.729491710662842,
"learning_rate": 5.119047619047619e-06,
"loss": 0.9071,
"step": 1250
},
{
"epoch": 3.75,
"grad_norm": 4.265748977661133,
"learning_rate": 5e-06,
"loss": 1.0496,
"step": 1260
},
{
"epoch": 3.7797619047619047,
"grad_norm": 3.6234512329101562,
"learning_rate": 4.880952380952381e-06,
"loss": 0.9945,
"step": 1270
},
{
"epoch": 3.8095238095238093,
"grad_norm": 3.0296449661254883,
"learning_rate": 4.761904761904762e-06,
"loss": 1.0592,
"step": 1280
},
{
"epoch": 3.8392857142857144,
"grad_norm": 3.7550673484802246,
"learning_rate": 4.642857142857144e-06,
"loss": 0.9102,
"step": 1290
},
{
"epoch": 3.869047619047619,
"grad_norm": 2.3732712268829346,
"learning_rate": 4.523809523809524e-06,
"loss": 0.9721,
"step": 1300
},
{
"epoch": 3.8988095238095237,
"grad_norm": 4.049142360687256,
"learning_rate": 4.404761904761905e-06,
"loss": 0.9409,
"step": 1310
},
{
"epoch": 3.928571428571429,
"grad_norm": 2.1877949237823486,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.0235,
"step": 1320
},
{
"epoch": 3.9583333333333335,
"grad_norm": 1.8449411392211914,
"learning_rate": 4.166666666666667e-06,
"loss": 0.978,
"step": 1330
},
{
"epoch": 3.988095238095238,
"grad_norm": 2.8841190338134766,
"learning_rate": 4.047619047619048e-06,
"loss": 1.0055,
"step": 1340
},
{
"epoch": 4.0,
"eval_accuracy": 0.9493670886075949,
"eval_loss": 0.9248189926147461,
"eval_runtime": 51.0423,
"eval_samples_per_second": 9.286,
"eval_steps_per_second": 1.175,
"step": 1344
},
{
"epoch": 4.017857142857143,
"grad_norm": 2.242076873779297,
"learning_rate": 3.928571428571429e-06,
"loss": 0.9244,
"step": 1350
},
{
"epoch": 4.0476190476190474,
"grad_norm": 1.98090660572052,
"learning_rate": 3.80952380952381e-06,
"loss": 0.8568,
"step": 1360
},
{
"epoch": 4.0773809523809526,
"grad_norm": 3.927706718444824,
"learning_rate": 3.690476190476191e-06,
"loss": 0.9644,
"step": 1370
},
{
"epoch": 4.107142857142857,
"grad_norm": 2.3780994415283203,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.97,
"step": 1380
},
{
"epoch": 4.136904761904762,
"grad_norm": 2.21608304977417,
"learning_rate": 3.4523809523809528e-06,
"loss": 0.9728,
"step": 1390
},
{
"epoch": 4.166666666666667,
"grad_norm": 6.764073848724365,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.8729,
"step": 1400
},
{
"epoch": 4.196428571428571,
"grad_norm": 1.5746071338653564,
"learning_rate": 3.2142857142857147e-06,
"loss": 0.7702,
"step": 1410
},
{
"epoch": 4.226190476190476,
"grad_norm": 1.8241825103759766,
"learning_rate": 3.0952380952380957e-06,
"loss": 0.9121,
"step": 1420
},
{
"epoch": 4.255952380952381,
"grad_norm": 3.9683926105499268,
"learning_rate": 2.9761904761904763e-06,
"loss": 0.8749,
"step": 1430
},
{
"epoch": 4.285714285714286,
"grad_norm": 1.5732113122940063,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.9421,
"step": 1440
},
{
"epoch": 4.315476190476191,
"grad_norm": 2.5848405361175537,
"learning_rate": 2.7380952380952387e-06,
"loss": 0.9617,
"step": 1450
},
{
"epoch": 4.345238095238095,
"grad_norm": 3.7017910480499268,
"learning_rate": 2.6190476190476192e-06,
"loss": 0.905,
"step": 1460
},
{
"epoch": 4.375,
"grad_norm": 5.973739147186279,
"learning_rate": 2.5e-06,
"loss": 0.89,
"step": 1470
},
{
"epoch": 4.404761904761905,
"grad_norm": 1.8716737031936646,
"learning_rate": 2.380952380952381e-06,
"loss": 0.9635,
"step": 1480
},
{
"epoch": 4.434523809523809,
"grad_norm": 3.3029792308807373,
"learning_rate": 2.261904761904762e-06,
"loss": 0.933,
"step": 1490
},
{
"epoch": 4.464285714285714,
"grad_norm": 2.5819740295410156,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.8899,
"step": 1500
},
{
"epoch": 4.494047619047619,
"grad_norm": 3.5635058879852295,
"learning_rate": 2.023809523809524e-06,
"loss": 0.8539,
"step": 1510
},
{
"epoch": 4.523809523809524,
"grad_norm": 2.5672874450683594,
"learning_rate": 1.904761904761905e-06,
"loss": 1.0972,
"step": 1520
},
{
"epoch": 4.553571428571429,
"grad_norm": 5.11098051071167,
"learning_rate": 1.7857142857142859e-06,
"loss": 0.9862,
"step": 1530
},
{
"epoch": 4.583333333333333,
"grad_norm": 2.5244972705841064,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.0213,
"step": 1540
},
{
"epoch": 4.613095238095238,
"grad_norm": 3.5044398307800293,
"learning_rate": 1.5476190476190479e-06,
"loss": 0.9144,
"step": 1550
},
{
"epoch": 4.642857142857143,
"grad_norm": 2.4903435707092285,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.9331,
"step": 1560
},
{
"epoch": 4.6726190476190474,
"grad_norm": 3.208696126937866,
"learning_rate": 1.3095238095238096e-06,
"loss": 1.013,
"step": 1570
},
{
"epoch": 4.7023809523809526,
"grad_norm": 2.255563735961914,
"learning_rate": 1.1904761904761906e-06,
"loss": 0.7625,
"step": 1580
},
{
"epoch": 4.732142857142857,
"grad_norm": 2.1157748699188232,
"learning_rate": 1.0714285714285714e-06,
"loss": 0.8885,
"step": 1590
},
{
"epoch": 4.761904761904762,
"grad_norm": 3.0076255798339844,
"learning_rate": 9.523809523809525e-07,
"loss": 1.0166,
"step": 1600
},
{
"epoch": 4.791666666666667,
"grad_norm": 2.899481773376465,
"learning_rate": 8.333333333333333e-07,
"loss": 0.9983,
"step": 1610
},
{
"epoch": 4.821428571428571,
"grad_norm": 6.084941387176514,
"learning_rate": 7.142857142857143e-07,
"loss": 1.1526,
"step": 1620
},
{
"epoch": 4.851190476190476,
"grad_norm": 3.8710179328918457,
"learning_rate": 5.952380952380953e-07,
"loss": 0.8589,
"step": 1630
},
{
"epoch": 4.880952380952381,
"grad_norm": 2.1053106784820557,
"learning_rate": 4.7619047619047623e-07,
"loss": 0.8788,
"step": 1640
},
{
"epoch": 4.910714285714286,
"grad_norm": 2.2121217250823975,
"learning_rate": 3.5714285714285716e-07,
"loss": 0.8718,
"step": 1650
},
{
"epoch": 4.940476190476191,
"grad_norm": 2.3137481212615967,
"learning_rate": 2.3809523809523811e-07,
"loss": 0.7878,
"step": 1660
},
{
"epoch": 4.970238095238095,
"grad_norm": 2.676529884338379,
"learning_rate": 1.1904761904761906e-07,
"loss": 0.7782,
"step": 1670
},
{
"epoch": 5.0,
"grad_norm": 7.775545597076416,
"learning_rate": 0.0,
"loss": 0.7847,
"step": 1680
},
{
"epoch": 5.0,
"eval_accuracy": 0.9514767932489452,
"eval_loss": 0.8745647668838501,
"eval_runtime": 50.8678,
"eval_samples_per_second": 9.318,
"eval_steps_per_second": 1.18,
"step": 1680
},
{
"epoch": 5.0,
"step": 1680,
"total_flos": 1.0410532148820787e+18,
"train_loss": 1.5688391100792658,
"train_runtime": 1801.1044,
"train_samples_per_second": 7.457,
"train_steps_per_second": 0.933
}
],
"logging_steps": 10,
"max_steps": 1680,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0410532148820787e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}