{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 4.0,
  "eval_steps": 25.0,
  "global_step": 116,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.03463203463203463,
      "grad_norm": NaN,
      "learning_rate": 0.0002,
      "loss": 73.3116,
      "step": 1
    },
    {
      "epoch": 0.06926406926406926,
      "grad_norm": 20.237655639648438,
      "learning_rate": 0.0002,
      "loss": 74.1209,
      "step": 2
    },
    {
      "epoch": 0.1038961038961039,
      "grad_norm": 22.083927154541016,
      "learning_rate": 0.00019827586206896554,
      "loss": 70.1702,
      "step": 3
    },
    {
      "epoch": 0.13852813852813853,
      "grad_norm": 24.736431121826172,
      "learning_rate": 0.00019655172413793104,
      "loss": 69.115,
      "step": 4
    },
    {
      "epoch": 0.17316017316017315,
      "grad_norm": 30.25436782836914,
      "learning_rate": 0.00019482758620689657,
      "loss": 62.927,
      "step": 5
    },
    {
      "epoch": 0.2077922077922078,
      "grad_norm": 32.069908142089844,
      "learning_rate": 0.0001931034482758621,
      "loss": 60.1745,
      "step": 6
    },
    {
      "epoch": 0.24242424242424243,
      "grad_norm": 30.88319969177246,
      "learning_rate": 0.0001913793103448276,
      "loss": 51.702,
      "step": 7
    },
    {
      "epoch": 0.27705627705627706,
      "grad_norm": 32.261192321777344,
      "learning_rate": 0.00018965517241379312,
      "loss": 50.5157,
      "step": 8
    },
    {
      "epoch": 0.3116883116883117,
      "grad_norm": 33.49891662597656,
      "learning_rate": 0.00018793103448275865,
      "loss": 51.5589,
      "step": 9
    },
    {
      "epoch": 0.3463203463203463,
      "grad_norm": 24.870943069458008,
      "learning_rate": 0.00018620689655172415,
      "loss": 50.8659,
      "step": 10
    },
    {
      "epoch": 0.38095238095238093,
      "grad_norm": 25.71776580810547,
      "learning_rate": 0.00018448275862068968,
      "loss": 51.3838,
      "step": 11
    },
    {
      "epoch": 0.4155844155844156,
      "grad_norm": NaN,
      "learning_rate": 0.00018275862068965518,
      "loss": 43.7134,
      "step": 12
    },
    {
      "epoch": 0.45021645021645024,
      "grad_norm": 30.822378158569336,
      "learning_rate": 0.00018275862068965518,
      "loss": 47.239,
      "step": 13
    },
    {
      "epoch": 0.48484848484848486,
      "grad_norm": 22.92112159729004,
      "learning_rate": 0.0001810344827586207,
      "loss": 47.168,
      "step": 14
    },
    {
      "epoch": 0.5194805194805194,
      "grad_norm": 28.55995750427246,
      "learning_rate": 0.0001793103448275862,
      "loss": 46.0921,
      "step": 15
    },
    {
      "epoch": 0.5541125541125541,
      "grad_norm": 28.638643264770508,
      "learning_rate": 0.00017758620689655173,
      "loss": 43.2701,
      "step": 16
    },
    {
      "epoch": 0.5887445887445888,
      "grad_norm": 13.875872611999512,
      "learning_rate": 0.00017586206896551723,
      "loss": 40.5568,
      "step": 17
    },
    {
      "epoch": 0.6233766233766234,
      "grad_norm": 15.718329429626465,
      "learning_rate": 0.00017413793103448276,
      "loss": 39.8683,
      "step": 18
    },
    {
      "epoch": 0.658008658008658,
      "grad_norm": 20.134868621826172,
      "learning_rate": 0.00017241379310344826,
      "loss": 43.4786,
      "step": 19
    },
    {
      "epoch": 0.6926406926406926,
      "grad_norm": 17.95608139038086,
      "learning_rate": 0.0001706896551724138,
      "loss": 39.757,
      "step": 20
    },
    {
      "epoch": 0.7272727272727273,
      "grad_norm": 12.064809799194336,
      "learning_rate": 0.00016896551724137932,
      "loss": 37.5556,
      "step": 21
    },
    {
      "epoch": 0.7619047619047619,
      "grad_norm": 22.8414249420166,
      "learning_rate": 0.00016724137931034482,
      "loss": 47.1283,
      "step": 22
    },
    {
      "epoch": 0.7965367965367965,
      "grad_norm": 12.476370811462402,
      "learning_rate": 0.00016551724137931035,
      "loss": 44.1665,
      "step": 23
    },
    {
      "epoch": 0.8311688311688312,
      "grad_norm": 19.783571243286133,
      "learning_rate": 0.00016379310344827587,
      "loss": 40.582,
      "step": 24
    },
    {
      "epoch": 0.8658008658008658,
      "grad_norm": 20.8145751953125,
      "learning_rate": 0.00016206896551724137,
      "loss": 41.1066,
      "step": 25
    },
    {
      "epoch": 0.9004329004329005,
      "grad_norm": 18.939414978027344,
      "learning_rate": 0.0001603448275862069,
      "loss": 40.8233,
      "step": 26
    },
    {
      "epoch": 0.935064935064935,
      "grad_norm": 16.40350914001465,
      "learning_rate": 0.00015862068965517243,
      "loss": 38.9867,
      "step": 27
    },
    {
      "epoch": 0.9696969696969697,
      "grad_norm": 12.22852897644043,
      "learning_rate": 0.00015689655172413793,
      "loss": 37.3049,
      "step": 28
    },
    {
      "epoch": 1.0,
      "grad_norm": 13.104403495788574,
      "learning_rate": 0.00015517241379310346,
      "loss": 35.9869,
      "step": 29
    },
    {
      "epoch": 1.0346320346320346,
      "grad_norm": 18.48545265197754,
      "learning_rate": 0.00015344827586206899,
      "loss": 41.11,
      "step": 30
    },
    {
      "epoch": 1.0692640692640694,
      "grad_norm": 22.71863555908203,
      "learning_rate": 0.00015172413793103449,
      "loss": 42.4246,
      "step": 31
    },
    {
      "epoch": 1.103896103896104,
      "grad_norm": 17.317848205566406,
      "learning_rate": 0.00015000000000000001,
      "loss": 41.0728,
      "step": 32
    },
    {
      "epoch": 1.1385281385281385,
      "grad_norm": 9.099658012390137,
      "learning_rate": 0.00014827586206896554,
      "loss": 35.8977,
      "step": 33
    },
    {
      "epoch": 1.173160173160173,
      "grad_norm": 18.253623962402344,
      "learning_rate": 0.00014655172413793104,
      "loss": 45.2388,
      "step": 34
    },
    {
      "epoch": 1.2077922077922079,
      "grad_norm": 11.06932258605957,
      "learning_rate": 0.00014482758620689657,
      "loss": 38.9349,
      "step": 35
    },
    {
      "epoch": 1.2424242424242424,
      "grad_norm": 13.193914413452148,
      "learning_rate": 0.0001431034482758621,
      "loss": 39.5845,
      "step": 36
    },
    {
      "epoch": 1.277056277056277,
      "grad_norm": 17.09387969970703,
      "learning_rate": 0.0001413793103448276,
      "loss": 37.5467,
      "step": 37
    },
    {
      "epoch": 1.3116883116883118,
      "grad_norm": 10.892671585083008,
      "learning_rate": 0.0001396551724137931,
      "loss": 40.0311,
      "step": 38
    },
    {
      "epoch": 1.3463203463203464,
      "grad_norm": 12.092562675476074,
      "learning_rate": 0.00013793103448275863,
      "loss": 37.58,
      "step": 39
    },
    {
      "epoch": 1.380952380952381,
      "grad_norm": 14.318760871887207,
      "learning_rate": 0.00013620689655172413,
      "loss": 44.1895,
      "step": 40
    },
    {
      "epoch": 1.4155844155844157,
      "grad_norm": 8.378585815429688,
      "learning_rate": 0.00013448275862068965,
      "loss": 40.4597,
      "step": 41
    },
    {
      "epoch": 1.4502164502164503,
      "grad_norm": 12.586594581604004,
      "learning_rate": 0.00013275862068965518,
      "loss": 34.0271,
      "step": 42
    },
    {
      "epoch": 1.4848484848484849,
      "grad_norm": 13.52869701385498,
      "learning_rate": 0.00013103448275862068,
      "loss": 40.9999,
      "step": 43
    },
    {
      "epoch": 1.5194805194805194,
      "grad_norm": 9.904869079589844,
      "learning_rate": 0.0001293103448275862,
      "loss": 39.7496,
      "step": 44
    },
    {
      "epoch": 1.554112554112554,
      "grad_norm": 17.99386978149414,
      "learning_rate": 0.00012758620689655174,
      "loss": 44.0324,
      "step": 45
    },
    {
      "epoch": 1.5887445887445888,
      "grad_norm": 13.183422088623047,
      "learning_rate": 0.00012586206896551724,
      "loss": 43.4125,
      "step": 46
    },
    {
      "epoch": 1.6233766233766234,
      "grad_norm": 10.06049919128418,
      "learning_rate": 0.00012413793103448277,
      "loss": 39.1023,
      "step": 47
    },
    {
      "epoch": 1.658008658008658,
      "grad_norm": 15.202058792114258,
      "learning_rate": 0.00012241379310344827,
      "loss": 37.0854,
      "step": 48
    },
    {
      "epoch": 1.6926406926406927,
      "grad_norm": 14.160669326782227,
      "learning_rate": 0.0001206896551724138,
      "loss": 36.6519,
      "step": 49
    },
    {
      "epoch": 1.7272727272727273,
      "grad_norm": 10.129295349121094,
      "learning_rate": 0.00011896551724137932,
      "loss": 40.1695,
      "step": 50
    },
    {
      "epoch": 1.7619047619047619,
      "grad_norm": 10.78943920135498,
      "learning_rate": 0.00011724137931034482,
      "loss": 40.4436,
      "step": 51
    },
    {
      "epoch": 1.7965367965367967,
      "grad_norm": 9.714445114135742,
      "learning_rate": 0.00011551724137931035,
      "loss": 41.4942,
      "step": 52
    },
    {
      "epoch": 1.8311688311688312,
      "grad_norm": 17.142423629760742,
      "learning_rate": 0.00011379310344827588,
      "loss": 45.8405,
      "step": 53
    },
    {
      "epoch": 1.8658008658008658,
      "grad_norm": 14.116847038269043,
      "learning_rate": 0.00011206896551724138,
      "loss": 38.918,
      "step": 54
    },
    {
      "epoch": 1.9004329004329006,
      "grad_norm": 8.169567108154297,
      "learning_rate": 0.0001103448275862069,
      "loss": 39.6313,
      "step": 55
    },
    {
      "epoch": 1.935064935064935,
      "grad_norm": 10.515144348144531,
      "learning_rate": 0.00010862068965517242,
      "loss": 42.3468,
      "step": 56
    },
    {
      "epoch": 1.9696969696969697,
      "grad_norm": 9.634146690368652,
      "learning_rate": 0.00010689655172413792,
      "loss": 43.1881,
      "step": 57
    },
    {
      "epoch": 2.0,
      "grad_norm": 16.54326629638672,
      "learning_rate": 0.00010517241379310345,
      "loss": 31.4997,
      "step": 58
    },
    {
      "epoch": 2.034632034632035,
      "grad_norm": 13.723237991333008,
      "learning_rate": 0.00010344827586206898,
      "loss": 39.0238,
      "step": 59
    },
    {
      "epoch": 2.069264069264069,
      "grad_norm": 11.90444564819336,
      "learning_rate": 0.00010172413793103448,
      "loss": 45.2243,
      "step": 60
    },
    {
      "epoch": 2.103896103896104,
      "grad_norm": 10.994711875915527,
      "learning_rate": 0.0001,
      "loss": 39.9984,
      "step": 61
    },
    {
      "epoch": 2.1385281385281387,
      "grad_norm": 11.435210227966309,
      "learning_rate": 9.827586206896552e-05,
      "loss": 43.82,
      "step": 62
    },
    {
      "epoch": 2.173160173160173,
      "grad_norm": 13.717733383178711,
      "learning_rate": 9.655172413793105e-05,
      "loss": 38.5156,
      "step": 63
    },
    {
      "epoch": 2.207792207792208,
      "grad_norm": 10.357719421386719,
      "learning_rate": 9.482758620689656e-05,
      "loss": 43.2746,
      "step": 64
    },
    {
      "epoch": 2.242424242424242,
      "grad_norm": 14.937288284301758,
      "learning_rate": 9.310344827586207e-05,
      "loss": 38.4409,
      "step": 65
    },
    {
      "epoch": 2.277056277056277,
      "grad_norm": 14.609394073486328,
      "learning_rate": 9.137931034482759e-05,
      "loss": 37.9292,
      "step": 66
    },
    {
      "epoch": 2.311688311688312,
      "grad_norm": 14.482377052307129,
      "learning_rate": 8.96551724137931e-05,
      "loss": 37.5054,
      "step": 67
    },
    {
      "epoch": 2.346320346320346,
      "grad_norm": 27.845836639404297,
      "learning_rate": 8.793103448275862e-05,
      "loss": 31.3865,
      "step": 68
    },
    {
      "epoch": 2.380952380952381,
      "grad_norm": 12.11486530303955,
      "learning_rate": 8.620689655172413e-05,
      "loss": 39.3685,
      "step": 69
    },
    {
      "epoch": 2.4155844155844157,
      "grad_norm": 12.73064136505127,
      "learning_rate": 8.448275862068966e-05,
      "loss": 39.5741,
      "step": 70
    },
    {
      "epoch": 2.45021645021645,
      "grad_norm": 12.202842712402344,
      "learning_rate": 8.275862068965517e-05,
      "loss": 39.1132,
      "step": 71
    },
    {
      "epoch": 2.484848484848485,
      "grad_norm": 10.822341918945312,
      "learning_rate": 8.103448275862069e-05,
      "loss": 37.3896,
      "step": 72
    },
    {
      "epoch": 2.5194805194805197,
      "grad_norm": 18.123933792114258,
      "learning_rate": 7.931034482758621e-05,
      "loss": 40.2132,
      "step": 73
    },
    {
      "epoch": 2.554112554112554,
      "grad_norm": 11.42330265045166,
      "learning_rate": 7.758620689655173e-05,
      "loss": 36.5826,
      "step": 74
    },
    {
      "epoch": 2.588744588744589,
      "grad_norm": 14.098088264465332,
      "learning_rate": 7.586206896551724e-05,
      "loss": 36.8519,
      "step": 75
    },
    {
      "epoch": 2.6233766233766236,
      "grad_norm": 21.339242935180664,
      "learning_rate": 7.413793103448277e-05,
      "loss": 44.4439,
      "step": 76
    },
    {
      "epoch": 2.658008658008658,
      "grad_norm": 12.531341552734375,
      "learning_rate": 7.241379310344828e-05,
      "loss": 36.067,
      "step": 77
    },
    {
      "epoch": 2.6926406926406927,
      "grad_norm": 14.97744083404541,
      "learning_rate": 7.06896551724138e-05,
      "loss": 43.6598,
      "step": 78
    },
    {
      "epoch": 2.7272727272727275,
      "grad_norm": 12.019708633422852,
      "learning_rate": 6.896551724137931e-05,
      "loss": 41.1947,
      "step": 79
    },
    {
      "epoch": 2.761904761904762,
      "grad_norm": 11.916413307189941,
      "learning_rate": 6.724137931034483e-05,
      "loss": 38.0417,
      "step": 80
    },
    {
      "epoch": 2.7965367965367967,
      "grad_norm": 24.30942726135254,
      "learning_rate": 6.551724137931034e-05,
      "loss": 34.9891,
      "step": 81
    },
    {
      "epoch": 2.8311688311688314,
      "grad_norm": 13.871292114257812,
      "learning_rate": 6.379310344827587e-05,
      "loss": 38.7756,
      "step": 82
    },
    {
      "epoch": 2.865800865800866,
      "grad_norm": 13.42234992980957,
      "learning_rate": 6.206896551724138e-05,
      "loss": 37.8582,
      "step": 83
    },
    {
      "epoch": 2.9004329004329006,
      "grad_norm": 10.090055465698242,
      "learning_rate": 6.03448275862069e-05,
      "loss": 41.1415,
      "step": 84
    },
    {
      "epoch": 2.935064935064935,
      "grad_norm": 14.918863296508789,
      "learning_rate": 5.862068965517241e-05,
      "loss": 35.2293,
      "step": 85
    },
    {
      "epoch": 2.9696969696969697,
      "grad_norm": 16.72150993347168,
      "learning_rate": 5.689655172413794e-05,
      "loss": 34.67,
      "step": 86
    },
    {
      "epoch": 3.0,
      "grad_norm": 10.314674377441406,
      "learning_rate": 5.517241379310345e-05,
      "loss": 34.2737,
      "step": 87
    },
    {
      "epoch": 3.034632034632035,
      "grad_norm": 11.975030899047852,
      "learning_rate": 5.344827586206896e-05,
      "loss": 36.4709,
      "step": 88
    },
    {
      "epoch": 3.069264069264069,
      "grad_norm": 12.001708984375,
      "learning_rate": 5.172413793103449e-05,
      "loss": 37.6535,
      "step": 89
    },
    {
      "epoch": 3.103896103896104,
      "grad_norm": 24.412235260009766,
      "learning_rate": 5e-05,
      "loss": 42.9897,
      "step": 90
    },
    {
      "epoch": 3.1385281385281387,
      "grad_norm": 18.475101470947266,
      "learning_rate": 4.827586206896552e-05,
      "loss": 40.2994,
      "step": 91
    },
    {
      "epoch": 3.173160173160173,
      "grad_norm": 16.03579330444336,
      "learning_rate": 4.655172413793104e-05,
      "loss": 39.8546,
      "step": 92
    },
    {
      "epoch": 3.207792207792208,
      "grad_norm": 14.9187650680542,
      "learning_rate": 4.482758620689655e-05,
      "loss": 36.7785,
      "step": 93
    },
    {
      "epoch": 3.242424242424242,
      "grad_norm": 12.255768775939941,
      "learning_rate": 4.3103448275862066e-05,
      "loss": 36.3773,
      "step": 94
    },
    {
      "epoch": 3.277056277056277,
      "grad_norm": 10.56830883026123,
      "learning_rate": 4.1379310344827587e-05,
      "loss": 36.5246,
      "step": 95
    },
    {
      "epoch": 3.311688311688312,
      "grad_norm": 11.949898719787598,
      "learning_rate": 3.965517241379311e-05,
      "loss": 37.3966,
      "step": 96
    },
    {
      "epoch": 3.346320346320346,
      "grad_norm": 12.314105987548828,
      "learning_rate": 3.793103448275862e-05,
      "loss": 39.7101,
      "step": 97
    },
    {
      "epoch": 3.380952380952381,
      "grad_norm": 12.357243537902832,
      "learning_rate": 3.620689655172414e-05,
      "loss": 40.0369,
      "step": 98
    },
    {
      "epoch": 3.4155844155844157,
      "grad_norm": 16.851078033447266,
      "learning_rate": 3.4482758620689657e-05,
      "loss": 36.3794,
      "step": 99
    },
    {
      "epoch": 3.45021645021645,
      "grad_norm": 14.092662811279297,
      "learning_rate": 3.275862068965517e-05,
      "loss": 40.9533,
      "step": 100
    },
    {
      "epoch": 3.484848484848485,
      "grad_norm": 11.862981796264648,
      "learning_rate": 3.103448275862069e-05,
      "loss": 40.7241,
      "step": 101
    },
    {
      "epoch": 3.5194805194805197,
      "grad_norm": 24.208559036254883,
      "learning_rate": 2.9310344827586206e-05,
      "loss": 33.0654,
      "step": 102
    },
    {
      "epoch": 3.554112554112554,
      "grad_norm": 11.848682403564453,
      "learning_rate": 2.7586206896551727e-05,
      "loss": 38.5032,
      "step": 103
    },
    {
      "epoch": 3.588744588744589,
      "grad_norm": 12.40089225769043,
      "learning_rate": 2.5862068965517244e-05,
      "loss": 40.0596,
      "step": 104
    },
    {
      "epoch": 3.6233766233766236,
      "grad_norm": 22.941667556762695,
      "learning_rate": 2.413793103448276e-05,
      "loss": 35.6433,
      "step": 105
    },
    {
      "epoch": 3.658008658008658,
      "grad_norm": 20.270925521850586,
      "learning_rate": 2.2413793103448276e-05,
      "loss": 36.9345,
      "step": 106
    },
    {
      "epoch": 3.6926406926406927,
      "grad_norm": 11.919129371643066,
      "learning_rate": 2.0689655172413793e-05,
      "loss": 38.8896,
      "step": 107
    },
    {
      "epoch": 3.7272727272727275,
      "grad_norm": 11.444902420043945,
      "learning_rate": 1.896551724137931e-05,
      "loss": 40.4934,
      "step": 108
    },
    {
      "epoch": 3.761904761904762,
      "grad_norm": 10.795780181884766,
      "learning_rate": 1.7241379310344828e-05,
      "loss": 37.4473,
      "step": 109
    },
    {
      "epoch": 3.7965367965367967,
      "grad_norm": 29.648780822753906,
      "learning_rate": 1.5517241379310346e-05,
      "loss": 32.2658,
      "step": 110
    },
    {
      "epoch": 3.8311688311688314,
      "grad_norm": 11.925921440124512,
      "learning_rate": 1.3793103448275863e-05,
      "loss": 39.5132,
      "step": 111
    },
    {
      "epoch": 3.865800865800866,
      "grad_norm": 13.764852523803711,
      "learning_rate": 1.206896551724138e-05,
      "loss": 34.6145,
      "step": 112
    },
    {
      "epoch": 3.9004329004329006,
      "grad_norm": 11.574369430541992,
      "learning_rate": 1.0344827586206897e-05,
      "loss": 38.1931,
      "step": 113
    },
    {
      "epoch": 3.935064935064935,
      "grad_norm": 13.354058265686035,
      "learning_rate": 8.620689655172414e-06,
      "loss": 41.1929,
      "step": 114
    },
    {
      "epoch": 3.9696969696969697,
      "grad_norm": 12.630919456481934,
      "learning_rate": 6.896551724137932e-06,
      "loss": 35.7694,
      "step": 115
    },
    {
      "epoch": 4.0,
      "grad_norm": 12.714266777038574,
      "learning_rate": 5.172413793103448e-06,
      "loss": 29.9168,
      "step": 116
    },
    {
      "epoch": 4.0,
      "step": 116,
      "total_flos": 34002530627808.0,
      "train_loss": 41.40085841869486,
      "train_runtime": 6671.9706,
      "train_samples_per_second": 0.276,
      "train_steps_per_second": 0.017
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 116,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 34002530627808.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}