Stewart Slocum
Add fine-tuned model
e4f2b95
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 365,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027397260273972603,
"grad_norm": 0.34919899702072144,
"learning_rate": 1e-05,
"loss": 1.6546,
"step": 1
},
{
"epoch": 0.005479452054794521,
"grad_norm": 0.35103511810302734,
"learning_rate": 9.972602739726028e-06,
"loss": 1.6666,
"step": 2
},
{
"epoch": 0.00821917808219178,
"grad_norm": 0.3742208182811737,
"learning_rate": 9.945205479452056e-06,
"loss": 1.735,
"step": 3
},
{
"epoch": 0.010958904109589041,
"grad_norm": 0.38575077056884766,
"learning_rate": 9.917808219178083e-06,
"loss": 1.7269,
"step": 4
},
{
"epoch": 0.0136986301369863,
"grad_norm": 0.361535906791687,
"learning_rate": 9.89041095890411e-06,
"loss": 1.6679,
"step": 5
},
{
"epoch": 0.01643835616438356,
"grad_norm": 0.3729570209980011,
"learning_rate": 9.863013698630138e-06,
"loss": 1.6534,
"step": 6
},
{
"epoch": 0.019178082191780823,
"grad_norm": 0.3666209876537323,
"learning_rate": 9.835616438356166e-06,
"loss": 1.6515,
"step": 7
},
{
"epoch": 0.021917808219178082,
"grad_norm": 0.3736323118209839,
"learning_rate": 9.808219178082193e-06,
"loss": 1.6758,
"step": 8
},
{
"epoch": 0.024657534246575342,
"grad_norm": 0.3758656084537506,
"learning_rate": 9.78082191780822e-06,
"loss": 1.7094,
"step": 9
},
{
"epoch": 0.0273972602739726,
"grad_norm": 0.37960362434387207,
"learning_rate": 9.753424657534248e-06,
"loss": 1.6872,
"step": 10
},
{
"epoch": 0.030136986301369864,
"grad_norm": 0.39145174622535706,
"learning_rate": 9.726027397260275e-06,
"loss": 1.7096,
"step": 11
},
{
"epoch": 0.03287671232876712,
"grad_norm": 0.36531391739845276,
"learning_rate": 9.698630136986303e-06,
"loss": 1.6133,
"step": 12
},
{
"epoch": 0.03561643835616438,
"grad_norm": 0.35493168234825134,
"learning_rate": 9.67123287671233e-06,
"loss": 1.5616,
"step": 13
},
{
"epoch": 0.038356164383561646,
"grad_norm": 0.3559548258781433,
"learning_rate": 9.643835616438358e-06,
"loss": 1.6144,
"step": 14
},
{
"epoch": 0.0410958904109589,
"grad_norm": 0.339250773191452,
"learning_rate": 9.616438356164385e-06,
"loss": 1.5508,
"step": 15
},
{
"epoch": 0.043835616438356165,
"grad_norm": 0.31973907351493835,
"learning_rate": 9.589041095890411e-06,
"loss": 1.5232,
"step": 16
},
{
"epoch": 0.04657534246575343,
"grad_norm": 0.31599804759025574,
"learning_rate": 9.561643835616438e-06,
"loss": 1.5018,
"step": 17
},
{
"epoch": 0.049315068493150684,
"grad_norm": 0.3060247004032135,
"learning_rate": 9.534246575342466e-06,
"loss": 1.5041,
"step": 18
},
{
"epoch": 0.052054794520547946,
"grad_norm": 0.2997976541519165,
"learning_rate": 9.506849315068493e-06,
"loss": 1.4911,
"step": 19
},
{
"epoch": 0.0547945205479452,
"grad_norm": 0.32057899236679077,
"learning_rate": 9.47945205479452e-06,
"loss": 1.5419,
"step": 20
},
{
"epoch": 0.057534246575342465,
"grad_norm": 0.27622660994529724,
"learning_rate": 9.452054794520548e-06,
"loss": 1.4222,
"step": 21
},
{
"epoch": 0.06027397260273973,
"grad_norm": 0.29135027527809143,
"learning_rate": 9.424657534246576e-06,
"loss": 1.435,
"step": 22
},
{
"epoch": 0.06301369863013699,
"grad_norm": 0.26389017701148987,
"learning_rate": 9.397260273972603e-06,
"loss": 1.4286,
"step": 23
},
{
"epoch": 0.06575342465753424,
"grad_norm": 0.2440488189458847,
"learning_rate": 9.36986301369863e-06,
"loss": 1.3593,
"step": 24
},
{
"epoch": 0.0684931506849315,
"grad_norm": 0.25608792901039124,
"learning_rate": 9.342465753424658e-06,
"loss": 1.4187,
"step": 25
},
{
"epoch": 0.07123287671232877,
"grad_norm": 0.2347661852836609,
"learning_rate": 9.315068493150685e-06,
"loss": 1.3816,
"step": 26
},
{
"epoch": 0.07397260273972603,
"grad_norm": 0.2517848610877991,
"learning_rate": 9.287671232876713e-06,
"loss": 1.2992,
"step": 27
},
{
"epoch": 0.07671232876712329,
"grad_norm": 0.24535977840423584,
"learning_rate": 9.26027397260274e-06,
"loss": 1.4509,
"step": 28
},
{
"epoch": 0.07945205479452055,
"grad_norm": 0.23485970497131348,
"learning_rate": 9.232876712328768e-06,
"loss": 1.4156,
"step": 29
},
{
"epoch": 0.0821917808219178,
"grad_norm": 0.24302135407924652,
"learning_rate": 9.205479452054795e-06,
"loss": 1.4287,
"step": 30
},
{
"epoch": 0.08493150684931507,
"grad_norm": 0.20637400448322296,
"learning_rate": 9.178082191780823e-06,
"loss": 1.3158,
"step": 31
},
{
"epoch": 0.08767123287671233,
"grad_norm": 0.20781908929347992,
"learning_rate": 9.15068493150685e-06,
"loss": 1.3538,
"step": 32
},
{
"epoch": 0.09041095890410959,
"grad_norm": 0.2159215360879898,
"learning_rate": 9.123287671232878e-06,
"loss": 1.4117,
"step": 33
},
{
"epoch": 0.09315068493150686,
"grad_norm": 0.21819128096103668,
"learning_rate": 9.095890410958905e-06,
"loss": 1.3963,
"step": 34
},
{
"epoch": 0.0958904109589041,
"grad_norm": 0.2067425698041916,
"learning_rate": 9.068493150684932e-06,
"loss": 1.3363,
"step": 35
},
{
"epoch": 0.09863013698630137,
"grad_norm": 0.1994429975748062,
"learning_rate": 9.04109589041096e-06,
"loss": 1.3555,
"step": 36
},
{
"epoch": 0.10136986301369863,
"grad_norm": 0.1868618130683899,
"learning_rate": 9.013698630136987e-06,
"loss": 1.3077,
"step": 37
},
{
"epoch": 0.10410958904109589,
"grad_norm": 0.20949655771255493,
"learning_rate": 8.986301369863015e-06,
"loss": 1.3717,
"step": 38
},
{
"epoch": 0.10684931506849316,
"grad_norm": 0.17898470163345337,
"learning_rate": 8.958904109589042e-06,
"loss": 1.26,
"step": 39
},
{
"epoch": 0.1095890410958904,
"grad_norm": 0.1768495738506317,
"learning_rate": 8.93150684931507e-06,
"loss": 1.262,
"step": 40
},
{
"epoch": 0.11232876712328767,
"grad_norm": 0.1851891726255417,
"learning_rate": 8.904109589041097e-06,
"loss": 1.2827,
"step": 41
},
{
"epoch": 0.11506849315068493,
"grad_norm": 0.18280373513698578,
"learning_rate": 8.876712328767125e-06,
"loss": 1.3053,
"step": 42
},
{
"epoch": 0.1178082191780822,
"grad_norm": 0.17816248536109924,
"learning_rate": 8.849315068493152e-06,
"loss": 1.2808,
"step": 43
},
{
"epoch": 0.12054794520547946,
"grad_norm": 0.17815659940242767,
"learning_rate": 8.82191780821918e-06,
"loss": 1.28,
"step": 44
},
{
"epoch": 0.1232876712328767,
"grad_norm": 0.17877663671970367,
"learning_rate": 8.794520547945207e-06,
"loss": 1.3015,
"step": 45
},
{
"epoch": 0.12602739726027398,
"grad_norm": 0.18253077566623688,
"learning_rate": 8.767123287671233e-06,
"loss": 1.2905,
"step": 46
},
{
"epoch": 0.12876712328767123,
"grad_norm": 0.16359081864356995,
"learning_rate": 8.73972602739726e-06,
"loss": 1.2685,
"step": 47
},
{
"epoch": 0.13150684931506848,
"grad_norm": 0.19661103188991547,
"learning_rate": 8.712328767123288e-06,
"loss": 1.2991,
"step": 48
},
{
"epoch": 0.13424657534246576,
"grad_norm": 0.1715298444032669,
"learning_rate": 8.684931506849315e-06,
"loss": 1.2592,
"step": 49
},
{
"epoch": 0.136986301369863,
"grad_norm": 0.1640940010547638,
"learning_rate": 8.657534246575343e-06,
"loss": 1.2517,
"step": 50
},
{
"epoch": 0.13972602739726028,
"grad_norm": 0.22945547103881836,
"learning_rate": 8.63013698630137e-06,
"loss": 1.3143,
"step": 51
},
{
"epoch": 0.14246575342465753,
"grad_norm": 0.1699182689189911,
"learning_rate": 8.602739726027397e-06,
"loss": 1.2361,
"step": 52
},
{
"epoch": 0.14520547945205478,
"grad_norm": 0.17926287651062012,
"learning_rate": 8.575342465753425e-06,
"loss": 1.189,
"step": 53
},
{
"epoch": 0.14794520547945206,
"grad_norm": 0.1543978899717331,
"learning_rate": 8.547945205479454e-06,
"loss": 1.2138,
"step": 54
},
{
"epoch": 0.1506849315068493,
"grad_norm": 0.14743275940418243,
"learning_rate": 8.520547945205481e-06,
"loss": 1.1915,
"step": 55
},
{
"epoch": 0.15342465753424658,
"grad_norm": 0.1501009166240692,
"learning_rate": 8.493150684931507e-06,
"loss": 1.1943,
"step": 56
},
{
"epoch": 0.15616438356164383,
"grad_norm": 0.15394991636276245,
"learning_rate": 8.465753424657535e-06,
"loss": 1.2293,
"step": 57
},
{
"epoch": 0.1589041095890411,
"grad_norm": 0.22368541359901428,
"learning_rate": 8.438356164383562e-06,
"loss": 1.2566,
"step": 58
},
{
"epoch": 0.16164383561643836,
"grad_norm": 0.15473651885986328,
"learning_rate": 8.41095890410959e-06,
"loss": 1.236,
"step": 59
},
{
"epoch": 0.1643835616438356,
"grad_norm": 0.14430241286754608,
"learning_rate": 8.383561643835617e-06,
"loss": 1.1609,
"step": 60
},
{
"epoch": 0.16712328767123288,
"grad_norm": 0.1418728530406952,
"learning_rate": 8.356164383561644e-06,
"loss": 1.1768,
"step": 61
},
{
"epoch": 0.16986301369863013,
"grad_norm": 0.1435127854347229,
"learning_rate": 8.328767123287672e-06,
"loss": 1.1784,
"step": 62
},
{
"epoch": 0.1726027397260274,
"grad_norm": 0.14738471806049347,
"learning_rate": 8.3013698630137e-06,
"loss": 1.1672,
"step": 63
},
{
"epoch": 0.17534246575342466,
"grad_norm": 0.16719698905944824,
"learning_rate": 8.273972602739727e-06,
"loss": 1.1992,
"step": 64
},
{
"epoch": 0.1780821917808219,
"grad_norm": 0.1518981009721756,
"learning_rate": 8.246575342465754e-06,
"loss": 1.1971,
"step": 65
},
{
"epoch": 0.18082191780821918,
"grad_norm": 0.1392986923456192,
"learning_rate": 8.219178082191782e-06,
"loss": 1.1768,
"step": 66
},
{
"epoch": 0.18356164383561643,
"grad_norm": 0.13890501856803894,
"learning_rate": 8.19178082191781e-06,
"loss": 1.1672,
"step": 67
},
{
"epoch": 0.1863013698630137,
"grad_norm": 0.1505006104707718,
"learning_rate": 8.164383561643837e-06,
"loss": 1.1564,
"step": 68
},
{
"epoch": 0.18904109589041096,
"grad_norm": 0.13556960225105286,
"learning_rate": 8.136986301369864e-06,
"loss": 1.1818,
"step": 69
},
{
"epoch": 0.1917808219178082,
"grad_norm": 0.13326483964920044,
"learning_rate": 8.109589041095892e-06,
"loss": 1.1334,
"step": 70
},
{
"epoch": 0.19452054794520549,
"grad_norm": 0.15403442084789276,
"learning_rate": 8.082191780821919e-06,
"loss": 1.2299,
"step": 71
},
{
"epoch": 0.19726027397260273,
"grad_norm": 0.22601965069770813,
"learning_rate": 8.054794520547946e-06,
"loss": 1.1585,
"step": 72
},
{
"epoch": 0.2,
"grad_norm": 0.13847771286964417,
"learning_rate": 8.027397260273974e-06,
"loss": 1.1656,
"step": 73
},
{
"epoch": 0.20273972602739726,
"grad_norm": 0.14086662232875824,
"learning_rate": 8.000000000000001e-06,
"loss": 1.149,
"step": 74
},
{
"epoch": 0.2054794520547945,
"grad_norm": 0.14320115745067596,
"learning_rate": 7.972602739726027e-06,
"loss": 1.1623,
"step": 75
},
{
"epoch": 0.20821917808219179,
"grad_norm": 0.15054814517498016,
"learning_rate": 7.945205479452055e-06,
"loss": 1.148,
"step": 76
},
{
"epoch": 0.21095890410958903,
"grad_norm": 0.1444290280342102,
"learning_rate": 7.917808219178082e-06,
"loss": 1.1139,
"step": 77
},
{
"epoch": 0.2136986301369863,
"grad_norm": 0.13065987825393677,
"learning_rate": 7.89041095890411e-06,
"loss": 1.1368,
"step": 78
},
{
"epoch": 0.21643835616438356,
"grad_norm": 0.14221784472465515,
"learning_rate": 7.863013698630137e-06,
"loss": 1.1211,
"step": 79
},
{
"epoch": 0.2191780821917808,
"grad_norm": 0.139826700091362,
"learning_rate": 7.835616438356164e-06,
"loss": 1.1495,
"step": 80
},
{
"epoch": 0.2219178082191781,
"grad_norm": 0.13123205304145813,
"learning_rate": 7.808219178082192e-06,
"loss": 1.1235,
"step": 81
},
{
"epoch": 0.22465753424657534,
"grad_norm": 0.1347399801015854,
"learning_rate": 7.78082191780822e-06,
"loss": 1.0951,
"step": 82
},
{
"epoch": 0.2273972602739726,
"grad_norm": 0.12708094716072083,
"learning_rate": 7.753424657534248e-06,
"loss": 1.1141,
"step": 83
},
{
"epoch": 0.23013698630136986,
"grad_norm": 0.134403258562088,
"learning_rate": 7.726027397260276e-06,
"loss": 1.1198,
"step": 84
},
{
"epoch": 0.2328767123287671,
"grad_norm": 0.1293765902519226,
"learning_rate": 7.698630136986302e-06,
"loss": 1.074,
"step": 85
},
{
"epoch": 0.2356164383561644,
"grad_norm": 0.1433599442243576,
"learning_rate": 7.671232876712329e-06,
"loss": 1.1273,
"step": 86
},
{
"epoch": 0.23835616438356164,
"grad_norm": 0.14464539289474487,
"learning_rate": 7.643835616438356e-06,
"loss": 1.1404,
"step": 87
},
{
"epoch": 0.2410958904109589,
"grad_norm": 0.15533122420310974,
"learning_rate": 7.616438356164384e-06,
"loss": 1.1486,
"step": 88
},
{
"epoch": 0.24383561643835616,
"grad_norm": 0.13592854142189026,
"learning_rate": 7.589041095890411e-06,
"loss": 1.1036,
"step": 89
},
{
"epoch": 0.2465753424657534,
"grad_norm": 0.14032554626464844,
"learning_rate": 7.561643835616439e-06,
"loss": 1.1046,
"step": 90
},
{
"epoch": 0.2493150684931507,
"grad_norm": 0.13799989223480225,
"learning_rate": 7.534246575342466e-06,
"loss": 1.0813,
"step": 91
},
{
"epoch": 0.25205479452054796,
"grad_norm": 0.1336989402770996,
"learning_rate": 7.506849315068494e-06,
"loss": 1.0856,
"step": 92
},
{
"epoch": 0.2547945205479452,
"grad_norm": 0.14835570752620697,
"learning_rate": 7.479452054794521e-06,
"loss": 1.098,
"step": 93
},
{
"epoch": 0.25753424657534246,
"grad_norm": 0.14027932286262512,
"learning_rate": 7.452054794520549e-06,
"loss": 1.1069,
"step": 94
},
{
"epoch": 0.2602739726027397,
"grad_norm": 0.13933329284191132,
"learning_rate": 7.424657534246575e-06,
"loss": 1.0936,
"step": 95
},
{
"epoch": 0.26301369863013696,
"grad_norm": 0.15654663741588593,
"learning_rate": 7.397260273972603e-06,
"loss": 1.0935,
"step": 96
},
{
"epoch": 0.26575342465753427,
"grad_norm": 0.17175213992595673,
"learning_rate": 7.36986301369863e-06,
"loss": 1.0783,
"step": 97
},
{
"epoch": 0.2684931506849315,
"grad_norm": 0.14236947894096375,
"learning_rate": 7.342465753424658e-06,
"loss": 1.081,
"step": 98
},
{
"epoch": 0.27123287671232876,
"grad_norm": 0.18027353286743164,
"learning_rate": 7.315068493150685e-06,
"loss": 1.1059,
"step": 99
},
{
"epoch": 0.273972602739726,
"grad_norm": 0.16158424317836761,
"learning_rate": 7.287671232876713e-06,
"loss": 1.0841,
"step": 100
},
{
"epoch": 0.27671232876712326,
"grad_norm": 0.15303635597229004,
"learning_rate": 7.260273972602741e-06,
"loss": 1.116,
"step": 101
},
{
"epoch": 0.27945205479452057,
"grad_norm": 0.15861615538597107,
"learning_rate": 7.232876712328768e-06,
"loss": 1.1038,
"step": 102
},
{
"epoch": 0.2821917808219178,
"grad_norm": 0.17147208750247955,
"learning_rate": 7.205479452054796e-06,
"loss": 1.115,
"step": 103
},
{
"epoch": 0.28493150684931506,
"grad_norm": 0.1569274365901947,
"learning_rate": 7.178082191780823e-06,
"loss": 1.0547,
"step": 104
},
{
"epoch": 0.2876712328767123,
"grad_norm": 0.153569296002388,
"learning_rate": 7.15068493150685e-06,
"loss": 1.0811,
"step": 105
},
{
"epoch": 0.29041095890410956,
"grad_norm": 0.14510026574134827,
"learning_rate": 7.123287671232877e-06,
"loss": 1.0283,
"step": 106
},
{
"epoch": 0.29315068493150687,
"grad_norm": 0.15047655999660492,
"learning_rate": 7.095890410958905e-06,
"loss": 1.0623,
"step": 107
},
{
"epoch": 0.2958904109589041,
"grad_norm": 0.15017667412757874,
"learning_rate": 7.068493150684932e-06,
"loss": 1.0856,
"step": 108
},
{
"epoch": 0.29863013698630136,
"grad_norm": 0.1588776707649231,
"learning_rate": 7.0410958904109596e-06,
"loss": 1.1063,
"step": 109
},
{
"epoch": 0.3013698630136986,
"grad_norm": 0.15009823441505432,
"learning_rate": 7.013698630136987e-06,
"loss": 1.063,
"step": 110
},
{
"epoch": 0.3041095890410959,
"grad_norm": 0.15306375920772552,
"learning_rate": 6.9863013698630145e-06,
"loss": 1.0374,
"step": 111
},
{
"epoch": 0.30684931506849317,
"grad_norm": 0.15214037895202637,
"learning_rate": 6.958904109589042e-06,
"loss": 1.0603,
"step": 112
},
{
"epoch": 0.3095890410958904,
"grad_norm": 0.15497593581676483,
"learning_rate": 6.931506849315069e-06,
"loss": 1.0633,
"step": 113
},
{
"epoch": 0.31232876712328766,
"grad_norm": 0.1648060530424118,
"learning_rate": 6.904109589041097e-06,
"loss": 1.0393,
"step": 114
},
{
"epoch": 0.3150684931506849,
"grad_norm": 0.21841637790203094,
"learning_rate": 6.876712328767123e-06,
"loss": 1.0306,
"step": 115
},
{
"epoch": 0.3178082191780822,
"grad_norm": 0.1690291315317154,
"learning_rate": 6.849315068493151e-06,
"loss": 1.0928,
"step": 116
},
{
"epoch": 0.32054794520547947,
"grad_norm": 0.15783993899822235,
"learning_rate": 6.821917808219178e-06,
"loss": 1.0192,
"step": 117
},
{
"epoch": 0.3232876712328767,
"grad_norm": 0.15355956554412842,
"learning_rate": 6.794520547945206e-06,
"loss": 1.0457,
"step": 118
},
{
"epoch": 0.32602739726027397,
"grad_norm": 0.17985540628433228,
"learning_rate": 6.767123287671233e-06,
"loss": 1.0741,
"step": 119
},
{
"epoch": 0.3287671232876712,
"grad_norm": 0.1796933263540268,
"learning_rate": 6.739726027397261e-06,
"loss": 1.0805,
"step": 120
},
{
"epoch": 0.3315068493150685,
"grad_norm": 0.17129400372505188,
"learning_rate": 6.712328767123288e-06,
"loss": 1.0443,
"step": 121
},
{
"epoch": 0.33424657534246577,
"grad_norm": 0.17514698207378387,
"learning_rate": 6.684931506849316e-06,
"loss": 1.0465,
"step": 122
},
{
"epoch": 0.336986301369863,
"grad_norm": 0.17114493250846863,
"learning_rate": 6.657534246575343e-06,
"loss": 1.0575,
"step": 123
},
{
"epoch": 0.33972602739726027,
"grad_norm": 0.1876526176929474,
"learning_rate": 6.630136986301371e-06,
"loss": 1.0671,
"step": 124
},
{
"epoch": 0.3424657534246575,
"grad_norm": 0.1693662852048874,
"learning_rate": 6.602739726027397e-06,
"loss": 1.0336,
"step": 125
},
{
"epoch": 0.3452054794520548,
"grad_norm": 0.22197027504444122,
"learning_rate": 6.5753424657534245e-06,
"loss": 1.0381,
"step": 126
},
{
"epoch": 0.34794520547945207,
"grad_norm": 0.15992571413516998,
"learning_rate": 6.547945205479452e-06,
"loss": 1.0552,
"step": 127
},
{
"epoch": 0.3506849315068493,
"grad_norm": 0.19459573924541473,
"learning_rate": 6.5205479452054794e-06,
"loss": 1.0716,
"step": 128
},
{
"epoch": 0.35342465753424657,
"grad_norm": 0.16909852623939514,
"learning_rate": 6.493150684931508e-06,
"loss": 1.0082,
"step": 129
},
{
"epoch": 0.3561643835616438,
"grad_norm": 0.18583637475967407,
"learning_rate": 6.465753424657535e-06,
"loss": 1.0846,
"step": 130
},
{
"epoch": 0.3589041095890411,
"grad_norm": 0.16531455516815186,
"learning_rate": 6.438356164383563e-06,
"loss": 1.0579,
"step": 131
},
{
"epoch": 0.36164383561643837,
"grad_norm": 0.15629485249519348,
"learning_rate": 6.41095890410959e-06,
"loss": 1.041,
"step": 132
},
{
"epoch": 0.3643835616438356,
"grad_norm": 0.15893372893333435,
"learning_rate": 6.3835616438356175e-06,
"loss": 1.0363,
"step": 133
},
{
"epoch": 0.36712328767123287,
"grad_norm": 0.17731238901615143,
"learning_rate": 6.356164383561645e-06,
"loss": 1.0379,
"step": 134
},
{
"epoch": 0.3698630136986301,
"grad_norm": 0.20481380820274353,
"learning_rate": 6.328767123287672e-06,
"loss": 1.0506,
"step": 135
},
{
"epoch": 0.3726027397260274,
"grad_norm": 0.15704141557216644,
"learning_rate": 6.301369863013699e-06,
"loss": 1.0326,
"step": 136
},
{
"epoch": 0.37534246575342467,
"grad_norm": 0.17464356124401093,
"learning_rate": 6.2739726027397265e-06,
"loss": 1.0506,
"step": 137
},
{
"epoch": 0.3780821917808219,
"grad_norm": 0.15756955742835999,
"learning_rate": 6.246575342465754e-06,
"loss": 1.0493,
"step": 138
},
{
"epoch": 0.38082191780821917,
"grad_norm": 0.14617951214313507,
"learning_rate": 6.219178082191781e-06,
"loss": 1.0398,
"step": 139
},
{
"epoch": 0.3835616438356164,
"grad_norm": 0.1518000364303589,
"learning_rate": 6.191780821917809e-06,
"loss": 1.029,
"step": 140
},
{
"epoch": 0.3863013698630137,
"grad_norm": 0.15911860764026642,
"learning_rate": 6.164383561643836e-06,
"loss": 1.0632,
"step": 141
},
{
"epoch": 0.38904109589041097,
"grad_norm": 0.14329439401626587,
"learning_rate": 6.136986301369864e-06,
"loss": 0.9982,
"step": 142
},
{
"epoch": 0.3917808219178082,
"grad_norm": 0.14678213000297546,
"learning_rate": 6.109589041095891e-06,
"loss": 1.0269,
"step": 143
},
{
"epoch": 0.39452054794520547,
"grad_norm": 0.1700473427772522,
"learning_rate": 6.082191780821919e-06,
"loss": 1.0589,
"step": 144
},
{
"epoch": 0.3972602739726027,
"grad_norm": 0.15550148487091064,
"learning_rate": 6.054794520547945e-06,
"loss": 1.0175,
"step": 145
},
{
"epoch": 0.4,
"grad_norm": 0.1483745574951172,
"learning_rate": 6.027397260273973e-06,
"loss": 1.0354,
"step": 146
},
{
"epoch": 0.40273972602739727,
"grad_norm": 0.14025282859802246,
"learning_rate": 6e-06,
"loss": 0.9656,
"step": 147
},
{
"epoch": 0.4054794520547945,
"grad_norm": 0.15070085227489471,
"learning_rate": 5.972602739726028e-06,
"loss": 1.0103,
"step": 148
},
{
"epoch": 0.40821917808219177,
"grad_norm": 0.16396139562129974,
"learning_rate": 5.945205479452055e-06,
"loss": 1.0236,
"step": 149
},
{
"epoch": 0.410958904109589,
"grad_norm": 0.14801575243473053,
"learning_rate": 5.9178082191780825e-06,
"loss": 1.0818,
"step": 150
},
{
"epoch": 0.4136986301369863,
"grad_norm": 0.15337041020393372,
"learning_rate": 5.89041095890411e-06,
"loss": 1.041,
"step": 151
},
{
"epoch": 0.41643835616438357,
"grad_norm": 0.1659669727087021,
"learning_rate": 5.863013698630137e-06,
"loss": 0.9632,
"step": 152
},
{
"epoch": 0.4191780821917808,
"grad_norm": 0.19434167444705963,
"learning_rate": 5.835616438356166e-06,
"loss": 1.0365,
"step": 153
},
{
"epoch": 0.42191780821917807,
"grad_norm": 0.1424913853406906,
"learning_rate": 5.8082191780821915e-06,
"loss": 1.0185,
"step": 154
},
{
"epoch": 0.4246575342465753,
"grad_norm": 0.159869983792305,
"learning_rate": 5.780821917808219e-06,
"loss": 1.0568,
"step": 155
},
{
"epoch": 0.4273972602739726,
"grad_norm": 0.14181528985500336,
"learning_rate": 5.753424657534246e-06,
"loss": 1.0098,
"step": 156
},
{
"epoch": 0.4301369863013699,
"grad_norm": 0.17228105664253235,
"learning_rate": 5.726027397260274e-06,
"loss": 1.0654,
"step": 157
},
{
"epoch": 0.4328767123287671,
"grad_norm": 0.15940870344638824,
"learning_rate": 5.698630136986302e-06,
"loss": 1.0354,
"step": 158
},
{
"epoch": 0.43561643835616437,
"grad_norm": 0.1820170134305954,
"learning_rate": 5.6712328767123296e-06,
"loss": 1.0184,
"step": 159
},
{
"epoch": 0.4383561643835616,
"grad_norm": 0.14352162182331085,
"learning_rate": 5.643835616438357e-06,
"loss": 1.0303,
"step": 160
},
{
"epoch": 0.4410958904109589,
"grad_norm": 0.14085696637630463,
"learning_rate": 5.6164383561643845e-06,
"loss": 1.0275,
"step": 161
},
{
"epoch": 0.4438356164383562,
"grad_norm": 0.14605316519737244,
"learning_rate": 5.589041095890412e-06,
"loss": 1.0231,
"step": 162
},
{
"epoch": 0.4465753424657534,
"grad_norm": 0.137510746717453,
"learning_rate": 5.561643835616439e-06,
"loss": 0.9733,
"step": 163
},
{
"epoch": 0.44931506849315067,
"grad_norm": 0.1410391479730606,
"learning_rate": 5.534246575342466e-06,
"loss": 1.0079,
"step": 164
},
{
"epoch": 0.4520547945205479,
"grad_norm": 0.17175598442554474,
"learning_rate": 5.506849315068493e-06,
"loss": 1.0035,
"step": 165
},
{
"epoch": 0.4547945205479452,
"grad_norm": 0.14537735283374786,
"learning_rate": 5.479452054794521e-06,
"loss": 1.0061,
"step": 166
},
{
"epoch": 0.4575342465753425,
"grad_norm": 0.2274583876132965,
"learning_rate": 5.452054794520548e-06,
"loss": 1.0426,
"step": 167
},
{
"epoch": 0.4602739726027397,
"grad_norm": 0.14900068938732147,
"learning_rate": 5.424657534246576e-06,
"loss": 1.0255,
"step": 168
},
{
"epoch": 0.46301369863013697,
"grad_norm": 0.14231647551059723,
"learning_rate": 5.397260273972603e-06,
"loss": 1.0428,
"step": 169
},
{
"epoch": 0.4657534246575342,
"grad_norm": 0.1493074893951416,
"learning_rate": 5.369863013698631e-06,
"loss": 0.9676,
"step": 170
},
{
"epoch": 0.4684931506849315,
"grad_norm": 0.18627113103866577,
"learning_rate": 5.342465753424658e-06,
"loss": 1.009,
"step": 171
},
{
"epoch": 0.4712328767123288,
"grad_norm": 0.16311538219451904,
"learning_rate": 5.3150684931506856e-06,
"loss": 1.0585,
"step": 172
},
{
"epoch": 0.473972602739726,
"grad_norm": 0.15902486443519592,
"learning_rate": 5.287671232876713e-06,
"loss": 0.987,
"step": 173
},
{
"epoch": 0.4767123287671233,
"grad_norm": 0.1851184368133545,
"learning_rate": 5.26027397260274e-06,
"loss": 1.0593,
"step": 174
},
{
"epoch": 0.4794520547945205,
"grad_norm": 0.14445023238658905,
"learning_rate": 5.232876712328767e-06,
"loss": 1.0439,
"step": 175
},
{
"epoch": 0.4821917808219178,
"grad_norm": 0.16020196676254272,
"learning_rate": 5.2054794520547945e-06,
"loss": 1.0296,
"step": 176
},
{
"epoch": 0.4849315068493151,
"grad_norm": 0.13849924504756927,
"learning_rate": 5.178082191780822e-06,
"loss": 1.0116,
"step": 177
},
{
"epoch": 0.4876712328767123,
"grad_norm": 0.13803769648075104,
"learning_rate": 5.1506849315068494e-06,
"loss": 0.9989,
"step": 178
},
{
"epoch": 0.4904109589041096,
"grad_norm": 0.24443377554416656,
"learning_rate": 5.123287671232877e-06,
"loss": 0.9311,
"step": 179
},
{
"epoch": 0.4931506849315068,
"grad_norm": 0.18483811616897583,
"learning_rate": 5.095890410958904e-06,
"loss": 0.965,
"step": 180
},
{
"epoch": 0.4958904109589041,
"grad_norm": 0.13710997998714447,
"learning_rate": 5.068493150684932e-06,
"loss": 0.9735,
"step": 181
},
{
"epoch": 0.4986301369863014,
"grad_norm": 0.1499159336090088,
"learning_rate": 5.04109589041096e-06,
"loss": 1.0204,
"step": 182
},
{
"epoch": 0.5013698630136987,
"grad_norm": 0.1443200260400772,
"learning_rate": 5.0136986301369875e-06,
"loss": 0.9589,
"step": 183
},
{
"epoch": 0.5041095890410959,
"grad_norm": 0.14708077907562256,
"learning_rate": 4.986301369863014e-06,
"loss": 0.9893,
"step": 184
},
{
"epoch": 0.5068493150684932,
"grad_norm": 0.1446971595287323,
"learning_rate": 4.958904109589042e-06,
"loss": 1.0013,
"step": 185
},
{
"epoch": 0.5095890410958904,
"grad_norm": 0.14119097590446472,
"learning_rate": 4.931506849315069e-06,
"loss": 1.0061,
"step": 186
},
{
"epoch": 0.5123287671232877,
"grad_norm": 0.14687605202198029,
"learning_rate": 4.9041095890410965e-06,
"loss": 1.0259,
"step": 187
},
{
"epoch": 0.5150684931506849,
"grad_norm": 0.14031365513801575,
"learning_rate": 4.876712328767124e-06,
"loss": 1.0262,
"step": 188
},
{
"epoch": 0.5178082191780822,
"grad_norm": 0.1647661030292511,
"learning_rate": 4.849315068493151e-06,
"loss": 1.0225,
"step": 189
},
{
"epoch": 0.5205479452054794,
"grad_norm": 0.14606985449790955,
"learning_rate": 4.821917808219179e-06,
"loss": 0.9771,
"step": 190
},
{
"epoch": 0.5232876712328767,
"grad_norm": 0.15172745287418365,
"learning_rate": 4.7945205479452054e-06,
"loss": 1.0252,
"step": 191
},
{
"epoch": 0.5260273972602739,
"grad_norm": 0.1375645399093628,
"learning_rate": 4.767123287671233e-06,
"loss": 1.0064,
"step": 192
},
{
"epoch": 0.5287671232876713,
"grad_norm": 0.1723964810371399,
"learning_rate": 4.73972602739726e-06,
"loss": 0.9935,
"step": 193
},
{
"epoch": 0.5315068493150685,
"grad_norm": 0.1510409563779831,
"learning_rate": 4.712328767123288e-06,
"loss": 1.0253,
"step": 194
},
{
"epoch": 0.5342465753424658,
"grad_norm": 0.14847567677497864,
"learning_rate": 4.684931506849315e-06,
"loss": 0.9796,
"step": 195
},
{
"epoch": 0.536986301369863,
"grad_norm": 0.14238575100898743,
"learning_rate": 4.657534246575343e-06,
"loss": 0.9835,
"step": 196
},
{
"epoch": 0.5397260273972603,
"grad_norm": 0.15265069901943207,
"learning_rate": 4.63013698630137e-06,
"loss": 1.0312,
"step": 197
},
{
"epoch": 0.5424657534246575,
"grad_norm": 0.13926750421524048,
"learning_rate": 4.602739726027398e-06,
"loss": 0.9606,
"step": 198
},
{
"epoch": 0.5452054794520548,
"grad_norm": 0.1447301059961319,
"learning_rate": 4.575342465753425e-06,
"loss": 0.9946,
"step": 199
},
{
"epoch": 0.547945205479452,
"grad_norm": 0.18391898274421692,
"learning_rate": 4.5479452054794525e-06,
"loss": 0.9846,
"step": 200
},
{
"epoch": 0.5506849315068493,
"grad_norm": 0.13554824888706207,
"learning_rate": 4.52054794520548e-06,
"loss": 1.0005,
"step": 201
},
{
"epoch": 0.5534246575342465,
"grad_norm": 0.14264196157455444,
"learning_rate": 4.493150684931507e-06,
"loss": 0.9962,
"step": 202
},
{
"epoch": 0.5561643835616439,
"grad_norm": 0.14450880885124207,
"learning_rate": 4.465753424657535e-06,
"loss": 0.9783,
"step": 203
},
{
"epoch": 0.5589041095890411,
"grad_norm": 0.14610551297664642,
"learning_rate": 4.438356164383562e-06,
"loss": 1.0051,
"step": 204
},
{
"epoch": 0.5616438356164384,
"grad_norm": 0.1378694474697113,
"learning_rate": 4.41095890410959e-06,
"loss": 0.9696,
"step": 205
},
{
"epoch": 0.5643835616438356,
"grad_norm": 0.14404141902923584,
"learning_rate": 4.383561643835616e-06,
"loss": 0.9924,
"step": 206
},
{
"epoch": 0.5671232876712329,
"grad_norm": 0.2318737804889679,
"learning_rate": 4.356164383561644e-06,
"loss": 0.9768,
"step": 207
},
{
"epoch": 0.5698630136986301,
"grad_norm": 0.14497965574264526,
"learning_rate": 4.328767123287671e-06,
"loss": 0.9792,
"step": 208
},
{
"epoch": 0.5726027397260274,
"grad_norm": 0.2267143577337265,
"learning_rate": 4.301369863013699e-06,
"loss": 0.9884,
"step": 209
},
{
"epoch": 0.5753424657534246,
"grad_norm": 0.14338290691375732,
"learning_rate": 4.273972602739727e-06,
"loss": 0.9647,
"step": 210
},
{
"epoch": 0.5780821917808219,
"grad_norm": 0.1473742127418518,
"learning_rate": 4.246575342465754e-06,
"loss": 1.0206,
"step": 211
},
{
"epoch": 0.5808219178082191,
"grad_norm": 0.18932686746120453,
"learning_rate": 4.219178082191781e-06,
"loss": 0.9899,
"step": 212
},
{
"epoch": 0.5835616438356165,
"grad_norm": 0.179109126329422,
"learning_rate": 4.1917808219178085e-06,
"loss": 0.9942,
"step": 213
},
{
"epoch": 0.5863013698630137,
"grad_norm": 0.16374213993549347,
"learning_rate": 4.164383561643836e-06,
"loss": 0.987,
"step": 214
},
{
"epoch": 0.589041095890411,
"grad_norm": 0.15565191209316254,
"learning_rate": 4.136986301369863e-06,
"loss": 1.0136,
"step": 215
},
{
"epoch": 0.5917808219178082,
"grad_norm": 0.15738491714000702,
"learning_rate": 4.109589041095891e-06,
"loss": 0.9797,
"step": 216
},
{
"epoch": 0.5945205479452055,
"grad_norm": 0.1617341786623001,
"learning_rate": 4.082191780821918e-06,
"loss": 1.004,
"step": 217
},
{
"epoch": 0.5972602739726027,
"grad_norm": 0.15519091486930847,
"learning_rate": 4.054794520547946e-06,
"loss": 0.9476,
"step": 218
},
{
"epoch": 0.6,
"grad_norm": 0.14793817698955536,
"learning_rate": 4.027397260273973e-06,
"loss": 0.9594,
"step": 219
},
{
"epoch": 0.6027397260273972,
"grad_norm": 0.1568097174167633,
"learning_rate": 4.000000000000001e-06,
"loss": 0.9404,
"step": 220
},
{
"epoch": 0.6054794520547945,
"grad_norm": 0.1501924991607666,
"learning_rate": 3.972602739726027e-06,
"loss": 0.9825,
"step": 221
},
{
"epoch": 0.6082191780821918,
"grad_norm": 0.2527293562889099,
"learning_rate": 3.945205479452055e-06,
"loss": 0.9469,
"step": 222
},
{
"epoch": 0.6109589041095891,
"grad_norm": 0.2237492948770523,
"learning_rate": 3.917808219178082e-06,
"loss": 0.9647,
"step": 223
},
{
"epoch": 0.6136986301369863,
"grad_norm": 0.1720942109823227,
"learning_rate": 3.89041095890411e-06,
"loss": 0.9707,
"step": 224
},
{
"epoch": 0.6164383561643836,
"grad_norm": 0.17925786972045898,
"learning_rate": 3.863013698630138e-06,
"loss": 0.9851,
"step": 225
},
{
"epoch": 0.6191780821917808,
"grad_norm": 0.185493603348732,
"learning_rate": 3.8356164383561645e-06,
"loss": 0.9851,
"step": 226
},
{
"epoch": 0.6219178082191781,
"grad_norm": 0.15975800156593323,
"learning_rate": 3.808219178082192e-06,
"loss": 0.9983,
"step": 227
},
{
"epoch": 0.6246575342465753,
"grad_norm": 0.1643008291721344,
"learning_rate": 3.7808219178082194e-06,
"loss": 0.9997,
"step": 228
},
{
"epoch": 0.6273972602739726,
"grad_norm": 0.15068687498569489,
"learning_rate": 3.753424657534247e-06,
"loss": 0.9795,
"step": 229
},
{
"epoch": 0.6301369863013698,
"grad_norm": 0.15405616164207458,
"learning_rate": 3.7260273972602743e-06,
"loss": 0.9906,
"step": 230
},
{
"epoch": 0.6328767123287671,
"grad_norm": 0.15264445543289185,
"learning_rate": 3.6986301369863014e-06,
"loss": 1.0125,
"step": 231
},
{
"epoch": 0.6356164383561644,
"grad_norm": 0.14779290556907654,
"learning_rate": 3.671232876712329e-06,
"loss": 0.9795,
"step": 232
},
{
"epoch": 0.6383561643835617,
"grad_norm": 0.16402508318424225,
"learning_rate": 3.6438356164383567e-06,
"loss": 1.002,
"step": 233
},
{
"epoch": 0.6410958904109589,
"grad_norm": 0.15044072270393372,
"learning_rate": 3.616438356164384e-06,
"loss": 0.9679,
"step": 234
},
{
"epoch": 0.6438356164383562,
"grad_norm": 0.1484067738056183,
"learning_rate": 3.5890410958904116e-06,
"loss": 1.0121,
"step": 235
},
{
"epoch": 0.6465753424657534,
"grad_norm": 0.14875975251197815,
"learning_rate": 3.5616438356164386e-06,
"loss": 0.9647,
"step": 236
},
{
"epoch": 0.6493150684931507,
"grad_norm": 0.1453394740819931,
"learning_rate": 3.534246575342466e-06,
"loss": 0.9703,
"step": 237
},
{
"epoch": 0.6520547945205479,
"grad_norm": 0.17434316873550415,
"learning_rate": 3.5068493150684935e-06,
"loss": 0.9591,
"step": 238
},
{
"epoch": 0.6547945205479452,
"grad_norm": 0.1531129777431488,
"learning_rate": 3.479452054794521e-06,
"loss": 0.9752,
"step": 239
},
{
"epoch": 0.6575342465753424,
"grad_norm": 0.16097253561019897,
"learning_rate": 3.4520547945205484e-06,
"loss": 0.9915,
"step": 240
},
{
"epoch": 0.6602739726027397,
"grad_norm": 0.14477920532226562,
"learning_rate": 3.4246575342465754e-06,
"loss": 0.995,
"step": 241
},
{
"epoch": 0.663013698630137,
"grad_norm": 0.1426738202571869,
"learning_rate": 3.397260273972603e-06,
"loss": 0.9429,
"step": 242
},
{
"epoch": 0.6657534246575343,
"grad_norm": 0.16615210473537445,
"learning_rate": 3.3698630136986303e-06,
"loss": 0.9951,
"step": 243
},
{
"epoch": 0.6684931506849315,
"grad_norm": 0.1474033147096634,
"learning_rate": 3.342465753424658e-06,
"loss": 0.958,
"step": 244
},
{
"epoch": 0.6712328767123288,
"grad_norm": 0.14540505409240723,
"learning_rate": 3.3150684931506857e-06,
"loss": 0.9452,
"step": 245
},
{
"epoch": 0.673972602739726,
"grad_norm": 0.22462385892868042,
"learning_rate": 3.2876712328767123e-06,
"loss": 0.9503,
"step": 246
},
{
"epoch": 0.6767123287671233,
"grad_norm": 0.14661099016666412,
"learning_rate": 3.2602739726027397e-06,
"loss": 0.9807,
"step": 247
},
{
"epoch": 0.6794520547945205,
"grad_norm": 0.14879795908927917,
"learning_rate": 3.2328767123287676e-06,
"loss": 0.9578,
"step": 248
},
{
"epoch": 0.6821917808219178,
"grad_norm": 0.16298916935920715,
"learning_rate": 3.205479452054795e-06,
"loss": 0.9525,
"step": 249
},
{
"epoch": 0.684931506849315,
"grad_norm": 0.1545659750699997,
"learning_rate": 3.1780821917808225e-06,
"loss": 0.9884,
"step": 250
},
{
"epoch": 0.6876712328767123,
"grad_norm": 0.14649316668510437,
"learning_rate": 3.1506849315068495e-06,
"loss": 0.9831,
"step": 251
},
{
"epoch": 0.6904109589041096,
"grad_norm": 0.25621578097343445,
"learning_rate": 3.123287671232877e-06,
"loss": 1.0186,
"step": 252
},
{
"epoch": 0.6931506849315069,
"grad_norm": 0.17166805267333984,
"learning_rate": 3.0958904109589044e-06,
"loss": 0.976,
"step": 253
},
{
"epoch": 0.6958904109589041,
"grad_norm": 0.2713611125946045,
"learning_rate": 3.068493150684932e-06,
"loss": 1.0367,
"step": 254
},
{
"epoch": 0.6986301369863014,
"grad_norm": 0.21252557635307312,
"learning_rate": 3.0410958904109593e-06,
"loss": 0.9695,
"step": 255
},
{
"epoch": 0.7013698630136986,
"grad_norm": 0.15245412290096283,
"learning_rate": 3.0136986301369864e-06,
"loss": 0.903,
"step": 256
},
{
"epoch": 0.7041095890410959,
"grad_norm": 0.16625380516052246,
"learning_rate": 2.986301369863014e-06,
"loss": 1.0032,
"step": 257
},
{
"epoch": 0.7068493150684931,
"grad_norm": 0.1509459912776947,
"learning_rate": 2.9589041095890413e-06,
"loss": 0.9269,
"step": 258
},
{
"epoch": 0.7095890410958904,
"grad_norm": 0.1466090828180313,
"learning_rate": 2.9315068493150687e-06,
"loss": 0.9565,
"step": 259
},
{
"epoch": 0.7123287671232876,
"grad_norm": 0.14883840084075928,
"learning_rate": 2.9041095890410957e-06,
"loss": 0.9848,
"step": 260
},
{
"epoch": 0.7150684931506849,
"grad_norm": 0.16292427480220795,
"learning_rate": 2.876712328767123e-06,
"loss": 1.0061,
"step": 261
},
{
"epoch": 0.7178082191780822,
"grad_norm": 0.2424314171075821,
"learning_rate": 2.849315068493151e-06,
"loss": 0.9698,
"step": 262
},
{
"epoch": 0.7205479452054795,
"grad_norm": 0.15062043070793152,
"learning_rate": 2.8219178082191785e-06,
"loss": 0.9648,
"step": 263
},
{
"epoch": 0.7232876712328767,
"grad_norm": 0.14946803450584412,
"learning_rate": 2.794520547945206e-06,
"loss": 0.9775,
"step": 264
},
{
"epoch": 0.726027397260274,
"grad_norm": 0.15413372218608856,
"learning_rate": 2.767123287671233e-06,
"loss": 0.9674,
"step": 265
},
{
"epoch": 0.7287671232876712,
"grad_norm": 0.15046992897987366,
"learning_rate": 2.7397260273972604e-06,
"loss": 0.9858,
"step": 266
},
{
"epoch": 0.7315068493150685,
"grad_norm": 0.14373600482940674,
"learning_rate": 2.712328767123288e-06,
"loss": 0.9536,
"step": 267
},
{
"epoch": 0.7342465753424657,
"grad_norm": 0.14666618406772614,
"learning_rate": 2.6849315068493153e-06,
"loss": 0.9525,
"step": 268
},
{
"epoch": 0.736986301369863,
"grad_norm": 0.21461984515190125,
"learning_rate": 2.6575342465753428e-06,
"loss": 0.9445,
"step": 269
},
{
"epoch": 0.7397260273972602,
"grad_norm": 0.1491764336824417,
"learning_rate": 2.63013698630137e-06,
"loss": 0.9414,
"step": 270
},
{
"epoch": 0.7424657534246575,
"grad_norm": 0.14540155231952667,
"learning_rate": 2.6027397260273973e-06,
"loss": 0.9715,
"step": 271
},
{
"epoch": 0.7452054794520548,
"grad_norm": 0.14945320785045624,
"learning_rate": 2.5753424657534247e-06,
"loss": 0.9837,
"step": 272
},
{
"epoch": 0.7479452054794521,
"grad_norm": 0.2210550606250763,
"learning_rate": 2.547945205479452e-06,
"loss": 0.9103,
"step": 273
},
{
"epoch": 0.7506849315068493,
"grad_norm": 0.15028400719165802,
"learning_rate": 2.52054794520548e-06,
"loss": 0.9913,
"step": 274
},
{
"epoch": 0.7534246575342466,
"grad_norm": 0.1455719918012619,
"learning_rate": 2.493150684931507e-06,
"loss": 0.9497,
"step": 275
},
{
"epoch": 0.7561643835616438,
"grad_norm": 0.22942738234996796,
"learning_rate": 2.4657534246575345e-06,
"loss": 0.9701,
"step": 276
},
{
"epoch": 0.7589041095890411,
"grad_norm": 0.1671881526708603,
"learning_rate": 2.438356164383562e-06,
"loss": 0.9739,
"step": 277
},
{
"epoch": 0.7616438356164383,
"grad_norm": 0.15146781504154205,
"learning_rate": 2.4109589041095894e-06,
"loss": 0.9913,
"step": 278
},
{
"epoch": 0.7643835616438356,
"grad_norm": 0.1616286039352417,
"learning_rate": 2.3835616438356164e-06,
"loss": 0.9819,
"step": 279
},
{
"epoch": 0.7671232876712328,
"grad_norm": 0.15679773688316345,
"learning_rate": 2.356164383561644e-06,
"loss": 0.9558,
"step": 280
},
{
"epoch": 0.7698630136986301,
"grad_norm": 0.14924940466880798,
"learning_rate": 2.3287671232876713e-06,
"loss": 0.9881,
"step": 281
},
{
"epoch": 0.7726027397260274,
"grad_norm": 0.16193532943725586,
"learning_rate": 2.301369863013699e-06,
"loss": 0.9702,
"step": 282
},
{
"epoch": 0.7753424657534247,
"grad_norm": 0.17239750921726227,
"learning_rate": 2.2739726027397262e-06,
"loss": 0.9642,
"step": 283
},
{
"epoch": 0.7780821917808219,
"grad_norm": 0.14394605159759521,
"learning_rate": 2.2465753424657537e-06,
"loss": 0.928,
"step": 284
},
{
"epoch": 0.7808219178082192,
"grad_norm": 0.22770152986049652,
"learning_rate": 2.219178082191781e-06,
"loss": 0.9784,
"step": 285
},
{
"epoch": 0.7835616438356164,
"grad_norm": 0.15883322060108185,
"learning_rate": 2.191780821917808e-06,
"loss": 0.9588,
"step": 286
},
{
"epoch": 0.7863013698630137,
"grad_norm": 0.17964458465576172,
"learning_rate": 2.1643835616438356e-06,
"loss": 0.9669,
"step": 287
},
{
"epoch": 0.7890410958904109,
"grad_norm": 0.15230049192905426,
"learning_rate": 2.1369863013698635e-06,
"loss": 0.9646,
"step": 288
},
{
"epoch": 0.7917808219178082,
"grad_norm": 0.16122160851955414,
"learning_rate": 2.1095890410958905e-06,
"loss": 0.9775,
"step": 289
},
{
"epoch": 0.7945205479452054,
"grad_norm": 0.1844584345817566,
"learning_rate": 2.082191780821918e-06,
"loss": 0.9625,
"step": 290
},
{
"epoch": 0.7972602739726027,
"grad_norm": 0.14859682321548462,
"learning_rate": 2.0547945205479454e-06,
"loss": 0.9514,
"step": 291
},
{
"epoch": 0.8,
"grad_norm": 0.22821080684661865,
"learning_rate": 2.027397260273973e-06,
"loss": 1.0101,
"step": 292
},
{
"epoch": 0.8027397260273973,
"grad_norm": 0.28670504689216614,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.0066,
"step": 293
},
{
"epoch": 0.8054794520547945,
"grad_norm": 0.18800397217273712,
"learning_rate": 1.9726027397260274e-06,
"loss": 0.9509,
"step": 294
},
{
"epoch": 0.8082191780821918,
"grad_norm": 0.17831219732761383,
"learning_rate": 1.945205479452055e-06,
"loss": 0.9519,
"step": 295
},
{
"epoch": 0.810958904109589,
"grad_norm": 0.16340646147727966,
"learning_rate": 1.9178082191780823e-06,
"loss": 0.9664,
"step": 296
},
{
"epoch": 0.8136986301369863,
"grad_norm": 0.25938671827316284,
"learning_rate": 1.8904109589041097e-06,
"loss": 0.9929,
"step": 297
},
{
"epoch": 0.8164383561643835,
"grad_norm": 0.20670604705810547,
"learning_rate": 1.8630136986301372e-06,
"loss": 0.955,
"step": 298
},
{
"epoch": 0.8191780821917808,
"grad_norm": 0.16003543138504028,
"learning_rate": 1.8356164383561644e-06,
"loss": 0.9041,
"step": 299
},
{
"epoch": 0.821917808219178,
"grad_norm": 0.17774829268455505,
"learning_rate": 1.808219178082192e-06,
"loss": 0.9766,
"step": 300
},
{
"epoch": 0.8246575342465754,
"grad_norm": 0.1567939966917038,
"learning_rate": 1.7808219178082193e-06,
"loss": 0.9925,
"step": 301
},
{
"epoch": 0.8273972602739726,
"grad_norm": 0.17240066826343536,
"learning_rate": 1.7534246575342468e-06,
"loss": 0.9758,
"step": 302
},
{
"epoch": 0.8301369863013699,
"grad_norm": 0.19978557527065277,
"learning_rate": 1.7260273972602742e-06,
"loss": 1.028,
"step": 303
},
{
"epoch": 0.8328767123287671,
"grad_norm": 0.2522045075893402,
"learning_rate": 1.6986301369863014e-06,
"loss": 0.9918,
"step": 304
},
{
"epoch": 0.8356164383561644,
"grad_norm": 0.15379253029823303,
"learning_rate": 1.671232876712329e-06,
"loss": 0.9816,
"step": 305
},
{
"epoch": 0.8383561643835616,
"grad_norm": 0.1589875966310501,
"learning_rate": 1.6438356164383561e-06,
"loss": 0.9529,
"step": 306
},
{
"epoch": 0.8410958904109589,
"grad_norm": 0.1517769694328308,
"learning_rate": 1.6164383561643838e-06,
"loss": 0.9073,
"step": 307
},
{
"epoch": 0.8438356164383561,
"grad_norm": 0.14468395709991455,
"learning_rate": 1.5890410958904112e-06,
"loss": 0.9402,
"step": 308
},
{
"epoch": 0.8465753424657534,
"grad_norm": 0.2625545561313629,
"learning_rate": 1.5616438356164385e-06,
"loss": 1.0263,
"step": 309
},
{
"epoch": 0.8493150684931506,
"grad_norm": 0.1752869337797165,
"learning_rate": 1.534246575342466e-06,
"loss": 0.9552,
"step": 310
},
{
"epoch": 0.852054794520548,
"grad_norm": 0.2208409607410431,
"learning_rate": 1.5068493150684932e-06,
"loss": 0.963,
"step": 311
},
{
"epoch": 0.8547945205479452,
"grad_norm": 0.148488387465477,
"learning_rate": 1.4794520547945206e-06,
"loss": 0.9358,
"step": 312
},
{
"epoch": 0.8575342465753425,
"grad_norm": 0.1534607708454132,
"learning_rate": 1.4520547945205479e-06,
"loss": 0.9557,
"step": 313
},
{
"epoch": 0.8602739726027397,
"grad_norm": 0.24213606119155884,
"learning_rate": 1.4246575342465755e-06,
"loss": 0.9558,
"step": 314
},
{
"epoch": 0.863013698630137,
"grad_norm": 0.1569008082151413,
"learning_rate": 1.397260273972603e-06,
"loss": 0.9572,
"step": 315
},
{
"epoch": 0.8657534246575342,
"grad_norm": 0.14338001608848572,
"learning_rate": 1.3698630136986302e-06,
"loss": 0.9624,
"step": 316
},
{
"epoch": 0.8684931506849315,
"grad_norm": 0.15615947544574738,
"learning_rate": 1.3424657534246577e-06,
"loss": 0.9637,
"step": 317
},
{
"epoch": 0.8712328767123287,
"grad_norm": 0.1520787477493286,
"learning_rate": 1.315068493150685e-06,
"loss": 0.9364,
"step": 318
},
{
"epoch": 0.873972602739726,
"grad_norm": 0.15411341190338135,
"learning_rate": 1.2876712328767124e-06,
"loss": 0.9946,
"step": 319
},
{
"epoch": 0.8767123287671232,
"grad_norm": 0.1493544578552246,
"learning_rate": 1.26027397260274e-06,
"loss": 0.9843,
"step": 320
},
{
"epoch": 0.8794520547945206,
"grad_norm": 0.17391565442085266,
"learning_rate": 1.2328767123287673e-06,
"loss": 0.9841,
"step": 321
},
{
"epoch": 0.8821917808219178,
"grad_norm": 0.15456560254096985,
"learning_rate": 1.2054794520547947e-06,
"loss": 0.9773,
"step": 322
},
{
"epoch": 0.8849315068493151,
"grad_norm": 0.14900638163089752,
"learning_rate": 1.178082191780822e-06,
"loss": 0.9829,
"step": 323
},
{
"epoch": 0.8876712328767123,
"grad_norm": 0.1501474678516388,
"learning_rate": 1.1506849315068494e-06,
"loss": 0.9348,
"step": 324
},
{
"epoch": 0.8904109589041096,
"grad_norm": 0.14925050735473633,
"learning_rate": 1.1232876712328769e-06,
"loss": 0.9322,
"step": 325
},
{
"epoch": 0.8931506849315068,
"grad_norm": 0.16502335667610168,
"learning_rate": 1.095890410958904e-06,
"loss": 0.9881,
"step": 326
},
{
"epoch": 0.8958904109589041,
"grad_norm": 0.24135267734527588,
"learning_rate": 1.0684931506849318e-06,
"loss": 0.9633,
"step": 327
},
{
"epoch": 0.8986301369863013,
"grad_norm": 0.15312564373016357,
"learning_rate": 1.041095890410959e-06,
"loss": 0.97,
"step": 328
},
{
"epoch": 0.9013698630136986,
"grad_norm": 0.15663985908031464,
"learning_rate": 1.0136986301369864e-06,
"loss": 0.9347,
"step": 329
},
{
"epoch": 0.9041095890410958,
"grad_norm": 0.1708153784275055,
"learning_rate": 9.863013698630137e-07,
"loss": 0.9524,
"step": 330
},
{
"epoch": 0.9068493150684932,
"grad_norm": 0.2211901694536209,
"learning_rate": 9.589041095890411e-07,
"loss": 0.9804,
"step": 331
},
{
"epoch": 0.9095890410958904,
"grad_norm": 0.16123130917549133,
"learning_rate": 9.315068493150686e-07,
"loss": 0.9688,
"step": 332
},
{
"epoch": 0.9123287671232877,
"grad_norm": 0.17667633295059204,
"learning_rate": 9.04109589041096e-07,
"loss": 0.9707,
"step": 333
},
{
"epoch": 0.915068493150685,
"grad_norm": 0.15364082157611847,
"learning_rate": 8.767123287671234e-07,
"loss": 0.991,
"step": 334
},
{
"epoch": 0.9178082191780822,
"grad_norm": 0.15868520736694336,
"learning_rate": 8.493150684931507e-07,
"loss": 0.9807,
"step": 335
},
{
"epoch": 0.9205479452054794,
"grad_norm": 0.1566782146692276,
"learning_rate": 8.219178082191781e-07,
"loss": 0.9354,
"step": 336
},
{
"epoch": 0.9232876712328767,
"grad_norm": 0.17358021438121796,
"learning_rate": 7.945205479452056e-07,
"loss": 0.9728,
"step": 337
},
{
"epoch": 0.9260273972602739,
"grad_norm": 0.20244623720645905,
"learning_rate": 7.67123287671233e-07,
"loss": 0.9716,
"step": 338
},
{
"epoch": 0.9287671232876712,
"grad_norm": 0.15061792731285095,
"learning_rate": 7.397260273972603e-07,
"loss": 0.9489,
"step": 339
},
{
"epoch": 0.9315068493150684,
"grad_norm": 0.1528923660516739,
"learning_rate": 7.123287671232878e-07,
"loss": 0.9975,
"step": 340
},
{
"epoch": 0.9342465753424658,
"grad_norm": 0.15784519910812378,
"learning_rate": 6.849315068493151e-07,
"loss": 0.9525,
"step": 341
},
{
"epoch": 0.936986301369863,
"grad_norm": 0.15129871666431427,
"learning_rate": 6.575342465753425e-07,
"loss": 0.9815,
"step": 342
},
{
"epoch": 0.9397260273972603,
"grad_norm": 0.18728306889533997,
"learning_rate": 6.3013698630137e-07,
"loss": 0.9315,
"step": 343
},
{
"epoch": 0.9424657534246575,
"grad_norm": 0.15884990990161896,
"learning_rate": 6.027397260273974e-07,
"loss": 0.9915,
"step": 344
},
{
"epoch": 0.9452054794520548,
"grad_norm": 0.16598787903785706,
"learning_rate": 5.753424657534247e-07,
"loss": 0.9847,
"step": 345
},
{
"epoch": 0.947945205479452,
"grad_norm": 0.15402452647686005,
"learning_rate": 5.47945205479452e-07,
"loss": 0.9558,
"step": 346
},
{
"epoch": 0.9506849315068493,
"grad_norm": 0.1826355904340744,
"learning_rate": 5.205479452054795e-07,
"loss": 0.9863,
"step": 347
},
{
"epoch": 0.9534246575342465,
"grad_norm": 0.15361925959587097,
"learning_rate": 4.931506849315068e-07,
"loss": 0.9806,
"step": 348
},
{
"epoch": 0.9561643835616438,
"grad_norm": 0.1521327793598175,
"learning_rate": 4.657534246575343e-07,
"loss": 0.966,
"step": 349
},
{
"epoch": 0.958904109589041,
"grad_norm": 0.15801185369491577,
"learning_rate": 4.383561643835617e-07,
"loss": 0.95,
"step": 350
},
{
"epoch": 0.9616438356164384,
"grad_norm": 0.14759479463100433,
"learning_rate": 4.1095890410958903e-07,
"loss": 0.9835,
"step": 351
},
{
"epoch": 0.9643835616438357,
"grad_norm": 0.15526413917541504,
"learning_rate": 3.835616438356165e-07,
"loss": 0.9359,
"step": 352
},
{
"epoch": 0.9671232876712329,
"grad_norm": 0.152072012424469,
"learning_rate": 3.561643835616439e-07,
"loss": 0.943,
"step": 353
},
{
"epoch": 0.9698630136986301,
"grad_norm": 0.14854729175567627,
"learning_rate": 3.2876712328767123e-07,
"loss": 0.9297,
"step": 354
},
{
"epoch": 0.9726027397260274,
"grad_norm": 0.1651182621717453,
"learning_rate": 3.013698630136987e-07,
"loss": 0.9714,
"step": 355
},
{
"epoch": 0.9753424657534246,
"grad_norm": 0.15493687987327576,
"learning_rate": 2.73972602739726e-07,
"loss": 0.9792,
"step": 356
},
{
"epoch": 0.9780821917808219,
"grad_norm": 0.16965237259864807,
"learning_rate": 2.465753424657534e-07,
"loss": 0.9676,
"step": 357
},
{
"epoch": 0.9808219178082191,
"grad_norm": 0.2606389820575714,
"learning_rate": 2.1917808219178084e-07,
"loss": 0.9751,
"step": 358
},
{
"epoch": 0.9835616438356164,
"grad_norm": 0.17312797904014587,
"learning_rate": 1.9178082191780824e-07,
"loss": 1.0047,
"step": 359
},
{
"epoch": 0.9863013698630136,
"grad_norm": 0.16929611563682556,
"learning_rate": 1.6438356164383561e-07,
"loss": 0.9507,
"step": 360
},
{
"epoch": 0.989041095890411,
"grad_norm": 0.16281668841838837,
"learning_rate": 1.36986301369863e-07,
"loss": 0.9427,
"step": 361
},
{
"epoch": 0.9917808219178083,
"grad_norm": 0.15664711594581604,
"learning_rate": 1.0958904109589042e-07,
"loss": 0.9845,
"step": 362
},
{
"epoch": 0.9945205479452055,
"grad_norm": 0.14752177894115448,
"learning_rate": 8.219178082191781e-08,
"loss": 0.9271,
"step": 363
},
{
"epoch": 0.9972602739726028,
"grad_norm": 0.1772214025259018,
"learning_rate": 5.479452054794521e-08,
"loss": 1.0036,
"step": 364
},
{
"epoch": 1.0,
"grad_norm": 0.1503649801015854,
"learning_rate": 2.7397260273972606e-08,
"loss": 0.9636,
"step": 365
}
],
"logging_steps": 1.0,
"max_steps": 365,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.375240306981601e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}