diff --git "a/outputs/qlora-out/checkpoint-885/trainer_state.json" "b/outputs/qlora-out/checkpoint-885/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/outputs/qlora-out/checkpoint-885/trainer_state.json"
@@ -0,0 +1,6260 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997175939000282,
+  "eval_steps": 295,
+  "global_step": 885,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0011296243998870376,
+      "grad_norm": 0.03433802351355553,
+      "learning_rate": 2e-05,
+      "loss": 1.1396,
+      "step": 1
+    },
+    {
+      "epoch": 0.0011296243998870376,
+      "eval_loss": 1.1312694549560547,
+      "eval_runtime": 554.2279,
+      "eval_samples_per_second": 17.652,
+      "eval_steps_per_second": 8.827,
+      "step": 1
+    },
+    {
+      "epoch": 0.0022592487997740753,
+      "grad_norm": 0.03576268255710602,
+      "learning_rate": 4e-05,
+      "loss": 1.1188,
+      "step": 2
+    },
+    {
+      "epoch": 0.003388873199661113,
+      "grad_norm": 0.03207174688577652,
+      "learning_rate": 6e-05,
+      "loss": 1.2194,
+      "step": 3
+    },
+    {
+      "epoch": 0.0045184975995481505,
+      "grad_norm": 0.03231927007436752,
+      "learning_rate": 8e-05,
+      "loss": 1.1735,
+      "step": 4
+    },
+    {
+      "epoch": 0.005648121999435188,
+      "grad_norm": 0.03306754678487778,
+      "learning_rate": 0.0001,
+      "loss": 1.1689,
+      "step": 5
+    },
+    {
+      "epoch": 0.006777746399322226,
+      "grad_norm": 0.035009413957595825,
+      "learning_rate": 0.00012,
+      "loss": 1.1808,
+      "step": 6
+    },
+    {
+      "epoch": 0.007907370799209263,
+      "grad_norm": 0.035900842398405075,
+      "learning_rate": 0.00014,
+      "loss": 1.0441,
+      "step": 7
+    },
+    {
+      "epoch": 0.009036995199096301,
+      "grad_norm": 0.028419604524970055,
+      "learning_rate": 0.00016,
+      "loss": 1.0858,
+      "step": 8
+    },
+    {
+      "epoch": 0.010166619598983339,
+      "grad_norm": 0.024826928973197937,
+      "learning_rate": 0.00018,
+      "loss": 1.1937,
+      "step": 9
+    },
+    {
+      "epoch": 0.011296243998870376,
+      "grad_norm": 0.026519587263464928,
+      "learning_rate": 0.0002,
+      "loss": 1.0726,
+      "step": 10
+    },
+    {
+      "epoch": 0.012425868398757414,
+      "grad_norm": 0.024713166058063507,
+      "learning_rate": 0.00019999992946277893,
+      "loss": 1.1191,
+      "step": 11
+    },
+    {
+      "epoch": 0.013555492798644452,
+      "grad_norm": 0.02494538575410843,
+      "learning_rate": 0.00019999971785121523,
+      "loss": 1.1751,
+      "step": 12
+    },
+    {
+      "epoch": 0.014685117198531489,
+      "grad_norm": 0.023202786222100258,
+      "learning_rate": 0.00019999936516560744,
+      "loss": 1.1801,
+      "step": 13
+    },
+    {
+      "epoch": 0.015814741598418527,
+      "grad_norm": 0.021448194980621338,
+      "learning_rate": 0.00019999887140645308,
+      "loss": 1.1676,
+      "step": 14
+    },
+    {
+      "epoch": 0.016944365998305563,
+      "grad_norm": 0.021449347957968712,
+      "learning_rate": 0.00019999823657444873,
+      "loss": 1.1224,
+      "step": 15
+    },
+    {
+      "epoch": 0.018073990398192602,
+      "grad_norm": 0.021438075229525566,
+      "learning_rate": 0.00019999746067049,
+      "loss": 1.0713,
+      "step": 16
+    },
+    {
+      "epoch": 0.019203614798079638,
+      "grad_norm": 0.020727725699543953,
+      "learning_rate": 0.00019999654369567147,
+      "loss": 1.1317,
+      "step": 17
+    },
+    {
+      "epoch": 0.020333239197966677,
+      "grad_norm": 0.023839153349399567,
+      "learning_rate": 0.00019999548565128678,
+      "loss": 0.994,
+      "step": 18
+    },
+    {
+      "epoch": 0.021462863597853713,
+      "grad_norm": 0.020854901522397995,
+      "learning_rate": 0.0001999942865388285,
+      "loss": 1.0086,
+      "step": 19
+    },
+    {
+      "epoch": 0.022592487997740753,
+      "grad_norm": 0.019373737275600433,
+      "learning_rate": 0.00019999294635998833,
+      "loss": 1.0284,
+      "step": 20
+    },
+    {
+      "epoch": 0.02372211239762779,
+      "grad_norm": 0.020383458584547043,
+      "learning_rate": 0.00019999146511665692,
+      "loss": 1.0926,
+      "step": 21
+    },
+    {
+      "epoch": 0.024851736797514828,
+      "grad_norm": 0.018995020538568497,
+      "learning_rate": 0.0001999898428109239,
+      "loss": 1.0333,
+      "step": 22
+    },
+    {
+      "epoch": 0.025981361197401864,
+      "grad_norm": 0.018192732706665993,
+      "learning_rate": 0.00019998807944507791,
+      "loss": 1.1471,
+      "step": 23
+    },
+    {
+      "epoch": 0.027110985597288903,
+      "grad_norm": 0.017383620142936707,
+      "learning_rate": 0.00019998617502160664,
+      "loss": 1.0417,
+      "step": 24
+    },
+    {
+      "epoch": 0.02824060999717594,
+      "grad_norm": 0.0169441569596529,
+      "learning_rate": 0.00019998412954319675,
+      "loss": 1.0375,
+      "step": 25
+    },
+    {
+      "epoch": 0.029370234397062978,
+      "grad_norm": 0.017483294010162354,
+      "learning_rate": 0.0001999819430127339,
+      "loss": 1.1357,
+      "step": 26
+    },
+    {
+      "epoch": 0.030499858796950014,
+      "grad_norm": 0.016769247129559517,
+      "learning_rate": 0.00019997961543330269,
+      "loss": 1.0641,
+      "step": 27
+    },
+    {
+      "epoch": 0.031629483196837054,
+      "grad_norm": 0.017466655001044273,
+      "learning_rate": 0.00019997714680818673,
+      "loss": 1.0919,
+      "step": 28
+    },
+    {
+      "epoch": 0.03275910759672409,
+      "grad_norm": 0.01684914343059063,
+      "learning_rate": 0.00019997453714086866,
+      "loss": 1.1458,
+      "step": 29
+    },
+    {
+      "epoch": 0.033888731996611125,
+      "grad_norm": 0.017144447192549706,
+      "learning_rate": 0.00019997178643503004,
+      "loss": 1.0499,
+      "step": 30
+    },
+    {
+      "epoch": 0.03501835639649816,
+      "grad_norm": 0.01914755254983902,
+      "learning_rate": 0.0001999688946945514,
+      "loss": 1.03,
+      "step": 31
+    },
+    {
+      "epoch": 0.036147980796385204,
+      "grad_norm": 0.017046675086021423,
+      "learning_rate": 0.00019996586192351225,
+      "loss": 1.0747,
+      "step": 32
+    },
+    {
+      "epoch": 0.03727760519627224,
+      "grad_norm": 0.01652969978749752,
+      "learning_rate": 0.00019996268812619107,
+      "loss": 1.1694,
+      "step": 33
+    },
+    {
+      "epoch": 0.038407229596159276,
+      "grad_norm": 0.016702843829989433,
+      "learning_rate": 0.00019995937330706526,
+      "loss": 1.0339,
+      "step": 34
+    },
+    {
+      "epoch": 0.03953685399604631,
+      "grad_norm": 0.016058262437582016,
+      "learning_rate": 0.00019995591747081122,
+      "loss": 1.1002,
+      "step": 35
+    },
+    {
+      "epoch": 0.040666478395933355,
+      "grad_norm": 0.01609026826918125,
+      "learning_rate": 0.0001999523206223042,
+      "loss": 1.0934,
+      "step": 36
+    },
+    {
+      "epoch": 0.04179610279582039,
+      "grad_norm": 0.017004741355776787,
+      "learning_rate": 0.00019994858276661844,
+      "loss": 1.0748,
+      "step": 37
+    },
+    {
+      "epoch": 0.042925727195707426,
+      "grad_norm": 0.017046496272087097,
+      "learning_rate": 0.00019994470390902712,
+      "loss": 1.0851,
+      "step": 38
+    },
+    {
+      "epoch": 0.04405535159559446,
+      "grad_norm": 0.016196010634303093,
+      "learning_rate": 0.0001999406840550023,
+      "loss": 0.9796,
+      "step": 39
+    },
+    {
+      "epoch": 0.045184975995481505,
+      "grad_norm": 0.01703856885433197,
+      "learning_rate": 0.000199936523210215,
+      "loss": 1.0511,
+      "step": 40
+    },
+    {
+      "epoch": 0.04631460039536854,
+      "grad_norm": 0.017249640077352524,
+      "learning_rate": 0.00019993222138053507,
+      "loss": 0.9521,
+      "step": 41
+    },
+    {
+      "epoch": 0.04744422479525558,
+      "grad_norm": 0.01665697991847992,
+      "learning_rate": 0.0001999277785720313,
+      "loss": 1.0732,
+      "step": 42
+    },
+    {
+      "epoch": 0.04857384919514261,
+      "grad_norm": 0.016691800206899643,
+      "learning_rate": 0.0001999231947909714,
+      "loss": 1.1504,
+      "step": 43
+    },
+    {
+      "epoch": 0.049703473595029656,
+      "grad_norm": 0.016521582379937172,
+      "learning_rate": 0.00019991847004382186,
+      "loss": 1.1022,
+      "step": 44
+    },
+    {
+      "epoch": 0.05083309799491669,
+      "grad_norm": 0.017150631174445152,
+      "learning_rate": 0.00019991360433724813,
+      "loss": 1.0843,
+      "step": 45
+    },
+    {
+      "epoch": 0.05196272239480373,
+      "grad_norm": 0.016666896641254425,
+      "learning_rate": 0.00019990859767811444,
+      "loss": 1.0328,
+      "step": 46
+    },
+    {
+      "epoch": 0.05309234679469076,
+      "grad_norm": 0.016831668093800545,
+      "learning_rate": 0.0001999034500734839,
+      "loss": 1.0285,
+      "step": 47
+    },
+    {
+      "epoch": 0.054221971194577806,
+      "grad_norm": 0.01656479388475418,
+      "learning_rate": 0.00019989816153061853,
+      "loss": 0.9825,
+      "step": 48
+    },
+    {
+      "epoch": 0.05535159559446484,
+      "grad_norm": 0.016812896355986595,
+      "learning_rate": 0.00019989273205697904,
+      "loss": 1.0912,
+      "step": 49
+    },
+    {
+      "epoch": 0.05648121999435188,
+      "grad_norm": 0.01740356534719467,
+      "learning_rate": 0.0001998871616602251,
+      "loss": 1.0601,
+      "step": 50
+    },
+    {
+      "epoch": 0.057610844394238914,
+      "grad_norm": 0.017337938770651817,
+      "learning_rate": 0.00019988145034821502,
+      "loss": 1.1482,
+      "step": 51
+    },
+    {
+      "epoch": 0.058740468794125957,
+      "grad_norm": 0.017546923831105232,
+      "learning_rate": 0.0001998755981290061,
+      "loss": 1.0472,
+      "step": 52
+    },
+    {
+      "epoch": 0.05987009319401299,
+      "grad_norm": 0.017129387706518173,
+      "learning_rate": 0.00019986960501085428,
+      "loss": 1.1083,
+      "step": 53
+    },
+    {
+      "epoch": 0.06099971759390003,
+      "grad_norm": 0.01688031479716301,
+      "learning_rate": 0.00019986347100221433,
+      "loss": 1.0589,
+      "step": 54
+    },
+    {
+      "epoch": 0.062129341993787064,
+      "grad_norm": 0.017208745703101158,
+      "learning_rate": 0.00019985719611173973,
+      "loss": 1.1292,
+      "step": 55
+    },
+    {
+      "epoch": 0.06325896639367411,
+      "grad_norm": 0.01749996654689312,
+      "learning_rate": 0.0001998507803482828,
+      "loss": 1.1252,
+      "step": 56
+    },
+    {
+      "epoch": 0.06438859079356114,
+      "grad_norm": 0.017861831933259964,
+      "learning_rate": 0.00019984422372089453,
+      "loss": 1.104,
+      "step": 57
+    },
+    {
+      "epoch": 0.06551821519344818,
+      "grad_norm": 0.01797177456319332,
+      "learning_rate": 0.00019983752623882462,
+      "loss": 1.0569,
+      "step": 58
+    },
+    {
+      "epoch": 0.06664783959333521,
+      "grad_norm": 0.01702985167503357,
+      "learning_rate": 0.00019983068791152152,
+      "loss": 1.0238,
+      "step": 59
+    },
+    {
+      "epoch": 0.06777746399322225,
+      "grad_norm": 0.01766786351799965,
+      "learning_rate": 0.00019982370874863236,
+      "loss": 1.0509,
+      "step": 60
+    },
+    {
+      "epoch": 0.06890708839310929,
+      "grad_norm": 0.017592614516615868,
+      "learning_rate": 0.00019981658876000298,
+      "loss": 1.0613,
+      "step": 61
+    },
+    {
+      "epoch": 0.07003671279299632,
+      "grad_norm": 0.018118126317858696,
+      "learning_rate": 0.00019980932795567782,
+      "loss": 1.1727,
+      "step": 62
+    },
+    {
+      "epoch": 0.07116633719288337,
+      "grad_norm": 0.017846032977104187,
+      "learning_rate": 0.00019980192634590007,
+      "loss": 1.0042,
+      "step": 63
+    },
+    {
+      "epoch": 0.07229596159277041,
+      "grad_norm": 0.018457984551787376,
+      "learning_rate": 0.00019979438394111145,
+      "loss": 1.0648,
+      "step": 64
+    },
+    {
+      "epoch": 0.07342558599265744,
+      "grad_norm": 0.019480090588331223,
+      "learning_rate": 0.0001997867007519524,
+      "loss": 1.0307,
+      "step": 65
+    },
+    {
+      "epoch": 0.07455521039254448,
+      "grad_norm": 0.018102938309311867,
+      "learning_rate": 0.00019977887678926195,
+      "loss": 1.1129,
+      "step": 66
+    },
+    {
+      "epoch": 0.07568483479243152,
+      "grad_norm": 0.017858000472187996,
+      "learning_rate": 0.00019977091206407768,
+      "loss": 1.1574,
+      "step": 67
+    },
+    {
+      "epoch": 0.07681445919231855,
+      "grad_norm": 0.019296329468488693,
+      "learning_rate": 0.0001997628065876358,
+      "loss": 1.046,
+      "step": 68
+    },
+    {
+      "epoch": 0.07794408359220559,
+      "grad_norm": 0.017915885895490646,
+      "learning_rate": 0.0001997545603713711,
+      "loss": 1.0357,
+      "step": 69
+    },
+    {
+      "epoch": 0.07907370799209262,
+      "grad_norm": 0.01840789057314396,
+      "learning_rate": 0.00019974617342691678,
+      "loss": 1.0383,
+      "step": 70
+    },
+    {
+      "epoch": 0.08020333239197967,
+      "grad_norm": 0.019900545477867126,
+      "learning_rate": 0.00019973764576610478,
+      "loss": 0.9994,
+      "step": 71
+    },
+    {
+      "epoch": 0.08133295679186671,
+      "grad_norm": 0.0189906544983387,
+      "learning_rate": 0.0001997289774009654,
+      "loss": 0.9418,
+      "step": 72
+    },
+    {
+      "epoch": 0.08246258119175374,
+      "grad_norm": 0.01873377151787281,
+      "learning_rate": 0.00019972016834372749,
+      "loss": 0.9937,
+      "step": 73
+    },
+    {
+      "epoch": 0.08359220559164078,
+      "grad_norm": 0.019596470519900322,
+      "learning_rate": 0.0001997112186068184,
+      "loss": 1.0887,
+      "step": 74
+    },
+    {
+      "epoch": 0.08472182999152782,
+      "grad_norm": 0.020303891971707344,
+      "learning_rate": 0.00019970212820286394,
+      "loss": 1.0142,
+      "step": 75
+    },
+    {
+      "epoch": 0.08585145439141485,
+      "grad_norm": 0.019804317504167557,
+      "learning_rate": 0.00019969289714468825,
+      "loss": 1.0394,
+      "step": 76
+    },
+    {
+      "epoch": 0.08698107879130189,
+      "grad_norm": 0.019536610692739487,
+      "learning_rate": 0.0001996835254453141,
+      "loss": 0.9576,
+      "step": 77
+    },
+    {
+      "epoch": 0.08811070319118892,
+      "grad_norm": 0.019902685657143593,
+      "learning_rate": 0.0001996740131179625,
+      "loss": 0.9835,
+      "step": 78
+    },
+    {
+      "epoch": 0.08924032759107597,
+      "grad_norm": 0.01986609399318695,
+      "learning_rate": 0.00019966436017605297,
+      "loss": 1.0597,
+      "step": 79
+    },
+    {
+      "epoch": 0.09036995199096301,
+      "grad_norm": 0.019435487687587738,
+      "learning_rate": 0.00019965456663320329,
+      "loss": 1.0863,
+      "step": 80
+    },
+    {
+      "epoch": 0.09149957639085005,
+      "grad_norm": 0.019000260159373283,
+      "learning_rate": 0.00019964463250322966,
+      "loss": 1.0935,
+      "step": 81
+    },
+    {
+      "epoch": 0.09262920079073708,
+      "grad_norm": 0.018888210877776146,
+      "learning_rate": 0.0001996345578001466,
+      "loss": 1.0399,
+      "step": 82
+    },
+    {
+      "epoch": 0.09375882519062412,
+      "grad_norm": 0.019765490666031837,
+      "learning_rate": 0.00019962434253816694,
+      "loss": 1.0265,
+      "step": 83
+    },
+    {
+      "epoch": 0.09488844959051115,
+      "grad_norm": 0.01926722563803196,
+      "learning_rate": 0.00019961398673170181,
+      "loss": 1.0307,
+      "step": 84
+    },
+    {
+      "epoch": 0.09601807399039819,
+      "grad_norm": 0.019572502002120018,
+      "learning_rate": 0.00019960349039536062,
+      "loss": 1.0217,
+      "step": 85
+    },
+    {
+      "epoch": 0.09714769839028523,
+      "grad_norm": 0.024138784036040306,
+      "learning_rate": 0.000199592853543951,
+      "loss": 1.1376,
+      "step": 86
+    },
+    {
+      "epoch": 0.09827732279017226,
+      "grad_norm": 0.03155818581581116,
+      "learning_rate": 0.0001995820761924788,
+      "loss": 1.1029,
+      "step": 87
+    },
+    {
+      "epoch": 0.09940694719005931,
+      "grad_norm": 0.019589390605688095,
+      "learning_rate": 0.00019957115835614816,
+      "loss": 1.0353,
+      "step": 88
+    },
+    {
+      "epoch": 0.10053657158994635,
+      "grad_norm": 0.020419439300894737,
+      "learning_rate": 0.00019956010005036133,
+      "loss": 1.0228,
+      "step": 89
+    },
+    {
+      "epoch": 0.10166619598983338,
+      "grad_norm": 0.02149847149848938,
+      "learning_rate": 0.00019954890129071876,
+      "loss": 1.1214,
+      "step": 90
+    },
+    {
+      "epoch": 0.10279582038972042,
+      "grad_norm": 0.01982825994491577,
+      "learning_rate": 0.00019953756209301903,
+      "loss": 1.0302,
+      "step": 91
+    },
+    {
+      "epoch": 0.10392544478960745,
+      "grad_norm": 0.01985330507159233,
+      "learning_rate": 0.00019952608247325885,
+      "loss": 1.0674,
+      "step": 92
+    },
+    {
+      "epoch": 0.10505506918949449,
+      "grad_norm": 0.020196454599499702,
+      "learning_rate": 0.00019951446244763309,
+      "loss": 1.0113,
+      "step": 93
+    },
+    {
+      "epoch": 0.10618469358938153,
+      "grad_norm": 0.020652327686548233,
+      "learning_rate": 0.00019950270203253454,
+      "loss": 1.0635,
+      "step": 94
+    },
+    {
+      "epoch": 0.10731431798926856,
+      "grad_norm": 0.020714478567242622,
+      "learning_rate": 0.00019949080124455416,
+      "loss": 1.0226,
+      "step": 95
+    },
+    {
+      "epoch": 0.10844394238915561,
+      "grad_norm": 0.021647842600941658,
+      "learning_rate": 0.000199478760100481,
+      "loss": 1.0575,
+      "step": 96
+    },
+    {
+      "epoch": 0.10957356678904265,
+      "grad_norm": 0.02076675370335579,
+      "learning_rate": 0.00019946657861730194,
+      "loss": 1.1146,
+      "step": 97
+    },
+    {
+      "epoch": 0.11070319118892968,
+      "grad_norm": 0.02103651687502861,
+      "learning_rate": 0.000199454256812202,
+      "loss": 0.9953,
+      "step": 98
+    },
+    {
+      "epoch": 0.11183281558881672,
+      "grad_norm": 0.02276523970067501,
+      "learning_rate": 0.00019944179470256405,
+      "loss": 1.021,
+      "step": 99
+    },
+    {
+      "epoch": 0.11296243998870376,
+      "grad_norm": 0.020957166329026222,
+      "learning_rate": 0.00019942919230596896,
+      "loss": 0.9838,
+      "step": 100
+    },
+    {
+      "epoch": 0.11409206438859079,
+      "grad_norm": 0.022394055500626564,
+      "learning_rate": 0.00019941644964019552,
+      "loss": 1.0169,
+      "step": 101
+    },
+    {
+      "epoch": 0.11522168878847783,
+      "grad_norm": 0.02139163948595524,
+      "learning_rate": 0.00019940356672322037,
+      "loss": 1.0788,
+      "step": 102
+    },
+    {
+      "epoch": 0.11635131318836486,
+      "grad_norm": 0.021381577476859093,
+      "learning_rate": 0.00019939054357321799,
+      "loss": 1.0669,
+      "step": 103
+    },
+    {
+      "epoch": 0.11748093758825191,
+      "grad_norm": 0.02302641049027443,
+      "learning_rate": 0.00019937738020856072,
+      "loss": 1.0122,
+      "step": 104
+    },
+    {
+      "epoch": 0.11861056198813895,
+      "grad_norm": 0.021372724324464798,
+      "learning_rate": 0.00019936407664781868,
+      "loss": 1.0974,
+      "step": 105
+    },
+    {
+      "epoch": 0.11974018638802598,
+      "grad_norm": 0.021260784938931465,
+      "learning_rate": 0.00019935063290975986,
+      "loss": 0.9996,
+      "step": 106
+    },
+    {
+      "epoch": 0.12086981078791302,
+      "grad_norm": 0.021557705476880074,
+      "learning_rate": 0.0001993370490133499,
+      "loss": 1.0215,
+      "step": 107
+    },
+    {
+      "epoch": 0.12199943518780006,
+      "grad_norm": 0.023252975195646286,
+      "learning_rate": 0.00019932332497775215,
+      "loss": 1.0908,
+      "step": 108
+    },
+    {
+      "epoch": 0.12312905958768709,
+      "grad_norm": 0.02185026742517948,
+      "learning_rate": 0.00019930946082232783,
+      "loss": 1.0751,
+      "step": 109
+    },
+    {
+      "epoch": 0.12425868398757413,
+      "grad_norm": 0.022223595529794693,
+      "learning_rate": 0.00019929545656663562,
+      "loss": 0.9737,
+      "step": 110
+    },
+    {
+      "epoch": 0.12538830838746118,
+      "grad_norm": 0.021415019407868385,
+      "learning_rate": 0.000199281312230432,
+      "loss": 1.0864,
+      "step": 111
+    },
+    {
+      "epoch": 0.12651793278734821,
+      "grad_norm": 0.02144046686589718,
+      "learning_rate": 0.000199267027833671,
+      "loss": 0.9984,
+      "step": 112
+    },
+    {
+      "epoch": 0.12764755718723525,
+      "grad_norm": 0.0225879717618227,
+      "learning_rate": 0.00019925260339650428,
+      "loss": 1.0685,
+      "step": 113
+    },
+    {
+      "epoch": 0.12877718158712229,
+      "grad_norm": 0.022809404879808426,
+      "learning_rate": 0.000199238038939281,
+      "loss": 1.0733,
+      "step": 114
+    },
+    {
+      "epoch": 0.12990680598700932,
+      "grad_norm": 0.023381488397717476,
+      "learning_rate": 0.00019922333448254786,
+      "loss": 1.0107,
+      "step": 115
+    },
+    {
+      "epoch": 0.13103643038689636,
+      "grad_norm": 0.022633766755461693,
+      "learning_rate": 0.00019920849004704914,
+      "loss": 0.9885,
+      "step": 116
+    },
+    {
+      "epoch": 0.1321660547867834,
+      "grad_norm": 0.02235741913318634,
+      "learning_rate": 0.00019919350565372656,
+      "loss": 1.0714,
+      "step": 117
+    },
+    {
+      "epoch": 0.13329567918667043,
+      "grad_norm": 0.02206304483115673,
+      "learning_rate": 0.00019917838132371923,
+      "loss": 1.0749,
+      "step": 118
+    },
+    {
+      "epoch": 0.13442530358655747,
+      "grad_norm": 0.022310512140393257,
+      "learning_rate": 0.0001991631170783637,
+      "loss": 1.0437,
+      "step": 119
+    },
+    {
+      "epoch": 0.1355549279864445,
+      "grad_norm": 0.021498341113328934,
+      "learning_rate": 0.00019914771293919395,
+      "loss": 1.0317,
+      "step": 120
+    },
+    {
+      "epoch": 0.13668455238633154,
+      "grad_norm": 0.021773051470518112,
+      "learning_rate": 0.0001991321689279413,
+      "loss": 1.0489,
+      "step": 121
+    },
+    {
+      "epoch": 0.13781417678621857,
+      "grad_norm": 0.021639568731188774,
+      "learning_rate": 0.0001991164850665343,
+      "loss": 0.9893,
+      "step": 122
+    },
+    {
+      "epoch": 0.1389438011861056,
+      "grad_norm": 0.022304847836494446,
+      "learning_rate": 0.00019910066137709896,
+      "loss": 1.0542,
+      "step": 123
+    },
+    {
+      "epoch": 0.14007342558599264,
+      "grad_norm": 0.022173380479216576,
+      "learning_rate": 0.0001990846978819584,
+      "loss": 1.0776,
+      "step": 124
+    },
+    {
+      "epoch": 0.1412030499858797,
+      "grad_norm": 0.023623231798410416,
+      "learning_rate": 0.00019906859460363307,
+      "loss": 1.1279,
+      "step": 125
+    },
+    {
+      "epoch": 0.14233267438576674,
+      "grad_norm": 0.022697214037179947,
+      "learning_rate": 0.0001990523515648406,
+      "loss": 0.9973,
+      "step": 126
+    },
+    {
+      "epoch": 0.14346229878565378,
+      "grad_norm": 0.02267601527273655,
+      "learning_rate": 0.00019903596878849568,
+      "loss": 1.1131,
+      "step": 127
+    },
+    {
+      "epoch": 0.14459192318554082,
+      "grad_norm": 0.02244328148663044,
+      "learning_rate": 0.0001990194462977103,
+      "loss": 1.0157,
+      "step": 128
+    },
+    {
+      "epoch": 0.14572154758542785,
+      "grad_norm": 0.02371121197938919,
+      "learning_rate": 0.00019900278411579344,
+      "loss": 0.9888,
+      "step": 129
+    },
+    {
+      "epoch": 0.1468511719853149,
+      "grad_norm": 0.0227819811552763,
+      "learning_rate": 0.00019898598226625119,
+      "loss": 1.0003,
+      "step": 130
+    },
+    {
+      "epoch": 0.14798079638520192,
+      "grad_norm": 0.02316221408545971,
+      "learning_rate": 0.00019896904077278663,
+      "loss": 1.0181,
+      "step": 131
+    },
+    {
+      "epoch": 0.14911042078508896,
+      "grad_norm": 0.022808320820331573,
+      "learning_rate": 0.00019895195965929994,
+      "loss": 1.0546,
+      "step": 132
+    },
+    {
+      "epoch": 0.150240045184976,
+      "grad_norm": 0.022865859791636467,
+      "learning_rate": 0.00019893473894988815,
+      "loss": 1.1513,
+      "step": 133
+    },
+    {
+      "epoch": 0.15136966958486303,
+      "grad_norm": 0.024973077699542046,
+      "learning_rate": 0.0001989173786688453,
+      "loss": 1.0109,
+      "step": 134
+    },
+    {
+      "epoch": 0.15249929398475007,
+      "grad_norm": 0.02237241342663765,
+      "learning_rate": 0.00019889987884066237,
+      "loss": 1.0991,
+      "step": 135
+    },
+    {
+      "epoch": 0.1536289183846371,
+      "grad_norm": 0.023280750960111618,
+      "learning_rate": 0.000198882239490027,
+      "loss": 1.0501,
+      "step": 136
+    },
+    {
+      "epoch": 0.15475854278452414,
+      "grad_norm": 0.022803954780101776,
+      "learning_rate": 0.00019886446064182396,
+      "loss": 1.0033,
+      "step": 137
+    },
+    {
+      "epoch": 0.15588816718441117,
+      "grad_norm": 0.02270156517624855,
+      "learning_rate": 0.0001988465423211346,
+      "loss": 1.0715,
+      "step": 138
+    },
+    {
+      "epoch": 0.1570177915842982,
+      "grad_norm": 0.023484455421566963,
+      "learning_rate": 0.00019882848455323704,
+      "loss": 1.1598,
+      "step": 139
+    },
+    {
+      "epoch": 0.15814741598418525,
+      "grad_norm": 0.02300065942108631,
+      "learning_rate": 0.00019881028736360622,
+      "loss": 1.0813,
+      "step": 140
+    },
+    {
+      "epoch": 0.15927704038407228,
+      "grad_norm": 0.02320142462849617,
+      "learning_rate": 0.00019879195077791376,
+      "loss": 1.0169,
+      "step": 141
+    },
+    {
+      "epoch": 0.16040666478395935,
+      "grad_norm": 0.02317328006029129,
+      "learning_rate": 0.00019877347482202785,
+      "loss": 1.0301,
+      "step": 142
+    },
+    {
+      "epoch": 0.16153628918384638,
+      "grad_norm": 0.02378895878791809,
+      "learning_rate": 0.0001987548595220133,
+      "loss": 0.977,
+      "step": 143
+    },
+    {
+      "epoch": 0.16266591358373342,
+      "grad_norm": 0.023976027965545654,
+      "learning_rate": 0.00019873610490413166,
+      "loss": 1.0859,
+      "step": 144
+    },
+    {
+      "epoch": 0.16379553798362045,
+      "grad_norm": 0.022843923419713974,
+      "learning_rate": 0.0001987172109948408,
+      "loss": 1.0103,
+      "step": 145
+    },
+    {
+      "epoch": 0.1649251623835075,
+      "grad_norm": 0.023723525926470757,
+      "learning_rate": 0.00019869817782079525,
+      "loss": 1.0704,
+      "step": 146
+    },
+    {
+      "epoch": 0.16605478678339453,
+      "grad_norm": 0.02391059510409832,
+      "learning_rate": 0.00019867900540884592,
+      "loss": 1.058,
+      "step": 147
+    },
+    {
+      "epoch": 0.16718441118328156,
+      "grad_norm": 0.023995989933609962,
+      "learning_rate": 0.0001986596937860402,
+      "loss": 1.0034,
+      "step": 148
+    },
+    {
+      "epoch": 0.1683140355831686,
+      "grad_norm": 0.024091636762022972,
+      "learning_rate": 0.00019864024297962186,
+      "loss": 1.1214,
+      "step": 149
+    },
+    {
+      "epoch": 0.16944365998305563,
+      "grad_norm": 0.024035949259996414,
+      "learning_rate": 0.000198620653017031,
+      "loss": 1.0219,
+      "step": 150
+    },
+    {
+      "epoch": 0.17057328438294267,
+      "grad_norm": 0.02359904535114765,
+      "learning_rate": 0.00019860092392590408,
+      "loss": 0.9627,
+      "step": 151
+    },
+    {
+      "epoch": 0.1717029087828297,
+      "grad_norm": 0.023622050881385803,
+      "learning_rate": 0.00019858105573407377,
+      "loss": 1.0582,
+      "step": 152
+    },
+    {
+      "epoch": 0.17283253318271674,
+      "grad_norm": 0.02392633818089962,
+      "learning_rate": 0.00019856104846956906,
+      "loss": 1.0089,
+      "step": 153
+    },
+    {
+      "epoch": 0.17396215758260378,
+      "grad_norm": 0.024305053055286407,
+      "learning_rate": 0.00019854090216061502,
+      "loss": 1.0222,
+      "step": 154
+    },
+    {
+      "epoch": 0.1750917819824908,
+      "grad_norm": 0.024212822318077087,
+      "learning_rate": 0.00019852061683563296,
+      "loss": 1.0429,
+      "step": 155
+    },
+    {
+      "epoch": 0.17622140638237785,
+      "grad_norm": 0.024261048063635826,
+      "learning_rate": 0.00019850019252324032,
+      "loss": 1.0506,
+      "step": 156
+    },
+    {
+      "epoch": 0.17735103078226488,
+      "grad_norm": 0.022689295932650566,
+      "learning_rate": 0.0001984796292522506,
+      "loss": 0.9999,
+      "step": 157
+    },
+    {
+      "epoch": 0.17848065518215195,
+      "grad_norm": 0.023288823664188385,
+      "learning_rate": 0.00019845892705167324,
+      "loss": 1.0242,
+      "step": 158
+    },
+    {
+      "epoch": 0.17961027958203898,
+      "grad_norm": 0.030173135921359062,
+      "learning_rate": 0.00019843808595071383,
+      "loss": 1.0641,
+      "step": 159
+    },
+    {
+      "epoch": 0.18073990398192602,
+      "grad_norm": 0.024562738835811615,
+      "learning_rate": 0.00019841710597877382,
+      "loss": 0.9781,
+      "step": 160
+    },
+    {
+      "epoch": 0.18186952838181306,
+      "grad_norm": 0.024897055700421333,
+      "learning_rate": 0.00019839598716545057,
+      "loss": 1.1015,
+      "step": 161
+    },
+    {
+      "epoch": 0.1829991527817001,
+      "grad_norm": 0.023950692266225815,
+      "learning_rate": 0.00019837472954053732,
+      "loss": 1.125,
+      "step": 162
+    },
+    {
+      "epoch": 0.18412877718158713,
+      "grad_norm": 0.025099674239754677,
+      "learning_rate": 0.00019835333313402318,
+      "loss": 1.0359,
+      "step": 163
+    },
+    {
+      "epoch": 0.18525840158147416,
+      "grad_norm": 0.025351393967866898,
+      "learning_rate": 0.000198331797976093,
+      "loss": 1.0281,
+      "step": 164
+    },
+    {
+      "epoch": 0.1863880259813612,
+      "grad_norm": 0.024693114683032036,
+      "learning_rate": 0.00019831012409712737,
+      "loss": 1.1521,
+      "step": 165
+    },
+    {
+      "epoch": 0.18751765038124824,
+      "grad_norm": 0.024255136027932167,
+      "learning_rate": 0.0001982883115277026,
+      "loss": 1.0842,
+      "step": 166
+    },
+    {
+      "epoch": 0.18864727478113527,
+      "grad_norm": 0.02501499280333519,
+      "learning_rate": 0.00019826636029859066,
+      "loss": 0.9975,
+      "step": 167
+    },
+    {
+      "epoch": 0.1897768991810223,
+      "grad_norm": 0.025276506319642067,
+      "learning_rate": 0.00019824427044075912,
+      "loss": 1.0119,
+      "step": 168
+    },
+    {
+      "epoch": 0.19090652358090934,
+      "grad_norm": 0.024857770651578903,
+      "learning_rate": 0.0001982220419853711,
+      "loss": 0.9733,
+      "step": 169
+    },
+    {
+      "epoch": 0.19203614798079638,
+      "grad_norm": 0.02459135465323925,
+      "learning_rate": 0.0001981996749637853,
+      "loss": 1.0791,
+      "step": 170
+    },
+    {
+      "epoch": 0.19316577238068341,
+      "grad_norm": 0.026056725531816483,
+      "learning_rate": 0.00019817716940755586,
+      "loss": 1.0849,
+      "step": 171
+    },
+    {
+      "epoch": 0.19429539678057045,
+      "grad_norm": 0.024211106821894646,
+      "learning_rate": 0.0001981545253484324,
+      "loss": 1.0253,
+      "step": 172
+    },
+    {
+      "epoch": 0.1954250211804575,
+      "grad_norm": 0.024450423195958138,
+      "learning_rate": 0.00019813174281835982,
+      "loss": 1.1101,
+      "step": 173
+    },
+    {
+      "epoch": 0.19655464558034452,
+      "grad_norm": 0.02433086559176445,
+      "learning_rate": 0.0001981088218494785,
+      "loss": 0.9887,
+      "step": 174
+    },
+    {
+      "epoch": 0.1976842699802316,
+      "grad_norm": 0.02424442023038864,
+      "learning_rate": 0.0001980857624741241,
+      "loss": 1.074,
+      "step": 175
+    },
+    {
+      "epoch": 0.19881389438011862,
+      "grad_norm": 0.02318243682384491,
+      "learning_rate": 0.00019806256472482744,
+      "loss": 1.1045,
+      "step": 176
+    },
+    {
+      "epoch": 0.19994351878000566,
+      "grad_norm": 0.02407553791999817,
+      "learning_rate": 0.00019803922863431467,
+      "loss": 1.0062,
+      "step": 177
+    },
+    {
+      "epoch": 0.2010731431798927,
+      "grad_norm": 0.02463892102241516,
+      "learning_rate": 0.000198015754235507,
+      "loss": 1.0689,
+      "step": 178
+    },
+    {
+      "epoch": 0.20220276757977973,
+      "grad_norm": 0.023701028898358345,
+      "learning_rate": 0.00019799214156152083,
+      "loss": 1.0672,
+      "step": 179
+    },
+    {
+      "epoch": 0.20333239197966677,
+      "grad_norm": 0.02471453696489334,
+      "learning_rate": 0.00019796839064566761,
+      "loss": 1.033,
+      "step": 180
+    },
+    {
+      "epoch": 0.2044620163795538,
+      "grad_norm": 0.02426736056804657,
+      "learning_rate": 0.00019794450152145382,
+      "loss": 1.0831,
+      "step": 181
+    },
+    {
+      "epoch": 0.20559164077944084,
+      "grad_norm": 0.0243529062718153,
+      "learning_rate": 0.0001979204742225809,
+      "loss": 1.0815,
+      "step": 182
+    },
+    {
+      "epoch": 0.20672126517932787,
+      "grad_norm": 0.0243973471224308,
+      "learning_rate": 0.00019789630878294526,
+      "loss": 1.0541,
+      "step": 183
+    },
+    {
+      "epoch": 0.2078508895792149,
+      "grad_norm": 0.02461186796426773,
+      "learning_rate": 0.0001978720052366381,
+      "loss": 1.1203,
+      "step": 184
+    },
+    {
+      "epoch": 0.20898051397910195,
+      "grad_norm": 0.02479882724583149,
+      "learning_rate": 0.00019784756361794555,
+      "loss": 1.078,
+      "step": 185
+    },
+    {
+      "epoch": 0.21011013837898898,
+      "grad_norm": 0.02605288103222847,
+      "learning_rate": 0.00019782298396134844,
+      "loss": 1.01,
+      "step": 186
+    },
+    {
+      "epoch": 0.21123976277887602,
+      "grad_norm": 0.025911834090948105,
+      "learning_rate": 0.00019779826630152245,
+      "loss": 1.1173,
+      "step": 187
+    },
+    {
+      "epoch": 0.21236938717876305,
+      "grad_norm": 0.024420902132987976,
+      "learning_rate": 0.00019777341067333786,
+      "loss": 1.0023,
+      "step": 188
+    },
+    {
+      "epoch": 0.2134990115786501,
+      "grad_norm": 0.024010393768548965,
+      "learning_rate": 0.0001977484171118596,
+      "loss": 1.1382,
+      "step": 189
+    },
+    {
+      "epoch": 0.21462863597853712,
+      "grad_norm": 0.024915101006627083,
+      "learning_rate": 0.00019772328565234717,
+      "loss": 1.0734,
+      "step": 190
+    },
+    {
+      "epoch": 0.21575826037842416,
+      "grad_norm": 0.025032367557287216,
+      "learning_rate": 0.0001976980163302547,
+      "loss": 0.9585,
+      "step": 191
+    },
+    {
+      "epoch": 0.21688788477831122,
+      "grad_norm": 0.024727528914809227,
+      "learning_rate": 0.0001976726091812307,
+      "loss": 1.0731,
+      "step": 192
+    },
+    {
+      "epoch": 0.21801750917819826,
+      "grad_norm": 0.024914614856243134,
+      "learning_rate": 0.00019764706424111816,
+      "loss": 0.9522,
+      "step": 193
+    },
+    {
+      "epoch": 0.2191471335780853,
+      "grad_norm": 0.024750174954533577,
+      "learning_rate": 0.00019762138154595446,
+      "loss": 0.9646,
+      "step": 194
+    },
+    {
+      "epoch": 0.22027675797797233,
+      "grad_norm": 0.02512511797249317,
+      "learning_rate": 0.00019759556113197135,
+      "loss": 1.0643,
+      "step": 195
+    },
+    {
+      "epoch": 0.22140638237785937,
+      "grad_norm": 0.026546582579612732,
+      "learning_rate": 0.00019756960303559483,
+      "loss": 1.1158,
+      "step": 196
+    },
+    {
+      "epoch": 0.2225360067777464,
+      "grad_norm": 0.02506748028099537,
+      "learning_rate": 0.0001975435072934451,
+      "loss": 1.0261,
+      "step": 197
+    },
+    {
+      "epoch": 0.22366563117763344,
+      "grad_norm": 0.024585796520113945,
+      "learning_rate": 0.00019751727394233667,
+      "loss": 1.017,
+      "step": 198
+    },
+    {
+      "epoch": 0.22479525557752048,
+      "grad_norm": 0.02528531290590763,
+      "learning_rate": 0.00019749090301927796,
+      "loss": 1.042,
+      "step": 199
+    },
+    {
+      "epoch": 0.2259248799774075,
+      "grad_norm": 0.025023646652698517,
+      "learning_rate": 0.00019746439456147172,
+      "loss": 0.9618,
+      "step": 200
+    },
+    {
+      "epoch": 0.22705450437729455,
+      "grad_norm": 0.025859549641609192,
+      "learning_rate": 0.00019743774860631457,
+      "loss": 0.9982,
+      "step": 201
+    },
+    {
+      "epoch": 0.22818412877718158,
+      "grad_norm": 0.026021264493465424,
+      "learning_rate": 0.00019741096519139713,
+      "loss": 1.0131,
+      "step": 202
+    },
+    {
+      "epoch": 0.22931375317706862,
+      "grad_norm": 0.025675011798739433,
+      "learning_rate": 0.00019738404435450395,
+      "loss": 1.0186,
+      "step": 203
+    },
+    {
+      "epoch": 0.23044337757695565,
+      "grad_norm": 0.025758078321814537,
+      "learning_rate": 0.00019735698613361347,
+      "loss": 1.0869,
+      "step": 204
+    },
+    {
+      "epoch": 0.2315730019768427,
+      "grad_norm": 0.02666814811527729,
+      "learning_rate": 0.00019732979056689794,
+      "loss": 1.0894,
+      "step": 205
+    },
+    {
+      "epoch": 0.23270262637672973,
+      "grad_norm": 0.024690723046660423,
+      "learning_rate": 0.0001973024576927233,
+      "loss": 1.0898,
+      "step": 206
+    },
+    {
+      "epoch": 0.23383225077661676,
+      "grad_norm": 0.025678694248199463,
+      "learning_rate": 0.00019727498754964928,
+      "loss": 1.091,
+      "step": 207
+    },
+    {
+      "epoch": 0.23496187517650383,
+      "grad_norm": 0.025275958701968193,
+      "learning_rate": 0.00019724738017642924,
+      "loss": 1.089,
+      "step": 208
+    },
+    {
+      "epoch": 0.23609149957639086,
+      "grad_norm": 0.02560093067586422,
+      "learning_rate": 0.00019721963561201012,
+      "loss": 0.9755,
+      "step": 209
+    },
+    {
+      "epoch": 0.2372211239762779,
+      "grad_norm": 0.026244761422276497,
+      "learning_rate": 0.00019719175389553242,
+      "loss": 1.0696,
+      "step": 210
+    },
+    {
+      "epoch": 0.23835074837616493,
+      "grad_norm": 0.025443457067012787,
+      "learning_rate": 0.0001971637350663301,
+      "loss": 1.0032,
+      "step": 211
+    },
+    {
+      "epoch": 0.23948037277605197,
+      "grad_norm": 0.027356769889593124,
+      "learning_rate": 0.00019713557916393058,
+      "loss": 1.0393,
+      "step": 212
+    },
+    {
+      "epoch": 0.240609997175939,
+      "grad_norm": 0.025765880942344666,
+      "learning_rate": 0.0001971072862280546,
+      "loss": 1.015,
+      "step": 213
+    },
+    {
+      "epoch": 0.24173962157582604,
+      "grad_norm": 0.025718411430716515,
+      "learning_rate": 0.00019707885629861632,
+      "loss": 1.0343,
+      "step": 214
+    },
+    {
+      "epoch": 0.24286924597571308,
+      "grad_norm": 0.026691369712352753,
+      "learning_rate": 0.00019705028941572307,
+      "loss": 1.0896,
+      "step": 215
+    },
+    {
+      "epoch": 0.2439988703756001,
+      "grad_norm": 0.025440771132707596,
+      "learning_rate": 0.00019702158561967544,
+      "loss": 0.9986,
+      "step": 216
+    },
+    {
+      "epoch": 0.24512849477548715,
+      "grad_norm": 0.02483600750565529,
+      "learning_rate": 0.00019699274495096712,
+      "loss": 1.0287,
+      "step": 217
+    },
+    {
+      "epoch": 0.24625811917537418,
+      "grad_norm": 0.027423838153481483,
+      "learning_rate": 0.00019696376745028497,
+      "loss": 1.0626,
+      "step": 218
+    },
+    {
+      "epoch": 0.24738774357526122,
+      "grad_norm": 0.026005201041698456,
+      "learning_rate": 0.0001969346531585088,
+      "loss": 1.0203,
+      "step": 219
+    },
+    {
+      "epoch": 0.24851736797514826,
+      "grad_norm": 0.026350049301981926,
+      "learning_rate": 0.00019690540211671144,
+      "loss": 1.0482,
+      "step": 220
+    },
+    {
+      "epoch": 0.2496469923750353,
+      "grad_norm": 0.026930196210741997,
+      "learning_rate": 0.00019687601436615864,
+      "loss": 1.0258,
+      "step": 221
+    },
+    {
+      "epoch": 0.25077661677492236,
+      "grad_norm": 0.025890439748764038,
+      "learning_rate": 0.00019684648994830903,
+      "loss": 1.0886,
+      "step": 222
+    },
+    {
+      "epoch": 0.25190624117480936,
+      "grad_norm": 0.025864360854029655,
+      "learning_rate": 0.00019681682890481398,
+      "loss": 0.976,
+      "step": 223
+    },
+    {
+      "epoch": 0.25303586557469643,
+      "grad_norm": 0.025524241849780083,
+      "learning_rate": 0.00019678703127751763,
+      "loss": 1.0251,
+      "step": 224
+    },
+    {
+      "epoch": 0.25416548997458344,
+      "grad_norm": 0.02650127001106739,
+      "learning_rate": 0.00019675709710845687,
+      "loss": 1.0435,
+      "step": 225
+    },
+    {
+      "epoch": 0.2552951143744705,
+      "grad_norm": 0.025557860732078552,
+      "learning_rate": 0.00019672702643986113,
+      "loss": 1.0555,
+      "step": 226
+    },
+    {
+      "epoch": 0.2564247387743575,
+      "grad_norm": 0.027075499296188354,
+      "learning_rate": 0.0001966968193141524,
+      "loss": 0.9965,
+      "step": 227
+    },
+    {
+      "epoch": 0.25755436317424457,
+      "grad_norm": 0.025682270526885986,
+      "learning_rate": 0.00019666647577394527,
+      "loss": 1.0151,
+      "step": 228
+    },
+    {
+      "epoch": 0.2586839875741316,
+      "grad_norm": 0.026663288474082947,
+      "learning_rate": 0.00019663599586204673,
+      "loss": 1.0354,
+      "step": 229
+    },
+    {
+      "epoch": 0.25981361197401864,
+      "grad_norm": 0.026434747502207756,
+      "learning_rate": 0.0001966053796214561,
+      "loss": 1.0551,
+      "step": 230
+    },
+    {
+      "epoch": 0.26094323637390565,
+      "grad_norm": 0.025536926463246346,
+      "learning_rate": 0.0001965746270953651,
+      "loss": 0.9731,
+      "step": 231
+    },
+    {
+      "epoch": 0.2620728607737927,
+      "grad_norm": 0.07522192597389221,
+      "learning_rate": 0.0001965437383271577,
+      "loss": 0.9796,
+      "step": 232
+    },
+    {
+      "epoch": 0.2632024851736798,
+      "grad_norm": 0.027285447344183922,
+      "learning_rate": 0.00019651271336040997,
+      "loss": 1.011,
+      "step": 233
+    },
+    {
+      "epoch": 0.2643321095735668,
+      "grad_norm": 0.026399778202176094,
+      "learning_rate": 0.0001964815522388903,
+      "loss": 1.0199,
+      "step": 234
+    },
+    {
+      "epoch": 0.26546173397345385,
+      "grad_norm": 0.026532689109444618,
+      "learning_rate": 0.00019645025500655906,
+      "loss": 0.9918,
+      "step": 235
+    },
+    {
+      "epoch": 0.26659135837334086,
+      "grad_norm": 0.025576921179890633,
+      "learning_rate": 0.00019641882170756862,
+      "loss": 1.0198,
+      "step": 236
+    },
+    {
+      "epoch": 0.2677209827732279,
+      "grad_norm": 0.026158379390835762,
+      "learning_rate": 0.00019638725238626335,
+      "loss": 1.0204,
+      "step": 237
+    },
+    {
+      "epoch": 0.26885060717311493,
+      "grad_norm": 0.025530420243740082,
+      "learning_rate": 0.00019635554708717946,
+      "loss": 1.0885,
+      "step": 238
+    },
+    {
+      "epoch": 0.269980231573002,
+      "grad_norm": 0.02707337960600853,
+      "learning_rate": 0.00019632370585504502,
+      "loss": 1.0649,
+      "step": 239
+    },
+    {
+      "epoch": 0.271109855972889,
+      "grad_norm": 0.027028286829590797,
+      "learning_rate": 0.00019629172873477995,
+      "loss": 1.0544,
+      "step": 240
+    },
+    {
+      "epoch": 0.27223948037277607,
+      "grad_norm": 0.02564058266580105,
+      "learning_rate": 0.0001962596157714957,
+      "loss": 1.0481,
+      "step": 241
+    },
+    {
+      "epoch": 0.2733691047726631,
+      "grad_norm": 0.026479296386241913,
+      "learning_rate": 0.0001962273670104955,
+      "loss": 1.0413,
+      "step": 242
+    },
+    {
+      "epoch": 0.27449872917255014,
+      "grad_norm": 0.0330955870449543,
+      "learning_rate": 0.00019619498249727412,
+      "loss": 1.0292,
+      "step": 243
+    },
+    {
+      "epoch": 0.27562835357243715,
+      "grad_norm": 0.02611500211060047,
+      "learning_rate": 0.0001961624622775178,
+      "loss": 1.009,
+      "step": 244
+    },
+    {
+      "epoch": 0.2767579779723242,
+      "grad_norm": 0.026876097545027733,
+      "learning_rate": 0.00019612980639710428,
+      "loss": 0.9854,
+      "step": 245
+    },
+    {
+      "epoch": 0.2778876023722112,
+      "grad_norm": 0.02685077115893364,
+      "learning_rate": 0.00019609701490210264,
+      "loss": 1.0282,
+      "step": 246
+    },
+    {
+      "epoch": 0.2790172267720983,
+      "grad_norm": 0.026131028309464455,
+      "learning_rate": 0.00019606408783877334,
+      "loss": 1.0673,
+      "step": 247
+    },
+    {
+      "epoch": 0.2801468511719853,
+      "grad_norm": 0.02628222666680813,
+      "learning_rate": 0.00019603102525356798,
+      "loss": 1.0659,
+      "step": 248
+    },
+    {
+      "epoch": 0.28127647557187235,
+      "grad_norm": 0.027401477098464966,
+      "learning_rate": 0.00019599782719312948,
+      "loss": 0.9942,
+      "step": 249
+    },
+    {
+      "epoch": 0.2824060999717594,
+      "grad_norm": 0.02594529278576374,
+      "learning_rate": 0.00019596449370429183,
+      "loss": 1.0091,
+      "step": 250
+    },
+    {
+      "epoch": 0.2835357243716464,
+      "grad_norm": 0.028301890939474106,
+      "learning_rate": 0.00019593102483408,
+      "loss": 1.0083,
+      "step": 251
+    },
+    {
+      "epoch": 0.2846653487715335,
+      "grad_norm": 0.02808901108801365,
+      "learning_rate": 0.00019589742062971007,
+      "loss": 1.071,
+      "step": 252
+    },
+    {
+      "epoch": 0.2857949731714205,
+      "grad_norm": 0.02654552273452282,
+      "learning_rate": 0.00019586368113858892,
+      "loss": 1.0865,
+      "step": 253
+    },
+    {
+      "epoch": 0.28692459757130756,
+      "grad_norm": 0.02610975131392479,
+      "learning_rate": 0.00019582980640831443,
+      "loss": 1.1093,
+      "step": 254
+    },
+    {
+      "epoch": 0.28805422197119457,
+      "grad_norm": 0.027240293100476265,
+      "learning_rate": 0.0001957957964866751,
+      "loss": 1.0822,
+      "step": 255
+    },
+    {
+      "epoch": 0.28918384637108163,
+      "grad_norm": 0.027821950614452362,
+      "learning_rate": 0.00019576165142165032,
+      "loss": 1.0371,
+      "step": 256
+    },
+    {
+      "epoch": 0.29031347077096864,
+      "grad_norm": 0.02755453623831272,
+      "learning_rate": 0.00019572737126141002,
+      "loss": 1.0752,
+      "step": 257
+    },
+    {
+      "epoch": 0.2914430951708557,
+      "grad_norm": 0.02676587551832199,
+      "learning_rate": 0.0001956929560543147,
+      "loss": 1.0599,
+      "step": 258
+    },
+    {
+      "epoch": 0.2925727195707427,
+      "grad_norm": 0.02904544584453106,
+      "learning_rate": 0.00019565840584891549,
+      "loss": 1.0568,
+      "step": 259
+    },
+    {
+      "epoch": 0.2937023439706298,
+      "grad_norm": 0.027289781719446182,
+      "learning_rate": 0.00019562372069395384,
+      "loss": 1.0671,
+      "step": 260
+    },
+    {
+      "epoch": 0.2948319683705168,
+      "grad_norm": 0.025955747812986374,
+      "learning_rate": 0.00019558890063836167,
+      "loss": 0.9118,
+      "step": 261
+    },
+    {
+      "epoch": 0.29596159277040385,
+      "grad_norm": 0.028641648590564728,
+      "learning_rate": 0.00019555394573126118,
+      "loss": 1.0498,
+      "step": 262
+    },
+    {
+      "epoch": 0.29709121717029086,
+      "grad_norm": 0.028356773778796196,
+      "learning_rate": 0.0001955188560219648,
+      "loss": 1.0238,
+      "step": 263
+    },
+    {
+      "epoch": 0.2982208415701779,
+      "grad_norm": 0.02746075950562954,
+      "learning_rate": 0.00019548363155997517,
+      "loss": 1.0741,
+      "step": 264
+    },
+    {
+      "epoch": 0.2993504659700649,
+      "grad_norm": 0.02712567336857319,
+      "learning_rate": 0.000195448272394985,
+      "loss": 1.0861,
+      "step": 265
+    },
+    {
+      "epoch": 0.300480090369952,
+      "grad_norm": 0.026709580793976784,
+      "learning_rate": 0.00019541277857687694,
+      "loss": 1.0024,
+      "step": 266
+    },
+    {
+      "epoch": 0.30160971476983905,
+      "grad_norm": 0.027716003358364105,
+      "learning_rate": 0.00019537715015572382,
+      "loss": 1.0406,
+      "step": 267
+    },
+    {
+      "epoch": 0.30273933916972606,
+      "grad_norm": 0.02704858034849167,
+      "learning_rate": 0.00019534138718178818,
+      "loss": 1.0088,
+      "step": 268
+    },
+    {
+      "epoch": 0.3038689635696131,
+      "grad_norm": 0.026793915778398514,
+      "learning_rate": 0.00019530548970552247,
+      "loss": 1.0556,
+      "step": 269
+    },
+    {
+      "epoch": 0.30499858796950013,
+      "grad_norm": 0.028323287144303322,
+      "learning_rate": 0.00019526945777756879,
+      "loss": 1.057,
+      "step": 270
+    },
+    {
+      "epoch": 0.3061282123693872,
+      "grad_norm": 0.0279136560857296,
+      "learning_rate": 0.00019523329144875904,
+      "loss": 1.0654,
+      "step": 271
+    },
+    {
+      "epoch": 0.3072578367692742,
+      "grad_norm": 0.02878638356924057,
+      "learning_rate": 0.00019519699077011465,
+      "loss": 1.0357,
+      "step": 272
+    },
+    {
+      "epoch": 0.30838746116916127,
+      "grad_norm": 0.026021145284175873,
+      "learning_rate": 0.00019516055579284658,
+      "loss": 1.092,
+      "step": 273
+    },
+    {
+      "epoch": 0.3095170855690483,
+      "grad_norm": 0.0282638818025589,
+      "learning_rate": 0.00019512398656835528,
+      "loss": 1.0242,
+      "step": 274
+    },
+    {
+      "epoch": 0.31064670996893534,
+      "grad_norm": 0.0277785062789917,
+      "learning_rate": 0.00019508728314823062,
+      "loss": 1.0922,
+      "step": 275
+    },
+    {
+      "epoch": 0.31177633436882235,
+      "grad_norm": 0.027666205540299416,
+      "learning_rate": 0.00019505044558425168,
+      "loss": 1.0434,
+      "step": 276
+    },
+    {
+      "epoch": 0.3129059587687094,
+      "grad_norm": 0.02734490856528282,
+      "learning_rate": 0.0001950134739283869,
+      "loss": 1.0726,
+      "step": 277
+    },
+    {
+      "epoch": 0.3140355831685964,
+      "grad_norm": 0.026907166466116905,
+      "learning_rate": 0.0001949763682327938,
+      "loss": 1.0807,
+      "step": 278
+    },
+    {
+      "epoch": 0.3151652075684835,
+      "grad_norm": 0.02773541398346424,
+      "learning_rate": 0.00019493912854981905,
+      "loss": 1.0941,
+      "step": 279
+    },
+    {
+      "epoch": 0.3162948319683705,
+      "grad_norm": 0.027467425912618637,
+      "learning_rate": 0.00019490175493199833,
+      "loss": 1.031,
+      "step": 280
+    },
+    {
+      "epoch": 0.31742445636825756,
+      "grad_norm": 0.02712651528418064,
+      "learning_rate": 0.00019486424743205626,
+      "loss": 1.0015,
+      "step": 281
+    },
+    {
+      "epoch": 0.31855408076814457,
+      "grad_norm": 0.026572776958346367,
+      "learning_rate": 0.00019482660610290636,
+      "loss": 0.9459,
+      "step": 282
+    },
+    {
+      "epoch": 0.31968370516803163,
+      "grad_norm": 0.02701294608414173,
+      "learning_rate": 0.00019478883099765086,
+      "loss": 1.0652,
+      "step": 283
+    },
+    {
+      "epoch": 0.3208133295679187,
+      "grad_norm": 0.02713761292397976,
+      "learning_rate": 0.0001947509221695808,
+      "loss": 1.0455,
+      "step": 284
+    },
+    {
+      "epoch": 0.3219429539678057,
+      "grad_norm": 0.028251413255929947,
+      "learning_rate": 0.00019471287967217594,
+      "loss": 0.9885,
+      "step": 285
+    },
+    {
+      "epoch": 0.32307257836769276,
+      "grad_norm": 0.028362903743982315,
+      "learning_rate": 0.00019467470355910438,
+      "loss": 1.0896,
+      "step": 286
+    },
+    {
+      "epoch": 0.3242022027675798,
+      "grad_norm": 0.027835773304104805,
+      "learning_rate": 0.00019463639388422297,
+      "loss": 0.9381,
+      "step": 287
+    },
+    {
+      "epoch": 0.32533182716746684,
+      "grad_norm": 0.026659086346626282,
+      "learning_rate": 0.0001945979507015768,
+      "loss": 0.9987,
+      "step": 288
+    },
+    {
+      "epoch": 0.32646145156735384,
+      "grad_norm": 0.028285473585128784,
+      "learning_rate": 0.0001945593740653994,
+      "loss": 1.0055,
+      "step": 289
+    },
+    {
+      "epoch": 0.3275910759672409,
+      "grad_norm": 0.027459239587187767,
+      "learning_rate": 0.00019452066403011253,
+      "loss": 1.0468,
+      "step": 290
+    },
+    {
+      "epoch": 0.3287207003671279,
+      "grad_norm": 0.028836321085691452,
+      "learning_rate": 0.00019448182065032621,
+      "loss": 1.0855,
+      "step": 291
+    },
+    {
+      "epoch": 0.329850324767015,
+      "grad_norm": 0.029597043991088867,
+      "learning_rate": 0.00019444284398083847,
+      "loss": 1.1135,
+      "step": 292
+    },
+    {
+      "epoch": 0.330979949166902,
+      "grad_norm": 0.029845820739865303,
+      "learning_rate": 0.00019440373407663542,
+      "loss": 1.0117,
+      "step": 293
+    },
+    {
+      "epoch": 0.33210957356678905,
+      "grad_norm": 0.027042267844080925,
+      "learning_rate": 0.00019436449099289119,
+      "loss": 1.0173,
+      "step": 294
+    },
+    {
+      "epoch": 0.33323919796667606,
+      "grad_norm": 0.027646934613585472,
+      "learning_rate": 0.00019432511478496768,
+      "loss": 1.0777,
+      "step": 295
+    },
+    {
+      "epoch": 0.33323919796667606,
+      "eval_loss": 1.0277949571609497,
+      "eval_runtime": 565.1236,
+      "eval_samples_per_second": 17.311,
+      "eval_steps_per_second": 8.657,
+      "step": 295
+    },
+    {
+      "epoch": 0.3343688223665631,
+      "grad_norm": 0.026499278843402863,
+      "learning_rate": 0.00019428560550841472,
+      "loss": 0.9618,
+      "step": 296
+    },
+    {
+      "epoch": 0.33549844676645013,
+      "grad_norm": 0.027500445023179054,
+      "learning_rate": 0.00019424596321896976,
+      "loss": 0.9794,
+      "step": 297
+    },
+    {
+      "epoch": 0.3366280711663372,
+      "grad_norm": 0.027349818497896194,
+      "learning_rate": 0.00019420618797255795,
+      "loss": 1.1008,
+      "step": 298
+    },
+    {
+      "epoch": 0.3377576955662242,
+      "grad_norm": 0.027657683938741684,
+      "learning_rate": 0.000194166279825292,
+      "loss": 1.0801,
+      "step": 299
+    },
+    {
+      "epoch": 0.33888731996611127,
+      "grad_norm": 0.027384718880057335,
+      "learning_rate": 0.00019412623883347207,
+      "loss": 1.038,
+      "step": 300
+    },
+    {
+      "epoch": 0.34001694436599833,
+      "grad_norm": 0.026920663192868233,
+      "learning_rate": 0.00019408606505358583,
+      "loss": 0.9868,
+      "step": 301
+    },
+    {
+      "epoch": 0.34114656876588534,
+      "grad_norm": 0.028844624757766724,
+      "learning_rate": 0.00019404575854230818,
+      "loss": 1.0293,
+      "step": 302
+    },
+    {
+      "epoch": 0.3422761931657724,
+      "grad_norm": 0.02755833975970745,
+      "learning_rate": 0.00019400531935650128,
+      "loss": 1.0087,
+      "step": 303
+    },
+    {
+      "epoch": 0.3434058175656594,
+      "grad_norm": 0.027301400899887085,
+      "learning_rate": 0.00019396474755321456,
+      "loss": 1.0318,
+      "step": 304
+    },
+    {
+      "epoch": 0.3445354419655465,
+      "grad_norm": 0.02760390006005764,
+      "learning_rate": 0.0001939240431896844,
+      "loss": 0.9421,
+      "step": 305
+    },
+    {
+      "epoch": 0.3456650663654335,
+      "grad_norm": 0.027442464604973793,
+      "learning_rate": 0.00019388320632333429,
+      "loss": 1.0801,
+      "step": 306
+    },
+    {
+      "epoch": 0.34679469076532055,
+      "grad_norm": 0.027593247592449188,
+      "learning_rate": 0.00019384223701177455,
+      "loss": 1.0607,
+      "step": 307
+    },
+    {
+      "epoch": 0.34792431516520755,
+      "grad_norm": 0.028117630630731583,
+      "learning_rate": 0.00019380113531280245,
+      "loss": 1.054,
+      "step": 308
+    },
+    {
+      "epoch": 0.3490539395650946,
+      "grad_norm": 0.029217706993222237,
+      "learning_rate": 0.00019375990128440204,
+      "loss": 1.0997,
+      "step": 309
+    },
+    {
+      "epoch": 0.3501835639649816,
+      "grad_norm": 0.027274932712316513,
+      "learning_rate": 0.0001937185349847439,
+      "loss": 1.0051,
+      "step": 310
+    },
+    {
+      "epoch": 0.3513131883648687,
+      "grad_norm": 0.03279178589582443,
+      "learning_rate": 0.0001936770364721854,
+      "loss": 1.0293,
+      "step": 311
+    },
+    {
+      "epoch": 0.3524428127647557,
+      "grad_norm": 0.026957320049405098,
+      "learning_rate": 0.00019363540580527025,
+      "loss": 1.0358,
+      "step": 312
+    },
+    {
+      "epoch": 0.35357243716464276,
+      "grad_norm": 0.029469158500432968,
+      "learning_rate": 0.0001935936430427287,
+      "loss": 1.1446,
+      "step": 313
+    },
+    {
+      "epoch": 0.35470206156452977,
+      "grad_norm": 0.03025597333908081,
+      "learning_rate": 0.00019355174824347735,
+      "loss": 1.0722,
+      "step": 314
+    },
+    {
+      "epoch": 0.35583168596441683,
+      "grad_norm": 0.02727232687175274,
+      "learning_rate": 0.00019350972146661905,
+      "loss": 1.0592,
+      "step": 315
+    },
+    {
+      "epoch": 0.3569613103643039,
+      "grad_norm": 0.028911981731653214,
+      "learning_rate": 0.00019346756277144285,
+      "loss": 1.1644,
+      "step": 316
+    },
+    {
+      "epoch": 0.3580909347641909,
+      "grad_norm": 0.02783570997416973,
+      "learning_rate": 0.0001934252722174239,
+      "loss": 0.9406,
+      "step": 317
+    },
+    {
+      "epoch": 0.35922055916407797,
+      "grad_norm": 0.02677338756620884,
+      "learning_rate": 0.00019338284986422335,
+      "loss": 0.9287,
+      "step": 318
+    },
+    {
+      "epoch": 0.360350183563965,
+      "grad_norm": 0.027951853349804878,
+      "learning_rate": 0.00019334029577168827,
+      "loss": 0.9541,
+      "step": 319
+    },
+    {
+      "epoch": 0.36147980796385204,
+      "grad_norm": 0.028323214501142502,
+      "learning_rate": 0.00019329760999985167,
+      "loss": 1.1566,
+      "step": 320
+    },
+    {
+      "epoch": 0.36260943236373905,
+      "grad_norm": 0.027881423011422157,
+      "learning_rate": 0.00019325479260893223,
+      "loss": 1.0662,
+      "step": 321
+    },
+    {
+      "epoch": 0.3637390567636261,
+      "grad_norm": 0.02717737667262554,
+      "learning_rate": 0.00019321184365933433,
+      "loss": 1.0317,
+      "step": 322
+    },
+    {
+      "epoch": 0.3648686811635131,
+      "grad_norm": 0.028628146275877953,
+      "learning_rate": 0.00019316876321164798,
+      "loss": 1.0503,
+      "step": 323
+    },
+    {
+      "epoch": 0.3659983055634002,
+      "grad_norm": 0.02851051092147827,
+      "learning_rate": 0.0001931255513266487,
+      "loss": 1.0565,
+      "step": 324
+    },
+    {
+      "epoch": 0.3671279299632872,
+      "grad_norm": 0.02863175794482231,
+      "learning_rate": 0.00019308220806529738,
+      "loss": 1.0243,
+      "step": 325
+    },
+    {
+      "epoch": 0.36825755436317426,
+      "grad_norm": 0.03015504591166973,
+      "learning_rate": 0.0001930387334887403,
+      "loss": 1.0208,
+      "step": 326
+    },
+    {
+      "epoch": 0.36938717876306126,
+      "grad_norm": 0.02771030366420746,
+      "learning_rate": 0.00019299512765830895,
+      "loss": 1.0094,
+      "step": 327
+    },
+    {
+      "epoch": 0.3705168031629483,
+      "grad_norm": 0.027864158153533936,
+      "learning_rate": 0.00019295139063552007,
+      "loss": 0.9863,
+      "step": 328
+    },
+    {
+      "epoch": 0.37164642756283534,
+      "grad_norm": 0.028755534440279007,
+      "learning_rate": 0.00019290752248207537,
+      "loss": 1.0542,
+      "step": 329
+    },
+    {
+      "epoch": 0.3727760519627224,
+      "grad_norm": 0.029860056936740875,
+      "learning_rate": 0.00019286352325986164,
+      "loss": 1.0006,
+      "step": 330
+    },
+    {
+      "epoch": 0.3739056763626094,
+      "grad_norm": 0.027963971719145775,
+      "learning_rate": 0.0001928193930309505,
+      "loss": 0.9609,
+      "step": 331
+    },
+    {
+      "epoch": 0.37503530076249647,
+      "grad_norm": 0.02750619500875473,
+      "learning_rate": 0.00019277513185759844,
+      "loss": 1.0076,
+      "step": 332
+    },
+    {
+      "epoch": 0.37616492516238353,
+      "grad_norm": 0.02815542183816433,
+      "learning_rate": 0.0001927307398022467,
+      "loss": 1.04,
+      "step": 333
+    },
+    {
+      "epoch": 0.37729454956227054,
+      "grad_norm": 0.028742128983139992,
+      "learning_rate": 0.00019268621692752108,
+      "loss": 0.9947,
+      "step": 334
+    },
+    {
+      "epoch": 0.3784241739621576,
+      "grad_norm": 0.027735736221075058,
+      "learning_rate": 0.00019264156329623197,
+      "loss": 1.0265,
+      "step": 335
+    },
+    {
+      "epoch": 0.3795537983620446,
+      "grad_norm": 0.02745204232633114,
+      "learning_rate": 0.00019259677897137426,
+      "loss": 1.0308,
+      "step": 336
+    },
+    {
+      "epoch": 0.3806834227619317,
+      "grad_norm": 0.028459064662456512,
+      "learning_rate": 0.00019255186401612718,
+      "loss": 1.0069,
+      "step": 337
+    },
+    {
+      "epoch": 0.3818130471618187,
+      "grad_norm": 0.028107335790991783,
+      "learning_rate": 0.00019250681849385424,
+      "loss": 1.0812,
+      "step": 338
+    },
+    {
+      "epoch": 0.38294267156170575,
+      "grad_norm": 0.029490889981389046,
+      "learning_rate": 0.00019246164246810316,
+      "loss": 1.0247,
+      "step": 339
+    },
+    {
+      "epoch": 0.38407229596159276,
+      "grad_norm": 0.027926163747906685,
+      "learning_rate": 0.00019241633600260578,
+      "loss": 0.9761,
+      "step": 340
+    },
+    {
+      "epoch": 0.3852019203614798,
+      "grad_norm": 0.02847837097942829,
+      "learning_rate": 0.00019237089916127793,
+      "loss": 1.0841,
+      "step": 341
+    },
+    {
+      "epoch": 0.38633154476136683,
+      "grad_norm": 0.027178598567843437,
+      "learning_rate": 0.00019232533200821942,
+      "loss": 1.1123,
+      "step": 342
+    },
+    {
+      "epoch": 0.3874611691612539,
+      "grad_norm": 0.027773573994636536,
+      "learning_rate": 0.00019227963460771377,
+      "loss": 0.9871,
+      "step": 343
+    },
+    {
+      "epoch": 0.3885907935611409,
+      "grad_norm": 0.027409275993704796,
+      "learning_rate": 0.00019223380702422844,
+      "loss": 1.0916,
+      "step": 344
+    },
+    {
+      "epoch": 0.38972041796102797,
+      "grad_norm": 0.028152553364634514,
+      "learning_rate": 0.00019218784932241434,
+      "loss": 1.0301,
+      "step": 345
+    },
+    {
+      "epoch": 0.390850042360915,
+      "grad_norm": 0.028817711398005486,
+      "learning_rate": 0.00019214176156710612,
+      "loss": 1.0203,
+      "step": 346
+    },
+    {
+      "epoch": 0.39197966676080204,
+      "grad_norm": 0.02772883139550686,
+      "learning_rate": 0.0001920955438233218,
+      "loss": 0.9991,
+      "step": 347
+    },
+    {
+      "epoch": 0.39310929116068904,
+      "grad_norm": 0.028133943676948547,
+      "learning_rate": 0.00019204919615626275,
+      "loss": 0.9834,
+      "step": 348
+    },
+    {
+      "epoch": 0.3942389155605761,
+      "grad_norm": 0.02936532348394394,
+      "learning_rate": 0.00019200271863131375,
+      "loss": 1.0227,
+      "step": 349
+    },
+    {
+      "epoch": 0.3953685399604632,
+      "grad_norm": 0.028890248388051987,
+      "learning_rate": 0.0001919561113140427,
+      "loss": 0.9551,
+      "step": 350
+    },
+    {
+      "epoch": 0.3964981643603502,
+      "grad_norm": 0.02820666879415512,
+      "learning_rate": 0.0001919093742702006,
+      "loss": 1.0343,
+      "step": 351
+    },
+    {
+      "epoch": 0.39762778876023724,
+      "grad_norm": 0.029474567621946335,
+      "learning_rate": 0.00019186250756572144,
+      "loss": 0.9853,
+      "step": 352
+    },
+    {
+      "epoch": 0.39875741316012425,
+      "grad_norm": 0.02914329618215561,
+      "learning_rate": 0.0001918155112667222,
+      "loss": 0.9542,
+      "step": 353
+    },
+    {
+      "epoch": 0.3998870375600113,
+      "grad_norm": 0.028036657720804214,
+      "learning_rate": 0.00019176838543950267,
+      "loss": 0.945,
+      "step": 354
+    },
+    {
+      "epoch": 0.4010166619598983,
+      "grad_norm": 0.027309326454997063,
+      "learning_rate": 0.00019172113015054532,
+      "loss": 0.977,
+      "step": 355
+    },
+    {
+      "epoch": 0.4021462863597854,
+      "grad_norm": 0.027427159249782562,
+      "learning_rate": 0.00019167374546651526,
+      "loss": 1.0505,
+      "step": 356
+    },
+    {
+      "epoch": 0.4032759107596724,
+      "grad_norm": 0.03023376129567623,
+      "learning_rate": 0.0001916262314542602,
+      "loss": 1.1378,
+      "step": 357
+    },
+    {
+      "epoch": 0.40440553515955946,
+      "grad_norm": 0.027807191014289856,
+      "learning_rate": 0.00019157858818081026,
+      "loss": 1.0516,
+      "step": 358
+    },
+    {
+      "epoch": 0.40553515955944647,
+      "grad_norm": 0.028308499604463577,
+      "learning_rate": 0.00019153081571337795,
+      "loss": 1.0673,
+      "step": 359
+    },
+    {
+      "epoch": 0.40666478395933353,
+      "grad_norm": 0.028541473671793938,
+      "learning_rate": 0.00019148291411935796,
+      "loss": 1.0567,
+      "step": 360
+    },
+    {
+      "epoch": 0.40779440835922054,
+      "grad_norm": 0.027455326169729233,
+      "learning_rate": 0.00019143488346632723,
+      "loss": 1.0078,
+      "step": 361
+    },
+    {
+      "epoch": 0.4089240327591076,
+      "grad_norm": 0.02952658385038376,
+      "learning_rate": 0.00019138672382204471,
+      "loss": 1.0686,
+      "step": 362
+    },
+    {
+      "epoch": 0.4100536571589946,
+      "grad_norm": 0.028435127809643745,
+      "learning_rate": 0.0001913384352544514,
+      "loss": 0.9846,
+      "step": 363
+    },
+    {
+      "epoch": 0.4111832815588817,
+      "grad_norm": 0.028838949277997017,
+      "learning_rate": 0.00019129001783167005,
+      "loss": 1.0602,
+      "step": 364
+    },
+    {
+      "epoch": 0.4123129059587687,
+      "grad_norm": 0.029650872573256493,
+      "learning_rate": 0.00019124147162200535,
+      "loss": 0.9967,
+      "step": 365
+    },
+    {
+      "epoch": 0.41344253035865575,
+      "grad_norm": 0.028792966157197952,
+      "learning_rate": 0.00019119279669394353,
+      "loss": 1.0562,
+      "step": 366
+    },
+    {
+      "epoch": 0.4145721547585428,
+      "grad_norm": 0.029962720349431038,
+      "learning_rate": 0.00019114399311615253,
+      "loss": 1.0016,
+      "step": 367
+    },
+    {
+      "epoch": 0.4157017791584298,
+      "grad_norm": 0.029513955116271973,
+      "learning_rate": 0.00019109506095748167,
+      "loss": 1.007,
+      "step": 368
+    },
+    {
+      "epoch": 0.4168314035583169,
+      "grad_norm": 0.028869032859802246,
+      "learning_rate": 0.00019104600028696175,
+      "loss": 1.033,
+      "step": 369
+    },
+    {
+      "epoch": 0.4179610279582039,
+      "grad_norm": 0.02818440832197666,
+      "learning_rate": 0.00019099681117380486,
+      "loss": 0.9947,
+      "step": 370
+    },
+    {
+      "epoch": 0.41909065235809095,
+      "grad_norm": 0.030735397711396217,
+      "learning_rate": 0.00019094749368740423,
+      "loss": 1.031,
+      "step": 371
+    },
+    {
+      "epoch": 0.42022027675797796,
+      "grad_norm": 0.029516831040382385,
+      "learning_rate": 0.00019089804789733424,
+      "loss": 1.1093,
+      "step": 372
+    },
+    {
+      "epoch": 0.421349901157865,
+      "grad_norm": 0.028589509427547455,
+      "learning_rate": 0.00019084847387335025,
+      "loss": 1.0524,
+      "step": 373
+    },
+    {
+      "epoch": 0.42247952555775203,
+      "grad_norm": 0.029599323868751526,
+      "learning_rate": 0.00019079877168538855,
+      "loss": 1.0867,
+      "step": 374
+    },
+    {
+      "epoch": 0.4236091499576391,
+      "grad_norm": 0.029633615165948868,
+      "learning_rate": 0.00019074894140356624,
+      "loss": 1.0187,
+      "step": 375
+    },
+    {
+      "epoch": 0.4247387743575261,
+      "grad_norm": 0.029569542035460472,
+      "learning_rate": 0.00019069898309818106,
+      "loss": 1.0172,
+      "step": 376
+    },
+    {
+      "epoch": 0.42586839875741317,
+      "grad_norm": 0.02864873595535755,
+      "learning_rate": 0.00019064889683971149,
+      "loss": 1.0408,
+      "step": 377
+    },
+    {
+      "epoch": 0.4269980231573002,
+      "grad_norm": 0.02849559485912323,
+      "learning_rate": 0.0001905986826988164,
+      "loss": 1.0513,
+      "step": 378
+    },
+    {
+      "epoch": 0.42812764755718724,
+      "grad_norm": 0.028202759101986885,
+      "learning_rate": 0.00019054834074633506,
+      "loss": 1.0536,
+      "step": 379
+    },
+    {
+      "epoch": 0.42925727195707425,
+      "grad_norm": 0.02983192540705204,
+      "learning_rate": 0.00019049787105328715,
+      "loss": 1.0294,
+      "step": 380
+    },
+    {
+      "epoch": 0.4303868963569613,
+      "grad_norm": 0.028043275699019432,
+      "learning_rate": 0.0001904472736908725,
+      "loss": 0.9645,
+      "step": 381
+    },
+    {
+      "epoch": 0.4315165207568483,
+      "grad_norm": 0.02895670384168625,
+      "learning_rate": 0.0001903965487304711,
+      "loss": 1.154,
+      "step": 382
+    },
+    {
+      "epoch": 0.4326461451567354,
+      "grad_norm": 0.02832162007689476,
+      "learning_rate": 0.0001903456962436428,
+      "loss": 1.0332,
+      "step": 383
+    },
+    {
+      "epoch": 0.43377576955662245,
+      "grad_norm": 0.029863545671105385,
+      "learning_rate": 0.00019029471630212762,
+      "loss": 1.0002,
+      "step": 384
+    },
+    {
+      "epoch": 0.43490539395650946,
+      "grad_norm": 0.02890811115503311,
+      "learning_rate": 0.00019024360897784508,
+      "loss": 1.0644,
+      "step": 385
+    },
+    {
+      "epoch": 0.4360350183563965,
+      "grad_norm": 0.03050493635237217,
+      "learning_rate": 0.0001901923743428946,
+      "loss": 1.0324,
+      "step": 386
+    },
+    {
+      "epoch": 0.43716464275628353,
+      "grad_norm": 0.029246153309941292,
+      "learning_rate": 0.00019014101246955515,
+      "loss": 1.0591,
+      "step": 387
+    },
+    {
+      "epoch": 0.4382942671561706,
+      "grad_norm": 0.02876698225736618,
+      "learning_rate": 0.00019008952343028526,
+      "loss": 0.9519,
+      "step": 388
+    },
+    {
+      "epoch": 0.4394238915560576,
+      "grad_norm": 0.029059743508696556,
+      "learning_rate": 0.00019003790729772273,
+      "loss": 1.0165,
+      "step": 389
+    },
+    {
+      "epoch": 0.44055351595594466,
+      "grad_norm": 0.02885555475950241,
+      "learning_rate": 0.00018998616414468478,
+      "loss": 1.004,
+      "step": 390
+    },
+    {
+      "epoch": 0.44168314035583167,
+      "grad_norm": 0.02809917740523815,
+      "learning_rate": 0.00018993429404416773,
+      "loss": 0.9685,
+      "step": 391
+    },
+    {
+      "epoch": 0.44281276475571874,
+      "grad_norm": 0.028004605323076248,
+      "learning_rate": 0.0001898822970693471,
+      "loss": 0.9923,
+      "step": 392
+    },
+    {
+      "epoch": 0.44394238915560574,
+      "grad_norm": 0.029958872124552727,
+      "learning_rate": 0.00018983017329357729,
+      "loss": 1.0468,
+      "step": 393
+    },
+    {
+      "epoch": 0.4450720135554928,
+      "grad_norm": 0.03032870590686798,
+      "learning_rate": 0.00018977792279039162,
+      "loss": 0.9573,
+      "step": 394
+    },
+    {
+      "epoch": 0.4462016379553798,
+      "grad_norm": 0.029365211725234985,
+      "learning_rate": 0.0001897255456335022,
+      "loss": 0.9673,
+      "step": 395
+    },
+    {
+      "epoch": 0.4473312623552669,
+      "grad_norm": 0.03092394582927227,
+      "learning_rate": 0.00018967304189679984,
+      "loss": 1.1468,
+      "step": 396
+    },
+    {
+      "epoch": 0.4484608867551539,
+      "grad_norm": 0.029345886781811714,
+      "learning_rate": 0.00018962041165435388,
+      "loss": 1.1213,
+      "step": 397
+    },
+    {
+      "epoch": 0.44959051115504095,
+      "grad_norm": 0.029504388570785522,
+      "learning_rate": 0.0001895676549804121,
+      "loss": 1.0483,
+      "step": 398
+    },
+    {
+      "epoch": 0.450720135554928,
+      "grad_norm": 0.029384993016719818,
+      "learning_rate": 0.00018951477194940075,
+      "loss": 0.9973,
+      "step": 399
+    },
+    {
+      "epoch": 0.451849759954815,
+      "grad_norm": 0.02798447571694851,
+      "learning_rate": 0.0001894617626359242,
+      "loss": 1.0041,
+      "step": 400
+    },
+    {
+      "epoch": 0.4529793843547021,
+      "grad_norm": 0.028576720505952835,
+      "learning_rate": 0.00018940862711476513,
+      "loss": 1.0699,
+      "step": 401
+    },
+    {
+      "epoch": 0.4541090087545891,
+      "grad_norm": 0.029531830921769142,
+      "learning_rate": 0.0001893553654608841,
+      "loss": 1.0396,
+      "step": 402
+    },
+    {
+      "epoch": 0.45523863315447616,
+      "grad_norm": 0.02875913865864277,
+      "learning_rate": 0.00018930197774941974,
+      "loss": 1.0302,
+      "step": 403
+    },
+    {
+      "epoch": 0.45636825755436317,
+      "grad_norm": 0.02790944278240204,
+      "learning_rate": 0.00018924846405568845,
+      "loss": 1.1243,
+      "step": 404
+    },
+    {
+      "epoch": 0.45749788195425023,
+      "grad_norm": 0.02811037190258503,
+      "learning_rate": 0.00018919482445518436,
+      "loss": 1.0377,
+      "step": 405
+    },
+    {
+      "epoch": 0.45862750635413724,
+      "grad_norm": 0.029786163941025734,
+      "learning_rate": 0.00018914105902357925,
+      "loss": 0.9825,
+      "step": 406
+    },
+    {
+      "epoch": 0.4597571307540243,
+      "grad_norm": 0.028242526575922966,
+      "learning_rate": 0.0001890871678367224,
+      "loss": 1.0738,
+      "step": 407
+    },
+    {
+      "epoch": 0.4608867551539113,
+      "grad_norm": 0.028527051210403442,
+      "learning_rate": 0.00018903315097064055,
+      "loss": 1.0024,
+      "step": 408
+    },
+    {
+      "epoch": 0.4620163795537984,
+      "grad_norm": 0.02773975394666195,
+      "learning_rate": 0.0001889790085015376,
+      "loss": 1.0042,
+      "step": 409
+    },
+    {
+      "epoch": 0.4631460039536854,
+      "grad_norm": 0.028500793501734734,
+      "learning_rate": 0.0001889247405057948,
+      "loss": 1.0938,
+      "step": 410
+    },
+    {
+      "epoch": 0.46427562835357244,
+      "grad_norm": 0.028347400948405266,
+      "learning_rate": 0.0001888703470599704,
+      "loss": 0.9892,
+      "step": 411
+    },
+    {
+      "epoch": 0.46540525275345945,
+      "grad_norm": 0.030584534630179405,
+      "learning_rate": 0.00018881582824079965,
+      "loss": 0.9977,
+      "step": 412
+    },
+    {
+      "epoch": 0.4665348771533465,
+      "grad_norm": 0.030196473002433777,
+      "learning_rate": 0.0001887611841251947,
+      "loss": 1.0442,
+      "step": 413
+    },
+    {
+      "epoch": 0.4676645015532335,
+      "grad_norm": 0.02942134439945221,
+      "learning_rate": 0.00018870641479024438,
+      "loss": 1.0096,
+      "step": 414
+    },
+    {
+      "epoch": 0.4687941259531206,
+      "grad_norm": 0.0283603947609663,
+      "learning_rate": 0.00018865152031321427,
+      "loss": 1.1341,
+      "step": 415
+    },
+    {
+      "epoch": 0.46992375035300765,
+      "grad_norm": 0.02936590276658535,
+      "learning_rate": 0.0001885965007715464,
+      "loss": 1.0823,
+      "step": 416
+    },
+    {
+      "epoch": 0.47105337475289466,
+      "grad_norm": 0.029375478625297546,
+      "learning_rate": 0.00018854135624285935,
+      "loss": 1.1148,
+      "step": 417
+    },
+    {
+      "epoch": 0.4721829991527817,
+      "grad_norm": 0.02892325632274151,
+      "learning_rate": 0.00018848608680494788,
+      "loss": 1.0905,
+      "step": 418
+    },
+    {
+      "epoch": 0.47331262355266873,
+      "grad_norm": 0.028916003182530403,
+      "learning_rate": 0.00018843069253578312,
+      "loss": 1.0133,
+      "step": 419
+    },
+    {
+      "epoch": 0.4744422479525558,
+      "grad_norm": 0.03031068667769432,
+      "learning_rate": 0.00018837517351351214,
+      "loss": 0.9835,
+      "step": 420
+    },
+    {
+      "epoch": 0.4755718723524428,
+      "grad_norm": 0.02931569144129753,
+      "learning_rate": 0.00018831952981645817,
+      "loss": 0.9664,
+      "step": 421
+    },
+    {
+      "epoch": 0.47670149675232987,
+      "grad_norm": 0.029150547459721565,
+      "learning_rate": 0.0001882637615231202,
+      "loss": 0.9604,
+      "step": 422
+    },
+    {
+      "epoch": 0.4778311211522169,
+      "grad_norm": 0.03003125637769699,
+      "learning_rate": 0.00018820786871217305,
+      "loss": 1.0735,
+      "step": 423
+    },
+    {
+      "epoch": 0.47896074555210394,
+      "grad_norm": 0.030021261423826218,
+      "learning_rate": 0.00018815185146246716,
+      "loss": 1.0005,
+      "step": 424
+    },
+    {
+      "epoch": 0.48009036995199095,
+      "grad_norm": 0.029816657304763794,
+      "learning_rate": 0.00018809570985302862,
+      "loss": 0.9366,
+      "step": 425
+    },
+    {
+      "epoch": 0.481219994351878,
+      "grad_norm": 0.02971251681447029,
+      "learning_rate": 0.00018803944396305884,
+      "loss": 1.0121,
+      "step": 426
+    },
+    {
+      "epoch": 0.482349618751765,
+      "grad_norm": 0.03110647387802601,
+      "learning_rate": 0.00018798305387193463,
+      "loss": 1.0021,
+      "step": 427
+    },
+    {
+      "epoch": 0.4834792431516521,
+      "grad_norm": 0.030216267332434654,
+      "learning_rate": 0.000187926539659208,
+      "loss": 0.9594,
+      "step": 428
+    },
+    {
+      "epoch": 0.4846088675515391,
+      "grad_norm": 0.030311699956655502,
+      "learning_rate": 0.000187869901404606,
+      "loss": 1.0478,
+      "step": 429
+    },
+    {
+      "epoch": 0.48573849195142615,
+      "grad_norm": 0.028579862788319588,
+      "learning_rate": 0.00018781313918803086,
+      "loss": 0.9539,
+      "step": 430
+    },
+    {
+      "epoch": 0.48686811635131316,
+      "grad_norm": 0.03003637120127678,
+      "learning_rate": 0.00018775625308955942,
+      "loss": 1.0172,
+      "step": 431
+    },
+    {
+      "epoch": 0.4879977407512002,
+      "grad_norm": 0.03043578751385212,
+      "learning_rate": 0.0001876992431894435,
+      "loss": 0.9997,
+      "step": 432
+    },
+    {
+      "epoch": 0.4891273651510873,
+      "grad_norm": 0.03140099346637726,
+      "learning_rate": 0.0001876421095681095,
+      "loss": 1.0307,
+      "step": 433
+    },
+    {
+      "epoch": 0.4902569895509743,
+      "grad_norm": 0.03060254082083702,
+      "learning_rate": 0.00018758485230615837,
+      "loss": 0.9873,
+      "step": 434
+    },
+    {
+      "epoch": 0.49138661395086136,
+      "grad_norm": 0.030223416164517403,
+      "learning_rate": 0.00018752747148436543,
+      "loss": 1.0629,
+      "step": 435
+    },
+    {
+      "epoch": 0.49251623835074837,
+      "grad_norm": 0.030368085950613022,
+      "learning_rate": 0.00018746996718368037,
+      "loss": 0.9692,
+      "step": 436
+    },
+    {
+      "epoch": 0.49364586275063543,
+      "grad_norm": 0.03002486564218998,
+      "learning_rate": 0.00018741233948522707,
+      "loss": 1.0334,
+      "step": 437
+    },
+    {
+      "epoch": 0.49477548715052244,
+      "grad_norm": 0.029050812125205994,
+      "learning_rate": 0.0001873545884703035,
+      "loss": 0.9861,
+      "step": 438
+    },
+    {
+      "epoch": 0.4959051115504095,
+      "grad_norm": 0.030488910153508186,
+      "learning_rate": 0.0001872967142203815,
+      "loss": 1.1141,
+      "step": 439
+    },
+    {
+      "epoch": 0.4970347359502965,
+      "grad_norm": 0.029405072331428528,
+      "learning_rate": 0.00018723871681710697,
+      "loss": 1.0318,
+      "step": 440
+    },
+    {
+      "epoch": 0.4981643603501836,
+      "grad_norm": 0.030446210876107216,
+      "learning_rate": 0.0001871805963422993,
+      "loss": 0.9895,
+      "step": 441
+    },
+    {
+      "epoch": 0.4992939847500706,
+      "grad_norm": 0.029718847945332527,
+      "learning_rate": 0.00018712235287795176,
+      "loss": 1.1104,
+      "step": 442
+    },
+    {
+      "epoch": 0.5004236091499576,
+      "grad_norm": 0.03045968897640705,
+      "learning_rate": 0.00018706398650623088,
+      "loss": 0.9305,
+      "step": 443
+    },
+    {
+      "epoch": 0.5015532335498447,
+      "grad_norm": 0.030085409060120583,
+      "learning_rate": 0.0001870054973094767,
+      "loss": 1.0243,
+      "step": 444
+    },
+    {
+      "epoch": 0.5026828579497317,
+      "grad_norm": 0.030122725293040276,
+      "learning_rate": 0.0001869468853702026,
+      "loss": 1.0977,
+      "step": 445
+    },
+    {
+      "epoch": 0.5038124823496187,
+      "grad_norm": 0.03070569783449173,
+      "learning_rate": 0.00018688815077109498,
+      "loss": 1.0352,
+      "step": 446
+    },
+    {
+      "epoch": 0.5049421067495058,
+      "grad_norm": 0.029172202572226524,
+      "learning_rate": 0.00018682929359501338,
+      "loss": 1.0018,
+      "step": 447
+    },
+    {
+      "epoch": 0.5060717311493929,
+      "grad_norm": 0.02992609702050686,
+      "learning_rate": 0.00018677031392499023,
+      "loss": 1.0543,
+      "step": 448
+    },
+    {
+      "epoch": 0.5072013555492799,
+      "grad_norm": 0.03060738928616047,
+      "learning_rate": 0.00018671121184423076,
+      "loss": 0.9548,
+      "step": 449
+    },
+    {
+      "epoch": 0.5083309799491669,
+      "grad_norm": 0.03061763569712639,
+      "learning_rate": 0.0001866519874361129,
+      "loss": 1.0017,
+      "step": 450
+    },
+    {
+      "epoch": 0.5094606043490539,
+      "grad_norm": 0.031224450096488,
+      "learning_rate": 0.00018659264078418718,
+      "loss": 1.0203,
+      "step": 451
+    },
+    {
+      "epoch": 0.510590228748941,
+      "grad_norm": 0.028874509036540985,
+      "learning_rate": 0.00018653317197217653,
+      "loss": 1.0266,
+      "step": 452
+    },
+    {
+      "epoch": 0.5117198531488281,
+      "grad_norm": 0.029967116191983223,
+      "learning_rate": 0.00018647358108397625,
+      "loss": 1.0335,
+      "step": 453
+    },
+    {
+      "epoch": 0.512849477548715,
+      "grad_norm": 0.030794909223914146,
+      "learning_rate": 0.00018641386820365385,
+      "loss": 1.0284,
+      "step": 454
+    },
+    {
+      "epoch": 0.5139791019486021,
+      "grad_norm": 0.031100483611226082,
+      "learning_rate": 0.000186354033415449,
+      "loss": 1.0486,
+      "step": 455
+    },
+    {
+      "epoch": 0.5151087263484891,
+      "grad_norm": 0.030945099890232086,
+      "learning_rate": 0.00018629407680377318,
+      "loss": 1.0685,
+      "step": 456
+    },
+    {
+      "epoch": 0.5162383507483762,
+      "grad_norm": 0.030694004148244858,
+      "learning_rate": 0.00018623399845320993,
+      "loss": 0.9765,
+      "step": 457
+    },
+    {
+      "epoch": 0.5173679751482632,
+      "grad_norm": 0.03131450340151787,
+      "learning_rate": 0.00018617379844851443,
+      "loss": 1.0927,
+      "step": 458
+    },
+    {
+      "epoch": 0.5184975995481502,
+      "grad_norm": 0.030793707817792892,
+      "learning_rate": 0.00018611347687461349,
+      "loss": 0.9999,
+      "step": 459
+    },
+    {
+      "epoch": 0.5196272239480373,
+      "grad_norm": 0.029182102531194687,
+      "learning_rate": 0.00018605303381660543,
+      "loss": 0.967,
+      "step": 460
+    },
+    {
+      "epoch": 0.5207568483479244,
+      "grad_norm": 0.030693160369992256,
+      "learning_rate": 0.00018599246935976,
+      "loss": 1.084,
+      "step": 461
+    },
+    {
+      "epoch": 0.5218864727478113,
+      "grad_norm": 0.030196724459528923,
+      "learning_rate": 0.0001859317835895181,
+      "loss": 1.024,
+      "step": 462
+    },
+    {
+      "epoch": 0.5230160971476984,
+      "grad_norm": 0.029934274032711983,
+      "learning_rate": 0.0001858709765914919,
+      "loss": 1.0975,
+      "step": 463
+    },
+    {
+      "epoch": 0.5241457215475854,
+      "grad_norm": 0.030209926888346672,
+      "learning_rate": 0.00018581004845146453,
+      "loss": 1.0485,
+      "step": 464
+    },
+    {
+      "epoch": 0.5252753459474725,
+      "grad_norm": 0.0305222999304533,
+      "learning_rate": 0.00018574899925538998,
+      "loss": 1.0272,
+      "step": 465
+    },
+    {
+      "epoch": 0.5264049703473596,
+      "grad_norm": 0.029943542554974556,
+      "learning_rate": 0.00018568782908939309,
+      "loss": 1.0122,
+      "step": 466
+    },
+    {
+      "epoch": 0.5275345947472465,
+      "grad_norm": 0.02910439483821392,
+      "learning_rate": 0.00018562653803976936,
+      "loss": 0.8831,
+      "step": 467
+    },
+    {
+      "epoch": 0.5286642191471336,
+      "grad_norm": 0.030156375840306282,
+      "learning_rate": 0.00018556512619298472,
+      "loss": 1.0245,
+      "step": 468
+    },
+    {
+      "epoch": 0.5297938435470206,
+      "grad_norm": 0.029457733035087585,
+      "learning_rate": 0.00018550359363567567,
+      "loss": 0.9933,
+      "step": 469
+    },
+    {
+      "epoch": 0.5309234679469077,
+      "grad_norm": 0.03006352297961712,
+      "learning_rate": 0.00018544194045464886,
+      "loss": 0.9978,
+      "step": 470
+    },
+    {
+      "epoch": 0.5320530923467947,
+      "grad_norm": 0.03152355179190636,
+      "learning_rate": 0.0001853801667368812,
+      "loss": 0.9832,
+      "step": 471
+    },
+    {
+      "epoch": 0.5331827167466817,
+      "grad_norm": 0.02921919897198677,
+      "learning_rate": 0.00018531827256951962,
+      "loss": 0.9178,
+      "step": 472
+    },
+    {
+      "epoch": 0.5343123411465688,
+      "grad_norm": 0.031064407899975777,
+      "learning_rate": 0.00018525625803988104,
+      "loss": 1.0384,
+      "step": 473
+    },
+    {
+      "epoch": 0.5354419655464558,
+      "grad_norm": 0.029859617352485657,
+      "learning_rate": 0.00018519412323545194,
+      "loss": 0.9886,
+      "step": 474
+    },
+    {
+      "epoch": 0.5365715899463428,
+      "grad_norm": 0.030883649364113808,
+      "learning_rate": 0.00018513186824388879,
+      "loss": 1.1247,
+      "step": 475
+    },
+    {
+      "epoch": 0.5377012143462299,
+      "grad_norm": 0.030706819146871567,
+      "learning_rate": 0.00018506949315301742,
+      "loss": 0.9923,
+      "step": 476
+    },
+    {
+      "epoch": 0.5388308387461169,
+      "grad_norm": 0.02973487228155136,
+      "learning_rate": 0.00018500699805083318,
+      "loss": 0.9388,
+      "step": 477
+    },
+    {
+      "epoch": 0.539960463146004,
+      "grad_norm": 0.03165286406874657,
+      "learning_rate": 0.00018494438302550062,
+      "loss": 1.0297,
+      "step": 478
+    },
+    {
+      "epoch": 0.5410900875458909,
+      "grad_norm": 0.0324639268219471,
+      "learning_rate": 0.0001848816481653536,
+      "loss": 1.0399,
+      "step": 479
+    },
+    {
+      "epoch": 0.542219711945778,
+      "grad_norm": 0.03156152740120888,
+      "learning_rate": 0.00018481879355889495,
+      "loss": 0.9528,
+      "step": 480
+    },
+    {
+      "epoch": 0.5433493363456651,
+      "grad_norm": 0.030102282762527466,
+      "learning_rate": 0.00018475581929479646,
+      "loss": 0.9972,
+      "step": 481
+    },
+    {
+      "epoch": 0.5444789607455521,
+      "grad_norm": 0.03062708117067814,
+      "learning_rate": 0.0001846927254618987,
+      "loss": 0.9629,
+      "step": 482
+    },
+    {
+      "epoch": 0.5456085851454392,
+      "grad_norm": 0.02973772957921028,
+      "learning_rate": 0.000184629512149211,
+      "loss": 1.1059,
+      "step": 483
+    },
+    {
+      "epoch": 0.5467382095453261,
+      "grad_norm": 0.030491316691040993,
+      "learning_rate": 0.00018456617944591111,
+      "loss": 1.093,
+      "step": 484
+    },
+    {
+      "epoch": 0.5478678339452132,
+      "grad_norm": 0.029982471838593483,
+      "learning_rate": 0.00018450272744134532,
+      "loss": 1.0719,
+      "step": 485
+    },
+    {
+      "epoch": 0.5489974583451003,
+      "grad_norm": 0.03204856067895889,
+      "learning_rate": 0.00018443915622502822,
+      "loss": 1.0136,
+      "step": 486
+    },
+    {
+      "epoch": 0.5501270827449873,
+      "grad_norm": 0.030183738097548485,
+      "learning_rate": 0.00018437546588664252,
+      "loss": 1.0613,
+      "step": 487
+    },
+    {
+      "epoch": 0.5512567071448743,
+      "grad_norm": 0.03049345500767231,
+      "learning_rate": 0.00018431165651603903,
+      "loss": 0.9428,
+      "step": 488
+    },
+    {
+      "epoch": 0.5523863315447614,
+      "grad_norm": 0.030976206064224243,
+      "learning_rate": 0.00018424772820323644,
+      "loss": 0.9908,
+      "step": 489
+    },
+    {
+      "epoch": 0.5535159559446484,
+      "grad_norm": 0.030059922486543655,
+      "learning_rate": 0.00018418368103842125,
+      "loss": 0.9546,
+      "step": 490
+    },
+    {
+      "epoch": 0.5546455803445355,
+      "grad_norm": 0.029848681762814522,
+      "learning_rate": 0.0001841195151119477,
+      "loss": 1.0269,
+      "step": 491
+    },
+    {
+      "epoch": 0.5557752047444224,
+      "grad_norm": 0.03216058760881424,
+      "learning_rate": 0.00018405523051433743,
+      "loss": 0.9717,
+      "step": 492
+    },
+    {
+      "epoch": 0.5569048291443095,
+      "grad_norm": 0.030524935573339462,
+      "learning_rate": 0.00018399082733627965,
+      "loss": 1.0208,
+      "step": 493
+    },
+    {
+      "epoch": 0.5580344535441966,
+      "grad_norm": 0.03152266517281532,
+      "learning_rate": 0.00018392630566863076,
+      "loss": 1.0353,
+      "step": 494
+    },
+    {
+      "epoch": 0.5591640779440836,
+      "grad_norm": 0.03233015537261963,
+      "learning_rate": 0.00018386166560241434,
+      "loss": 1.1238,
+      "step": 495
+    },
+    {
+      "epoch": 0.5602937023439706,
+      "grad_norm": 0.031183136627078056,
+      "learning_rate": 0.000183796907228821,
+      "loss": 1.0266,
+      "step": 496
+    },
+    {
+      "epoch": 0.5614233267438576,
+      "grad_norm": 0.030228251591324806,
+      "learning_rate": 0.00018373203063920822,
+      "loss": 1.0074,
+      "step": 497
+    },
+    {
+      "epoch": 0.5625529511437447,
+      "grad_norm": 0.031268905848264694,
+      "learning_rate": 0.00018366703592510034,
+      "loss": 1.0106,
+      "step": 498
+    },
+    {
+      "epoch": 0.5636825755436318,
+      "grad_norm": 0.031185952946543694,
+      "learning_rate": 0.0001836019231781883,
+      "loss": 1.0476,
+      "step": 499
+    },
+    {
+      "epoch": 0.5648121999435188,
+      "grad_norm": 0.03026709146797657,
+      "learning_rate": 0.0001835366924903295,
+      "loss": 1.0619,
+      "step": 500
+    },
+    {
+      "epoch": 0.5659418243434058,
+      "grad_norm": 0.029817136004567146,
+      "learning_rate": 0.00018347134395354776,
+      "loss": 1.0016,
+      "step": 501
+    },
+    {
+      "epoch": 0.5670714487432928,
+      "grad_norm": 0.030526304617524147,
+      "learning_rate": 0.00018340587766003323,
+      "loss": 1.0559,
+      "step": 502
+    },
+    {
+      "epoch": 0.5682010731431799,
+      "grad_norm": 0.03136800602078438,
+      "learning_rate": 0.00018334029370214208,
+      "loss": 0.9867,
+      "step": 503
+    },
+    {
+      "epoch": 0.569330697543067,
+      "grad_norm": 0.030273810029029846,
+      "learning_rate": 0.0001832745921723965,
+      "loss": 0.9358,
+      "step": 504
+    },
+    {
+      "epoch": 0.5704603219429539,
+      "grad_norm": 0.02991536259651184,
+      "learning_rate": 0.00018320877316348454,
+      "loss": 0.9964,
+      "step": 505
+    },
+    {
+      "epoch": 0.571589946342841,
+      "grad_norm": 0.031318966299295425,
+      "learning_rate": 0.00018314283676826009,
+      "loss": 0.9946,
+      "step": 506
+    },
+    {
+      "epoch": 0.5727195707427281,
+      "grad_norm": 0.030620397999882698,
+      "learning_rate": 0.00018307678307974241,
+      "loss": 1.0597,
+      "step": 507
+    },
+    {
+      "epoch": 0.5738491951426151,
+      "grad_norm": 0.03023059107363224,
+      "learning_rate": 0.0001830106121911165,
+      "loss": 0.9825,
+      "step": 508
+    },
+    {
+      "epoch": 0.5749788195425021,
+      "grad_norm": 0.03067387081682682,
+      "learning_rate": 0.0001829443241957325,
+      "loss": 0.9863,
+      "step": 509
+    },
+    {
+      "epoch": 0.5761084439423891,
+      "grad_norm": 0.03259598836302757,
+      "learning_rate": 0.00018287791918710587,
+      "loss": 1.0366,
+      "step": 510
+    },
+    {
+      "epoch": 0.5772380683422762,
+      "grad_norm": 0.03081597201526165,
+      "learning_rate": 0.00018281139725891707,
+      "loss": 1.144,
+      "step": 511
+    },
+    {
+      "epoch": 0.5783676927421633,
+      "grad_norm": 0.03100423514842987,
+      "learning_rate": 0.00018274475850501158,
+      "loss": 1.011,
+      "step": 512
+    },
+    {
+      "epoch": 0.5794973171420502,
+      "grad_norm": 0.030796082690358162,
+      "learning_rate": 0.00018267800301939965,
+      "loss": 0.8843,
+      "step": 513
+    },
+    {
+      "epoch": 0.5806269415419373,
+      "grad_norm": 0.030977580696344376,
+      "learning_rate": 0.00018261113089625613,
+      "loss": 1.0606,
+      "step": 514
+    },
+    {
+      "epoch": 0.5817565659418243,
+      "grad_norm": 0.03037908300757408,
+      "learning_rate": 0.0001825441422299206,
+      "loss": 0.9751,
+      "step": 515
+    },
+    {
+      "epoch": 0.5828861903417114,
+      "grad_norm": 0.03079284355044365,
+      "learning_rate": 0.00018247703711489686,
+      "loss": 1.0062,
+      "step": 516
+    },
+    {
+      "epoch": 0.5840158147415985,
+      "grad_norm": 0.031534090638160706,
+      "learning_rate": 0.00018240981564585313,
+      "loss": 0.949,
+      "step": 517
+    },
+    {
+      "epoch": 0.5851454391414854,
+      "grad_norm": 0.03137180209159851,
+      "learning_rate": 0.0001823424779176217,
+      "loss": 1.0799,
+      "step": 518
+    },
+    {
+      "epoch": 0.5862750635413725,
+      "grad_norm": 0.0305685643106699,
+      "learning_rate": 0.00018227502402519893,
+      "loss": 1.0609,
+      "step": 519
+    },
+    {
+      "epoch": 0.5874046879412596,
+      "grad_norm": 0.02950458414852619,
+      "learning_rate": 0.00018220745406374498,
+      "loss": 0.9671,
+      "step": 520
+    },
+    {
+      "epoch": 0.5885343123411466,
+      "grad_norm": 0.030199820175766945,
+      "learning_rate": 0.00018213976812858382,
+      "loss": 1.0684,
+      "step": 521
+    },
+    {
+      "epoch": 0.5896639367410336,
+      "grad_norm": 0.031708989292383194,
+      "learning_rate": 0.00018207196631520297,
+      "loss": 0.9994,
+      "step": 522
+    },
+    {
+      "epoch": 0.5907935611409206,
+      "grad_norm": 0.03120891936123371,
+      "learning_rate": 0.00018200404871925353,
+      "loss": 1.001,
+      "step": 523
+    },
+    {
+      "epoch": 0.5919231855408077,
+      "grad_norm": 0.033152077347040176,
+      "learning_rate": 0.0001819360154365498,
+      "loss": 1.0489,
+      "step": 524
+    },
+    {
+      "epoch": 0.5930528099406948,
+      "grad_norm": 0.03135927394032478,
+      "learning_rate": 0.00018186786656306935,
+      "loss": 1.1065,
+      "step": 525
+    },
+    {
+      "epoch": 0.5941824343405817,
+      "grad_norm": 0.030605459585785866,
+      "learning_rate": 0.0001817996021949529,
+      "loss": 1.0116,
+      "step": 526
+    },
+    {
+      "epoch": 0.5953120587404688,
+      "grad_norm": 0.031958550214767456,
+      "learning_rate": 0.00018173122242850397,
+      "loss": 1.0113,
+      "step": 527
+    },
+    {
+      "epoch": 0.5964416831403558,
+      "grad_norm": 0.033079009503126144,
+      "learning_rate": 0.00018166272736018895,
+      "loss": 0.9531,
+      "step": 528
+    },
+    {
+      "epoch": 0.5975713075402429,
+      "grad_norm": 0.0316440686583519,
+      "learning_rate": 0.00018159411708663684,
+      "loss": 0.9916,
+      "step": 529
+    },
+    {
+      "epoch": 0.5987009319401299,
+      "grad_norm": 0.030489858239889145,
+      "learning_rate": 0.00018152539170463925,
+      "loss": 0.995,
+      "step": 530
+    },
+    {
+      "epoch": 0.5998305563400169,
+      "grad_norm": 0.0322355218231678,
+      "learning_rate": 0.00018145655131115009,
+      "loss": 1.0784,
+      "step": 531
+    },
+    {
+      "epoch": 0.600960180739904,
+      "grad_norm": 0.03130833059549332,
+      "learning_rate": 0.00018138759600328563,
+      "loss": 1.0537,
+      "step": 532
+    },
+    {
+      "epoch": 0.602089805139791,
+      "grad_norm": 0.031001951545476913,
+      "learning_rate": 0.0001813185258783241,
+      "loss": 0.9956,
+      "step": 533
+    },
+    {
+      "epoch": 0.6032194295396781,
+      "grad_norm": 0.03067929483950138,
+      "learning_rate": 0.0001812493410337058,
+      "loss": 1.0148,
+      "step": 534
+    },
+    {
+      "epoch": 0.6043490539395651,
+      "grad_norm": 0.03192298486828804,
+      "learning_rate": 0.00018118004156703296,
+      "loss": 0.9635,
+      "step": 535
+    },
+    {
+      "epoch": 0.6054786783394521,
+      "grad_norm": 0.031253885477781296,
+      "learning_rate": 0.00018111062757606932,
+      "loss": 0.9987,
+      "step": 536
+    },
+    {
+      "epoch": 0.6066083027393392,
+      "grad_norm": 0.031125715002417564,
+      "learning_rate": 0.0001810410991587403,
+      "loss": 0.9915,
+      "step": 537
+    },
+    {
+      "epoch": 0.6077379271392263,
+      "grad_norm": 0.03175501897931099,
+      "learning_rate": 0.00018097145641313272,
+      "loss": 1.0357,
+      "step": 538
+    },
+    {
+      "epoch": 0.6088675515391132,
+      "grad_norm": 0.031910236924886703,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 1.0679,
+      "step": 539
+    },
+    {
+      "epoch": 0.6099971759390003,
+      "grad_norm": 0.03214259445667267,
+      "learning_rate": 0.00018083182833023562,
+      "loss": 1.0173,
+      "step": 540
+    },
+    {
+      "epoch": 0.6111268003388873,
+      "grad_norm": 0.03169810026884079,
+      "learning_rate": 0.00018076184318992558,
+      "loss": 1.0428,
+      "step": 541
+    },
+    {
+      "epoch": 0.6122564247387744,
+      "grad_norm": 0.03129338473081589,
+      "learning_rate": 0.00018069174411529577,
+      "loss": 1.0236,
+      "step": 542
+    },
+    {
+      "epoch": 0.6133860491386613,
+      "grad_norm": 0.03245764225721359,
+      "learning_rate": 0.0001806215312052381,
+      "loss": 1.0081,
+      "step": 543
+    },
+    {
+      "epoch": 0.6145156735385484,
+      "grad_norm": 0.030435949563980103,
+      "learning_rate": 0.0001805512045588051,
+      "loss": 1.0731,
+      "step": 544
+    },
+    {
+      "epoch": 0.6156452979384355,
+      "grad_norm": 0.030730856582522392,
+      "learning_rate": 0.0001804807642752096,
+      "loss": 1.0793,
+      "step": 545
+    },
+    {
+      "epoch": 0.6167749223383225,
+      "grad_norm": 0.02937515825033188,
+      "learning_rate": 0.00018041021045382485,
+      "loss": 1.0123,
+      "step": 546
+    },
+    {
+      "epoch": 0.6179045467382095,
+      "grad_norm": 0.03019302524626255,
+      "learning_rate": 0.0001803395431941843,
+      "loss": 1.0232,
+      "step": 547
+    },
+    {
+      "epoch": 0.6190341711380966,
+      "grad_norm": 0.04123188927769661,
+      "learning_rate": 0.00018026876259598135,
+      "loss": 1.0309,
+      "step": 548
+    },
+    {
+      "epoch": 0.6201637955379836,
+      "grad_norm": 0.03046722523868084,
+      "learning_rate": 0.00018019786875906935,
+      "loss": 0.9721,
+      "step": 549
+    },
+    {
+      "epoch": 0.6212934199378707,
+      "grad_norm": 0.033260468393564224,
+      "learning_rate": 0.00018012686178346142,
+      "loss": 1.0726,
+      "step": 550
+    },
+    {
+      "epoch": 0.6224230443377577,
+      "grad_norm": 0.03144606575369835,
+      "learning_rate": 0.0001800557417693302,
+      "loss": 0.947,
+      "step": 551
+    },
+    {
+      "epoch": 0.6235526687376447,
+      "grad_norm": 0.03095083311200142,
+      "learning_rate": 0.00017998450881700787,
+      "loss": 0.9937,
+      "step": 552
+    },
+    {
+      "epoch": 0.6246822931375318,
+      "grad_norm": 0.03133854269981384,
+      "learning_rate": 0.00017991316302698595,
+      "loss": 0.9502,
+      "step": 553
+    },
+    {
+      "epoch": 0.6258119175374188,
+      "grad_norm": 0.03148304298520088,
+      "learning_rate": 0.00017984170449991506,
+      "loss": 1.1628,
+      "step": 554
+    },
+    {
+      "epoch": 0.6269415419373059,
+      "grad_norm": 0.03164827451109886,
+      "learning_rate": 0.000179770133336605,
+      "loss": 0.8814,
+      "step": 555
+    },
+    {
+      "epoch": 0.6280711663371928,
+      "grad_norm": 0.03083074651658535,
+      "learning_rate": 0.0001796984496380243,
+      "loss": 0.9999,
+      "step": 556
+    },
+    {
+      "epoch": 0.6292007907370799,
+      "grad_norm": 0.03223288804292679,
+      "learning_rate": 0.0001796266535053004,
+      "loss": 1.0819,
+      "step": 557
+    },
+    {
+      "epoch": 0.630330415136967,
+      "grad_norm": 0.03053288348019123,
+      "learning_rate": 0.00017955474503971925,
+      "loss": 1.1077,
+      "step": 558
+    },
+    {
+      "epoch": 0.631460039536854,
+      "grad_norm": 0.03127776086330414,
+      "learning_rate": 0.00017948272434272535,
+      "loss": 1.041,
+      "step": 559
+    },
+    {
+      "epoch": 0.632589663936741,
+      "grad_norm": 0.03209880739450455,
+      "learning_rate": 0.00017941059151592147,
+      "loss": 1.0081,
+      "step": 560
+    },
+    {
+      "epoch": 0.633719288336628,
+      "grad_norm": 0.02959609404206276,
+      "learning_rate": 0.00017933834666106864,
+      "loss": 0.9875,
+      "step": 561
+    },
+    {
+      "epoch": 0.6348489127365151,
+      "grad_norm": 0.03344092145562172,
+      "learning_rate": 0.00017926598988008582,
+      "loss": 0.9677,
+      "step": 562
+    },
+    {
+      "epoch": 0.6359785371364022,
+      "grad_norm": 0.03260407596826553,
+      "learning_rate": 0.00017919352127505,
+      "loss": 1.0449,
+      "step": 563
+    },
+    {
+      "epoch": 0.6371081615362891,
+      "grad_norm": 0.031249945983290672,
+      "learning_rate": 0.0001791209409481958,
+      "loss": 1.0662,
+      "step": 564
+    },
+    {
+      "epoch": 0.6382377859361762,
+      "grad_norm": 0.031923823058605194,
+      "learning_rate": 0.00017904824900191556,
+      "loss": 1.0379,
+      "step": 565
+    },
+    {
+      "epoch": 0.6393674103360633,
+      "grad_norm": 0.030242929235100746,
+      "learning_rate": 0.00017897544553875902,
+      "loss": 1.0257,
+      "step": 566
+    },
+    {
+      "epoch": 0.6404970347359503,
+      "grad_norm": 0.032716382294893265,
+      "learning_rate": 0.00017890253066143324,
+      "loss": 0.9987,
+      "step": 567
+    },
+    {
+      "epoch": 0.6416266591358374,
+      "grad_norm": 0.03140626102685928,
+      "learning_rate": 0.0001788295044728025,
+      "loss": 1.0162,
+      "step": 568
+    },
+    {
+      "epoch": 0.6427562835357243,
+      "grad_norm": 0.029912738129496574,
+      "learning_rate": 0.0001787563670758881,
+      "loss": 1.0318,
+      "step": 569
+    },
+    {
+      "epoch": 0.6438859079356114,
+      "grad_norm": 0.03130066394805908,
+      "learning_rate": 0.0001786831185738682,
+      "loss": 1.0026,
+      "step": 570
+    },
+    {
+      "epoch": 0.6450155323354985,
+      "grad_norm": 0.033079057931900024,
+      "learning_rate": 0.00017860975907007772,
+      "loss": 1.0262,
+      "step": 571
+    },
+    {
+      "epoch": 0.6461451567353855,
+      "grad_norm": 0.03027520515024662,
+      "learning_rate": 0.00017853628866800812,
+      "loss": 1.0075,
+      "step": 572
+    },
+    {
+      "epoch": 0.6472747811352725,
+      "grad_norm": 0.03166157007217407,
+      "learning_rate": 0.00017846270747130742,
+      "loss": 1.0858,
+      "step": 573
+    },
+    {
+      "epoch": 0.6484044055351595,
+      "grad_norm": 0.03081650286912918,
+      "learning_rate": 0.00017838901558377986,
+      "loss": 1.0215,
+      "step": 574
+    },
+    {
+      "epoch": 0.6495340299350466,
+      "grad_norm": 0.03256387263536453,
+      "learning_rate": 0.0001783152131093859,
+      "loss": 0.973,
+      "step": 575
+    },
+    {
+      "epoch": 0.6506636543349337,
+      "grad_norm": 0.030604898929595947,
+      "learning_rate": 0.00017824130015224192,
+      "loss": 1.057,
+      "step": 576
+    },
+    {
+      "epoch": 0.6517932787348206,
+      "grad_norm": 0.030695458874106407,
+      "learning_rate": 0.00017816727681662023,
+      "loss": 1.0804,
+      "step": 577
+    },
+    {
+      "epoch": 0.6529229031347077,
+      "grad_norm": 0.031340498477220535,
+      "learning_rate": 0.0001780931432069488,
+      "loss": 0.9722,
+      "step": 578
+    },
+    {
+      "epoch": 0.6540525275345948,
+      "grad_norm": 0.03206819295883179,
+      "learning_rate": 0.00017801889942781126,
+      "loss": 1.0593,
+      "step": 579
+    },
+    {
+      "epoch": 0.6551821519344818,
+      "grad_norm": 0.030380915850400925,
+      "learning_rate": 0.00017794454558394657,
+      "loss": 0.9263,
+      "step": 580
+    },
+    {
+      "epoch": 0.6563117763343688,
+      "grad_norm": 0.03320132568478584,
+      "learning_rate": 0.00017787008178024905,
+      "loss": 1.0798,
+      "step": 581
+    },
+    {
+      "epoch": 0.6574414007342558,
+      "grad_norm": 0.0311865396797657,
+      "learning_rate": 0.00017779550812176806,
+      "loss": 0.9205,
+      "step": 582
+    },
+    {
+      "epoch": 0.6585710251341429,
+      "grad_norm": 0.032210152596235275,
+      "learning_rate": 0.00017772082471370797,
+      "loss": 1.0411,
+      "step": 583
+    },
+    {
+      "epoch": 0.65970064953403,
+      "grad_norm": 0.03178109973669052,
+      "learning_rate": 0.00017764603166142798,
+      "loss": 1.0502,
+      "step": 584
+    },
+    {
+      "epoch": 0.660830273933917,
+      "grad_norm": 0.0323721244931221,
+      "learning_rate": 0.000177571129070442,
+      "loss": 1.0372,
+      "step": 585
+    },
+    {
+      "epoch": 0.661959898333804,
+      "grad_norm": 0.031241752207279205,
+      "learning_rate": 0.0001774961170464184,
+      "loss": 0.9741,
+      "step": 586
+    },
+    {
+      "epoch": 0.663089522733691,
+      "grad_norm": 0.03263148292899132,
+      "learning_rate": 0.00017742099569518,
+      "loss": 1.0956,
+      "step": 587
+    },
+    {
+      "epoch": 0.6642191471335781,
+      "grad_norm": 0.031760070472955704,
+      "learning_rate": 0.00017734576512270383,
+      "loss": 0.9795,
+      "step": 588
+    },
+    {
+      "epoch": 0.6653487715334652,
+      "grad_norm": 0.03184381127357483,
+      "learning_rate": 0.00017727042543512099,
+      "loss": 0.9054,
+      "step": 589
+    },
+    {
+      "epoch": 0.6664783959333521,
+      "grad_norm": 0.03145081177353859,
+      "learning_rate": 0.00017719497673871653,
+      "loss": 1.0219,
+      "step": 590
+    },
+    {
+      "epoch": 0.6664783959333521,
+      "eval_loss": 1.0118999481201172,
+      "eval_runtime": 547.41,
+      "eval_samples_per_second": 17.871,
+      "eval_steps_per_second": 8.937,
+      "step": 590
+    },
+    {
+      "epoch": 0.6676080203332392,
+      "grad_norm": 0.03253559768199921,
+      "learning_rate": 0.00017711941913992928,
+      "loss": 0.9635,
+      "step": 591
+    },
+    {
+      "epoch": 0.6687376447331262,
+      "grad_norm": 0.03181855380535126,
+      "learning_rate": 0.00017704375274535167,
+      "loss": 0.8852,
+      "step": 592
+    },
+    {
+      "epoch": 0.6698672691330133,
+      "grad_norm": 0.03165988251566887,
+      "learning_rate": 0.0001769679776617297,
+      "loss": 1.0201,
+      "step": 593
+    },
+    {
+      "epoch": 0.6709968935329003,
+      "grad_norm": 0.03077312745153904,
+      "learning_rate": 0.00017689209399596257,
+      "loss": 1.0307,
+      "step": 594
+    },
+    {
+      "epoch": 0.6721265179327873,
+      "grad_norm": 0.032232630997896194,
+      "learning_rate": 0.00017681610185510285,
+      "loss": 1.0121,
+      "step": 595
+    },
+    {
+      "epoch": 0.6732561423326744,
+      "grad_norm": 0.03249699994921684,
+      "learning_rate": 0.0001767400013463559,
+      "loss": 0.9288,
+      "step": 596
+    },
+    {
+      "epoch": 0.6743857667325615,
+      "grad_norm": 0.03216133266687393,
+      "learning_rate": 0.0001766637925770802,
+      "loss": 0.9665,
+      "step": 597
+    },
+    {
+      "epoch": 0.6755153911324484,
+      "grad_norm": 0.03151794150471687,
+      "learning_rate": 0.00017658747565478677,
+      "loss": 1.0497,
+      "step": 598
+    },
+    {
+      "epoch": 0.6766450155323355,
+      "grad_norm": 0.03118024580180645,
+      "learning_rate": 0.00017651105068713935,
+      "loss": 0.9403,
+      "step": 599
+    },
+    {
+      "epoch": 0.6777746399322225,
+      "grad_norm": 0.030804403126239777,
+      "learning_rate": 0.00017643451778195395,
+      "loss": 1.0011,
+      "step": 600
+    },
+    {
+      "epoch": 0.6789042643321096,
+      "grad_norm": 0.03352154418826103,
+      "learning_rate": 0.000176357877047199,
+      "loss": 1.0294,
+      "step": 601
+    },
+    {
+      "epoch": 0.6800338887319967,
+      "grad_norm": 0.03205511346459389,
+      "learning_rate": 0.00017628112859099498,
+      "loss": 1.0487,
+      "step": 602
+    },
+    {
+      "epoch": 0.6811635131318836,
+      "grad_norm": 0.031228026375174522,
+      "learning_rate": 0.00017620427252161433,
+      "loss": 0.9319,
+      "step": 603
+    },
+    {
+      "epoch": 0.6822931375317707,
+      "grad_norm": 0.031973280012607574,
+      "learning_rate": 0.00017612730894748136,
+      "loss": 1.0829,
+      "step": 604
+    },
+    {
+      "epoch": 0.6834227619316577,
+      "grad_norm": 0.03331442177295685,
+      "learning_rate": 0.00017605023797717195,
+      "loss": 1.0669,
+      "step": 605
+    },
+    {
+      "epoch": 0.6845523863315448,
+      "grad_norm": 0.0336139053106308,
+      "learning_rate": 0.00017597305971941358,
+      "loss": 1.0722,
+      "step": 606
+    },
+    {
+      "epoch": 0.6856820107314318,
+      "grad_norm": 0.03086121752858162,
+      "learning_rate": 0.00017589577428308502,
+      "loss": 1.092,
+      "step": 607
+    },
+    {
+      "epoch": 0.6868116351313188,
+      "grad_norm": 0.032204341143369675,
+      "learning_rate": 0.0001758183817772163,
+      "loss": 0.9483,
+      "step": 608
+    },
+    {
+      "epoch": 0.6879412595312059,
+      "grad_norm": 0.03183162584900856,
+      "learning_rate": 0.00017574088231098843,
+      "loss": 1.0029,
+      "step": 609
+    },
+    {
+      "epoch": 0.689070883931093,
+      "grad_norm": 0.031096026301383972,
+      "learning_rate": 0.00017566327599373338,
+      "loss": 1.0094,
+      "step": 610
+    },
+    {
+      "epoch": 0.6902005083309799,
+      "grad_norm": 0.032303981482982635,
+      "learning_rate": 0.0001755855629349338,
+      "loss": 0.976,
+      "step": 611
+    },
+    {
+      "epoch": 0.691330132730867,
+      "grad_norm": 0.03237254545092583,
+      "learning_rate": 0.00017550774324422296,
+      "loss": 0.9472,
+      "step": 612
+    },
+    {
+      "epoch": 0.692459757130754,
+      "grad_norm": 0.03161952272057533,
+      "learning_rate": 0.0001754298170313846,
+      "loss": 0.995,
+      "step": 613
+    },
+    {
+      "epoch": 0.6935893815306411,
+      "grad_norm": 0.032882727682590485,
+      "learning_rate": 0.00017535178440635264,
+      "loss": 0.9078,
+      "step": 614
+    },
+    {
+      "epoch": 0.694719005930528,
+      "grad_norm": 0.030476143583655357,
+      "learning_rate": 0.0001752736454792112,
+      "loss": 0.9488,
+      "step": 615
+    },
+    {
+      "epoch": 0.6958486303304151,
+      "grad_norm": 0.032640308141708374,
+      "learning_rate": 0.00017519540036019428,
+      "loss": 0.9968,
+      "step": 616
+    },
+    {
+      "epoch": 0.6969782547303022,
+      "grad_norm": 0.03207506611943245,
+      "learning_rate": 0.00017511704915968581,
+      "loss": 1.0598,
+      "step": 617
+    },
+    {
+      "epoch": 0.6981078791301892,
+      "grad_norm": 0.04379906877875328,
+      "learning_rate": 0.0001750385919882193,
+      "loss": 1.0801,
+      "step": 618
+    },
+    {
+      "epoch": 0.6992375035300763,
+      "grad_norm": 0.03258811682462692,
+      "learning_rate": 0.00017496002895647775,
+      "loss": 1.0901,
+      "step": 619
+    },
+    {
+      "epoch": 0.7003671279299633,
+      "grad_norm": 0.034346289932727814,
+      "learning_rate": 0.0001748813601752935,
+      "loss": 0.9995,
+      "step": 620
+    },
+    {
+      "epoch": 0.7014967523298503,
+      "grad_norm": 0.0332537442445755,
+      "learning_rate": 0.0001748025857556481,
+      "loss": 1.0382,
+      "step": 621
+    },
+    {
+      "epoch": 0.7026263767297374,
+      "grad_norm": 0.031845249235630035,
+      "learning_rate": 0.0001747237058086722,
+      "loss": 1.1217,
+      "step": 622
+    },
+    {
+      "epoch": 0.7037560011296244,
+      "grad_norm": 0.032574612647295,
+      "learning_rate": 0.00017464472044564512,
+      "loss": 0.8765,
+      "step": 623
+    },
+    {
+      "epoch": 0.7048856255295114,
+      "grad_norm": 0.031227873638272285,
+      "learning_rate": 0.00017456562977799514,
+      "loss": 0.9676,
+      "step": 624
+    },
+    {
+      "epoch": 0.7060152499293985,
+      "grad_norm": 0.032643262296915054,
+      "learning_rate": 0.00017448643391729888,
+      "loss": 0.9842,
+      "step": 625
+    },
+    {
+      "epoch": 0.7071448743292855,
+      "grad_norm": 0.031137650832533836,
+      "learning_rate": 0.00017440713297528154,
+      "loss": 0.9877,
+      "step": 626
+    },
+    {
+      "epoch": 0.7082744987291726,
+      "grad_norm": 0.030961019918322563,
+      "learning_rate": 0.0001743277270638164,
+      "loss": 1.017,
+      "step": 627
+    },
+    {
+      "epoch": 0.7094041231290595,
+      "grad_norm": 0.032677747309207916,
+      "learning_rate": 0.00017424821629492495,
+      "loss": 1.0023,
+      "step": 628
+    },
+    {
+      "epoch": 0.7105337475289466,
+      "grad_norm": 0.032716501504182816,
+      "learning_rate": 0.00017416860078077657,
+      "loss": 0.9893,
+      "step": 629
+    },
+    {
+      "epoch": 0.7116633719288337,
+      "grad_norm": 0.03239135444164276,
+      "learning_rate": 0.0001740888806336884,
+      "loss": 0.9949,
+      "step": 630
+    },
+    {
+      "epoch": 0.7127929963287207,
+      "grad_norm": 0.03276536986231804,
+      "learning_rate": 0.0001740090559661252,
+      "loss": 1.0778,
+      "step": 631
+    },
+    {
+      "epoch": 0.7139226207286078,
+      "grad_norm": 0.030909627676010132,
+      "learning_rate": 0.00017392912689069917,
+      "loss": 1.0098,
+      "step": 632
+    },
+    {
+      "epoch": 0.7150522451284947,
+      "grad_norm": 0.032094262540340424,
+      "learning_rate": 0.00017384909352016975,
+      "loss": 0.9703,
+      "step": 633
+    },
+    {
+      "epoch": 0.7161818695283818,
+      "grad_norm": 0.03388513997197151,
+      "learning_rate": 0.00017376895596744367,
+      "loss": 1.0014,
+      "step": 634
+    },
+    {
+      "epoch": 0.7173114939282689,
+      "grad_norm": 0.03186871111392975,
+      "learning_rate": 0.00017368871434557447,
+      "loss": 1.0076,
+      "step": 635
+    },
+    {
+      "epoch": 0.7184411183281559,
+      "grad_norm": 0.03189585730433464,
+      "learning_rate": 0.00017360836876776256,
+      "loss": 0.9721,
+      "step": 636
+    },
+    {
+      "epoch": 0.7195707427280429,
+      "grad_norm": 0.033433668315410614,
+      "learning_rate": 0.0001735279193473551,
+      "loss": 0.9798,
+      "step": 637
+    },
+    {
+      "epoch": 0.72070036712793,
+      "grad_norm": 0.031073307618498802,
+      "learning_rate": 0.00017344736619784553,
+      "loss": 0.9629,
+      "step": 638
+    },
+    {
+      "epoch": 0.721829991527817,
+      "grad_norm": 0.030326619744300842,
+      "learning_rate": 0.00017336670943287388,
+      "loss": 1.0727,
+      "step": 639
+    },
+    {
+      "epoch": 0.7229596159277041,
+      "grad_norm": 0.03189557045698166,
+      "learning_rate": 0.00017328594916622616,
+      "loss": 1.0175,
+      "step": 640
+    },
+    {
+      "epoch": 0.724089240327591,
+      "grad_norm": 0.03241390734910965,
+      "learning_rate": 0.00017320508551183446,
+      "loss": 1.1313,
+      "step": 641
+    },
+    {
+      "epoch": 0.7252188647274781,
+      "grad_norm": 0.0323265865445137,
+      "learning_rate": 0.0001731241185837768,
+      "loss": 1.0418,
+      "step": 642
+    },
+    {
+      "epoch": 0.7263484891273652,
+      "grad_norm": 0.032344575971364975,
+      "learning_rate": 0.00017304304849627677,
+      "loss": 1.0882,
+      "step": 643
+    },
+    {
+      "epoch": 0.7274781135272522,
+      "grad_norm": 0.032916001975536346,
+      "learning_rate": 0.00017296187536370355,
+      "loss": 0.9596,
+      "step": 644
+    },
+    {
+      "epoch": 0.7286077379271392,
+      "grad_norm": 0.031346168369054794,
+      "learning_rate": 0.00017288059930057166,
+      "loss": 0.9729,
+      "step": 645
+    },
+    {
+      "epoch": 0.7297373623270262,
+      "grad_norm": 0.032094355672597885,
+      "learning_rate": 0.00017279922042154092,
+      "loss": 1.0331,
+      "step": 646
+    },
+    {
+      "epoch": 0.7308669867269133,
+      "grad_norm": 0.03289850801229477,
+      "learning_rate": 0.00017271773884141607,
+      "loss": 1.0411,
+      "step": 647
+    },
+    {
+      "epoch": 0.7319966111268004,
+      "grad_norm": 0.03297988697886467,
+      "learning_rate": 0.0001726361546751468,
+      "loss": 0.9847,
+      "step": 648
+    },
+    {
+      "epoch": 0.7331262355266874,
+      "grad_norm": 0.03368350863456726,
+      "learning_rate": 0.00017255446803782754,
+      "loss": 0.9978,
+      "step": 649
+    },
+    {
+      "epoch": 0.7342558599265744,
+      "grad_norm": 0.03280177712440491,
+      "learning_rate": 0.00017247267904469725,
+      "loss": 1.0363,
+      "step": 650
+    },
+    {
+      "epoch": 0.7353854843264614,
+      "grad_norm": 0.031202217563986778,
+      "learning_rate": 0.00017239078781113926,
+      "loss": 1.025,
+      "step": 651
+    },
+    {
+      "epoch": 0.7365151087263485,
+      "grad_norm": 0.03219619765877724,
+      "learning_rate": 0.00017230879445268124,
+      "loss": 0.9878,
+      "step": 652
+    },
+    {
+      "epoch": 0.7376447331262356,
+      "grad_norm": 0.033005617558956146,
+      "learning_rate": 0.00017222669908499482,
+      "loss": 1.0223,
+      "step": 653
+    },
+    {
+      "epoch": 0.7387743575261225,
+      "grad_norm": 0.03350326791405678,
+      "learning_rate": 0.00017214450182389559,
+      "loss": 1.0173,
+      "step": 654
+    },
+    {
+      "epoch": 0.7399039819260096,
+      "grad_norm": 0.031389541923999786,
+      "learning_rate": 0.00017206220278534286,
+      "loss": 1.0458,
+      "step": 655
+    },
+    {
+      "epoch": 0.7410336063258967,
+      "grad_norm": 0.031541019678115845,
+      "learning_rate": 0.00017197980208543954,
+      "loss": 0.9489,
+      "step": 656
+    },
+    {
+      "epoch": 0.7421632307257837,
+      "grad_norm": 0.03202977776527405,
+      "learning_rate": 0.00017189729984043204,
+      "loss": 1.0364,
+      "step": 657
+    },
+    {
+      "epoch": 0.7432928551256707,
+      "grad_norm": 0.03152487054467201,
+      "learning_rate": 0.00017181469616670984,
+      "loss": 0.9827,
+      "step": 658
+    },
+    {
+      "epoch": 0.7444224795255577,
+      "grad_norm": 0.03225429356098175,
+      "learning_rate": 0.00017173199118080564,
+      "loss": 1.0996,
+      "step": 659
+    },
+    {
+      "epoch": 0.7455521039254448,
+      "grad_norm": 0.03217494860291481,
+      "learning_rate": 0.00017164918499939504,
+      "loss": 0.9355,
+      "step": 660
+    },
+    {
+      "epoch": 0.7466817283253319,
+      "grad_norm": 0.032104648649692535,
+      "learning_rate": 0.00017156627773929644,
+      "loss": 1.0552,
+      "step": 661
+    },
+    {
+      "epoch": 0.7478113527252188,
+      "grad_norm": 0.03186746686697006,
+      "learning_rate": 0.0001714832695174707,
+      "loss": 1.071,
+      "step": 662
+    },
+    {
+      "epoch": 0.7489409771251059,
+      "grad_norm": 0.03182530775666237,
+      "learning_rate": 0.00017140016045102133,
+      "loss": 1.0688,
+      "step": 663
+    },
+    {
+      "epoch": 0.7500706015249929,
+      "grad_norm": 0.03153397887945175,
+      "learning_rate": 0.00017131695065719386,
+      "loss": 0.9624,
+      "step": 664
+    },
+    {
+      "epoch": 0.75120022592488,
+      "grad_norm": 0.03226126730442047,
+      "learning_rate": 0.0001712336402533761,
+      "loss": 1.1134,
+      "step": 665
+    },
+    {
+      "epoch": 0.7523298503247671,
+      "grad_norm": 0.031511638313531876,
+      "learning_rate": 0.00017115022935709778,
+      "loss": 1.0753,
+      "step": 666
+    },
+    {
+      "epoch": 0.753459474724654,
+      "grad_norm": 0.03331499546766281,
+      "learning_rate": 0.00017106671808603027,
+      "loss": 0.9709,
+      "step": 667
+    },
+    {
+      "epoch": 0.7545890991245411,
+      "grad_norm": 0.032829850912094116,
+      "learning_rate": 0.0001709831065579867,
+      "loss": 0.9839,
+      "step": 668
+    },
+    {
+      "epoch": 0.7557187235244281,
+      "grad_norm": 0.032828208059072495,
+      "learning_rate": 0.00017089939489092152,
+      "loss": 0.9924,
+      "step": 669
+    },
+    {
+      "epoch": 0.7568483479243152,
+      "grad_norm": 0.0320126973092556,
+      "learning_rate": 0.00017081558320293055,
+      "loss": 0.9649,
+      "step": 670
+    },
+    {
+      "epoch": 0.7579779723242022,
+      "grad_norm": 0.03252957761287689,
+      "learning_rate": 0.0001707316716122506,
+      "loss": 0.9643,
+      "step": 671
+    },
+    {
+      "epoch": 0.7591075967240892,
+      "grad_norm": 0.032323576509952545,
+      "learning_rate": 0.00017064766023725948,
+      "loss": 0.9962,
+      "step": 672
+    },
+    {
+      "epoch": 0.7602372211239763,
+      "grad_norm": 0.03305547684431076,
+      "learning_rate": 0.00017056354919647583,
+      "loss": 1.0864,
+      "step": 673
+    },
+    {
+      "epoch": 0.7613668455238634,
+      "grad_norm": 0.0321505106985569,
+      "learning_rate": 0.0001704793386085588,
+      "loss": 1.01,
+      "step": 674
+    },
+    {
+      "epoch": 0.7624964699237503,
+      "grad_norm": 0.03243474289774895,
+      "learning_rate": 0.000170395028592308,
+      "loss": 1.1192,
+      "step": 675
+    },
+    {
+      "epoch": 0.7636260943236374,
+      "grad_norm": 0.03369235247373581,
+      "learning_rate": 0.00017031061926666333,
+      "loss": 0.9846,
+      "step": 676
+    },
+    {
+      "epoch": 0.7647557187235244,
+      "grad_norm": 0.03172389790415764,
+      "learning_rate": 0.00017022611075070474,
+      "loss": 1.0406,
+      "step": 677
+    },
+    {
+      "epoch": 0.7658853431234115,
+      "grad_norm": 0.03241589665412903,
+      "learning_rate": 0.00017014150316365216,
+      "loss": 0.9235,
+      "step": 678
+    },
+    {
+      "epoch": 0.7670149675232985,
+      "grad_norm": 0.03271762281656265,
+      "learning_rate": 0.0001700567966248653,
+      "loss": 0.9516,
+      "step": 679
+    },
+    {
+      "epoch": 0.7681445919231855,
+      "grad_norm": 0.032931018620729446,
+      "learning_rate": 0.00016997199125384343,
+      "loss": 1.0315,
+      "step": 680
+    },
+    {
+      "epoch": 0.7692742163230726,
+      "grad_norm": 0.032814498990774155,
+      "learning_rate": 0.00016988708717022522,
+      "loss": 0.941,
+      "step": 681
+    },
+    {
+      "epoch": 0.7704038407229596,
+      "grad_norm": 0.031445201486349106,
+      "learning_rate": 0.00016980208449378866,
+      "loss": 1.0588,
+      "step": 682
+    },
+    {
+      "epoch": 0.7715334651228467,
+      "grad_norm": 0.033274564892053604,
+      "learning_rate": 0.0001697169833444508,
+      "loss": 0.9968,
+      "step": 683
+    },
+    {
+      "epoch": 0.7726630895227337,
+      "grad_norm": 0.03313668072223663,
+      "learning_rate": 0.00016963178384226763,
+      "loss": 1.0308,
+      "step": 684
+    },
+    {
+      "epoch": 0.7737927139226207,
+      "grad_norm": 0.032828278839588165,
+      "learning_rate": 0.00016954648610743384,
+      "loss": 1.0245,
+      "step": 685
+    },
+    {
+      "epoch": 0.7749223383225078,
+      "grad_norm": 0.03268923610448837,
+      "learning_rate": 0.00016946109026028274,
+      "loss": 1.0515,
+      "step": 686
+    },
+    {
+      "epoch": 0.7760519627223949,
+      "grad_norm": 0.03162987902760506,
+      "learning_rate": 0.00016937559642128604,
+      "loss": 0.9649,
+      "step": 687
+    },
+    {
+      "epoch": 0.7771815871222818,
+      "grad_norm": 0.03206837549805641,
+      "learning_rate": 0.0001692900047110537,
+      "loss": 1.0174,
+      "step": 688
+    },
+    {
+      "epoch": 0.7783112115221689,
+      "grad_norm": 0.03194599226117134,
+      "learning_rate": 0.0001692043152503338,
+      "loss": 0.9872,
+      "step": 689
+    },
+    {
+      "epoch": 0.7794408359220559,
+      "grad_norm": 0.032261595129966736,
+      "learning_rate": 0.0001691185281600122,
+      "loss": 1.0046,
+      "step": 690
+    },
+    {
+      "epoch": 0.780570460321943,
+      "grad_norm": 0.032003022730350494,
+      "learning_rate": 0.00016903264356111258,
+      "loss": 1.0223,
+      "step": 691
+    },
+    {
+      "epoch": 0.78170008472183,
+      "grad_norm": 0.03204648569226265,
+      "learning_rate": 0.00016894666157479614,
+      "loss": 0.9402,
+      "step": 692
+    },
+    {
+      "epoch": 0.782829709121717,
+      "grad_norm": 0.03307194262742996,
+      "learning_rate": 0.00016886058232236156,
+      "loss": 0.977,
+      "step": 693
+    },
+    {
+      "epoch": 0.7839593335216041,
+      "grad_norm": 0.03305744007229805,
+      "learning_rate": 0.00016877440592524457,
+      "loss": 1.0158,
+      "step": 694
+    },
+    {
+      "epoch": 0.7850889579214911,
+      "grad_norm": 0.03376347944140434,
+      "learning_rate": 0.0001686881325050181,
+      "loss": 0.9266,
+      "step": 695
+    },
+    {
+      "epoch": 0.7862185823213781,
+      "grad_norm": 0.031977638602256775,
+      "learning_rate": 0.0001686017621833919,
+      "loss": 0.9966,
+      "step": 696
+    },
+    {
+      "epoch": 0.7873482067212652,
+      "grad_norm": 0.02983999252319336,
+      "learning_rate": 0.00016851529508221235,
+      "loss": 1.0418,
+      "step": 697
+    },
+    {
+      "epoch": 0.7884778311211522,
+      "grad_norm": 0.032275013625621796,
+      "learning_rate": 0.00016842873132346252,
+      "loss": 1.0745,
+      "step": 698
+    },
+    {
+      "epoch": 0.7896074555210393,
+      "grad_norm": 0.031730227172374725,
+      "learning_rate": 0.0001683420710292617,
+      "loss": 1.0505,
+      "step": 699
+    },
+    {
+      "epoch": 0.7907370799209263,
+      "grad_norm": 0.03272555023431778,
+      "learning_rate": 0.00016825531432186543,
+      "loss": 0.9826,
+      "step": 700
+    },
+    {
+      "epoch": 0.7918667043208133,
+      "grad_norm": 0.03270925581455231,
+      "learning_rate": 0.00016816846132366523,
+      "loss": 0.9954,
+      "step": 701
+    },
+    {
+      "epoch": 0.7929963287207004,
+      "grad_norm": 0.03216206654906273,
+      "learning_rate": 0.00016808151215718853,
+      "loss": 1.0266,
+      "step": 702
+    },
+    {
+      "epoch": 0.7941259531205874,
+      "grad_norm": 0.035289812833070755,
+      "learning_rate": 0.00016799446694509834,
+      "loss": 0.9776,
+      "step": 703
+    },
+    {
+      "epoch": 0.7952555775204745,
+      "grad_norm": 0.03199274092912674,
+      "learning_rate": 0.00016790732581019321,
+      "loss": 1.1088,
+      "step": 704
+    },
+    {
+      "epoch": 0.7963852019203614,
+      "grad_norm": 0.032748714089393616,
+      "learning_rate": 0.00016782008887540704,
+      "loss": 1.0957,
+      "step": 705
+    },
+    {
+      "epoch": 0.7975148263202485,
+      "grad_norm": 0.03328438848257065,
+      "learning_rate": 0.00016773275626380882,
+      "loss": 1.033,
+      "step": 706
+    },
+    {
+      "epoch": 0.7986444507201356,
+      "grad_norm": 0.031454868614673615,
+      "learning_rate": 0.00016764532809860255,
+      "loss": 0.9854,
+      "step": 707
+    },
+    {
+      "epoch": 0.7997740751200226,
+      "grad_norm": 0.032634053379297256,
+      "learning_rate": 0.00016755780450312705,
+      "loss": 0.9914,
+      "step": 708
+    },
+    {
+      "epoch": 0.8009036995199096,
+      "grad_norm": 0.03341520577669144,
+      "learning_rate": 0.00016747018560085572,
+      "loss": 0.9696,
+      "step": 709
+    },
+    {
+      "epoch": 0.8020333239197966,
+      "grad_norm": 0.03382014483213425,
+      "learning_rate": 0.00016738247151539643,
+      "loss": 1.0987,
+      "step": 710
+    },
+    {
+      "epoch": 0.8031629483196837,
+      "grad_norm": 0.03288474678993225,
+      "learning_rate": 0.00016729466237049137,
+      "loss": 1.0378,
+      "step": 711
+    },
+    {
+      "epoch": 0.8042925727195708,
+      "grad_norm": 0.03332820534706116,
+      "learning_rate": 0.00016720675829001675,
+      "loss": 1.0544,
+      "step": 712
+    },
+    {
+      "epoch": 0.8054221971194577,
+      "grad_norm": 0.033210329711437225,
+      "learning_rate": 0.0001671187593979828,
+      "loss": 0.9805,
+      "step": 713
+    },
+    {
+      "epoch": 0.8065518215193448,
+      "grad_norm": 0.03217016160488129,
+      "learning_rate": 0.00016703066581853345,
+      "loss": 1.0576,
+      "step": 714
+    },
+    {
+      "epoch": 0.8076814459192319,
+      "grad_norm": 0.03476932644844055,
+      "learning_rate": 0.00016694247767594624,
+      "loss": 1.0224,
+      "step": 715
+    },
+    {
+      "epoch": 0.8088110703191189,
+      "grad_norm": 0.03346959874033928,
+      "learning_rate": 0.00016685419509463213,
+      "loss": 1.0332,
+      "step": 716
+    },
+    {
+      "epoch": 0.809940694719006,
+      "grad_norm": 0.03398541361093521,
+      "learning_rate": 0.00016676581819913516,
+      "loss": 0.8649,
+      "step": 717
+    },
+    {
+      "epoch": 0.8110703191188929,
+      "grad_norm": 0.033196430653333664,
+      "learning_rate": 0.0001666773471141327,
+      "loss": 0.9434,
+      "step": 718
+    },
+    {
+      "epoch": 0.81219994351878,
+      "grad_norm": 0.03290561959147453,
+      "learning_rate": 0.00016658878196443476,
+      "loss": 0.9993,
+      "step": 719
+    },
+    {
+      "epoch": 0.8133295679186671,
+      "grad_norm": 0.03241690620779991,
+      "learning_rate": 0.00016650012287498412,
+      "loss": 1.0136,
+      "step": 720
+    },
+    {
+      "epoch": 0.8144591923185541,
+      "grad_norm": 0.03312429040670395,
+      "learning_rate": 0.00016641136997085608,
+      "loss": 1.0292,
+      "step": 721
+    },
+    {
+      "epoch": 0.8155888167184411,
+      "grad_norm": 0.03105839341878891,
+      "learning_rate": 0.0001663225233772584,
+      "loss": 0.9277,
+      "step": 722
+    },
+    {
+      "epoch": 0.8167184411183281,
+      "grad_norm": 0.0325755774974823,
+      "learning_rate": 0.00016623358321953078,
+      "loss": 1.0722,
+      "step": 723
+    },
+    {
+      "epoch": 0.8178480655182152,
+      "grad_norm": 0.033952005207538605,
+      "learning_rate": 0.00016614454962314516,
+      "loss": 1.0253,
+      "step": 724
+    },
+    {
+      "epoch": 0.8189776899181023,
+      "grad_norm": 0.0334470197558403,
+      "learning_rate": 0.00016605542271370513,
+      "loss": 1.0267,
+      "step": 725
+    },
+    {
+      "epoch": 0.8201073143179892,
+      "grad_norm": 0.03237008675932884,
+      "learning_rate": 0.00016596620261694604,
+      "loss": 1.0669,
+      "step": 726
+    },
+    {
+      "epoch": 0.8212369387178763,
+      "grad_norm": 0.03195658326148987,
+      "learning_rate": 0.00016587688945873458,
+      "loss": 0.9879,
+      "step": 727
+    },
+    {
+      "epoch": 0.8223665631177633,
+      "grad_norm": 0.03366916999220848,
+      "learning_rate": 0.0001657874833650688,
+      "loss": 0.9801,
+      "step": 728
+    },
+    {
+      "epoch": 0.8234961875176504,
+      "grad_norm": 0.03287327662110329,
+      "learning_rate": 0.0001656979844620779,
+      "loss": 0.9283,
+      "step": 729
+    },
+    {
+      "epoch": 0.8246258119175374,
+      "grad_norm": 0.03366275876760483,
+      "learning_rate": 0.00016560839287602192,
+      "loss": 1.0678,
+      "step": 730
+    },
+    {
+      "epoch": 0.8257554363174244,
+      "grad_norm": 0.03446445241570473,
+      "learning_rate": 0.00016551870873329167,
+      "loss": 0.9899,
+      "step": 731
+    },
+    {
+      "epoch": 0.8268850607173115,
+      "grad_norm": 0.03468972072005272,
+      "learning_rate": 0.0001654289321604086,
+      "loss": 1.0614,
+      "step": 732
+    },
+    {
+      "epoch": 0.8280146851171986,
+      "grad_norm": 0.03443734720349312,
+      "learning_rate": 0.00016533906328402448,
+      "loss": 1.0321,
+      "step": 733
+    },
+    {
+      "epoch": 0.8291443095170856,
+      "grad_norm": 0.03276367112994194,
+      "learning_rate": 0.0001652491022309213,
+      "loss": 0.9848,
+      "step": 734
+    },
+    {
+      "epoch": 0.8302739339169726,
+      "grad_norm": 0.03289159759879112,
+      "learning_rate": 0.00016515904912801118,
+      "loss": 1.0121,
+      "step": 735
+    },
+    {
+      "epoch": 0.8314035583168596,
+      "grad_norm": 0.034025318920612335,
+      "learning_rate": 0.000165068904102336,
+      "loss": 1.0589,
+      "step": 736
+    },
+    {
+      "epoch": 0.8325331827167467,
+      "grad_norm": 0.03421149030327797,
+      "learning_rate": 0.00016497866728106735,
+      "loss": 1.0138,
+      "step": 737
+    },
+    {
+      "epoch": 0.8336628071166338,
+      "grad_norm": 0.03334156796336174,
+      "learning_rate": 0.0001648883387915063,
+      "loss": 1.0337,
+      "step": 738
+    },
+    {
+      "epoch": 0.8347924315165207,
+      "grad_norm": 0.03213927149772644,
+      "learning_rate": 0.0001647979187610833,
+      "loss": 1.0248,
+      "step": 739
+    },
+    {
+      "epoch": 0.8359220559164078,
+      "grad_norm": 0.03407248482108116,
+      "learning_rate": 0.00016470740731735787,
+      "loss": 0.9995,
+      "step": 740
+    },
+    {
+      "epoch": 0.8370516803162948,
+      "grad_norm": 0.03234965354204178,
+      "learning_rate": 0.00016461680458801858,
+      "loss": 1.0526,
+      "step": 741
+    },
+    {
+      "epoch": 0.8381813047161819,
+      "grad_norm": 0.03325793519616127,
+      "learning_rate": 0.0001645261107008827,
+      "loss": 0.9461,
+      "step": 742
+    },
+    {
+      "epoch": 0.8393109291160689,
+      "grad_norm": 0.034206606447696686,
+      "learning_rate": 0.00016443532578389606,
+      "loss": 0.9095,
+      "step": 743
+    },
+    {
+      "epoch": 0.8404405535159559,
+      "grad_norm": 0.03346103057265282,
+      "learning_rate": 0.00016434444996513305,
+      "loss": 1.0337,
+      "step": 744
+    },
+    {
+      "epoch": 0.841570177915843,
+      "grad_norm": 0.03360540792346001,
+      "learning_rate": 0.0001642534833727962,
+      "loss": 0.9532,
+      "step": 745
+    },
+    {
+      "epoch": 0.84269980231573,
+      "grad_norm": 0.03263968229293823,
+      "learning_rate": 0.0001641624261352161,
+      "loss": 1.0579,
+      "step": 746
+    },
+    {
+      "epoch": 0.843829426715617,
+      "grad_norm": 0.033077508211135864,
+      "learning_rate": 0.0001640712783808513,
+      "loss": 0.9993,
+      "step": 747
+    },
+    {
+      "epoch": 0.8449590511155041,
+      "grad_norm": 0.03186168894171715,
+      "learning_rate": 0.00016398004023828797,
+      "loss": 0.9576,
+      "step": 748
+    },
+    {
+      "epoch": 0.8460886755153911,
+      "grad_norm": 0.032343216240406036,
+      "learning_rate": 0.00016388871183623977,
+      "loss": 1.0693,
+      "step": 749
+    },
+    {
+      "epoch": 0.8472182999152782,
+      "grad_norm": 0.03365077078342438,
+      "learning_rate": 0.00016379729330354774,
+      "loss": 0.9867,
+      "step": 750
+    },
+    {
+      "epoch": 0.8483479243151653,
+      "grad_norm": 0.03302355110645294,
+      "learning_rate": 0.00016370578476918008,
+      "loss": 1.002,
+      "step": 751
+    },
+    {
+      "epoch": 0.8494775487150522,
+      "grad_norm": 0.034719739109277725,
+      "learning_rate": 0.00016361418636223198,
+      "loss": 0.9621,
+      "step": 752
+    },
+    {
+      "epoch": 0.8506071731149393,
+      "grad_norm": 0.03225456923246384,
+      "learning_rate": 0.0001635224982119253,
+      "loss": 1.0285,
+      "step": 753
+    },
+    {
+      "epoch": 0.8517367975148263,
+      "grad_norm": 0.03379584476351738,
+      "learning_rate": 0.0001634307204476087,
+      "loss": 1.0787,
+      "step": 754
+    },
+    {
+      "epoch": 0.8528664219147134,
+      "grad_norm": 0.03374066203832626,
+      "learning_rate": 0.00016333885319875702,
+      "loss": 1.0322,
+      "step": 755
+    },
+    {
+      "epoch": 0.8539960463146004,
+      "grad_norm": 0.03438407927751541,
+      "learning_rate": 0.00016324689659497155,
+      "loss": 1.0204,
+      "step": 756
+    },
+    {
+      "epoch": 0.8551256707144874,
+      "grad_norm": 0.03300711140036583,
+      "learning_rate": 0.00016315485076597957,
+      "loss": 1.0088,
+      "step": 757
+    },
+    {
+      "epoch": 0.8562552951143745,
+      "grad_norm": 0.032439880073070526,
+      "learning_rate": 0.00016306271584163416,
+      "loss": 1.0198,
+      "step": 758
+    },
+    {
+      "epoch": 0.8573849195142615,
+      "grad_norm": 0.0341016985476017,
+      "learning_rate": 0.00016297049195191415,
+      "loss": 1.0242,
+      "step": 759
+    },
+    {
+      "epoch": 0.8585145439141485,
+      "grad_norm": 0.032917320728302,
+      "learning_rate": 0.00016287817922692395,
+      "loss": 1.0012,
+      "step": 760
+    },
+    {
+      "epoch": 0.8596441683140356,
+      "grad_norm": 0.03229722008109093,
+      "learning_rate": 0.00016278577779689314,
+      "loss": 0.9944,
+      "step": 761
+    },
+    {
+      "epoch": 0.8607737927139226,
+      "grad_norm": 0.0344838984310627,
+      "learning_rate": 0.0001626932877921766,
+      "loss": 0.9813,
+      "step": 762
+    },
+    {
+      "epoch": 0.8619034171138097,
+      "grad_norm": 0.033522870391607285,
+      "learning_rate": 0.00016260070934325402,
+      "loss": 1.0256,
+      "step": 763
+    },
+    {
+      "epoch": 0.8630330415136966,
+      "grad_norm": 0.03514671325683594,
+      "learning_rate": 0.00016250804258072997,
+      "loss": 0.9543,
+      "step": 764
+    },
+    {
+      "epoch": 0.8641626659135837,
+      "grad_norm": 0.03211130201816559,
+      "learning_rate": 0.00016241528763533353,
+      "loss": 1.0009,
+      "step": 765
+    },
+    {
+      "epoch": 0.8652922903134708,
+      "grad_norm": 0.033048368990421295,
+      "learning_rate": 0.00016232244463791826,
+      "loss": 1.0042,
+      "step": 766
+    },
+    {
+      "epoch": 0.8664219147133578,
+      "grad_norm": 0.031953100115060806,
+      "learning_rate": 0.00016222951371946192,
+      "loss": 1.0096,
+      "step": 767
+    },
+    {
+      "epoch": 0.8675515391132449,
+      "grad_norm": 0.03293442353606224,
+      "learning_rate": 0.00016213649501106622,
+      "loss": 0.9987,
+      "step": 768
+    },
+    {
+      "epoch": 0.8686811635131318,
+      "grad_norm": 0.033335424959659576,
+      "learning_rate": 0.00016204338864395684,
+      "loss": 1.0035,
+      "step": 769
+    },
+    {
+      "epoch": 0.8698107879130189,
+      "grad_norm": 0.04050195962190628,
+      "learning_rate": 0.00016195019474948299,
+      "loss": 1.0326,
+      "step": 770
+    },
+    {
+      "epoch": 0.870940412312906,
+      "grad_norm": 0.03311360627412796,
+      "learning_rate": 0.00016185691345911755,
+      "loss": 1.0184,
+      "step": 771
+    },
+    {
+      "epoch": 0.872070036712793,
+      "grad_norm": 0.03323720395565033,
+      "learning_rate": 0.0001617635449044565,
+      "loss": 0.9625,
+      "step": 772
+    },
+    {
+      "epoch": 0.87319966111268,
+      "grad_norm": 0.03422234579920769,
+      "learning_rate": 0.00016167008921721902,
+      "loss": 1.0654,
+      "step": 773
+    },
+    {
+      "epoch": 0.8743292855125671,
+      "grad_norm": 0.034163184463977814,
+      "learning_rate": 0.00016157654652924723,
+      "loss": 0.9953,
+      "step": 774
+    },
+    {
+      "epoch": 0.8754589099124541,
+      "grad_norm": 0.03320545703172684,
+      "learning_rate": 0.00016148291697250594,
+      "loss": 0.9766,
+      "step": 775
+    },
+    {
+      "epoch": 0.8765885343123412,
+      "grad_norm": 0.03346817195415497,
+      "learning_rate": 0.0001613892006790825,
+      "loss": 0.9201,
+      "step": 776
+    },
+    {
+      "epoch": 0.8777181587122281,
+      "grad_norm": 0.03284529596567154,
+      "learning_rate": 0.00016129539778118667,
+      "loss": 0.9284,
+      "step": 777
+    },
+    {
+      "epoch": 0.8788477831121152,
+      "grad_norm": 0.032990384846925735,
+      "learning_rate": 0.00016120150841115037,
+      "loss": 1.0058,
+      "step": 778
+    },
+    {
+      "epoch": 0.8799774075120023,
+      "grad_norm": 0.03396923094987869,
+      "learning_rate": 0.0001611075327014275,
+      "loss": 1.0831,
+      "step": 779
+    },
+    {
+      "epoch": 0.8811070319118893,
+      "grad_norm": 0.03224315121769905,
+      "learning_rate": 0.00016101347078459373,
+      "loss": 0.9318,
+      "step": 780
+    },
+    {
+      "epoch": 0.8822366563117763,
+      "grad_norm": 0.031812380999326706,
+      "learning_rate": 0.00016091932279334645,
+      "loss": 1.0566,
+      "step": 781
+    },
+    {
+      "epoch": 0.8833662807116633,
+      "grad_norm": 0.033773597329854965,
+      "learning_rate": 0.00016082508886050437,
+      "loss": 0.9349,
+      "step": 782
+    },
+    {
+      "epoch": 0.8844959051115504,
+      "grad_norm": 0.03375870734453201,
+      "learning_rate": 0.00016073076911900754,
+      "loss": 0.9875,
+      "step": 783
+    },
+    {
+      "epoch": 0.8856255295114375,
+      "grad_norm": 0.03443336486816406,
+      "learning_rate": 0.00016063636370191692,
+      "loss": 1.0604,
+      "step": 784
+    },
+    {
+      "epoch": 0.8867551539113245,
+      "grad_norm": 0.032187797129154205,
+      "learning_rate": 0.0001605418727424145,
+      "loss": 0.986,
+      "step": 785
+    },
+    {
+      "epoch": 0.8878847783112115,
+      "grad_norm": 0.03341427072882652,
+      "learning_rate": 0.00016044729637380284,
+      "loss": 0.9184,
+      "step": 786
+    },
+    {
+      "epoch": 0.8890144027110986,
+      "grad_norm": 0.03245866298675537,
+      "learning_rate": 0.000160352634729505,
+      "loss": 1.1511,
+      "step": 787
+    },
+    {
+      "epoch": 0.8901440271109856,
+      "grad_norm": 0.032569848001003265,
+      "learning_rate": 0.00016025788794306442,
+      "loss": 1.0948,
+      "step": 788
+    },
+    {
+      "epoch": 0.8912736515108727,
+      "grad_norm": 0.03429558128118515,
+      "learning_rate": 0.0001601630561481446,
+      "loss": 0.9379,
+      "step": 789
+    },
+    {
+      "epoch": 0.8924032759107596,
+      "grad_norm": 0.033720601350069046,
+      "learning_rate": 0.00016006813947852893,
+      "loss": 0.9845,
+      "step": 790
+    },
+    {
+      "epoch": 0.8935329003106467,
+      "grad_norm": 0.033525846898555756,
+      "learning_rate": 0.00015997313806812057,
+      "loss": 1.0279,
+      "step": 791
+    },
+    {
+      "epoch": 0.8946625247105338,
+      "grad_norm": 0.03577594459056854,
+      "learning_rate": 0.00015987805205094227,
+      "loss": 0.9654,
+      "step": 792
+    },
+    {
+      "epoch": 0.8957921491104208,
+      "grad_norm": 0.03572090342640877,
+      "learning_rate": 0.00015978288156113604,
+      "loss": 1.0292,
+      "step": 793
+    },
+    {
+      "epoch": 0.8969217735103078,
+      "grad_norm": 0.0330742709338665,
+      "learning_rate": 0.00015968762673296318,
+      "loss": 1.0898,
+      "step": 794
+    },
+    {
+      "epoch": 0.8980513979101948,
+      "grad_norm": 0.03374762088060379,
+      "learning_rate": 0.0001595922877008039,
+      "loss": 1.0368,
+      "step": 795
+    },
+    {
+      "epoch": 0.8991810223100819,
+      "grad_norm": 0.035000476986169815,
+      "learning_rate": 0.00015949686459915715,
+      "loss": 1.0531,
+      "step": 796
+    },
+    {
+      "epoch": 0.900310646709969,
+      "grad_norm": 0.03325015306472778,
+      "learning_rate": 0.00015940135756264062,
+      "loss": 1.0199,
+      "step": 797
+    },
+    {
+      "epoch": 0.901440271109856,
+      "grad_norm": 0.03547768294811249,
+      "learning_rate": 0.0001593057667259902,
+      "loss": 0.9988,
+      "step": 798
+    },
+    {
+      "epoch": 0.902569895509743,
+      "grad_norm": 0.03294992819428444,
+      "learning_rate": 0.0001592100922240603,
+      "loss": 0.9943,
+      "step": 799
+    },
+    {
+      "epoch": 0.90369951990963,
+      "grad_norm": 0.03369821235537529,
+      "learning_rate": 0.00015911433419182305,
+      "loss": 1.0186,
+      "step": 800
+    },
+    {
+      "epoch": 0.9048291443095171,
+      "grad_norm": 0.03317281976342201,
+      "learning_rate": 0.00015901849276436862,
+      "loss": 0.9601,
+      "step": 801
+    },
+    {
+      "epoch": 0.9059587687094042,
+      "grad_norm": 0.036661259829998016,
+      "learning_rate": 0.00015892256807690478,
+      "loss": 1.0847,
+      "step": 802
+    },
+    {
+      "epoch": 0.9070883931092911,
+      "grad_norm": 0.0334974080324173,
+      "learning_rate": 0.00015882656026475672,
+      "loss": 1.0264,
+      "step": 803
+    },
+    {
+      "epoch": 0.9082180175091782,
+      "grad_norm": 0.03364727646112442,
+      "learning_rate": 0.00015873046946336694,
+      "loss": 0.9768,
+      "step": 804
+    },
+    {
+      "epoch": 0.9093476419090653,
+      "grad_norm": 0.03534623235464096,
+      "learning_rate": 0.000158634295808295,
+      "loss": 1.0705,
+      "step": 805
+    },
+    {
+      "epoch": 0.9104772663089523,
+      "grad_norm": 0.032764844596385956,
+      "learning_rate": 0.00015853803943521733,
+      "loss": 0.9543,
+      "step": 806
+    },
+    {
+      "epoch": 0.9116068907088393,
+      "grad_norm": 0.03310185670852661,
+      "learning_rate": 0.00015844170047992712,
+      "loss": 1.0077,
+      "step": 807
+    },
+    {
+      "epoch": 0.9127365151087263,
+      "grad_norm": 0.0327795036137104,
+      "learning_rate": 0.00015834527907833396,
+      "loss": 0.9765,
+      "step": 808
+    },
+    {
+      "epoch": 0.9138661395086134,
+      "grad_norm": 0.03351445123553276,
+      "learning_rate": 0.00015824877536646382,
+      "loss": 1.0634,
+      "step": 809
+    },
+    {
+      "epoch": 0.9149957639085005,
+      "grad_norm": 0.03497536852955818,
+      "learning_rate": 0.00015815218948045878,
+      "loss": 0.9211,
+      "step": 810
+    },
+    {
+      "epoch": 0.9161253883083874,
+      "grad_norm": 0.03262564167380333,
+      "learning_rate": 0.00015805552155657683,
+      "loss": 0.9841,
+      "step": 811
+    },
+    {
+      "epoch": 0.9172550127082745,
+      "grad_norm": 0.03305838629603386,
+      "learning_rate": 0.00015795877173119176,
+      "loss": 0.9968,
+      "step": 812
+    },
+    {
+      "epoch": 0.9183846371081615,
+      "grad_norm": 0.03393985703587532,
+      "learning_rate": 0.00015786194014079274,
+      "loss": 1.0257,
+      "step": 813
+    },
+    {
+      "epoch": 0.9195142615080486,
+      "grad_norm": 0.03377285972237587,
+      "learning_rate": 0.00015776502692198448,
+      "loss": 0.979,
+      "step": 814
+    },
+    {
+      "epoch": 0.9206438859079357,
+      "grad_norm": 0.03390325978398323,
+      "learning_rate": 0.00015766803221148673,
+      "loss": 1.0935,
+      "step": 815
+    },
+    {
+      "epoch": 0.9217735103078226,
+      "grad_norm": 0.034586288034915924,
+      "learning_rate": 0.00015757095614613427,
+      "loss": 1.0286,
+      "step": 816
+    },
+    {
+      "epoch": 0.9229031347077097,
+      "grad_norm": 0.034462425857782364,
+      "learning_rate": 0.00015747379886287655,
+      "loss": 0.9826,
+      "step": 817
+    },
+    {
+      "epoch": 0.9240327591075967,
+      "grad_norm": 0.03412788733839989,
+      "learning_rate": 0.0001573765604987777,
+      "loss": 1.0391,
+      "step": 818
+    },
+    {
+      "epoch": 0.9251623835074838,
+      "grad_norm": 0.03411950543522835,
+      "learning_rate": 0.0001572792411910162,
+      "loss": 1.014,
+      "step": 819
+    },
+    {
+      "epoch": 0.9262920079073708,
+      "grad_norm": 0.03366335481405258,
+      "learning_rate": 0.0001571818410768848,
+      "loss": 1.0191,
+      "step": 820
+    },
+    {
+      "epoch": 0.9274216323072578,
+      "grad_norm": 0.033515483140945435,
+      "learning_rate": 0.00015708436029379004,
+      "loss": 1.0072,
+      "step": 821
+    },
+    {
+      "epoch": 0.9285512567071449,
+      "grad_norm": 0.033421795815229416,
+      "learning_rate": 0.0001569867989792525,
+      "loss": 1.0311,
+      "step": 822
+    },
+    {
+      "epoch": 0.929680881107032,
+      "grad_norm": 0.032961517572402954,
+      "learning_rate": 0.00015688915727090613,
+      "loss": 1.0476,
+      "step": 823
+    },
+    {
+      "epoch": 0.9308105055069189,
+      "grad_norm": 0.03382313251495361,
+      "learning_rate": 0.00015679143530649854,
+      "loss": 0.9863,
+      "step": 824
+    },
+    {
+      "epoch": 0.931940129906806,
+      "grad_norm": 0.03453601896762848,
+      "learning_rate": 0.0001566936332238904,
+      "loss": 0.981,
+      "step": 825
+    },
+    {
+      "epoch": 0.933069754306693,
+      "grad_norm": 0.03426108881831169,
+      "learning_rate": 0.00015659575116105544,
+      "loss": 1.0615,
+      "step": 826
+    },
+    {
+      "epoch": 0.9341993787065801,
+      "grad_norm": 0.03343765065073967,
+      "learning_rate": 0.0001564977892560803,
+      "loss": 1.0745,
+      "step": 827
+    },
+    {
+      "epoch": 0.935329003106467,
+      "grad_norm": 0.03495456650853157,
+      "learning_rate": 0.00015639974764716414,
+      "loss": 0.9985,
+      "step": 828
+    },
+    {
+      "epoch": 0.9364586275063541,
+      "grad_norm": 0.033679116517305374,
+      "learning_rate": 0.0001563016264726186,
+      "loss": 1.0216,
+      "step": 829
+    },
+    {
+      "epoch": 0.9375882519062412,
+      "grad_norm": 0.03362250700592995,
+      "learning_rate": 0.0001562034258708676,
+      "loss": 1.0337,
+      "step": 830
+    },
+    {
+      "epoch": 0.9387178763061282,
+      "grad_norm": 0.034377310425043106,
+      "learning_rate": 0.00015610514598044707,
+      "loss": 1.0583,
+      "step": 831
+    },
+    {
+      "epoch": 0.9398475007060153,
+      "grad_norm": 0.033647313714027405,
+      "learning_rate": 0.00015600678694000487,
+      "loss": 1.0126,
+      "step": 832
+    },
+    {
+      "epoch": 0.9409771251059023,
+      "grad_norm": 0.03457539901137352,
+      "learning_rate": 0.0001559083488883004,
+      "loss": 1.1528,
+      "step": 833
+    },
+    {
+      "epoch": 0.9421067495057893,
+      "grad_norm": 0.03426367789506912,
+      "learning_rate": 0.00015580983196420464,
+      "loss": 0.9055,
+      "step": 834
+    },
+    {
+      "epoch": 0.9432363739056764,
+      "grad_norm": 0.03347745165228844,
+      "learning_rate": 0.0001557112363066998,
+      "loss": 0.9978,
+      "step": 835
+    },
+    {
+      "epoch": 0.9443659983055634,
+      "grad_norm": 0.03355059772729874,
+      "learning_rate": 0.00015561256205487908,
+      "loss": 0.9844,
+      "step": 836
+    },
+    {
+      "epoch": 0.9454956227054504,
+      "grad_norm": 0.032837532460689545,
+      "learning_rate": 0.0001555138093479467,
+      "loss": 0.932,
+      "step": 837
+    },
+    {
+      "epoch": 0.9466252471053375,
+      "grad_norm": 0.03441225364804268,
+      "learning_rate": 0.0001554149783252175,
+      "loss": 0.9975,
+      "step": 838
+    },
+    {
+      "epoch": 0.9477548715052245,
+      "grad_norm": 0.033451907336711884,
+      "learning_rate": 0.00015531606912611674,
+      "loss": 0.9707,
+      "step": 839
+    },
+    {
+      "epoch": 0.9488844959051116,
+      "grad_norm": 0.03538847342133522,
+      "learning_rate": 0.00015521708189018005,
+      "loss": 1.0129,
+      "step": 840
+    },
+    {
+      "epoch": 0.9500141203049985,
+      "grad_norm": 0.033600080758333206,
+      "learning_rate": 0.00015511801675705312,
+      "loss": 1.0403,
+      "step": 841
+    },
+    {
+      "epoch": 0.9511437447048856,
+      "grad_norm": 0.03426308557391167,
+      "learning_rate": 0.00015501887386649155,
+      "loss": 0.9879,
+      "step": 842
+    },
+    {
+      "epoch": 0.9522733691047727,
+      "grad_norm": 0.033120229840278625,
+      "learning_rate": 0.00015491965335836055,
+      "loss": 1.0627,
+      "step": 843
+    },
+    {
+      "epoch": 0.9534029935046597,
+      "grad_norm": 0.0343567430973053,
+      "learning_rate": 0.00015482035537263498,
+      "loss": 1.0308,
+      "step": 844
+    },
+    {
+      "epoch": 0.9545326179045467,
+      "grad_norm": 0.033301327377557755,
+      "learning_rate": 0.00015472098004939888,
+      "loss": 1.0106,
+      "step": 845
+    },
+    {
+      "epoch": 0.9556622423044338,
+      "grad_norm": 0.03342900052666664,
+      "learning_rate": 0.00015462152752884544,
+      "loss": 1.0261,
+      "step": 846
+    },
+    {
+      "epoch": 0.9567918667043208,
+      "grad_norm": 0.032714009284973145,
+      "learning_rate": 0.00015452199795127678,
+      "loss": 0.8953,
+      "step": 847
+    },
+    {
+      "epoch": 0.9579214911042079,
+      "grad_norm": 0.0333135612308979,
+      "learning_rate": 0.00015442239145710364,
+      "loss": 1.0105,
+      "step": 848
+    },
+    {
+      "epoch": 0.9590511155040949,
+      "grad_norm": 0.03534407541155815,
+      "learning_rate": 0.00015432270818684532,
+      "loss": 0.9325,
+      "step": 849
+    },
+    {
+      "epoch": 0.9601807399039819,
+      "grad_norm": 0.03319082036614418,
+      "learning_rate": 0.00015422294828112954,
+      "loss": 0.9187,
+      "step": 850
+    },
+    {
+      "epoch": 0.961310364303869,
+      "grad_norm": 0.03402223438024521,
+      "learning_rate": 0.00015412311188069193,
+      "loss": 0.9523,
+      "step": 851
+    },
+    {
+      "epoch": 0.962439988703756,
+      "grad_norm": 0.038419678807258606,
+      "learning_rate": 0.00015402319912637613,
+      "loss": 1.0135,
+      "step": 852
+    },
+    {
+      "epoch": 0.9635696131036431,
+      "grad_norm": 0.03462392836809158,
+      "learning_rate": 0.00015392321015913357,
+      "loss": 1.0811,
+      "step": 853
+    },
+    {
+      "epoch": 0.96469923750353,
+      "grad_norm": 0.033567875623703,
+      "learning_rate": 0.0001538231451200231,
+      "loss": 1.0052,
+      "step": 854
+    },
+    {
+      "epoch": 0.9658288619034171,
+      "grad_norm": 0.03398734703660011,
+      "learning_rate": 0.00015372300415021091,
+      "loss": 0.9939,
+      "step": 855
+    },
+    {
+      "epoch": 0.9669584863033042,
+      "grad_norm": 0.03315124288201332,
+      "learning_rate": 0.00015362278739097026,
+      "loss": 1.0515,
+      "step": 856
+    },
+    {
+      "epoch": 0.9680881107031912,
+      "grad_norm": 0.03387816995382309,
+      "learning_rate": 0.0001535224949836815,
+      "loss": 1.0906,
+      "step": 857
+    },
+    {
+      "epoch": 0.9692177351030782,
+      "grad_norm": 0.033208638429641724,
+      "learning_rate": 0.00015342212706983153,
+      "loss": 0.9542,
+      "step": 858
+    },
+    {
+      "epoch": 0.9703473595029652,
+      "grad_norm": 0.0338163860142231,
+      "learning_rate": 0.00015332168379101377,
+      "loss": 0.9892,
+      "step": 859
+    },
+    {
+      "epoch": 0.9714769839028523,
+      "grad_norm": 0.033496033400297165,
+      "learning_rate": 0.00015322116528892807,
+      "loss": 1.0253,
+      "step": 860
+    },
+    {
+      "epoch": 0.9726066083027394,
+      "grad_norm": 0.034597545862197876,
+      "learning_rate": 0.00015312057170538035,
+      "loss": 1.0102,
+      "step": 861
+    },
+    {
+      "epoch": 0.9737362327026263,
+      "grad_norm": 0.03476065397262573,
+      "learning_rate": 0.00015301990318228244,
+      "loss": 0.938,
+      "step": 862
+    },
+    {
+      "epoch": 0.9748658571025134,
+      "grad_norm": 0.036271460354328156,
+      "learning_rate": 0.00015291915986165186,
+      "loss": 0.9072,
+      "step": 863
+    },
+    {
+      "epoch": 0.9759954815024005,
+      "grad_norm": 0.032739460468292236,
+      "learning_rate": 0.00015281834188561174,
+      "loss": 0.9955,
+      "step": 864
+    },
+    {
+      "epoch": 0.9771251059022875,
+      "grad_norm": 0.03603595495223999,
+      "learning_rate": 0.0001527174493963905,
+      "loss": 0.978,
+      "step": 865
+    },
+    {
+      "epoch": 0.9782547303021746,
+      "grad_norm": 0.03469686582684517,
+      "learning_rate": 0.00015261648253632156,
+      "loss": 1.0928,
+      "step": 866
+    },
+    {
+      "epoch": 0.9793843547020615,
+      "grad_norm": 0.03487220034003258,
+      "learning_rate": 0.0001525154414478434,
+      "loss": 1.0144,
+      "step": 867
+    },
+    {
+      "epoch": 0.9805139791019486,
+      "grad_norm": 0.03308931365609169,
+      "learning_rate": 0.00015241432627349918,
+      "loss": 0.9912,
+      "step": 868
+    },
+    {
+      "epoch": 0.9816436035018357,
+      "grad_norm": 0.0350349023938179,
+      "learning_rate": 0.00015231313715593662,
+      "loss": 1.0209,
+      "step": 869
+    },
+    {
+      "epoch": 0.9827732279017227,
+      "grad_norm": 0.034897249191999435,
+      "learning_rate": 0.0001522118742379076,
+      "loss": 0.9873,
+      "step": 870
+    },
+    {
+      "epoch": 0.9839028523016097,
+      "grad_norm": 0.03427942842245102,
+      "learning_rate": 0.00015211053766226828,
+      "loss": 0.9497,
+      "step": 871
+    },
+    {
+      "epoch": 0.9850324767014967,
+      "grad_norm": 0.0339798741042614,
+      "learning_rate": 0.00015200912757197868,
+      "loss": 0.9741,
+      "step": 872
+    },
+    {
+      "epoch": 0.9861621011013838,
+      "grad_norm": 0.03557536378502846,
+      "learning_rate": 0.00015190764411010247,
+      "loss": 0.9747,
+      "step": 873
+    },
+    {
+      "epoch": 0.9872917255012709,
+      "grad_norm": 0.036786146461963654,
+      "learning_rate": 0.00015180608741980692,
+      "loss": 1.0296,
+      "step": 874
+    },
+    {
+      "epoch": 0.9884213499011578,
+      "grad_norm": 0.03306087478995323,
+      "learning_rate": 0.00015170445764436252,
+      "loss": 1.0559,
+      "step": 875
+    },
+    {
+      "epoch": 0.9895509743010449,
+      "grad_norm": 0.03436678647994995,
+      "learning_rate": 0.00015160275492714296,
+      "loss": 0.9572,
+      "step": 876
+    },
+    {
+      "epoch": 0.990680598700932,
+      "grad_norm": 0.03426647186279297,
+      "learning_rate": 0.00015150097941162474,
+      "loss": 0.999,
+      "step": 877
+    },
+    {
+      "epoch": 0.991810223100819,
+      "grad_norm": 0.03366367891430855,
+      "learning_rate": 0.00015139913124138715,
+      "loss": 1.0365,
+      "step": 878
+    },
+    {
+      "epoch": 0.992939847500706,
+      "grad_norm": 0.034058500081300735,
+      "learning_rate": 0.00015129721056011185,
+      "loss": 0.9835,
+      "step": 879
+    },
+    {
+      "epoch": 0.994069471900593,
+      "grad_norm": 0.03479884937405586,
+      "learning_rate": 0.00015119521751158296,
+      "loss": 1.0604,
+      "step": 880
+    },
+    {
+      "epoch": 0.9951990963004801,
+      "grad_norm": 0.03426951542496681,
+      "learning_rate": 0.00015109315223968655,
+      "loss": 1.0344,
+      "step": 881
+    },
+    {
+      "epoch": 0.9963287207003672,
+      "grad_norm": 0.034726936370134354,
+      "learning_rate": 0.0001509910148884106,
+      "loss": 0.927,
+      "step": 882
+    },
+    {
+      "epoch": 0.9974583451002542,
+      "grad_norm": 0.03522869199514389,
+      "learning_rate": 0.00015088880560184493,
+      "loss": 1.035,
+      "step": 883
+    },
+    {
+      "epoch": 0.9985879695001412,
+      "grad_norm": 0.03507549315690994,
+      "learning_rate": 0.00015078652452418063,
+      "loss": 0.952,
+      "step": 884
+    },
+    {
+      "epoch": 0.9997175939000282,
+      "grad_norm": 0.03401617333292961,
+      "learning_rate": 0.00015068417179971014,
+      "loss": 1.0006,
+      "step": 885
+    },
+    {
+      "epoch": 0.9997175939000282,
+      "eval_loss": 1.0020042657852173,
+      "eval_runtime": 552.3244,
+      "eval_samples_per_second": 17.712,
+      "eval_steps_per_second": 8.857,
+      "step": 885
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 2655,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 885,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0674035770156646e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}