{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999838657631494, "eval_steps": 500, "global_step": 6197, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016134236850596966, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 11.8734, "step": 1 }, { "epoch": 0.00032268473701193933, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 12.0235, "step": 2 }, { "epoch": 0.000484027105517909, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 12.5059, "step": 3 }, { "epoch": 0.0006453694740238787, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 12.353, "step": 4 }, { "epoch": 0.0008067118425298483, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 11.9095, "step": 5 }, { "epoch": 0.000968054211035818, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 11.7864, "step": 6 }, { "epoch": 0.0011293965795417876, "grad_norm": 121.9916763305664, "learning_rate": 5.376344086021506e-07, "loss": 12.4286, "step": 7 }, { "epoch": 0.0012907389480477573, "grad_norm": 119.27645874023438, "learning_rate": 1.0752688172043011e-06, "loss": 12.1422, "step": 8 }, { "epoch": 0.001452081316553727, "grad_norm": 118.97579193115234, "learning_rate": 1.6129032258064516e-06, "loss": 12.24, "step": 9 }, { "epoch": 0.0016134236850596966, "grad_norm": 118.97579193115234, "learning_rate": 1.6129032258064516e-06, "loss": 12.033, "step": 10 }, { "epoch": 0.0017747660535656663, "grad_norm": 113.35755157470703, "learning_rate": 2.1505376344086023e-06, "loss": 11.746, "step": 11 }, { "epoch": 0.001936108422071636, "grad_norm": 118.38843536376953, "learning_rate": 2.688172043010753e-06, "loss": 11.9291, "step": 12 }, { "epoch": 0.002097450790577606, "grad_norm": 116.94325256347656, "learning_rate": 3.225806451612903e-06, "loss": 11.7795, "step": 13 }, { "epoch": 0.002258793159083575, "grad_norm": 114.86215209960938, "learning_rate": 3.763440860215054e-06, "loss": 11.5286, "step": 14 }, { "epoch": 0.002420135527589545, "grad_norm": 112.50050354003906, "learning_rate": 4.3010752688172045e-06, "loss": 11.4632, "step": 15 }, { "epoch": 0.0025814778960955146, "grad_norm": 112.45469665527344, "learning_rate": 4.838709677419355e-06, "loss": 11.5083, "step": 16 }, { "epoch": 0.0027428202646014844, "grad_norm": 102.28872680664062, "learning_rate": 5.376344086021506e-06, "loss": 10.6307, "step": 17 }, { "epoch": 0.002904162633107454, "grad_norm": 102.35145568847656, "learning_rate": 5.9139784946236566e-06, "loss": 10.6356, "step": 18 }, { "epoch": 0.003065505001613424, "grad_norm": 99.08859252929688, "learning_rate": 6.451612903225806e-06, "loss": 10.1105, "step": 19 }, { "epoch": 0.003226847370119393, "grad_norm": 99.42597198486328, "learning_rate": 6.989247311827957e-06, "loss": 10.0749, "step": 20 }, { "epoch": 0.003388189738625363, "grad_norm": 96.81177520751953, "learning_rate": 7.526881720430108e-06, "loss": 10.0479, "step": 21 }, { "epoch": 0.0035495321071313327, "grad_norm": 87.56671142578125, "learning_rate": 8.064516129032258e-06, "loss": 9.3494, "step": 22 }, { "epoch": 0.0037108744756373024, "grad_norm": 89.21701049804688, "learning_rate": 8.602150537634409e-06, "loss": 9.0056, "step": 23 }, { "epoch": 0.003872216844143272, "grad_norm": 84.08562469482422, "learning_rate": 9.13978494623656e-06, "loss": 8.6993, "step": 24 }, { "epoch": 0.004033559212649242, "grad_norm": 83.88109588623047, "learning_rate": 9.67741935483871e-06, "loss": 8.5553, "step": 25 }, { "epoch": 0.004194901581155212, "grad_norm": 78.00149536132812, "learning_rate": 1.0215053763440861e-05, "loss": 8.0172, "step": 26 }, { "epoch": 0.004356243949661181, "grad_norm": 76.08988189697266, "learning_rate": 1.0752688172043012e-05, "loss": 7.7614, "step": 27 }, { "epoch": 0.00451758631816715, "grad_norm": 73.39444732666016, "learning_rate": 1.129032258064516e-05, "loss": 7.2832, "step": 28 }, { "epoch": 0.00467892868667312, "grad_norm": 69.82600402832031, "learning_rate": 1.1827956989247313e-05, "loss": 7.169, "step": 29 }, { "epoch": 0.00484027105517909, "grad_norm": 68.02349853515625, "learning_rate": 1.2365591397849464e-05, "loss": 6.8744, "step": 30 }, { "epoch": 0.0050016134236850595, "grad_norm": 68.92740631103516, "learning_rate": 1.2903225806451613e-05, "loss": 6.814, "step": 31 }, { "epoch": 0.005162955792191029, "grad_norm": 62.0346794128418, "learning_rate": 1.3440860215053763e-05, "loss": 6.4574, "step": 32 }, { "epoch": 0.005324298160696999, "grad_norm": 61.29989242553711, "learning_rate": 1.3978494623655914e-05, "loss": 6.2698, "step": 33 }, { "epoch": 0.005485640529202969, "grad_norm": 56.97321701049805, "learning_rate": 1.4516129032258066e-05, "loss": 6.0728, "step": 34 }, { "epoch": 0.0056469828977089385, "grad_norm": 54.352840423583984, "learning_rate": 1.5053763440860215e-05, "loss": 5.9087, "step": 35 }, { "epoch": 0.005808325266214908, "grad_norm": 49.80109786987305, "learning_rate": 1.5591397849462366e-05, "loss": 5.4298, "step": 36 }, { "epoch": 0.005969667634720878, "grad_norm": 48.01213455200195, "learning_rate": 1.6129032258064517e-05, "loss": 5.3382, "step": 37 }, { "epoch": 0.006131010003226848, "grad_norm": 48.508453369140625, "learning_rate": 1.6666666666666667e-05, "loss": 5.5643, "step": 38 }, { "epoch": 0.0062923523717328175, "grad_norm": 46.24903106689453, "learning_rate": 1.7204301075268818e-05, "loss": 4.8385, "step": 39 }, { "epoch": 0.006453694740238786, "grad_norm": 43.764591217041016, "learning_rate": 1.774193548387097e-05, "loss": 4.8629, "step": 40 }, { "epoch": 0.006615037108744756, "grad_norm": 41.214168548583984, "learning_rate": 1.827956989247312e-05, "loss": 4.8244, "step": 41 }, { "epoch": 0.006776379477250726, "grad_norm": 41.70524215698242, "learning_rate": 1.881720430107527e-05, "loss": 4.714, "step": 42 }, { "epoch": 0.006937721845756696, "grad_norm": 41.5999755859375, "learning_rate": 1.935483870967742e-05, "loss": 4.4334, "step": 43 }, { "epoch": 0.007099064214262665, "grad_norm": 38.3800048828125, "learning_rate": 1.989247311827957e-05, "loss": 4.2036, "step": 44 }, { "epoch": 0.007260406582768635, "grad_norm": 34.799400329589844, "learning_rate": 2.0430107526881722e-05, "loss": 4.1716, "step": 45 }, { "epoch": 0.007421748951274605, "grad_norm": 32.226043701171875, "learning_rate": 2.0967741935483873e-05, "loss": 3.908, "step": 46 }, { "epoch": 0.0075830913197805746, "grad_norm": 31.816619873046875, "learning_rate": 2.1505376344086024e-05, "loss": 4.101, "step": 47 }, { "epoch": 0.007744433688286544, "grad_norm": 26.555458068847656, "learning_rate": 2.2043010752688174e-05, "loss": 3.8238, "step": 48 }, { "epoch": 0.007905776056792513, "grad_norm": 26.600265502929688, "learning_rate": 2.258064516129032e-05, "loss": 3.7415, "step": 49 }, { "epoch": 0.008067118425298484, "grad_norm": 25.45236587524414, "learning_rate": 2.3118279569892472e-05, "loss": 3.4973, "step": 50 }, { "epoch": 0.008228460793804453, "grad_norm": 26.451797485351562, "learning_rate": 2.3655913978494626e-05, "loss": 3.7219, "step": 51 }, { "epoch": 0.008389803162310423, "grad_norm": 27.508174896240234, "learning_rate": 2.4193548387096777e-05, "loss": 3.522, "step": 52 }, { "epoch": 0.008551145530816392, "grad_norm": 26.211048126220703, "learning_rate": 2.4731182795698928e-05, "loss": 3.5366, "step": 53 }, { "epoch": 0.008712487899322363, "grad_norm": 27.209714889526367, "learning_rate": 2.5268817204301075e-05, "loss": 3.2962, "step": 54 }, { "epoch": 0.008873830267828332, "grad_norm": 26.76357078552246, "learning_rate": 2.5806451612903226e-05, "loss": 3.7067, "step": 55 }, { "epoch": 0.0090351726363343, "grad_norm": 25.661449432373047, "learning_rate": 2.6344086021505376e-05, "loss": 3.5572, "step": 56 }, { "epoch": 0.009196515004840271, "grad_norm": 25.68983268737793, "learning_rate": 2.6881720430107527e-05, "loss": 3.5089, "step": 57 }, { "epoch": 0.00935785737334624, "grad_norm": 25.6807918548584, "learning_rate": 2.7419354838709678e-05, "loss": 3.4321, "step": 58 }, { "epoch": 0.00951919974185221, "grad_norm": 24.658496856689453, "learning_rate": 2.7956989247311828e-05, "loss": 3.218, "step": 59 }, { "epoch": 0.00968054211035818, "grad_norm": 24.265926361083984, "learning_rate": 2.8494623655913982e-05, "loss": 3.1398, "step": 60 }, { "epoch": 0.00984188447886415, "grad_norm": 23.916343688964844, "learning_rate": 2.9032258064516133e-05, "loss": 3.4459, "step": 61 }, { "epoch": 0.010003226847370119, "grad_norm": 22.103166580200195, "learning_rate": 2.9569892473118284e-05, "loss": 3.1716, "step": 62 }, { "epoch": 0.01016456921587609, "grad_norm": 20.560914993286133, "learning_rate": 3.010752688172043e-05, "loss": 3.0084, "step": 63 }, { "epoch": 0.010325911584382058, "grad_norm": 20.613006591796875, "learning_rate": 3.0645161290322585e-05, "loss": 3.0275, "step": 64 }, { "epoch": 0.010487253952888029, "grad_norm": 17.877843856811523, "learning_rate": 3.118279569892473e-05, "loss": 2.908, "step": 65 }, { "epoch": 0.010648596321393998, "grad_norm": 16.064481735229492, "learning_rate": 3.172043010752688e-05, "loss": 3.1454, "step": 66 }, { "epoch": 0.010809938689899969, "grad_norm": 13.223140716552734, "learning_rate": 3.2258064516129034e-05, "loss": 2.6419, "step": 67 }, { "epoch": 0.010971281058405937, "grad_norm": 10.892476081848145, "learning_rate": 3.279569892473118e-05, "loss": 2.614, "step": 68 }, { "epoch": 0.011132623426911906, "grad_norm": 10.642965316772461, "learning_rate": 3.3333333333333335e-05, "loss": 2.8108, "step": 69 }, { "epoch": 0.011293965795417877, "grad_norm": 7.963952541351318, "learning_rate": 3.387096774193548e-05, "loss": 2.832, "step": 70 }, { "epoch": 0.011455308163923846, "grad_norm": 9.012112617492676, "learning_rate": 3.4408602150537636e-05, "loss": 2.9791, "step": 71 }, { "epoch": 0.011616650532429816, "grad_norm": 10.27229118347168, "learning_rate": 3.494623655913979e-05, "loss": 2.8891, "step": 72 }, { "epoch": 0.011777992900935785, "grad_norm": 7.0006279945373535, "learning_rate": 3.548387096774194e-05, "loss": 2.733, "step": 73 }, { "epoch": 0.011939335269441756, "grad_norm": 9.03067398071289, "learning_rate": 3.602150537634409e-05, "loss": 2.6866, "step": 74 }, { "epoch": 0.012100677637947725, "grad_norm": 8.046092987060547, "learning_rate": 3.655913978494624e-05, "loss": 2.7973, "step": 75 }, { "epoch": 0.012262020006453695, "grad_norm": 8.398710250854492, "learning_rate": 3.7096774193548386e-05, "loss": 2.7915, "step": 76 }, { "epoch": 0.012423362374959664, "grad_norm": 6.430469989776611, "learning_rate": 3.763440860215054e-05, "loss": 2.6369, "step": 77 }, { "epoch": 0.012584704743465635, "grad_norm": 6.532980918884277, "learning_rate": 3.817204301075269e-05, "loss": 2.6752, "step": 78 }, { "epoch": 0.012746047111971604, "grad_norm": 7.386938095092773, "learning_rate": 3.870967741935484e-05, "loss": 2.6912, "step": 79 }, { "epoch": 0.012907389480477573, "grad_norm": 7.101880073547363, "learning_rate": 3.924731182795699e-05, "loss": 2.7584, "step": 80 }, { "epoch": 0.013068731848983543, "grad_norm": 8.289556503295898, "learning_rate": 3.978494623655914e-05, "loss": 2.6468, "step": 81 }, { "epoch": 0.013230074217489512, "grad_norm": 10.011824607849121, "learning_rate": 4.032258064516129e-05, "loss": 2.8707, "step": 82 }, { "epoch": 0.013391416585995483, "grad_norm": 9.494156837463379, "learning_rate": 4.0860215053763444e-05, "loss": 2.7115, "step": 83 }, { "epoch": 0.013552758954501452, "grad_norm": 8.189593315124512, "learning_rate": 4.13978494623656e-05, "loss": 2.833, "step": 84 }, { "epoch": 0.013714101323007422, "grad_norm": 6.510471820831299, "learning_rate": 4.1935483870967746e-05, "loss": 2.7779, "step": 85 }, { "epoch": 0.013875443691513391, "grad_norm": 7.422402858734131, "learning_rate": 4.247311827956989e-05, "loss": 2.7875, "step": 86 }, { "epoch": 0.014036786060019362, "grad_norm": 15.020564079284668, "learning_rate": 4.301075268817205e-05, "loss": 2.7667, "step": 87 }, { "epoch": 0.01419812842852533, "grad_norm": 8.193143844604492, "learning_rate": 4.3548387096774194e-05, "loss": 2.9937, "step": 88 }, { "epoch": 0.014359470797031301, "grad_norm": 9.4781494140625, "learning_rate": 4.408602150537635e-05, "loss": 2.6305, "step": 89 }, { "epoch": 0.01452081316553727, "grad_norm": 10.007437705993652, "learning_rate": 4.4623655913978496e-05, "loss": 2.6819, "step": 90 }, { "epoch": 0.014682155534043239, "grad_norm": 8.336685180664062, "learning_rate": 4.516129032258064e-05, "loss": 2.6643, "step": 91 }, { "epoch": 0.01484349790254921, "grad_norm": 8.228814125061035, "learning_rate": 4.56989247311828e-05, "loss": 2.6335, "step": 92 }, { "epoch": 0.015004840271055178, "grad_norm": 7.648069858551025, "learning_rate": 4.6236559139784944e-05, "loss": 2.8316, "step": 93 }, { "epoch": 0.015166182639561149, "grad_norm": 8.510735511779785, "learning_rate": 4.67741935483871e-05, "loss": 2.894, "step": 94 }, { "epoch": 0.015327525008067118, "grad_norm": 5.819738864898682, "learning_rate": 4.731182795698925e-05, "loss": 2.7905, "step": 95 }, { "epoch": 0.015488867376573089, "grad_norm": 6.688803195953369, "learning_rate": 4.78494623655914e-05, "loss": 2.7318, "step": 96 }, { "epoch": 0.01565020974507906, "grad_norm": 8.233120918273926, "learning_rate": 4.8387096774193554e-05, "loss": 2.5895, "step": 97 }, { "epoch": 0.015811552113585026, "grad_norm": 6.3282670974731445, "learning_rate": 4.89247311827957e-05, "loss": 2.7843, "step": 98 }, { "epoch": 0.015972894482090997, "grad_norm": 6.703812599182129, "learning_rate": 4.9462365591397855e-05, "loss": 2.9789, "step": 99 }, { "epoch": 0.016134236850596968, "grad_norm": 8.786787033081055, "learning_rate": 5e-05, "loss": 2.4158, "step": 100 }, { "epoch": 0.016295579219102935, "grad_norm": 7.690980434417725, "learning_rate": 5.053763440860215e-05, "loss": 2.7758, "step": 101 }, { "epoch": 0.016456921587608905, "grad_norm": 7.197218418121338, "learning_rate": 5.1075268817204304e-05, "loss": 2.5494, "step": 102 }, { "epoch": 0.016618263956114876, "grad_norm": 5.804955005645752, "learning_rate": 5.161290322580645e-05, "loss": 2.7368, "step": 103 }, { "epoch": 0.016779606324620847, "grad_norm": 8.487010955810547, "learning_rate": 5.2150537634408605e-05, "loss": 2.6821, "step": 104 }, { "epoch": 0.016940948693126814, "grad_norm": 9.467545509338379, "learning_rate": 5.268817204301075e-05, "loss": 2.5257, "step": 105 }, { "epoch": 0.017102291061632784, "grad_norm": 8.017814636230469, "learning_rate": 5.32258064516129e-05, "loss": 2.8659, "step": 106 }, { "epoch": 0.017263633430138755, "grad_norm": 8.576655387878418, "learning_rate": 5.3763440860215054e-05, "loss": 2.8219, "step": 107 }, { "epoch": 0.017424975798644726, "grad_norm": 6.280098915100098, "learning_rate": 5.43010752688172e-05, "loss": 2.6667, "step": 108 }, { "epoch": 0.017586318167150693, "grad_norm": 8.505315780639648, "learning_rate": 5.4838709677419355e-05, "loss": 2.8887, "step": 109 }, { "epoch": 0.017747660535656663, "grad_norm": 8.483502388000488, "learning_rate": 5.53763440860215e-05, "loss": 2.7645, "step": 110 }, { "epoch": 0.017909002904162634, "grad_norm": 7.418360233306885, "learning_rate": 5.5913978494623656e-05, "loss": 2.6832, "step": 111 }, { "epoch": 0.0180703452726686, "grad_norm": 9.711334228515625, "learning_rate": 5.645161290322582e-05, "loss": 2.4625, "step": 112 }, { "epoch": 0.01823168764117457, "grad_norm": 7.5127034187316895, "learning_rate": 5.6989247311827965e-05, "loss": 2.7079, "step": 113 }, { "epoch": 0.018393030009680542, "grad_norm": 8.565291404724121, "learning_rate": 5.752688172043011e-05, "loss": 2.6694, "step": 114 }, { "epoch": 0.018554372378186513, "grad_norm": 6.193171501159668, "learning_rate": 5.8064516129032266e-05, "loss": 2.7512, "step": 115 }, { "epoch": 0.01871571474669248, "grad_norm": 10.021088600158691, "learning_rate": 5.860215053763441e-05, "loss": 2.8323, "step": 116 }, { "epoch": 0.01887705711519845, "grad_norm": 9.823052406311035, "learning_rate": 5.913978494623657e-05, "loss": 2.5964, "step": 117 }, { "epoch": 0.01903839948370442, "grad_norm": 8.563424110412598, "learning_rate": 5.9677419354838715e-05, "loss": 2.5162, "step": 118 }, { "epoch": 0.019199741852210392, "grad_norm": 7.295447826385498, "learning_rate": 6.021505376344086e-05, "loss": 2.6978, "step": 119 }, { "epoch": 0.01936108422071636, "grad_norm": 6.0475993156433105, "learning_rate": 6.0752688172043016e-05, "loss": 2.6616, "step": 120 }, { "epoch": 0.01952242658922233, "grad_norm": 7.451548099517822, "learning_rate": 6.129032258064517e-05, "loss": 2.6365, "step": 121 }, { "epoch": 0.0196837689577283, "grad_norm": 8.676538467407227, "learning_rate": 6.182795698924732e-05, "loss": 2.6597, "step": 122 }, { "epoch": 0.01984511132623427, "grad_norm": 7.062118053436279, "learning_rate": 6.236559139784946e-05, "loss": 2.6511, "step": 123 }, { "epoch": 0.020006453694740238, "grad_norm": 7.9792866706848145, "learning_rate": 6.290322580645161e-05, "loss": 2.4457, "step": 124 }, { "epoch": 0.02016779606324621, "grad_norm": 8.919709205627441, "learning_rate": 6.344086021505376e-05, "loss": 2.6601, "step": 125 }, { "epoch": 0.02032913843175218, "grad_norm": 7.2807183265686035, "learning_rate": 6.397849462365592e-05, "loss": 2.6229, "step": 126 }, { "epoch": 0.020490480800258146, "grad_norm": 7.657270431518555, "learning_rate": 6.451612903225807e-05, "loss": 2.6337, "step": 127 }, { "epoch": 0.020651823168764117, "grad_norm": 9.673775672912598, "learning_rate": 6.505376344086021e-05, "loss": 2.6504, "step": 128 }, { "epoch": 0.020813165537270088, "grad_norm": 6.2916059494018555, "learning_rate": 6.559139784946236e-05, "loss": 2.5548, "step": 129 }, { "epoch": 0.020974507905776058, "grad_norm": 6.704552173614502, "learning_rate": 6.612903225806452e-05, "loss": 2.8779, "step": 130 }, { "epoch": 0.021135850274282025, "grad_norm": 7.804670333862305, "learning_rate": 6.666666666666667e-05, "loss": 2.7628, "step": 131 }, { "epoch": 0.021297192642787996, "grad_norm": 7.032751083374023, "learning_rate": 6.720430107526882e-05, "loss": 2.5618, "step": 132 }, { "epoch": 0.021458535011293967, "grad_norm": 6.686706066131592, "learning_rate": 6.774193548387096e-05, "loss": 2.4971, "step": 133 }, { "epoch": 0.021619877379799937, "grad_norm": 7.140244960784912, "learning_rate": 6.827956989247311e-05, "loss": 2.7566, "step": 134 }, { "epoch": 0.021781219748305904, "grad_norm": 6.7180023193359375, "learning_rate": 6.881720430107527e-05, "loss": 2.6035, "step": 135 }, { "epoch": 0.021942562116811875, "grad_norm": 7.563843727111816, "learning_rate": 6.935483870967743e-05, "loss": 2.8565, "step": 136 }, { "epoch": 0.022103904485317846, "grad_norm": 5.411715984344482, "learning_rate": 6.989247311827958e-05, "loss": 2.6123, "step": 137 }, { "epoch": 0.022265246853823813, "grad_norm": 7.0890278816223145, "learning_rate": 7.043010752688173e-05, "loss": 2.6014, "step": 138 }, { "epoch": 0.022426589222329783, "grad_norm": 8.12354850769043, "learning_rate": 7.096774193548388e-05, "loss": 2.5794, "step": 139 }, { "epoch": 0.022587931590835754, "grad_norm": 7.8680100440979, "learning_rate": 7.150537634408602e-05, "loss": 3.1132, "step": 140 }, { "epoch": 0.022749273959341725, "grad_norm": 7.589081287384033, "learning_rate": 7.204301075268818e-05, "loss": 2.6502, "step": 141 }, { "epoch": 0.02291061632784769, "grad_norm": 7.087393283843994, "learning_rate": 7.258064516129033e-05, "loss": 2.4192, "step": 142 }, { "epoch": 0.023071958696353662, "grad_norm": 8.530641555786133, "learning_rate": 7.311827956989248e-05, "loss": 2.7432, "step": 143 }, { "epoch": 0.023233301064859633, "grad_norm": 10.033088684082031, "learning_rate": 7.365591397849463e-05, "loss": 2.3456, "step": 144 }, { "epoch": 0.023394643433365604, "grad_norm": 7.166675567626953, "learning_rate": 7.419354838709677e-05, "loss": 2.4632, "step": 145 }, { "epoch": 0.02355598580187157, "grad_norm": 5.421966552734375, "learning_rate": 7.473118279569893e-05, "loss": 2.5053, "step": 146 }, { "epoch": 0.02371732817037754, "grad_norm": 6.469846725463867, "learning_rate": 7.526881720430108e-05, "loss": 2.6275, "step": 147 }, { "epoch": 0.023878670538883512, "grad_norm": 8.492588996887207, "learning_rate": 7.580645161290323e-05, "loss": 2.4239, "step": 148 }, { "epoch": 0.02404001290738948, "grad_norm": 5.68565034866333, "learning_rate": 7.634408602150538e-05, "loss": 2.5063, "step": 149 }, { "epoch": 0.02420135527589545, "grad_norm": 6.721501350402832, "learning_rate": 7.688172043010752e-05, "loss": 2.6168, "step": 150 }, { "epoch": 0.02436269764440142, "grad_norm": 6.561045169830322, "learning_rate": 7.741935483870968e-05, "loss": 2.5427, "step": 151 }, { "epoch": 0.02452404001290739, "grad_norm": 6.798069000244141, "learning_rate": 7.795698924731183e-05, "loss": 2.5517, "step": 152 }, { "epoch": 0.024685382381413358, "grad_norm": 7.483495712280273, "learning_rate": 7.849462365591398e-05, "loss": 2.61, "step": 153 }, { "epoch": 0.02484672474991933, "grad_norm": 6.829707622528076, "learning_rate": 7.903225806451613e-05, "loss": 2.6721, "step": 154 }, { "epoch": 0.0250080671184253, "grad_norm": 6.352954387664795, "learning_rate": 7.956989247311829e-05, "loss": 2.6246, "step": 155 }, { "epoch": 0.02516940948693127, "grad_norm": 7.2489213943481445, "learning_rate": 8.010752688172043e-05, "loss": 2.6004, "step": 156 }, { "epoch": 0.025330751855437237, "grad_norm": 6.657808780670166, "learning_rate": 8.064516129032258e-05, "loss": 2.6013, "step": 157 }, { "epoch": 0.025492094223943208, "grad_norm": 9.814123153686523, "learning_rate": 8.118279569892473e-05, "loss": 2.5923, "step": 158 }, { "epoch": 0.025653436592449178, "grad_norm": 7.6293158531188965, "learning_rate": 8.172043010752689e-05, "loss": 2.5834, "step": 159 }, { "epoch": 0.025814778960955145, "grad_norm": 6.346309661865234, "learning_rate": 8.225806451612904e-05, "loss": 2.7438, "step": 160 }, { "epoch": 0.025976121329461116, "grad_norm": 6.685229301452637, "learning_rate": 8.27956989247312e-05, "loss": 2.6558, "step": 161 }, { "epoch": 0.026137463697967087, "grad_norm": 5.442112922668457, "learning_rate": 8.333333333333334e-05, "loss": 2.6434, "step": 162 }, { "epoch": 0.026298806066473057, "grad_norm": 7.593375205993652, "learning_rate": 8.387096774193549e-05, "loss": 2.6782, "step": 163 }, { "epoch": 0.026460148434979024, "grad_norm": 7.5932393074035645, "learning_rate": 8.440860215053764e-05, "loss": 2.4522, "step": 164 }, { "epoch": 0.026621490803484995, "grad_norm": 6.538851737976074, "learning_rate": 8.494623655913979e-05, "loss": 2.4436, "step": 165 }, { "epoch": 0.026782833171990966, "grad_norm": 7.262729644775391, "learning_rate": 8.548387096774195e-05, "loss": 2.6256, "step": 166 }, { "epoch": 0.026944175540496936, "grad_norm": 6.7534613609313965, "learning_rate": 8.60215053763441e-05, "loss": 2.8376, "step": 167 }, { "epoch": 0.027105517909002903, "grad_norm": 7.417628765106201, "learning_rate": 8.655913978494624e-05, "loss": 2.7797, "step": 168 }, { "epoch": 0.027266860277508874, "grad_norm": 8.517837524414062, "learning_rate": 8.709677419354839e-05, "loss": 2.743, "step": 169 }, { "epoch": 0.027428202646014845, "grad_norm": 7.477149486541748, "learning_rate": 8.763440860215054e-05, "loss": 2.5709, "step": 170 }, { "epoch": 0.02758954501452081, "grad_norm": 11.132448196411133, "learning_rate": 8.81720430107527e-05, "loss": 2.7476, "step": 171 }, { "epoch": 0.027750887383026782, "grad_norm": 7.404602527618408, "learning_rate": 8.870967741935484e-05, "loss": 2.6269, "step": 172 }, { "epoch": 0.027912229751532753, "grad_norm": 9.709242820739746, "learning_rate": 8.924731182795699e-05, "loss": 2.5831, "step": 173 }, { "epoch": 0.028073572120038724, "grad_norm": 8.573921203613281, "learning_rate": 8.978494623655914e-05, "loss": 2.6558, "step": 174 }, { "epoch": 0.02823491448854469, "grad_norm": 7.5375189781188965, "learning_rate": 9.032258064516129e-05, "loss": 2.7978, "step": 175 }, { "epoch": 0.02839625685705066, "grad_norm": 5.954165935516357, "learning_rate": 9.086021505376345e-05, "loss": 2.5566, "step": 176 }, { "epoch": 0.028557599225556632, "grad_norm": 6.07686710357666, "learning_rate": 9.13978494623656e-05, "loss": 2.5547, "step": 177 }, { "epoch": 0.028718941594062602, "grad_norm": 8.48181438446045, "learning_rate": 9.193548387096774e-05, "loss": 2.6522, "step": 178 }, { "epoch": 0.02888028396256857, "grad_norm": 5.5955424308776855, "learning_rate": 9.247311827956989e-05, "loss": 2.587, "step": 179 }, { "epoch": 0.02904162633107454, "grad_norm": 6.64224100112915, "learning_rate": 9.301075268817204e-05, "loss": 2.464, "step": 180 }, { "epoch": 0.02920296869958051, "grad_norm": 5.745777606964111, "learning_rate": 9.35483870967742e-05, "loss": 2.4542, "step": 181 }, { "epoch": 0.029364311068086478, "grad_norm": 8.085434913635254, "learning_rate": 9.408602150537636e-05, "loss": 2.5606, "step": 182 }, { "epoch": 0.02952565343659245, "grad_norm": 5.5775980949401855, "learning_rate": 9.46236559139785e-05, "loss": 2.5326, "step": 183 }, { "epoch": 0.02968699580509842, "grad_norm": 5.335843563079834, "learning_rate": 9.516129032258065e-05, "loss": 2.7036, "step": 184 }, { "epoch": 0.02984833817360439, "grad_norm": 7.278665542602539, "learning_rate": 9.56989247311828e-05, "loss": 2.4229, "step": 185 }, { "epoch": 0.030009680542110357, "grad_norm": 6.640331268310547, "learning_rate": 9.623655913978496e-05, "loss": 2.6562, "step": 186 }, { "epoch": 0.030171022910616328, "grad_norm": 8.202140808105469, "learning_rate": 9.677419354838711e-05, "loss": 2.6288, "step": 187 }, { "epoch": 0.030332365279122298, "grad_norm": 7.170082092285156, "learning_rate": 9.731182795698925e-05, "loss": 2.4985, "step": 188 }, { "epoch": 0.03049370764762827, "grad_norm": 6.576979160308838, "learning_rate": 9.78494623655914e-05, "loss": 2.4262, "step": 189 }, { "epoch": 0.030655050016134236, "grad_norm": 4.620917320251465, "learning_rate": 9.838709677419355e-05, "loss": 2.4553, "step": 190 }, { "epoch": 0.030816392384640207, "grad_norm": 9.18274211883545, "learning_rate": 9.892473118279571e-05, "loss": 2.4282, "step": 191 }, { "epoch": 0.030977734753146177, "grad_norm": 6.6460676193237305, "learning_rate": 9.946236559139786e-05, "loss": 2.4051, "step": 192 }, { "epoch": 0.031139077121652144, "grad_norm": 6.2056355476379395, "learning_rate": 0.0001, "loss": 2.5354, "step": 193 }, { "epoch": 0.03130041949015812, "grad_norm": 8.653217315673828, "learning_rate": 9.999999317344175e-05, "loss": 2.5603, "step": 194 }, { "epoch": 0.03146176185866408, "grad_norm": 7.770074844360352, "learning_rate": 9.999997269376886e-05, "loss": 2.6073, "step": 195 }, { "epoch": 0.03162310422717005, "grad_norm": 6.948127746582031, "learning_rate": 9.999993856098693e-05, "loss": 2.5165, "step": 196 }, { "epoch": 0.03178444659567602, "grad_norm": 10.14613151550293, "learning_rate": 9.999989077510529e-05, "loss": 2.5069, "step": 197 }, { "epoch": 0.031945788964181994, "grad_norm": 5.593254089355469, "learning_rate": 9.999982933613696e-05, "loss": 2.4338, "step": 198 }, { "epoch": 0.032107131332687965, "grad_norm": 5.369663238525391, "learning_rate": 9.999975424409873e-05, "loss": 2.4927, "step": 199 }, { "epoch": 0.032268473701193935, "grad_norm": 5.942346096038818, "learning_rate": 9.999966549901113e-05, "loss": 2.5401, "step": 200 }, { "epoch": 0.032429816069699906, "grad_norm": 7.377420425415039, "learning_rate": 9.999956310089834e-05, "loss": 2.3675, "step": 201 }, { "epoch": 0.03259115843820587, "grad_norm": 4.923562526702881, "learning_rate": 9.999944704978836e-05, "loss": 2.4815, "step": 202 }, { "epoch": 0.03275250080671184, "grad_norm": 5.832359790802002, "learning_rate": 9.999931734571286e-05, "loss": 2.7102, "step": 203 }, { "epoch": 0.03291384317521781, "grad_norm": 8.515419006347656, "learning_rate": 9.999917398870729e-05, "loss": 2.7848, "step": 204 }, { "epoch": 0.03307518554372378, "grad_norm": 6.8526716232299805, "learning_rate": 9.999901697881076e-05, "loss": 2.4864, "step": 205 }, { "epoch": 0.03323652791222975, "grad_norm": 5.681766033172607, "learning_rate": 9.999884631606615e-05, "loss": 2.6268, "step": 206 }, { "epoch": 0.03339787028073572, "grad_norm": 6.9704718589782715, "learning_rate": 9.999866200052008e-05, "loss": 2.2992, "step": 207 }, { "epoch": 0.03355921264924169, "grad_norm": 6.220224857330322, "learning_rate": 9.999846403222286e-05, "loss": 2.5179, "step": 208 }, { "epoch": 0.033720555017747664, "grad_norm": 5.515377521514893, "learning_rate": 9.999825241122856e-05, "loss": 2.4122, "step": 209 }, { "epoch": 0.03388189738625363, "grad_norm": 9.85246753692627, "learning_rate": 9.999802713759495e-05, "loss": 2.8617, "step": 210 }, { "epoch": 0.0340432397547596, "grad_norm": 8.233046531677246, "learning_rate": 9.999778821138357e-05, "loss": 2.5484, "step": 211 }, { "epoch": 0.03420458212326557, "grad_norm": 7.567295551300049, "learning_rate": 9.999753563265963e-05, "loss": 2.3982, "step": 212 }, { "epoch": 0.03436592449177154, "grad_norm": 6.545775890350342, "learning_rate": 9.999726940149212e-05, "loss": 2.3446, "step": 213 }, { "epoch": 0.03452726686027751, "grad_norm": 6.2413716316223145, "learning_rate": 9.999698951795374e-05, "loss": 2.4994, "step": 214 }, { "epoch": 0.03468860922878348, "grad_norm": 7.410308361053467, "learning_rate": 9.999669598212092e-05, "loss": 2.8418, "step": 215 }, { "epoch": 0.03484995159728945, "grad_norm": 7.597142696380615, "learning_rate": 9.999638879407378e-05, "loss": 2.6682, "step": 216 }, { "epoch": 0.035011293965795415, "grad_norm": 7.289877414703369, "learning_rate": 9.999606795389622e-05, "loss": 2.4565, "step": 217 }, { "epoch": 0.035172636334301385, "grad_norm": 7.676548004150391, "learning_rate": 9.999573346167588e-05, "loss": 2.6301, "step": 218 }, { "epoch": 0.035333978702807356, "grad_norm": 6.4527363777160645, "learning_rate": 9.999538531750405e-05, "loss": 2.5459, "step": 219 }, { "epoch": 0.03549532107131333, "grad_norm": 6.0173726081848145, "learning_rate": 9.999502352147583e-05, "loss": 2.7194, "step": 220 }, { "epoch": 0.0356566634398193, "grad_norm": 6.561513423919678, "learning_rate": 9.999464807368999e-05, "loss": 2.458, "step": 221 }, { "epoch": 0.03581800580832527, "grad_norm": 7.991656303405762, "learning_rate": 9.999425897424906e-05, "loss": 2.904, "step": 222 }, { "epoch": 0.03597934817683124, "grad_norm": 4.952508449554443, "learning_rate": 9.99938562232593e-05, "loss": 2.6497, "step": 223 }, { "epoch": 0.0361406905453372, "grad_norm": 5.483695030212402, "learning_rate": 9.999343982083065e-05, "loss": 2.4902, "step": 224 }, { "epoch": 0.03630203291384317, "grad_norm": 6.592175483703613, "learning_rate": 9.999300976707687e-05, "loss": 2.616, "step": 225 }, { "epoch": 0.03646337528234914, "grad_norm": 8.33538818359375, "learning_rate": 9.999256606211533e-05, "loss": 2.7458, "step": 226 }, { "epoch": 0.036624717650855114, "grad_norm": 6.011215686798096, "learning_rate": 9.999210870606723e-05, "loss": 2.7323, "step": 227 }, { "epoch": 0.036786060019361085, "grad_norm": 9.141729354858398, "learning_rate": 9.999163769905744e-05, "loss": 2.4671, "step": 228 }, { "epoch": 0.036947402387867055, "grad_norm": 9.175477981567383, "learning_rate": 9.999115304121457e-05, "loss": 2.5845, "step": 229 }, { "epoch": 0.037108744756373026, "grad_norm": 10.001459121704102, "learning_rate": 9.9990654732671e-05, "loss": 2.804, "step": 230 }, { "epoch": 0.037270087124878996, "grad_norm": 8.348021507263184, "learning_rate": 9.999014277356276e-05, "loss": 2.6492, "step": 231 }, { "epoch": 0.03743142949338496, "grad_norm": 6.260481357574463, "learning_rate": 9.998961716402965e-05, "loss": 2.4399, "step": 232 }, { "epoch": 0.03759277186189093, "grad_norm": 4.412412643432617, "learning_rate": 9.998907790421522e-05, "loss": 2.5457, "step": 233 }, { "epoch": 0.0377541142303969, "grad_norm": 6.601975440979004, "learning_rate": 9.998852499426668e-05, "loss": 2.2956, "step": 234 }, { "epoch": 0.03791545659890287, "grad_norm": 5.505072116851807, "learning_rate": 9.998795843433503e-05, "loss": 2.4495, "step": 235 }, { "epoch": 0.03807679896740884, "grad_norm": 6.530910491943359, "learning_rate": 9.998737822457498e-05, "loss": 2.3748, "step": 236 }, { "epoch": 0.03823814133591481, "grad_norm": 7.663618087768555, "learning_rate": 9.998678436514497e-05, "loss": 2.4358, "step": 237 }, { "epoch": 0.038399483704420784, "grad_norm": 8.565320014953613, "learning_rate": 9.998617685620714e-05, "loss": 2.68, "step": 238 }, { "epoch": 0.03856082607292675, "grad_norm": 8.876402854919434, "learning_rate": 9.998555569792741e-05, "loss": 2.5005, "step": 239 }, { "epoch": 0.03872216844143272, "grad_norm": 5.009706497192383, "learning_rate": 9.998492089047538e-05, "loss": 2.3758, "step": 240 }, { "epoch": 0.03888351080993869, "grad_norm": 4.652370452880859, "learning_rate": 9.998427243402437e-05, "loss": 2.5858, "step": 241 }, { "epoch": 0.03904485317844466, "grad_norm": 5.316776752471924, "learning_rate": 9.998361032875145e-05, "loss": 2.4996, "step": 242 }, { "epoch": 0.03920619554695063, "grad_norm": 6.4909138679504395, "learning_rate": 9.998293457483745e-05, "loss": 2.5396, "step": 243 }, { "epoch": 0.0393675379154566, "grad_norm": 6.99546480178833, "learning_rate": 9.998224517246689e-05, "loss": 2.6507, "step": 244 }, { "epoch": 0.03952888028396257, "grad_norm": 8.02343463897705, "learning_rate": 9.998154212182797e-05, "loss": 2.5949, "step": 245 }, { "epoch": 0.03969022265246854, "grad_norm": 7.2550950050354, "learning_rate": 9.998082542311273e-05, "loss": 2.5827, "step": 246 }, { "epoch": 0.039851565020974505, "grad_norm": 7.897508144378662, "learning_rate": 9.998009507651684e-05, "loss": 2.4193, "step": 247 }, { "epoch": 0.040012907389480476, "grad_norm": 6.488155364990234, "learning_rate": 9.997935108223972e-05, "loss": 2.5488, "step": 248 }, { "epoch": 0.04017424975798645, "grad_norm": 7.670787334442139, "learning_rate": 9.997859344048455e-05, "loss": 2.4619, "step": 249 }, { "epoch": 0.04033559212649242, "grad_norm": 8.569957733154297, "learning_rate": 9.997782215145821e-05, "loss": 2.4712, "step": 250 }, { "epoch": 0.04049693449499839, "grad_norm": 4.621592998504639, "learning_rate": 9.997703721537131e-05, "loss": 2.601, "step": 251 }, { "epoch": 0.04065827686350436, "grad_norm": 5.4416303634643555, "learning_rate": 9.997623863243817e-05, "loss": 2.1628, "step": 252 }, { "epoch": 0.04081961923201033, "grad_norm": 7.547059059143066, "learning_rate": 9.997542640287687e-05, "loss": 2.3897, "step": 253 }, { "epoch": 0.04098096160051629, "grad_norm": 7.269131183624268, "learning_rate": 9.997460052690918e-05, "loss": 2.5672, "step": 254 }, { "epoch": 0.04114230396902226, "grad_norm": 6.176670551300049, "learning_rate": 9.997376100476063e-05, "loss": 2.4091, "step": 255 }, { "epoch": 0.041303646337528234, "grad_norm": 4.447066783905029, "learning_rate": 9.997290783666049e-05, "loss": 2.4808, "step": 256 }, { "epoch": 0.041464988706034205, "grad_norm": 5.186746597290039, "learning_rate": 9.997204102284167e-05, "loss": 2.5325, "step": 257 }, { "epoch": 0.041626331074540175, "grad_norm": 6.026534080505371, "learning_rate": 9.99711605635409e-05, "loss": 2.2731, "step": 258 }, { "epoch": 0.041787673443046146, "grad_norm": 6.029815196990967, "learning_rate": 9.997026645899859e-05, "loss": 2.4693, "step": 259 }, { "epoch": 0.041949015811552116, "grad_norm": 6.355932235717773, "learning_rate": 9.996935870945891e-05, "loss": 2.6484, "step": 260 }, { "epoch": 0.04211035818005808, "grad_norm": 7.226869106292725, "learning_rate": 9.996843731516969e-05, "loss": 2.4643, "step": 261 }, { "epoch": 0.04227170054856405, "grad_norm": 6.115468502044678, "learning_rate": 9.996750227638257e-05, "loss": 2.3783, "step": 262 }, { "epoch": 0.04243304291707002, "grad_norm": 8.106987953186035, "learning_rate": 9.996655359335282e-05, "loss": 2.3988, "step": 263 }, { "epoch": 0.04259438528557599, "grad_norm": 4.741427421569824, "learning_rate": 9.996559126633957e-05, "loss": 2.4387, "step": 264 }, { "epoch": 0.04275572765408196, "grad_norm": 9.366172790527344, "learning_rate": 9.996461529560553e-05, "loss": 2.3502, "step": 265 }, { "epoch": 0.04291707002258793, "grad_norm": 7.219048023223877, "learning_rate": 9.99636256814172e-05, "loss": 2.5856, "step": 266 }, { "epoch": 0.043078412391093904, "grad_norm": 7.835181713104248, "learning_rate": 9.996262242404484e-05, "loss": 2.6606, "step": 267 }, { "epoch": 0.043239754759599874, "grad_norm": 7.519559860229492, "learning_rate": 9.99616055237624e-05, "loss": 2.8062, "step": 268 }, { "epoch": 0.04340109712810584, "grad_norm": 4.6615376472473145, "learning_rate": 9.996057498084753e-05, "loss": 2.3809, "step": 269 }, { "epoch": 0.04356243949661181, "grad_norm": 5.479961395263672, "learning_rate": 9.995953079558165e-05, "loss": 2.5693, "step": 270 }, { "epoch": 0.04372378186511778, "grad_norm": 5.388136386871338, "learning_rate": 9.99584729682499e-05, "loss": 2.4341, "step": 271 }, { "epoch": 0.04388512423362375, "grad_norm": 7.276689529418945, "learning_rate": 9.99574014991411e-05, "loss": 2.4584, "step": 272 }, { "epoch": 0.04404646660212972, "grad_norm": 4.907256603240967, "learning_rate": 9.995631638854785e-05, "loss": 2.1793, "step": 273 }, { "epoch": 0.04420780897063569, "grad_norm": 3.8604819774627686, "learning_rate": 9.995521763676645e-05, "loss": 2.406, "step": 274 }, { "epoch": 0.04436915133914166, "grad_norm": 5.270939826965332, "learning_rate": 9.995410524409692e-05, "loss": 2.4819, "step": 275 }, { "epoch": 0.044530493707647625, "grad_norm": 6.114327430725098, "learning_rate": 9.995297921084303e-05, "loss": 2.4482, "step": 276 }, { "epoch": 0.044691836076153596, "grad_norm": 5.704726696014404, "learning_rate": 9.995183953731225e-05, "loss": 2.4674, "step": 277 }, { "epoch": 0.04485317844465957, "grad_norm": 5.346158504486084, "learning_rate": 9.995068622381577e-05, "loss": 2.279, "step": 278 }, { "epoch": 0.04501452081316554, "grad_norm": 8.070467948913574, "learning_rate": 9.994951927066853e-05, "loss": 2.5467, "step": 279 }, { "epoch": 0.04517586318167151, "grad_norm": 4.506627082824707, "learning_rate": 9.994833867818917e-05, "loss": 2.5589, "step": 280 }, { "epoch": 0.04533720555017748, "grad_norm": 5.788137912750244, "learning_rate": 9.994714444670007e-05, "loss": 2.4006, "step": 281 }, { "epoch": 0.04549854791868345, "grad_norm": 6.531043529510498, "learning_rate": 9.994593657652733e-05, "loss": 2.4025, "step": 282 }, { "epoch": 0.04565989028718941, "grad_norm": 6.962115287780762, "learning_rate": 9.994471506800079e-05, "loss": 2.5754, "step": 283 }, { "epoch": 0.04582123265569538, "grad_norm": 5.1116180419921875, "learning_rate": 9.994347992145395e-05, "loss": 2.3522, "step": 284 }, { "epoch": 0.045982575024201354, "grad_norm": 8.793933868408203, "learning_rate": 9.994223113722415e-05, "loss": 2.5642, "step": 285 }, { "epoch": 0.046143917392707325, "grad_norm": 6.9725751876831055, "learning_rate": 9.994096871565233e-05, "loss": 2.6067, "step": 286 }, { "epoch": 0.046305259761213295, "grad_norm": 6.1990790367126465, "learning_rate": 9.993969265708323e-05, "loss": 2.498, "step": 287 }, { "epoch": 0.046466602129719266, "grad_norm": 5.293033123016357, "learning_rate": 9.99384029618653e-05, "loss": 2.276, "step": 288 }, { "epoch": 0.046627944498225236, "grad_norm": 6.670391082763672, "learning_rate": 9.99370996303507e-05, "loss": 2.4333, "step": 289 }, { "epoch": 0.04678928686673121, "grad_norm": 7.920434474945068, "learning_rate": 9.993578266289532e-05, "loss": 2.2344, "step": 290 }, { "epoch": 0.04695062923523717, "grad_norm": 5.005353927612305, "learning_rate": 9.993445205985877e-05, "loss": 2.4097, "step": 291 }, { "epoch": 0.04711197160374314, "grad_norm": 4.682715892791748, "learning_rate": 9.99331078216044e-05, "loss": 2.8777, "step": 292 }, { "epoch": 0.04727331397224911, "grad_norm": 6.497574806213379, "learning_rate": 9.993174994849926e-05, "loss": 2.3354, "step": 293 }, { "epoch": 0.04743465634075508, "grad_norm": 5.930582523345947, "learning_rate": 9.993037844091413e-05, "loss": 2.4212, "step": 294 }, { "epoch": 0.04759599870926105, "grad_norm": 5.225839138031006, "learning_rate": 9.992899329922354e-05, "loss": 2.4619, "step": 295 }, { "epoch": 0.047757341077767024, "grad_norm": 7.46838903427124, "learning_rate": 9.99275945238057e-05, "loss": 2.5483, "step": 296 }, { "epoch": 0.047918683446272994, "grad_norm": 8.18468189239502, "learning_rate": 9.992618211504256e-05, "loss": 2.4418, "step": 297 }, { "epoch": 0.04808002581477896, "grad_norm": 6.2516608238220215, "learning_rate": 9.992475607331981e-05, "loss": 2.6307, "step": 298 }, { "epoch": 0.04824136818328493, "grad_norm": 5.004658222198486, "learning_rate": 9.992331639902685e-05, "loss": 2.3118, "step": 299 }, { "epoch": 0.0484027105517909, "grad_norm": 5.056971549987793, "learning_rate": 9.99218630925568e-05, "loss": 2.411, "step": 300 }, { "epoch": 0.04856405292029687, "grad_norm": 7.776693344116211, "learning_rate": 9.992039615430648e-05, "loss": 2.3621, "step": 301 }, { "epoch": 0.04872539528880284, "grad_norm": 4.139708042144775, "learning_rate": 9.991891558467648e-05, "loss": 2.2396, "step": 302 }, { "epoch": 0.04888673765730881, "grad_norm": 8.121435165405273, "learning_rate": 9.991742138407107e-05, "loss": 2.7522, "step": 303 }, { "epoch": 0.04904808002581478, "grad_norm": 6.5305681228637695, "learning_rate": 9.991591355289827e-05, "loss": 2.92, "step": 304 }, { "epoch": 0.049209422394320745, "grad_norm": 7.091409683227539, "learning_rate": 9.99143920915698e-05, "loss": 2.511, "step": 305 }, { "epoch": 0.049370764762826716, "grad_norm": 6.0481486320495605, "learning_rate": 9.991285700050115e-05, "loss": 2.2692, "step": 306 }, { "epoch": 0.04953210713133269, "grad_norm": 5.499849796295166, "learning_rate": 9.991130828011145e-05, "loss": 2.6419, "step": 307 }, { "epoch": 0.04969344949983866, "grad_norm": 6.164821147918701, "learning_rate": 9.990974593082364e-05, "loss": 2.626, "step": 308 }, { "epoch": 0.04985479186834463, "grad_norm": 5.402545928955078, "learning_rate": 9.99081699530643e-05, "loss": 2.3183, "step": 309 }, { "epoch": 0.0500161342368506, "grad_norm": 6.041627407073975, "learning_rate": 9.990658034726379e-05, "loss": 2.5301, "step": 310 }, { "epoch": 0.05017747660535657, "grad_norm": 7.5806050300598145, "learning_rate": 9.990497711385617e-05, "loss": 2.4254, "step": 311 }, { "epoch": 0.05033881897386254, "grad_norm": 7.522918701171875, "learning_rate": 9.990336025327922e-05, "loss": 2.4936, "step": 312 }, { "epoch": 0.0505001613423685, "grad_norm": 6.263611793518066, "learning_rate": 9.990172976597445e-05, "loss": 2.4999, "step": 313 }, { "epoch": 0.050661503710874474, "grad_norm": 4.018587112426758, "learning_rate": 9.990008565238707e-05, "loss": 2.3965, "step": 314 }, { "epoch": 0.050822846079380445, "grad_norm": 4.3108906745910645, "learning_rate": 9.989842791296603e-05, "loss": 2.4771, "step": 315 }, { "epoch": 0.050984188447886415, "grad_norm": 5.848104476928711, "learning_rate": 9.989675654816402e-05, "loss": 2.4651, "step": 316 }, { "epoch": 0.051145530816392386, "grad_norm": 3.9502875804901123, "learning_rate": 9.989507155843738e-05, "loss": 2.2438, "step": 317 }, { "epoch": 0.051306873184898356, "grad_norm": 5.53761625289917, "learning_rate": 9.989337294424627e-05, "loss": 2.3552, "step": 318 }, { "epoch": 0.05146821555340433, "grad_norm": 6.956762313842773, "learning_rate": 9.989166070605447e-05, "loss": 2.5993, "step": 319 }, { "epoch": 0.05162955792191029, "grad_norm": 6.241908073425293, "learning_rate": 9.988993484432957e-05, "loss": 2.5251, "step": 320 }, { "epoch": 0.05179090029041626, "grad_norm": 6.745074272155762, "learning_rate": 9.988819535954281e-05, "loss": 2.4227, "step": 321 }, { "epoch": 0.05195224265892223, "grad_norm": 7.87310791015625, "learning_rate": 9.988644225216918e-05, "loss": 2.2028, "step": 322 }, { "epoch": 0.0521135850274282, "grad_norm": 9.180561065673828, "learning_rate": 9.988467552268741e-05, "loss": 2.453, "step": 323 }, { "epoch": 0.05227492739593417, "grad_norm": 4.864908218383789, "learning_rate": 9.988289517157989e-05, "loss": 2.3643, "step": 324 }, { "epoch": 0.052436269764440144, "grad_norm": 9.718785285949707, "learning_rate": 9.988110119933281e-05, "loss": 2.42, "step": 325 }, { "epoch": 0.052597612132946114, "grad_norm": 8.330720901489258, "learning_rate": 9.9879293606436e-05, "loss": 2.6186, "step": 326 }, { "epoch": 0.05275895450145208, "grad_norm": 5.9520392417907715, "learning_rate": 9.987747239338306e-05, "loss": 2.5718, "step": 327 }, { "epoch": 0.05292029686995805, "grad_norm": 7.140975475311279, "learning_rate": 9.987563756067129e-05, "loss": 2.3265, "step": 328 }, { "epoch": 0.05308163923846402, "grad_norm": 6.457657337188721, "learning_rate": 9.987378910880172e-05, "loss": 2.64, "step": 329 }, { "epoch": 0.05324298160696999, "grad_norm": 3.6005430221557617, "learning_rate": 9.987192703827907e-05, "loss": 2.314, "step": 330 }, { "epoch": 0.05340432397547596, "grad_norm": 3.92282772064209, "learning_rate": 9.987005134961185e-05, "loss": 2.4194, "step": 331 }, { "epoch": 0.05356566634398193, "grad_norm": 4.9452619552612305, "learning_rate": 9.986816204331221e-05, "loss": 2.6117, "step": 332 }, { "epoch": 0.0537270087124879, "grad_norm": 4.207228183746338, "learning_rate": 9.986625911989604e-05, "loss": 2.3704, "step": 333 }, { "epoch": 0.05388835108099387, "grad_norm": 4.267602443695068, "learning_rate": 9.986434257988298e-05, "loss": 2.3398, "step": 334 }, { "epoch": 0.054049693449499836, "grad_norm": 5.442686080932617, "learning_rate": 9.986241242379633e-05, "loss": 2.6067, "step": 335 }, { "epoch": 0.05421103581800581, "grad_norm": 5.375389575958252, "learning_rate": 9.986046865216317e-05, "loss": 2.3361, "step": 336 }, { "epoch": 0.05437237818651178, "grad_norm": 5.796661853790283, "learning_rate": 9.985851126551428e-05, "loss": 2.5738, "step": 337 }, { "epoch": 0.05453372055501775, "grad_norm": 5.076924800872803, "learning_rate": 9.985654026438411e-05, "loss": 2.8978, "step": 338 }, { "epoch": 0.05469506292352372, "grad_norm": 5.532252788543701, "learning_rate": 9.985455564931092e-05, "loss": 2.4838, "step": 339 }, { "epoch": 0.05485640529202969, "grad_norm": 5.251373767852783, "learning_rate": 9.985255742083657e-05, "loss": 2.5022, "step": 340 }, { "epoch": 0.05501774766053566, "grad_norm": 5.868371486663818, "learning_rate": 9.985054557950674e-05, "loss": 2.5384, "step": 341 }, { "epoch": 0.05517909002904162, "grad_norm": 5.244610786437988, "learning_rate": 9.984852012587081e-05, "loss": 2.6677, "step": 342 }, { "epoch": 0.055340432397547594, "grad_norm": 5.554970741271973, "learning_rate": 9.98464810604818e-05, "loss": 2.3778, "step": 343 }, { "epoch": 0.055501774766053565, "grad_norm": 5.125883102416992, "learning_rate": 9.984442838389654e-05, "loss": 2.5587, "step": 344 }, { "epoch": 0.055663117134559535, "grad_norm": 5.0995588302612305, "learning_rate": 9.984236209667553e-05, "loss": 2.4692, "step": 345 }, { "epoch": 0.055824459503065506, "grad_norm": 5.538670539855957, "learning_rate": 9.9840282199383e-05, "loss": 2.5167, "step": 346 }, { "epoch": 0.055985801871571476, "grad_norm": 7.013861179351807, "learning_rate": 9.983818869258687e-05, "loss": 2.4946, "step": 347 }, { "epoch": 0.05614714424007745, "grad_norm": 6.362895965576172, "learning_rate": 9.983608157685882e-05, "loss": 2.3693, "step": 348 }, { "epoch": 0.05630848660858341, "grad_norm": 8.72132396697998, "learning_rate": 9.983396085277421e-05, "loss": 2.5072, "step": 349 }, { "epoch": 0.05646982897708938, "grad_norm": 6.60469913482666, "learning_rate": 9.983182652091214e-05, "loss": 2.5737, "step": 350 }, { "epoch": 0.05663117134559535, "grad_norm": 6.866801738739014, "learning_rate": 9.982967858185542e-05, "loss": 2.2838, "step": 351 }, { "epoch": 0.05679251371410132, "grad_norm": 5.227248668670654, "learning_rate": 9.982751703619055e-05, "loss": 2.1749, "step": 352 }, { "epoch": 0.05695385608260729, "grad_norm": 5.646151065826416, "learning_rate": 9.982534188450778e-05, "loss": 3.0098, "step": 353 }, { "epoch": 0.057115198451113264, "grad_norm": 5.992406845092773, "learning_rate": 9.982315312740107e-05, "loss": 2.3147, "step": 354 }, { "epoch": 0.057276540819619234, "grad_norm": 5.548222064971924, "learning_rate": 9.982095076546807e-05, "loss": 2.3271, "step": 355 }, { "epoch": 0.057437883188125205, "grad_norm": 5.728975296020508, "learning_rate": 9.981873479931018e-05, "loss": 2.4881, "step": 356 }, { "epoch": 0.05759922555663117, "grad_norm": 5.330463409423828, "learning_rate": 9.981650522953248e-05, "loss": 2.653, "step": 357 }, { "epoch": 0.05776056792513714, "grad_norm": 6.157469272613525, "learning_rate": 9.981426205674381e-05, "loss": 2.6039, "step": 358 }, { "epoch": 0.05792191029364311, "grad_norm": 7.181394100189209, "learning_rate": 9.981200528155666e-05, "loss": 2.4105, "step": 359 }, { "epoch": 0.05808325266214908, "grad_norm": 12.247302055358887, "learning_rate": 9.980973490458728e-05, "loss": 2.2891, "step": 360 }, { "epoch": 0.05824459503065505, "grad_norm": 5.332772254943848, "learning_rate": 9.980745092645564e-05, "loss": 2.3454, "step": 361 }, { "epoch": 0.05840593739916102, "grad_norm": 4.680450439453125, "learning_rate": 9.98051533477854e-05, "loss": 2.715, "step": 362 }, { "epoch": 0.05856727976766699, "grad_norm": 4.383055686950684, "learning_rate": 9.980284216920393e-05, "loss": 2.6565, "step": 363 }, { "epoch": 0.058728622136172956, "grad_norm": 8.36205768585205, "learning_rate": 9.980051739134233e-05, "loss": 2.3592, "step": 364 }, { "epoch": 0.05888996450467893, "grad_norm": 11.09957218170166, "learning_rate": 9.979817901483544e-05, "loss": 2.4009, "step": 365 }, { "epoch": 0.0590513068731849, "grad_norm": 4.821877479553223, "learning_rate": 9.979582704032175e-05, "loss": 2.131, "step": 366 }, { "epoch": 0.05921264924169087, "grad_norm": 4.984494686126709, "learning_rate": 9.979346146844351e-05, "loss": 2.4964, "step": 367 }, { "epoch": 0.05937399161019684, "grad_norm": 4.013694763183594, "learning_rate": 9.979108229984663e-05, "loss": 2.3903, "step": 368 }, { "epoch": 0.05953533397870281, "grad_norm": 7.097440242767334, "learning_rate": 9.978868953518084e-05, "loss": 2.4343, "step": 369 }, { "epoch": 0.05969667634720878, "grad_norm": 8.406354904174805, "learning_rate": 9.978628317509947e-05, "loss": 2.7057, "step": 370 }, { "epoch": 0.05985801871571474, "grad_norm": 5.988000869750977, "learning_rate": 9.978386322025961e-05, "loss": 2.4413, "step": 371 }, { "epoch": 0.060019361084220714, "grad_norm": 5.423110008239746, "learning_rate": 9.978142967132207e-05, "loss": 2.403, "step": 372 }, { "epoch": 0.060180703452726685, "grad_norm": 4.012385368347168, "learning_rate": 9.977898252895134e-05, "loss": 2.5389, "step": 373 }, { "epoch": 0.060342045821232655, "grad_norm": 6.347021102905273, "learning_rate": 9.977652179381566e-05, "loss": 2.5845, "step": 374 }, { "epoch": 0.060503388189738626, "grad_norm": 7.557514190673828, "learning_rate": 9.977404746658696e-05, "loss": 2.5214, "step": 375 }, { "epoch": 0.060664730558244596, "grad_norm": 5.463866710662842, "learning_rate": 9.977155954794089e-05, "loss": 2.6464, "step": 376 }, { "epoch": 0.06082607292675057, "grad_norm": 4.601561546325684, "learning_rate": 9.976905803855679e-05, "loss": 2.3969, "step": 377 }, { "epoch": 0.06098741529525654, "grad_norm": 6.171661853790283, "learning_rate": 9.976654293911776e-05, "loss": 2.3105, "step": 378 }, { "epoch": 0.0611487576637625, "grad_norm": 5.989907264709473, "learning_rate": 9.976401425031054e-05, "loss": 2.4348, "step": 379 }, { "epoch": 0.06131010003226847, "grad_norm": 7.300041198730469, "learning_rate": 9.976147197282565e-05, "loss": 2.4743, "step": 380 }, { "epoch": 0.06147144240077444, "grad_norm": 5.653286933898926, "learning_rate": 9.975891610735728e-05, "loss": 2.3747, "step": 381 }, { "epoch": 0.06163278476928041, "grad_norm": 6.075132369995117, "learning_rate": 9.975634665460332e-05, "loss": 2.4115, "step": 382 }, { "epoch": 0.061794127137786384, "grad_norm": 4.353718280792236, "learning_rate": 9.975376361526543e-05, "loss": 2.5368, "step": 383 }, { "epoch": 0.061955469506292354, "grad_norm": 7.838526248931885, "learning_rate": 9.975116699004892e-05, "loss": 2.6441, "step": 384 }, { "epoch": 0.062116811874798325, "grad_norm": 5.915657043457031, "learning_rate": 9.974855677966283e-05, "loss": 2.4787, "step": 385 }, { "epoch": 0.06227815424330429, "grad_norm": 12.826671600341797, "learning_rate": 9.974593298481991e-05, "loss": 2.3556, "step": 386 }, { "epoch": 0.06243949661181026, "grad_norm": 5.417259693145752, "learning_rate": 9.97432956062366e-05, "loss": 2.2902, "step": 387 }, { "epoch": 0.06260083898031624, "grad_norm": 6.576791286468506, "learning_rate": 9.974064464463313e-05, "loss": 2.6462, "step": 388 }, { "epoch": 0.06276218134882221, "grad_norm": 4.490204811096191, "learning_rate": 9.973798010073332e-05, "loss": 2.4747, "step": 389 }, { "epoch": 0.06292352371732816, "grad_norm": 4.300869464874268, "learning_rate": 9.973530197526477e-05, "loss": 2.3842, "step": 390 }, { "epoch": 0.06308486608583413, "grad_norm": 6.780396938323975, "learning_rate": 9.973261026895877e-05, "loss": 2.2796, "step": 391 }, { "epoch": 0.0632462084543401, "grad_norm": 7.646263599395752, "learning_rate": 9.972990498255034e-05, "loss": 2.2162, "step": 392 }, { "epoch": 0.06340755082284608, "grad_norm": 4.344062805175781, "learning_rate": 9.97271861167782e-05, "loss": 2.519, "step": 393 }, { "epoch": 0.06356889319135205, "grad_norm": 5.198149681091309, "learning_rate": 9.972445367238474e-05, "loss": 2.4272, "step": 394 }, { "epoch": 0.06373023555985802, "grad_norm": 7.867514610290527, "learning_rate": 9.972170765011611e-05, "loss": 2.6917, "step": 395 }, { "epoch": 0.06389157792836399, "grad_norm": 6.376491069793701, "learning_rate": 9.971894805072212e-05, "loss": 2.5755, "step": 396 }, { "epoch": 0.06405292029686996, "grad_norm": 6.178385257720947, "learning_rate": 9.971617487495635e-05, "loss": 2.8525, "step": 397 }, { "epoch": 0.06421426266537593, "grad_norm": 5.063424587249756, "learning_rate": 9.971338812357603e-05, "loss": 2.4475, "step": 398 }, { "epoch": 0.0643756050338819, "grad_norm": 5.792573928833008, "learning_rate": 9.971058779734211e-05, "loss": 2.4081, "step": 399 }, { "epoch": 0.06453694740238787, "grad_norm": 5.929740905761719, "learning_rate": 9.970777389701926e-05, "loss": 2.5581, "step": 400 }, { "epoch": 0.06469828977089384, "grad_norm": 5.673580169677734, "learning_rate": 9.970494642337585e-05, "loss": 2.4403, "step": 401 }, { "epoch": 0.06485963213939981, "grad_norm": 10.702649116516113, "learning_rate": 9.970210537718395e-05, "loss": 2.5654, "step": 402 }, { "epoch": 0.06502097450790578, "grad_norm": 4.5370564460754395, "learning_rate": 9.969925075921936e-05, "loss": 2.5331, "step": 403 }, { "epoch": 0.06518231687641174, "grad_norm": 6.114105224609375, "learning_rate": 9.969638257026156e-05, "loss": 2.3586, "step": 404 }, { "epoch": 0.06534365924491771, "grad_norm": 5.066039085388184, "learning_rate": 9.969350081109375e-05, "loss": 2.5332, "step": 405 }, { "epoch": 0.06550500161342368, "grad_norm": 4.686367034912109, "learning_rate": 9.96906054825028e-05, "loss": 2.4109, "step": 406 }, { "epoch": 0.06566634398192965, "grad_norm": 5.096669673919678, "learning_rate": 9.968769658527935e-05, "loss": 2.372, "step": 407 }, { "epoch": 0.06582768635043562, "grad_norm": 5.357354164123535, "learning_rate": 9.968477412021769e-05, "loss": 2.2959, "step": 408 }, { "epoch": 0.06598902871894159, "grad_norm": 7.111669063568115, "learning_rate": 9.968183808811586e-05, "loss": 2.3232, "step": 409 }, { "epoch": 0.06615037108744756, "grad_norm": 6.305822849273682, "learning_rate": 9.967888848977556e-05, "loss": 2.2802, "step": 410 }, { "epoch": 0.06631171345595353, "grad_norm": 5.072010517120361, "learning_rate": 9.96759253260022e-05, "loss": 2.4779, "step": 411 }, { "epoch": 0.0664730558244595, "grad_norm": 5.068328857421875, "learning_rate": 9.967294859760494e-05, "loss": 2.717, "step": 412 }, { "epoch": 0.06663439819296547, "grad_norm": 5.075811862945557, "learning_rate": 9.966995830539658e-05, "loss": 2.499, "step": 413 }, { "epoch": 0.06679574056147145, "grad_norm": 8.059813499450684, "learning_rate": 9.966695445019369e-05, "loss": 2.4381, "step": 414 }, { "epoch": 0.06695708292997742, "grad_norm": 4.285947799682617, "learning_rate": 9.96639370328165e-05, "loss": 2.453, "step": 415 }, { "epoch": 0.06711842529848339, "grad_norm": 4.399743556976318, "learning_rate": 9.966090605408892e-05, "loss": 2.507, "step": 416 }, { "epoch": 0.06727976766698936, "grad_norm": 6.607447147369385, "learning_rate": 9.965786151483867e-05, "loss": 2.2949, "step": 417 }, { "epoch": 0.06744111003549533, "grad_norm": 4.112335205078125, "learning_rate": 9.965480341589701e-05, "loss": 2.4463, "step": 418 }, { "epoch": 0.06760245240400128, "grad_norm": 6.776587963104248, "learning_rate": 9.965173175809906e-05, "loss": 2.5205, "step": 419 }, { "epoch": 0.06776379477250725, "grad_norm": 5.577247619628906, "learning_rate": 9.964864654228353e-05, "loss": 2.2608, "step": 420 }, { "epoch": 0.06792513714101323, "grad_norm": 5.070724010467529, "learning_rate": 9.96455477692929e-05, "loss": 2.3995, "step": 421 }, { "epoch": 0.0680864795095192, "grad_norm": 4.874788284301758, "learning_rate": 9.964243543997331e-05, "loss": 2.3424, "step": 422 }, { "epoch": 0.06824782187802517, "grad_norm": 4.443953514099121, "learning_rate": 9.963930955517464e-05, "loss": 2.3932, "step": 423 }, { "epoch": 0.06840916424653114, "grad_norm": 4.812726020812988, "learning_rate": 9.963617011575046e-05, "loss": 2.5151, "step": 424 }, { "epoch": 0.06857050661503711, "grad_norm": 6.698811054229736, "learning_rate": 9.9633017122558e-05, "loss": 2.4978, "step": 425 }, { "epoch": 0.06873184898354308, "grad_norm": 3.6164677143096924, "learning_rate": 9.962985057645824e-05, "loss": 2.4074, "step": 426 }, { "epoch": 0.06889319135204905, "grad_norm": 5.184870719909668, "learning_rate": 9.962667047831584e-05, "loss": 2.4073, "step": 427 }, { "epoch": 0.06905453372055502, "grad_norm": 5.24127197265625, "learning_rate": 9.962347682899917e-05, "loss": 2.3406, "step": 428 }, { "epoch": 0.06921587608906099, "grad_norm": 6.580834865570068, "learning_rate": 9.962026962938032e-05, "loss": 2.4803, "step": 429 }, { "epoch": 0.06937721845756696, "grad_norm": 5.9285078048706055, "learning_rate": 9.961704888033499e-05, "loss": 2.5044, "step": 430 }, { "epoch": 0.06953856082607293, "grad_norm": 4.945661544799805, "learning_rate": 9.96138145827427e-05, "loss": 2.5625, "step": 431 }, { "epoch": 0.0696999031945789, "grad_norm": 4.674883842468262, "learning_rate": 9.961056673748661e-05, "loss": 2.2854, "step": 432 }, { "epoch": 0.06986124556308487, "grad_norm": 5.121710300445557, "learning_rate": 9.960730534545358e-05, "loss": 2.5345, "step": 433 }, { "epoch": 0.07002258793159083, "grad_norm": 6.434323310852051, "learning_rate": 9.960403040753415e-05, "loss": 2.3152, "step": 434 }, { "epoch": 0.0701839303000968, "grad_norm": 4.598944664001465, "learning_rate": 9.96007419246226e-05, "loss": 2.4321, "step": 435 }, { "epoch": 0.07034527266860277, "grad_norm": 5.111134052276611, "learning_rate": 9.95974398976169e-05, "loss": 2.4378, "step": 436 }, { "epoch": 0.07050661503710874, "grad_norm": 4.676234245300293, "learning_rate": 9.959412432741869e-05, "loss": 2.4166, "step": 437 }, { "epoch": 0.07066795740561471, "grad_norm": 9.32239818572998, "learning_rate": 9.959079521493334e-05, "loss": 2.5017, "step": 438 }, { "epoch": 0.07082929977412068, "grad_norm": 6.225765705108643, "learning_rate": 9.958745256106991e-05, "loss": 2.4031, "step": 439 }, { "epoch": 0.07099064214262665, "grad_norm": 5.544423580169678, "learning_rate": 9.958409636674113e-05, "loss": 2.3907, "step": 440 }, { "epoch": 0.07115198451113262, "grad_norm": 5.300071716308594, "learning_rate": 9.958072663286348e-05, "loss": 2.5297, "step": 441 }, { "epoch": 0.0713133268796386, "grad_norm": 7.311100006103516, "learning_rate": 9.957734336035707e-05, "loss": 2.5866, "step": 442 }, { "epoch": 0.07147466924814457, "grad_norm": 5.8680524826049805, "learning_rate": 9.957394655014579e-05, "loss": 2.2837, "step": 443 }, { "epoch": 0.07163601161665054, "grad_norm": 6.737392902374268, "learning_rate": 9.957053620315715e-05, "loss": 2.2766, "step": 444 }, { "epoch": 0.0717973539851565, "grad_norm": 5.466081619262695, "learning_rate": 9.95671123203224e-05, "loss": 2.1729, "step": 445 }, { "epoch": 0.07195869635366248, "grad_norm": 4.970895290374756, "learning_rate": 9.956367490257645e-05, "loss": 2.4309, "step": 446 }, { "epoch": 0.07212003872216845, "grad_norm": 4.9316253662109375, "learning_rate": 9.956022395085798e-05, "loss": 2.516, "step": 447 }, { "epoch": 0.0722813810906744, "grad_norm": 5.837429046630859, "learning_rate": 9.955675946610924e-05, "loss": 2.4396, "step": 448 }, { "epoch": 0.07244272345918037, "grad_norm": 5.840031147003174, "learning_rate": 9.955328144927633e-05, "loss": 2.5762, "step": 449 }, { "epoch": 0.07260406582768635, "grad_norm": 5.159872055053711, "learning_rate": 9.954978990130892e-05, "loss": 2.6101, "step": 450 }, { "epoch": 0.07276540819619232, "grad_norm": 7.33108377456665, "learning_rate": 9.954628482316042e-05, "loss": 2.4867, "step": 451 }, { "epoch": 0.07292675056469829, "grad_norm": 4.924890041351318, "learning_rate": 9.954276621578795e-05, "loss": 2.3232, "step": 452 }, { "epoch": 0.07308809293320426, "grad_norm": 6.319643497467041, "learning_rate": 9.95392340801523e-05, "loss": 2.2449, "step": 453 }, { "epoch": 0.07324943530171023, "grad_norm": 4.542699337005615, "learning_rate": 9.953568841721797e-05, "loss": 2.2596, "step": 454 }, { "epoch": 0.0734107776702162, "grad_norm": 5.186656475067139, "learning_rate": 9.953212922795314e-05, "loss": 2.5219, "step": 455 }, { "epoch": 0.07357212003872217, "grad_norm": 5.568697929382324, "learning_rate": 9.952855651332968e-05, "loss": 2.2704, "step": 456 }, { "epoch": 0.07373346240722814, "grad_norm": 5.325469493865967, "learning_rate": 9.95249702743232e-05, "loss": 2.3298, "step": 457 }, { "epoch": 0.07389480477573411, "grad_norm": 9.226958274841309, "learning_rate": 9.952137051191292e-05, "loss": 2.4681, "step": 458 }, { "epoch": 0.07405614714424008, "grad_norm": 9.054658889770508, "learning_rate": 9.951775722708184e-05, "loss": 2.4207, "step": 459 }, { "epoch": 0.07421748951274605, "grad_norm": 4.462258338928223, "learning_rate": 9.951413042081659e-05, "loss": 2.1436, "step": 460 }, { "epoch": 0.07437883188125202, "grad_norm": 4.8121209144592285, "learning_rate": 9.951049009410751e-05, "loss": 2.2229, "step": 461 }, { "epoch": 0.07454017424975799, "grad_norm": 5.110342979431152, "learning_rate": 9.950683624794865e-05, "loss": 2.6386, "step": 462 }, { "epoch": 0.07470151661826395, "grad_norm": 5.061398983001709, "learning_rate": 9.950316888333775e-05, "loss": 2.3988, "step": 463 }, { "epoch": 0.07486285898676992, "grad_norm": 6.079970836639404, "learning_rate": 9.949948800127619e-05, "loss": 2.5707, "step": 464 }, { "epoch": 0.07502420135527589, "grad_norm": 5.802469253540039, "learning_rate": 9.949579360276912e-05, "loss": 2.5647, "step": 465 }, { "epoch": 0.07518554372378186, "grad_norm": 4.262973785400391, "learning_rate": 9.949208568882531e-05, "loss": 2.3063, "step": 466 }, { "epoch": 0.07534688609228783, "grad_norm": 7.107011795043945, "learning_rate": 9.948836426045728e-05, "loss": 2.4295, "step": 467 }, { "epoch": 0.0755082284607938, "grad_norm": 5.002132892608643, "learning_rate": 9.948462931868119e-05, "loss": 2.4616, "step": 468 }, { "epoch": 0.07566957082929977, "grad_norm": 4.140540599822998, "learning_rate": 9.948088086451691e-05, "loss": 2.309, "step": 469 }, { "epoch": 0.07583091319780574, "grad_norm": 5.206024169921875, "learning_rate": 9.947711889898802e-05, "loss": 2.3658, "step": 470 }, { "epoch": 0.07599225556631171, "grad_norm": 4.66822624206543, "learning_rate": 9.947334342312176e-05, "loss": 2.3782, "step": 471 }, { "epoch": 0.07615359793481769, "grad_norm": 4.914458274841309, "learning_rate": 9.946955443794908e-05, "loss": 2.3358, "step": 472 }, { "epoch": 0.07631494030332366, "grad_norm": 4.964338779449463, "learning_rate": 9.946575194450458e-05, "loss": 2.549, "step": 473 }, { "epoch": 0.07647628267182963, "grad_norm": 8.110756874084473, "learning_rate": 9.946193594382662e-05, "loss": 2.5279, "step": 474 }, { "epoch": 0.0766376250403356, "grad_norm": 8.146491050720215, "learning_rate": 9.945810643695717e-05, "loss": 2.3447, "step": 475 }, { "epoch": 0.07679896740884157, "grad_norm": 4.413754463195801, "learning_rate": 9.945426342494195e-05, "loss": 2.3938, "step": 476 }, { "epoch": 0.07696030977734754, "grad_norm": 5.769023418426514, "learning_rate": 9.945040690883033e-05, "loss": 2.3792, "step": 477 }, { "epoch": 0.0771216521458535, "grad_norm": 5.316256999969482, "learning_rate": 9.944653688967537e-05, "loss": 2.3402, "step": 478 }, { "epoch": 0.07728299451435947, "grad_norm": 5.797434329986572, "learning_rate": 9.944265336853385e-05, "loss": 2.4458, "step": 479 }, { "epoch": 0.07744433688286544, "grad_norm": 4.082608699798584, "learning_rate": 9.94387563464662e-05, "loss": 2.4178, "step": 480 }, { "epoch": 0.0776056792513714, "grad_norm": 5.853196144104004, "learning_rate": 9.943484582453653e-05, "loss": 2.2416, "step": 481 }, { "epoch": 0.07776702161987738, "grad_norm": 7.1680588722229, "learning_rate": 9.94309218038127e-05, "loss": 2.1606, "step": 482 }, { "epoch": 0.07792836398838335, "grad_norm": 4.1502814292907715, "learning_rate": 9.942698428536616e-05, "loss": 2.4373, "step": 483 }, { "epoch": 0.07808970635688932, "grad_norm": 3.6260924339294434, "learning_rate": 9.942303327027216e-05, "loss": 2.4671, "step": 484 }, { "epoch": 0.07825104872539529, "grad_norm": 4.55759334564209, "learning_rate": 9.941906875960952e-05, "loss": 2.4586, "step": 485 }, { "epoch": 0.07841239109390126, "grad_norm": 4.005107402801514, "learning_rate": 9.941509075446081e-05, "loss": 2.5739, "step": 486 }, { "epoch": 0.07857373346240723, "grad_norm": 4.9014506340026855, "learning_rate": 9.94110992559123e-05, "loss": 2.5555, "step": 487 }, { "epoch": 0.0787350758309132, "grad_norm": 6.068490982055664, "learning_rate": 9.940709426505388e-05, "loss": 2.2641, "step": 488 }, { "epoch": 0.07889641819941917, "grad_norm": 3.5387156009674072, "learning_rate": 9.94030757829792e-05, "loss": 2.422, "step": 489 }, { "epoch": 0.07905776056792514, "grad_norm": 5.17158317565918, "learning_rate": 9.939904381078553e-05, "loss": 2.4981, "step": 490 }, { "epoch": 0.07921910293643111, "grad_norm": 6.625000476837158, "learning_rate": 9.939499834957386e-05, "loss": 2.6917, "step": 491 }, { "epoch": 0.07938044530493708, "grad_norm": 5.97782039642334, "learning_rate": 9.939093940044885e-05, "loss": 2.2068, "step": 492 }, { "epoch": 0.07954178767344304, "grad_norm": 5.146000862121582, "learning_rate": 9.938686696451884e-05, "loss": 2.2952, "step": 493 }, { "epoch": 0.07970313004194901, "grad_norm": 4.428076267242432, "learning_rate": 9.938278104289586e-05, "loss": 2.3511, "step": 494 }, { "epoch": 0.07986447241045498, "grad_norm": 4.798122882843018, "learning_rate": 9.937868163669565e-05, "loss": 2.3543, "step": 495 }, { "epoch": 0.08002581477896095, "grad_norm": 6.988558292388916, "learning_rate": 9.937456874703757e-05, "loss": 2.3155, "step": 496 }, { "epoch": 0.08018715714746692, "grad_norm": 4.669104099273682, "learning_rate": 9.93704423750447e-05, "loss": 2.4429, "step": 497 }, { "epoch": 0.0803484995159729, "grad_norm": 5.921403884887695, "learning_rate": 9.93663025218438e-05, "loss": 2.5554, "step": 498 }, { "epoch": 0.08050984188447886, "grad_norm": 4.644062042236328, "learning_rate": 9.93621491885653e-05, "loss": 2.4043, "step": 499 }, { "epoch": 0.08067118425298483, "grad_norm": 5.473841667175293, "learning_rate": 9.935798237634335e-05, "loss": 2.4431, "step": 500 }, { "epoch": 0.0808325266214908, "grad_norm": 3.7958078384399414, "learning_rate": 9.935380208631572e-05, "loss": 2.4083, "step": 501 }, { "epoch": 0.08099386898999678, "grad_norm": 6.851291179656982, "learning_rate": 9.93496083196239e-05, "loss": 2.4766, "step": 502 }, { "epoch": 0.08115521135850275, "grad_norm": 8.067902565002441, "learning_rate": 9.934540107741304e-05, "loss": 2.3541, "step": 503 }, { "epoch": 0.08131655372700872, "grad_norm": 4.656294822692871, "learning_rate": 9.934118036083199e-05, "loss": 2.3281, "step": 504 }, { "epoch": 0.08147789609551469, "grad_norm": 5.242002487182617, "learning_rate": 9.933694617103327e-05, "loss": 2.3153, "step": 505 }, { "epoch": 0.08163923846402066, "grad_norm": 3.5532069206237793, "learning_rate": 9.933269850917309e-05, "loss": 2.2761, "step": 506 }, { "epoch": 0.08180058083252661, "grad_norm": 3.949692726135254, "learning_rate": 9.932843737641127e-05, "loss": 2.4368, "step": 507 }, { "epoch": 0.08196192320103259, "grad_norm": 3.7503204345703125, "learning_rate": 9.932416277391143e-05, "loss": 2.3481, "step": 508 }, { "epoch": 0.08212326556953856, "grad_norm": 4.7445478439331055, "learning_rate": 9.931987470284077e-05, "loss": 2.4143, "step": 509 }, { "epoch": 0.08228460793804453, "grad_norm": 4.674653053283691, "learning_rate": 9.931557316437021e-05, "loss": 2.4809, "step": 510 }, { "epoch": 0.0824459503065505, "grad_norm": 4.840732097625732, "learning_rate": 9.931125815967434e-05, "loss": 2.469, "step": 511 }, { "epoch": 0.08260729267505647, "grad_norm": 4.673700332641602, "learning_rate": 9.930692968993143e-05, "loss": 2.6427, "step": 512 }, { "epoch": 0.08276863504356244, "grad_norm": 6.143237113952637, "learning_rate": 9.93025877563234e-05, "loss": 2.1935, "step": 513 }, { "epoch": 0.08292997741206841, "grad_norm": 4.747680187225342, "learning_rate": 9.929823236003589e-05, "loss": 2.3936, "step": 514 }, { "epoch": 0.08309131978057438, "grad_norm": 4.849829196929932, "learning_rate": 9.929386350225818e-05, "loss": 2.6919, "step": 515 }, { "epoch": 0.08325266214908035, "grad_norm": 6.0291924476623535, "learning_rate": 9.928948118418326e-05, "loss": 2.3324, "step": 516 }, { "epoch": 0.08341400451758632, "grad_norm": 3.673546075820923, "learning_rate": 9.928508540700774e-05, "loss": 1.9863, "step": 517 }, { "epoch": 0.08357534688609229, "grad_norm": 4.3796706199646, "learning_rate": 9.928067617193199e-05, "loss": 2.3091, "step": 518 }, { "epoch": 0.08373668925459826, "grad_norm": 4.878311634063721, "learning_rate": 9.927625348015996e-05, "loss": 2.4597, "step": 519 }, { "epoch": 0.08389803162310423, "grad_norm": 4.7062482833862305, "learning_rate": 9.927181733289935e-05, "loss": 2.447, "step": 520 }, { "epoch": 0.0840593739916102, "grad_norm": 6.62054967880249, "learning_rate": 9.92673677313615e-05, "loss": 2.2419, "step": 521 }, { "epoch": 0.08422071636011616, "grad_norm": 6.965595722198486, "learning_rate": 9.926290467676141e-05, "loss": 2.4569, "step": 522 }, { "epoch": 0.08438205872862213, "grad_norm": 5.016797065734863, "learning_rate": 9.925842817031781e-05, "loss": 2.5768, "step": 523 }, { "epoch": 0.0845434010971281, "grad_norm": 5.2972941398620605, "learning_rate": 9.925393821325301e-05, "loss": 2.3541, "step": 524 }, { "epoch": 0.08470474346563407, "grad_norm": 3.7381811141967773, "learning_rate": 9.924943480679311e-05, "loss": 2.4922, "step": 525 }, { "epoch": 0.08486608583414004, "grad_norm": 4.882343292236328, "learning_rate": 9.924491795216777e-05, "loss": 2.2613, "step": 526 }, { "epoch": 0.08502742820264601, "grad_norm": 6.2732834815979, "learning_rate": 9.924038765061042e-05, "loss": 2.3453, "step": 527 }, { "epoch": 0.08518877057115198, "grad_norm": 4.926011562347412, "learning_rate": 9.923584390335805e-05, "loss": 2.1763, "step": 528 }, { "epoch": 0.08535011293965795, "grad_norm": 3.1853888034820557, "learning_rate": 9.923128671165145e-05, "loss": 2.166, "step": 529 }, { "epoch": 0.08551145530816393, "grad_norm": 5.0853376388549805, "learning_rate": 9.922671607673499e-05, "loss": 2.3266, "step": 530 }, { "epoch": 0.0856727976766699, "grad_norm": 5.984068870544434, "learning_rate": 9.922213199985673e-05, "loss": 2.4452, "step": 531 }, { "epoch": 0.08583414004517587, "grad_norm": 4.2504143714904785, "learning_rate": 9.921753448226843e-05, "loss": 2.3857, "step": 532 }, { "epoch": 0.08599548241368184, "grad_norm": 4.222536563873291, "learning_rate": 9.921292352522548e-05, "loss": 2.7227, "step": 533 }, { "epoch": 0.08615682478218781, "grad_norm": 6.777401447296143, "learning_rate": 9.920829912998696e-05, "loss": 2.4183, "step": 534 }, { "epoch": 0.08631816715069378, "grad_norm": 4.108943462371826, "learning_rate": 9.920366129781564e-05, "loss": 2.1958, "step": 535 }, { "epoch": 0.08647950951919975, "grad_norm": 6.230100154876709, "learning_rate": 9.919901002997792e-05, "loss": 2.4445, "step": 536 }, { "epoch": 0.0866408518877057, "grad_norm": 7.756218433380127, "learning_rate": 9.919434532774387e-05, "loss": 2.4117, "step": 537 }, { "epoch": 0.08680219425621168, "grad_norm": 5.887117385864258, "learning_rate": 9.918966719238726e-05, "loss": 2.3651, "step": 538 }, { "epoch": 0.08696353662471765, "grad_norm": 5.209339618682861, "learning_rate": 9.918497562518554e-05, "loss": 2.3272, "step": 539 }, { "epoch": 0.08712487899322362, "grad_norm": 4.876028060913086, "learning_rate": 9.918027062741976e-05, "loss": 2.3033, "step": 540 }, { "epoch": 0.08728622136172959, "grad_norm": 4.886802673339844, "learning_rate": 9.917555220037468e-05, "loss": 2.3048, "step": 541 }, { "epoch": 0.08744756373023556, "grad_norm": 4.376396179199219, "learning_rate": 9.917082034533875e-05, "loss": 2.4459, "step": 542 }, { "epoch": 0.08760890609874153, "grad_norm": 4.801101207733154, "learning_rate": 9.916607506360407e-05, "loss": 2.3303, "step": 543 }, { "epoch": 0.0877702484672475, "grad_norm": 4.449361801147461, "learning_rate": 9.916131635646635e-05, "loss": 2.1947, "step": 544 }, { "epoch": 0.08793159083575347, "grad_norm": 6.665609836578369, "learning_rate": 9.915654422522505e-05, "loss": 2.5422, "step": 545 }, { "epoch": 0.08809293320425944, "grad_norm": 6.845566749572754, "learning_rate": 9.915175867118324e-05, "loss": 2.5268, "step": 546 }, { "epoch": 0.08825427557276541, "grad_norm": 6.118831634521484, "learning_rate": 9.914695969564769e-05, "loss": 2.3306, "step": 547 }, { "epoch": 0.08841561794127138, "grad_norm": 6.415126800537109, "learning_rate": 9.914214729992881e-05, "loss": 2.2492, "step": 548 }, { "epoch": 0.08857696030977735, "grad_norm": 4.820186614990234, "learning_rate": 9.913732148534068e-05, "loss": 2.2528, "step": 549 }, { "epoch": 0.08873830267828332, "grad_norm": 5.117327690124512, "learning_rate": 9.913248225320106e-05, "loss": 2.4017, "step": 550 }, { "epoch": 0.08889964504678928, "grad_norm": 5.227145671844482, "learning_rate": 9.912762960483138e-05, "loss": 2.2688, "step": 551 }, { "epoch": 0.08906098741529525, "grad_norm": 5.166123867034912, "learning_rate": 9.912276354155666e-05, "loss": 2.3175, "step": 552 }, { "epoch": 0.08922232978380122, "grad_norm": 7.08079195022583, "learning_rate": 9.911788406470569e-05, "loss": 2.3965, "step": 553 }, { "epoch": 0.08938367215230719, "grad_norm": 3.4850637912750244, "learning_rate": 9.911299117561085e-05, "loss": 2.4604, "step": 554 }, { "epoch": 0.08954501452081316, "grad_norm": 5.370327949523926, "learning_rate": 9.910808487560821e-05, "loss": 2.4105, "step": 555 }, { "epoch": 0.08970635688931913, "grad_norm": 6.220058441162109, "learning_rate": 9.910316516603748e-05, "loss": 2.4818, "step": 556 }, { "epoch": 0.0898676992578251, "grad_norm": 4.778527736663818, "learning_rate": 9.909823204824206e-05, "loss": 2.335, "step": 557 }, { "epoch": 0.09002904162633107, "grad_norm": 5.088039875030518, "learning_rate": 9.9093285523569e-05, "loss": 2.512, "step": 558 }, { "epoch": 0.09019038399483705, "grad_norm": 4.244760513305664, "learning_rate": 9.908832559336902e-05, "loss": 2.5163, "step": 559 }, { "epoch": 0.09035172636334302, "grad_norm": 3.665497303009033, "learning_rate": 9.908335225899647e-05, "loss": 2.288, "step": 560 }, { "epoch": 0.09051306873184899, "grad_norm": 5.0853986740112305, "learning_rate": 9.907836552180938e-05, "loss": 2.3966, "step": 561 }, { "epoch": 0.09067441110035496, "grad_norm": 4.697176933288574, "learning_rate": 9.907336538316944e-05, "loss": 2.3659, "step": 562 }, { "epoch": 0.09083575346886093, "grad_norm": 4.051960468292236, "learning_rate": 9.906835184444203e-05, "loss": 2.5436, "step": 563 }, { "epoch": 0.0909970958373669, "grad_norm": 4.65770149230957, "learning_rate": 9.906332490699613e-05, "loss": 2.2301, "step": 564 }, { "epoch": 0.09115843820587287, "grad_norm": 5.872143745422363, "learning_rate": 9.905828457220442e-05, "loss": 2.2801, "step": 565 }, { "epoch": 0.09131978057437883, "grad_norm": 5.051412105560303, "learning_rate": 9.90532308414432e-05, "loss": 2.4987, "step": 566 }, { "epoch": 0.0914811229428848, "grad_norm": 5.8648271560668945, "learning_rate": 9.904816371609249e-05, "loss": 2.4338, "step": 567 }, { "epoch": 0.09164246531139077, "grad_norm": 5.009479522705078, "learning_rate": 9.90430831975359e-05, "loss": 2.4364, "step": 568 }, { "epoch": 0.09180380767989674, "grad_norm": 5.303811550140381, "learning_rate": 9.903798928716074e-05, "loss": 2.1138, "step": 569 }, { "epoch": 0.09196515004840271, "grad_norm": 6.00145149230957, "learning_rate": 9.903288198635798e-05, "loss": 2.3861, "step": 570 }, { "epoch": 0.09212649241690868, "grad_norm": 4.163204669952393, "learning_rate": 9.902776129652223e-05, "loss": 2.2934, "step": 571 }, { "epoch": 0.09228783478541465, "grad_norm": 4.87972354888916, "learning_rate": 9.902262721905171e-05, "loss": 2.2856, "step": 572 }, { "epoch": 0.09244917715392062, "grad_norm": 5.656421184539795, "learning_rate": 9.901747975534841e-05, "loss": 2.3445, "step": 573 }, { "epoch": 0.09261051952242659, "grad_norm": 6.181389808654785, "learning_rate": 9.901231890681786e-05, "loss": 2.3808, "step": 574 }, { "epoch": 0.09277186189093256, "grad_norm": 4.321932315826416, "learning_rate": 9.900714467486932e-05, "loss": 2.3646, "step": 575 }, { "epoch": 0.09293320425943853, "grad_norm": 4.5859503746032715, "learning_rate": 9.900195706091566e-05, "loss": 2.2895, "step": 576 }, { "epoch": 0.0930945466279445, "grad_norm": 6.927850246429443, "learning_rate": 9.899675606637345e-05, "loss": 2.3984, "step": 577 }, { "epoch": 0.09325588899645047, "grad_norm": 6.717582702636719, "learning_rate": 9.899154169266283e-05, "loss": 2.124, "step": 578 }, { "epoch": 0.09341723136495644, "grad_norm": 7.106042385101318, "learning_rate": 9.898631394120771e-05, "loss": 2.3456, "step": 579 }, { "epoch": 0.09357857373346241, "grad_norm": 7.629501819610596, "learning_rate": 9.898107281343556e-05, "loss": 2.2063, "step": 580 }, { "epoch": 0.09373991610196837, "grad_norm": 4.966427326202393, "learning_rate": 9.897581831077754e-05, "loss": 2.2868, "step": 581 }, { "epoch": 0.09390125847047434, "grad_norm": 5.79547643661499, "learning_rate": 9.897055043466848e-05, "loss": 2.3294, "step": 582 }, { "epoch": 0.09406260083898031, "grad_norm": 4.90958833694458, "learning_rate": 9.896526918654678e-05, "loss": 2.5938, "step": 583 }, { "epoch": 0.09422394320748628, "grad_norm": 4.303000450134277, "learning_rate": 9.895997456785463e-05, "loss": 2.3196, "step": 584 }, { "epoch": 0.09438528557599225, "grad_norm": 4.9478044509887695, "learning_rate": 9.89546665800377e-05, "loss": 2.5005, "step": 585 }, { "epoch": 0.09454662794449822, "grad_norm": 6.02296781539917, "learning_rate": 9.894934522454547e-05, "loss": 2.2884, "step": 586 }, { "epoch": 0.0947079703130042, "grad_norm": 5.697391510009766, "learning_rate": 9.894401050283099e-05, "loss": 2.2631, "step": 587 }, { "epoch": 0.09486931268151017, "grad_norm": 6.0295305252075195, "learning_rate": 9.893866241635096e-05, "loss": 2.4042, "step": 588 }, { "epoch": 0.09503065505001614, "grad_norm": 4.033676624298096, "learning_rate": 9.893330096656574e-05, "loss": 2.5702, "step": 589 }, { "epoch": 0.0951919974185221, "grad_norm": 4.621325492858887, "learning_rate": 9.892792615493934e-05, "loss": 2.4732, "step": 590 }, { "epoch": 0.09535333978702808, "grad_norm": 4.931112766265869, "learning_rate": 9.892253798293942e-05, "loss": 2.3272, "step": 591 }, { "epoch": 0.09551468215553405, "grad_norm": 10.282770156860352, "learning_rate": 9.89171364520373e-05, "loss": 2.4638, "step": 592 }, { "epoch": 0.09567602452404002, "grad_norm": 6.483240604400635, "learning_rate": 9.891172156370792e-05, "loss": 2.3649, "step": 593 }, { "epoch": 0.09583736689254599, "grad_norm": 6.331130027770996, "learning_rate": 9.89062933194299e-05, "loss": 2.359, "step": 594 }, { "epoch": 0.09599870926105195, "grad_norm": 6.046228885650635, "learning_rate": 9.890085172068544e-05, "loss": 2.288, "step": 595 }, { "epoch": 0.09616005162955792, "grad_norm": 4.380414962768555, "learning_rate": 9.88953967689605e-05, "loss": 2.3372, "step": 596 }, { "epoch": 0.09632139399806389, "grad_norm": 4.5679826736450195, "learning_rate": 9.888992846574456e-05, "loss": 2.5387, "step": 597 }, { "epoch": 0.09648273636656986, "grad_norm": 6.251746654510498, "learning_rate": 9.888444681253086e-05, "loss": 2.3858, "step": 598 }, { "epoch": 0.09664407873507583, "grad_norm": 4.814467430114746, "learning_rate": 9.887895181081622e-05, "loss": 2.3785, "step": 599 }, { "epoch": 0.0968054211035818, "grad_norm": 4.712541103363037, "learning_rate": 9.88734434621011e-05, "loss": 2.2192, "step": 600 }, { "epoch": 0.09696676347208777, "grad_norm": 6.289758205413818, "learning_rate": 9.886792176788964e-05, "loss": 2.1972, "step": 601 }, { "epoch": 0.09712810584059374, "grad_norm": 4.202297210693359, "learning_rate": 9.886238672968959e-05, "loss": 2.3075, "step": 602 }, { "epoch": 0.09728944820909971, "grad_norm": 4.969404220581055, "learning_rate": 9.885683834901238e-05, "loss": 2.6972, "step": 603 }, { "epoch": 0.09745079057760568, "grad_norm": 5.10714054107666, "learning_rate": 9.885127662737306e-05, "loss": 2.2584, "step": 604 }, { "epoch": 0.09761213294611165, "grad_norm": 4.227580547332764, "learning_rate": 9.88457015662903e-05, "loss": 2.2547, "step": 605 }, { "epoch": 0.09777347531461762, "grad_norm": 4.349273204803467, "learning_rate": 9.884011316728648e-05, "loss": 2.3444, "step": 606 }, { "epoch": 0.09793481768312359, "grad_norm": 5.646902561187744, "learning_rate": 9.883451143188753e-05, "loss": 2.251, "step": 607 }, { "epoch": 0.09809616005162956, "grad_norm": 4.5992655754089355, "learning_rate": 9.882889636162313e-05, "loss": 2.2226, "step": 608 }, { "epoch": 0.09825750242013553, "grad_norm": 6.841413497924805, "learning_rate": 9.882326795802652e-05, "loss": 2.4206, "step": 609 }, { "epoch": 0.09841884478864149, "grad_norm": 6.097044467926025, "learning_rate": 9.881762622263459e-05, "loss": 2.5558, "step": 610 }, { "epoch": 0.09858018715714746, "grad_norm": 5.023830890655518, "learning_rate": 9.88119711569879e-05, "loss": 2.3713, "step": 611 }, { "epoch": 0.09874152952565343, "grad_norm": 4.367787837982178, "learning_rate": 9.880630276263066e-05, "loss": 2.4648, "step": 612 }, { "epoch": 0.0989028718941594, "grad_norm": 5.2165093421936035, "learning_rate": 9.880062104111064e-05, "loss": 2.1164, "step": 613 }, { "epoch": 0.09906421426266537, "grad_norm": 4.974855422973633, "learning_rate": 9.879492599397935e-05, "loss": 2.4089, "step": 614 }, { "epoch": 0.09922555663117134, "grad_norm": 5.106447696685791, "learning_rate": 9.878921762279185e-05, "loss": 2.3289, "step": 615 }, { "epoch": 0.09938689899967731, "grad_norm": 5.060490608215332, "learning_rate": 9.878349592910692e-05, "loss": 2.2726, "step": 616 }, { "epoch": 0.09954824136818329, "grad_norm": 4.20667839050293, "learning_rate": 9.877776091448694e-05, "loss": 2.2952, "step": 617 }, { "epoch": 0.09970958373668926, "grad_norm": 5.2343525886535645, "learning_rate": 9.87720125804979e-05, "loss": 2.278, "step": 618 }, { "epoch": 0.09987092610519523, "grad_norm": 3.2236578464508057, "learning_rate": 9.876625092870947e-05, "loss": 2.2452, "step": 619 }, { "epoch": 0.1000322684737012, "grad_norm": 5.387293338775635, "learning_rate": 9.876047596069493e-05, "loss": 2.3576, "step": 620 }, { "epoch": 0.10019361084220717, "grad_norm": 6.220782279968262, "learning_rate": 9.875468767803122e-05, "loss": 2.5285, "step": 621 }, { "epoch": 0.10035495321071314, "grad_norm": 5.5559258460998535, "learning_rate": 9.87488860822989e-05, "loss": 2.3215, "step": 622 }, { "epoch": 0.10051629557921911, "grad_norm": 5.513997554779053, "learning_rate": 9.874307117508214e-05, "loss": 2.1272, "step": 623 }, { "epoch": 0.10067763794772508, "grad_norm": 5.5388007164001465, "learning_rate": 9.873724295796881e-05, "loss": 2.5749, "step": 624 }, { "epoch": 0.10083898031623104, "grad_norm": 4.577971935272217, "learning_rate": 9.873140143255036e-05, "loss": 2.4018, "step": 625 }, { "epoch": 0.101000322684737, "grad_norm": 4.911154747009277, "learning_rate": 9.872554660042188e-05, "loss": 2.4153, "step": 626 }, { "epoch": 0.10116166505324298, "grad_norm": 5.559174060821533, "learning_rate": 9.871967846318213e-05, "loss": 2.2745, "step": 627 }, { "epoch": 0.10132300742174895, "grad_norm": 5.209252834320068, "learning_rate": 9.871379702243345e-05, "loss": 2.2871, "step": 628 }, { "epoch": 0.10148434979025492, "grad_norm": 5.395344257354736, "learning_rate": 9.870790227978186e-05, "loss": 2.2349, "step": 629 }, { "epoch": 0.10164569215876089, "grad_norm": 4.204890727996826, "learning_rate": 9.870199423683697e-05, "loss": 2.2719, "step": 630 }, { "epoch": 0.10180703452726686, "grad_norm": 6.293933868408203, "learning_rate": 9.869607289521207e-05, "loss": 2.3921, "step": 631 }, { "epoch": 0.10196837689577283, "grad_norm": 5.962689399719238, "learning_rate": 9.869013825652405e-05, "loss": 2.464, "step": 632 }, { "epoch": 0.1021297192642788, "grad_norm": 5.058944225311279, "learning_rate": 9.868419032239342e-05, "loss": 2.0913, "step": 633 }, { "epoch": 0.10229106163278477, "grad_norm": 5.054147720336914, "learning_rate": 9.867822909444434e-05, "loss": 2.4864, "step": 634 }, { "epoch": 0.10245240400129074, "grad_norm": 5.315404891967773, "learning_rate": 9.867225457430461e-05, "loss": 2.3332, "step": 635 }, { "epoch": 0.10261374636979671, "grad_norm": 5.104187488555908, "learning_rate": 9.866626676360564e-05, "loss": 2.3409, "step": 636 }, { "epoch": 0.10277508873830268, "grad_norm": 3.9855387210845947, "learning_rate": 9.866026566398248e-05, "loss": 2.1252, "step": 637 }, { "epoch": 0.10293643110680865, "grad_norm": 5.759126663208008, "learning_rate": 9.86542512770738e-05, "loss": 2.3079, "step": 638 }, { "epoch": 0.10309777347531461, "grad_norm": 7.466570854187012, "learning_rate": 9.864822360452188e-05, "loss": 2.2168, "step": 639 }, { "epoch": 0.10325911584382058, "grad_norm": 8.343269348144531, "learning_rate": 9.86421826479727e-05, "loss": 1.9994, "step": 640 }, { "epoch": 0.10342045821232655, "grad_norm": 5.27672815322876, "learning_rate": 9.863612840907577e-05, "loss": 2.3302, "step": 641 }, { "epoch": 0.10358180058083252, "grad_norm": 4.862626552581787, "learning_rate": 9.86300608894843e-05, "loss": 2.4057, "step": 642 }, { "epoch": 0.1037431429493385, "grad_norm": 4.641247272491455, "learning_rate": 9.862398009085511e-05, "loss": 2.2996, "step": 643 }, { "epoch": 0.10390448531784446, "grad_norm": 5.97214412689209, "learning_rate": 9.86178860148486e-05, "loss": 2.1299, "step": 644 }, { "epoch": 0.10406582768635043, "grad_norm": 6.091129779815674, "learning_rate": 9.861177866312887e-05, "loss": 2.3089, "step": 645 }, { "epoch": 0.1042271700548564, "grad_norm": 5.053765773773193, "learning_rate": 9.86056580373636e-05, "loss": 2.2826, "step": 646 }, { "epoch": 0.10438851242336238, "grad_norm": 5.303426742553711, "learning_rate": 9.859952413922407e-05, "loss": 2.3649, "step": 647 }, { "epoch": 0.10454985479186835, "grad_norm": 5.3501715660095215, "learning_rate": 9.859337697038526e-05, "loss": 2.2318, "step": 648 }, { "epoch": 0.10471119716037432, "grad_norm": 5.797842979431152, "learning_rate": 9.858721653252571e-05, "loss": 2.3028, "step": 649 }, { "epoch": 0.10487253952888029, "grad_norm": 4.831730365753174, "learning_rate": 9.858104282732759e-05, "loss": 2.3065, "step": 650 }, { "epoch": 0.10503388189738626, "grad_norm": 7.14301061630249, "learning_rate": 9.857485585647675e-05, "loss": 2.5492, "step": 651 }, { "epoch": 0.10519522426589223, "grad_norm": 5.6980061531066895, "learning_rate": 9.856865562166256e-05, "loss": 2.1222, "step": 652 }, { "epoch": 0.1053565666343982, "grad_norm": 4.008930206298828, "learning_rate": 9.856244212457813e-05, "loss": 2.0549, "step": 653 }, { "epoch": 0.10551790900290416, "grad_norm": 5.521589756011963, "learning_rate": 9.855621536692008e-05, "loss": 2.4834, "step": 654 }, { "epoch": 0.10567925137141013, "grad_norm": 4.837742805480957, "learning_rate": 9.854997535038873e-05, "loss": 2.4236, "step": 655 }, { "epoch": 0.1058405937399161, "grad_norm": 4.2191081047058105, "learning_rate": 9.854372207668799e-05, "loss": 2.5142, "step": 656 }, { "epoch": 0.10600193610842207, "grad_norm": 5.554219722747803, "learning_rate": 9.85374555475254e-05, "loss": 2.275, "step": 657 }, { "epoch": 0.10616327847692804, "grad_norm": 5.10724401473999, "learning_rate": 9.85311757646121e-05, "loss": 2.3817, "step": 658 }, { "epoch": 0.10632462084543401, "grad_norm": 4.961881160736084, "learning_rate": 9.852488272966286e-05, "loss": 1.9945, "step": 659 }, { "epoch": 0.10648596321393998, "grad_norm": 5.993134498596191, "learning_rate": 9.85185764443961e-05, "loss": 2.5539, "step": 660 }, { "epoch": 0.10664730558244595, "grad_norm": 4.784736633300781, "learning_rate": 9.85122569105338e-05, "loss": 2.3461, "step": 661 }, { "epoch": 0.10680864795095192, "grad_norm": 7.389451503753662, "learning_rate": 9.850592412980159e-05, "loss": 2.3742, "step": 662 }, { "epoch": 0.10696999031945789, "grad_norm": 4.962615489959717, "learning_rate": 9.849957810392872e-05, "loss": 2.3071, "step": 663 }, { "epoch": 0.10713133268796386, "grad_norm": 5.416285037994385, "learning_rate": 9.849321883464806e-05, "loss": 2.0281, "step": 664 }, { "epoch": 0.10729267505646983, "grad_norm": 4.267657279968262, "learning_rate": 9.848684632369605e-05, "loss": 2.2553, "step": 665 }, { "epoch": 0.1074540174249758, "grad_norm": 5.005043029785156, "learning_rate": 9.848046057281284e-05, "loss": 2.3333, "step": 666 }, { "epoch": 0.10761535979348177, "grad_norm": 5.271666526794434, "learning_rate": 9.847406158374209e-05, "loss": 2.4515, "step": 667 }, { "epoch": 0.10777670216198774, "grad_norm": 5.715791702270508, "learning_rate": 9.846764935823113e-05, "loss": 2.1334, "step": 668 }, { "epoch": 0.1079380445304937, "grad_norm": 7.363461017608643, "learning_rate": 9.846122389803093e-05, "loss": 2.2951, "step": 669 }, { "epoch": 0.10809938689899967, "grad_norm": 4.60572624206543, "learning_rate": 9.845478520489599e-05, "loss": 2.2605, "step": 670 }, { "epoch": 0.10826072926750564, "grad_norm": 4.57849645614624, "learning_rate": 9.844833328058452e-05, "loss": 2.5506, "step": 671 }, { "epoch": 0.10842207163601161, "grad_norm": 5.1598944664001465, "learning_rate": 9.844186812685827e-05, "loss": 2.2983, "step": 672 }, { "epoch": 0.10858341400451758, "grad_norm": 5.090889930725098, "learning_rate": 9.843538974548265e-05, "loss": 2.1432, "step": 673 }, { "epoch": 0.10874475637302355, "grad_norm": 3.7854175567626953, "learning_rate": 9.842889813822665e-05, "loss": 2.4845, "step": 674 }, { "epoch": 0.10890609874152953, "grad_norm": 4.897592067718506, "learning_rate": 9.842239330686287e-05, "loss": 2.33, "step": 675 }, { "epoch": 0.1090674411100355, "grad_norm": 3.634265184402466, "learning_rate": 9.841587525316756e-05, "loss": 2.5112, "step": 676 }, { "epoch": 0.10922878347854147, "grad_norm": 5.599669456481934, "learning_rate": 9.840934397892054e-05, "loss": 2.5141, "step": 677 }, { "epoch": 0.10939012584704744, "grad_norm": 3.9483070373535156, "learning_rate": 9.840279948590528e-05, "loss": 2.2176, "step": 678 }, { "epoch": 0.10955146821555341, "grad_norm": 4.629967212677002, "learning_rate": 9.83962417759088e-05, "loss": 2.2878, "step": 679 }, { "epoch": 0.10971281058405938, "grad_norm": 4.446599006652832, "learning_rate": 9.838967085072177e-05, "loss": 2.2885, "step": 680 }, { "epoch": 0.10987415295256535, "grad_norm": 5.427147388458252, "learning_rate": 9.838308671213847e-05, "loss": 1.9743, "step": 681 }, { "epoch": 0.11003549532107132, "grad_norm": 5.741102695465088, "learning_rate": 9.83764893619568e-05, "loss": 2.2042, "step": 682 }, { "epoch": 0.11019683768957728, "grad_norm": 5.572746276855469, "learning_rate": 9.836987880197822e-05, "loss": 2.2499, "step": 683 }, { "epoch": 0.11035818005808325, "grad_norm": 4.086371421813965, "learning_rate": 9.836325503400781e-05, "loss": 2.4301, "step": 684 }, { "epoch": 0.11051952242658922, "grad_norm": 6.250503063201904, "learning_rate": 9.83566180598543e-05, "loss": 2.1814, "step": 685 }, { "epoch": 0.11068086479509519, "grad_norm": 5.577382564544678, "learning_rate": 9.834996788133002e-05, "loss": 2.1989, "step": 686 }, { "epoch": 0.11084220716360116, "grad_norm": 5.428224563598633, "learning_rate": 9.834330450025082e-05, "loss": 2.1685, "step": 687 }, { "epoch": 0.11100354953210713, "grad_norm": 4.101722717285156, "learning_rate": 9.833662791843627e-05, "loss": 2.2781, "step": 688 }, { "epoch": 0.1111648919006131, "grad_norm": 6.424073219299316, "learning_rate": 9.832993813770947e-05, "loss": 2.2648, "step": 689 }, { "epoch": 0.11132623426911907, "grad_norm": 4.855088233947754, "learning_rate": 9.832323515989717e-05, "loss": 2.1524, "step": 690 }, { "epoch": 0.11148757663762504, "grad_norm": 7.551122665405273, "learning_rate": 9.831651898682968e-05, "loss": 2.0951, "step": 691 }, { "epoch": 0.11164891900613101, "grad_norm": 4.96258020401001, "learning_rate": 9.830978962034093e-05, "loss": 2.3655, "step": 692 }, { "epoch": 0.11181026137463698, "grad_norm": 5.927750587463379, "learning_rate": 9.830304706226847e-05, "loss": 2.0535, "step": 693 }, { "epoch": 0.11197160374314295, "grad_norm": 4.051743507385254, "learning_rate": 9.829629131445342e-05, "loss": 2.3049, "step": 694 }, { "epoch": 0.11213294611164892, "grad_norm": 4.13598108291626, "learning_rate": 9.828952237874055e-05, "loss": 2.3451, "step": 695 }, { "epoch": 0.1122942884801549, "grad_norm": 5.234704494476318, "learning_rate": 9.828274025697817e-05, "loss": 2.3477, "step": 696 }, { "epoch": 0.11245563084866086, "grad_norm": 3.897089719772339, "learning_rate": 9.827594495101823e-05, "loss": 2.0933, "step": 697 }, { "epoch": 0.11261697321716682, "grad_norm": 4.481931686401367, "learning_rate": 9.826913646271631e-05, "loss": 2.3839, "step": 698 }, { "epoch": 0.11277831558567279, "grad_norm": 4.5691986083984375, "learning_rate": 9.826231479393148e-05, "loss": 2.362, "step": 699 }, { "epoch": 0.11293965795417876, "grad_norm": 3.9855291843414307, "learning_rate": 9.825547994652655e-05, "loss": 2.4411, "step": 700 }, { "epoch": 0.11310100032268473, "grad_norm": 4.106456756591797, "learning_rate": 9.824863192236784e-05, "loss": 2.2892, "step": 701 }, { "epoch": 0.1132623426911907, "grad_norm": 4.03942346572876, "learning_rate": 9.824177072332526e-05, "loss": 2.3969, "step": 702 }, { "epoch": 0.11342368505969667, "grad_norm": 5.276364803314209, "learning_rate": 9.823489635127236e-05, "loss": 2.1279, "step": 703 }, { "epoch": 0.11358502742820265, "grad_norm": 3.4048545360565186, "learning_rate": 9.822800880808628e-05, "loss": 2.3158, "step": 704 }, { "epoch": 0.11374636979670862, "grad_norm": 4.882122039794922, "learning_rate": 9.822110809564774e-05, "loss": 2.2801, "step": 705 }, { "epoch": 0.11390771216521459, "grad_norm": 4.997471332550049, "learning_rate": 9.821419421584107e-05, "loss": 2.0774, "step": 706 }, { "epoch": 0.11406905453372056, "grad_norm": 5.2324090003967285, "learning_rate": 9.82072671705542e-05, "loss": 2.1501, "step": 707 }, { "epoch": 0.11423039690222653, "grad_norm": 4.585089683532715, "learning_rate": 9.820032696167863e-05, "loss": 2.3784, "step": 708 }, { "epoch": 0.1143917392707325, "grad_norm": 4.679159641265869, "learning_rate": 9.819337359110945e-05, "loss": 2.0806, "step": 709 }, { "epoch": 0.11455308163923847, "grad_norm": 5.738368511199951, "learning_rate": 9.81864070607454e-05, "loss": 2.3261, "step": 710 }, { "epoch": 0.11471442400774444, "grad_norm": 5.211796283721924, "learning_rate": 9.817942737248878e-05, "loss": 2.1125, "step": 711 }, { "epoch": 0.11487576637625041, "grad_norm": 6.789905548095703, "learning_rate": 9.817243452824545e-05, "loss": 2.3248, "step": 712 }, { "epoch": 0.11503710874475637, "grad_norm": 6.678147315979004, "learning_rate": 9.81654285299249e-05, "loss": 2.1509, "step": 713 }, { "epoch": 0.11519845111326234, "grad_norm": 3.517925500869751, "learning_rate": 9.815840937944022e-05, "loss": 2.512, "step": 714 }, { "epoch": 0.11535979348176831, "grad_norm": 6.970942497253418, "learning_rate": 9.815137707870805e-05, "loss": 2.0529, "step": 715 }, { "epoch": 0.11552113585027428, "grad_norm": 6.183144569396973, "learning_rate": 9.814433162964868e-05, "loss": 2.4695, "step": 716 }, { "epoch": 0.11568247821878025, "grad_norm": 4.125637054443359, "learning_rate": 9.813727303418594e-05, "loss": 2.2679, "step": 717 }, { "epoch": 0.11584382058728622, "grad_norm": 7.293137073516846, "learning_rate": 9.813020129424726e-05, "loss": 2.2164, "step": 718 }, { "epoch": 0.11600516295579219, "grad_norm": 4.501054286956787, "learning_rate": 9.812311641176366e-05, "loss": 2.2489, "step": 719 }, { "epoch": 0.11616650532429816, "grad_norm": 4.464605331420898, "learning_rate": 9.811601838866979e-05, "loss": 2.0857, "step": 720 }, { "epoch": 0.11632784769280413, "grad_norm": 3.8551461696624756, "learning_rate": 9.810890722690381e-05, "loss": 2.1855, "step": 721 }, { "epoch": 0.1164891900613101, "grad_norm": 6.24691104888916, "learning_rate": 9.810178292840753e-05, "loss": 2.0357, "step": 722 }, { "epoch": 0.11665053242981607, "grad_norm": 6.160900115966797, "learning_rate": 9.809464549512633e-05, "loss": 2.1341, "step": 723 }, { "epoch": 0.11681187479832204, "grad_norm": 5.17367696762085, "learning_rate": 9.808749492900918e-05, "loss": 2.1418, "step": 724 }, { "epoch": 0.11697321716682801, "grad_norm": 7.1499786376953125, "learning_rate": 9.808033123200859e-05, "loss": 2.2524, "step": 725 }, { "epoch": 0.11713455953533398, "grad_norm": 5.599761486053467, "learning_rate": 9.807315440608076e-05, "loss": 2.243, "step": 726 }, { "epoch": 0.11729590190383996, "grad_norm": 4.063814163208008, "learning_rate": 9.806596445318537e-05, "loss": 2.0576, "step": 727 }, { "epoch": 0.11745724427234591, "grad_norm": 4.227038860321045, "learning_rate": 9.805876137528571e-05, "loss": 2.2421, "step": 728 }, { "epoch": 0.11761858664085188, "grad_norm": 5.211657524108887, "learning_rate": 9.805154517434871e-05, "loss": 2.5321, "step": 729 }, { "epoch": 0.11777992900935785, "grad_norm": 4.9539031982421875, "learning_rate": 9.804431585234483e-05, "loss": 2.5082, "step": 730 }, { "epoch": 0.11794127137786382, "grad_norm": 4.826414108276367, "learning_rate": 9.803707341124812e-05, "loss": 2.186, "step": 731 }, { "epoch": 0.1181026137463698, "grad_norm": 5.114047527313232, "learning_rate": 9.802981785303621e-05, "loss": 2.2141, "step": 732 }, { "epoch": 0.11826395611487577, "grad_norm": 6.016683578491211, "learning_rate": 9.802254917969032e-05, "loss": 2.2963, "step": 733 }, { "epoch": 0.11842529848338174, "grad_norm": 7.989525318145752, "learning_rate": 9.801526739319528e-05, "loss": 2.2502, "step": 734 }, { "epoch": 0.1185866408518877, "grad_norm": 4.647066593170166, "learning_rate": 9.800797249553943e-05, "loss": 2.2818, "step": 735 }, { "epoch": 0.11874798322039368, "grad_norm": 4.587214469909668, "learning_rate": 9.800066448871477e-05, "loss": 2.6027, "step": 736 }, { "epoch": 0.11890932558889965, "grad_norm": 4.695929527282715, "learning_rate": 9.799334337471681e-05, "loss": 1.9805, "step": 737 }, { "epoch": 0.11907066795740562, "grad_norm": 5.0293707847595215, "learning_rate": 9.798600915554468e-05, "loss": 2.3194, "step": 738 }, { "epoch": 0.11923201032591159, "grad_norm": 5.731796741485596, "learning_rate": 9.79786618332011e-05, "loss": 2.1848, "step": 739 }, { "epoch": 0.11939335269441756, "grad_norm": 5.218835353851318, "learning_rate": 9.79713014096923e-05, "loss": 2.0952, "step": 740 }, { "epoch": 0.11955469506292353, "grad_norm": 4.538522720336914, "learning_rate": 9.79639278870282e-05, "loss": 1.9968, "step": 741 }, { "epoch": 0.11971603743142949, "grad_norm": 5.75065803527832, "learning_rate": 9.795654126722217e-05, "loss": 2.2524, "step": 742 }, { "epoch": 0.11987737979993546, "grad_norm": 4.09686803817749, "learning_rate": 9.794914155229124e-05, "loss": 2.0196, "step": 743 }, { "epoch": 0.12003872216844143, "grad_norm": 3.8854429721832275, "learning_rate": 9.794172874425602e-05, "loss": 2.1736, "step": 744 }, { "epoch": 0.1202000645369474, "grad_norm": 4.692524433135986, "learning_rate": 9.793430284514062e-05, "loss": 2.2808, "step": 745 }, { "epoch": 0.12036140690545337, "grad_norm": 4.936270236968994, "learning_rate": 9.792686385697282e-05, "loss": 2.1992, "step": 746 }, { "epoch": 0.12052274927395934, "grad_norm": 4.312668323516846, "learning_rate": 9.79194117817839e-05, "loss": 2.1661, "step": 747 }, { "epoch": 0.12068409164246531, "grad_norm": 6.014287948608398, "learning_rate": 9.791194662160874e-05, "loss": 2.1115, "step": 748 }, { "epoch": 0.12084543401097128, "grad_norm": 4.760694980621338, "learning_rate": 9.79044683784858e-05, "loss": 2.0568, "step": 749 }, { "epoch": 0.12100677637947725, "grad_norm": 4.866275310516357, "learning_rate": 9.78969770544571e-05, "loss": 2.0202, "step": 750 }, { "epoch": 0.12116811874798322, "grad_norm": 5.796637535095215, "learning_rate": 9.788947265156827e-05, "loss": 2.2872, "step": 751 }, { "epoch": 0.12132946111648919, "grad_norm": 5.722478866577148, "learning_rate": 9.788195517186845e-05, "loss": 2.2423, "step": 752 }, { "epoch": 0.12149080348499516, "grad_norm": 6.461528778076172, "learning_rate": 9.787442461741037e-05, "loss": 2.3383, "step": 753 }, { "epoch": 0.12165214585350113, "grad_norm": 4.787562370300293, "learning_rate": 9.786688099025037e-05, "loss": 2.3742, "step": 754 }, { "epoch": 0.1218134882220071, "grad_norm": 5.212571144104004, "learning_rate": 9.78593242924483e-05, "loss": 2.2332, "step": 755 }, { "epoch": 0.12197483059051308, "grad_norm": 5.459593296051025, "learning_rate": 9.785175452606762e-05, "loss": 2.0847, "step": 756 }, { "epoch": 0.12213617295901903, "grad_norm": 7.659170627593994, "learning_rate": 9.784417169317539e-05, "loss": 2.2558, "step": 757 }, { "epoch": 0.122297515327525, "grad_norm": 7.307011127471924, "learning_rate": 9.783657579584213e-05, "loss": 2.3545, "step": 758 }, { "epoch": 0.12245885769603097, "grad_norm": 5.335236549377441, "learning_rate": 9.782896683614204e-05, "loss": 2.0521, "step": 759 }, { "epoch": 0.12262020006453694, "grad_norm": 4.387326717376709, "learning_rate": 9.782134481615281e-05, "loss": 2.3457, "step": 760 }, { "epoch": 0.12278154243304291, "grad_norm": 4.305408000946045, "learning_rate": 9.781370973795576e-05, "loss": 2.1857, "step": 761 }, { "epoch": 0.12294288480154889, "grad_norm": 5.678192615509033, "learning_rate": 9.780606160363572e-05, "loss": 2.3804, "step": 762 }, { "epoch": 0.12310422717005486, "grad_norm": 5.5075178146362305, "learning_rate": 9.779840041528109e-05, "loss": 2.1147, "step": 763 }, { "epoch": 0.12326556953856083, "grad_norm": 6.328161239624023, "learning_rate": 9.77907261749839e-05, "loss": 2.4973, "step": 764 }, { "epoch": 0.1234269119070668, "grad_norm": 4.356446743011475, "learning_rate": 9.778303888483965e-05, "loss": 2.2695, "step": 765 }, { "epoch": 0.12358825427557277, "grad_norm": 4.555844783782959, "learning_rate": 9.777533854694747e-05, "loss": 2.5355, "step": 766 }, { "epoch": 0.12374959664407874, "grad_norm": 5.903046131134033, "learning_rate": 9.776762516341003e-05, "loss": 2.0878, "step": 767 }, { "epoch": 0.12391093901258471, "grad_norm": 6.699967384338379, "learning_rate": 9.775989873633357e-05, "loss": 2.1891, "step": 768 }, { "epoch": 0.12407228138109068, "grad_norm": 4.067835330963135, "learning_rate": 9.775215926782788e-05, "loss": 2.1916, "step": 769 }, { "epoch": 0.12423362374959665, "grad_norm": 5.333219528198242, "learning_rate": 9.774440676000631e-05, "loss": 2.2094, "step": 770 }, { "epoch": 0.12439496611810262, "grad_norm": 3.9629952907562256, "learning_rate": 9.773664121498579e-05, "loss": 2.6525, "step": 771 }, { "epoch": 0.12455630848660858, "grad_norm": 5.467307090759277, "learning_rate": 9.772886263488679e-05, "loss": 2.3546, "step": 772 }, { "epoch": 0.12471765085511455, "grad_norm": 5.498898029327393, "learning_rate": 9.772107102183336e-05, "loss": 2.4372, "step": 773 }, { "epoch": 0.12487899322362052, "grad_norm": 4.510770320892334, "learning_rate": 9.771326637795308e-05, "loss": 2.2908, "step": 774 }, { "epoch": 0.1250403355921265, "grad_norm": 5.672365188598633, "learning_rate": 9.770544870537711e-05, "loss": 2.288, "step": 775 }, { "epoch": 0.12520167796063247, "grad_norm": 5.459904193878174, "learning_rate": 9.769761800624016e-05, "loss": 2.1396, "step": 776 }, { "epoch": 0.12536302032913843, "grad_norm": 3.5810322761535645, "learning_rate": 9.768977428268051e-05, "loss": 2.2168, "step": 777 }, { "epoch": 0.12552436269764441, "grad_norm": 4.152242183685303, "learning_rate": 9.768191753683998e-05, "loss": 2.214, "step": 778 }, { "epoch": 0.12568570506615037, "grad_norm": 6.909939765930176, "learning_rate": 9.767404777086393e-05, "loss": 2.1607, "step": 779 }, { "epoch": 0.12584704743465633, "grad_norm": 6.329325199127197, "learning_rate": 9.766616498690133e-05, "loss": 2.4425, "step": 780 }, { "epoch": 0.1260083898031623, "grad_norm": 3.946126937866211, "learning_rate": 9.765826918710466e-05, "loss": 2.1719, "step": 781 }, { "epoch": 0.12616973217166827, "grad_norm": 4.491634368896484, "learning_rate": 9.765036037362996e-05, "loss": 2.2366, "step": 782 }, { "epoch": 0.12633107454017425, "grad_norm": 4.341719150543213, "learning_rate": 9.764243854863682e-05, "loss": 2.2991, "step": 783 }, { "epoch": 0.1264924169086802, "grad_norm": 6.946076393127441, "learning_rate": 9.763450371428841e-05, "loss": 2.2794, "step": 784 }, { "epoch": 0.1266537592771862, "grad_norm": 5.051512241363525, "learning_rate": 9.762655587275142e-05, "loss": 2.0915, "step": 785 }, { "epoch": 0.12681510164569215, "grad_norm": 3.9053547382354736, "learning_rate": 9.761859502619612e-05, "loss": 2.3005, "step": 786 }, { "epoch": 0.12697644401419814, "grad_norm": 5.447360515594482, "learning_rate": 9.761062117679632e-05, "loss": 2.1376, "step": 787 }, { "epoch": 0.1271377863827041, "grad_norm": 4.656970977783203, "learning_rate": 9.760263432672936e-05, "loss": 2.0274, "step": 788 }, { "epoch": 0.12729912875121008, "grad_norm": 3.9776928424835205, "learning_rate": 9.759463447817616e-05, "loss": 2.3201, "step": 789 }, { "epoch": 0.12746047111971603, "grad_norm": 4.438282489776611, "learning_rate": 9.758662163332118e-05, "loss": 2.2339, "step": 790 }, { "epoch": 0.12762181348822202, "grad_norm": 4.395122051239014, "learning_rate": 9.75785957943524e-05, "loss": 2.3393, "step": 791 }, { "epoch": 0.12778315585672798, "grad_norm": 5.506349563598633, "learning_rate": 9.75705569634614e-05, "loss": 2.1605, "step": 792 }, { "epoch": 0.12794449822523396, "grad_norm": 5.949671268463135, "learning_rate": 9.756250514284328e-05, "loss": 2.3274, "step": 793 }, { "epoch": 0.12810584059373992, "grad_norm": 4.912148475646973, "learning_rate": 9.755444033469669e-05, "loss": 2.1012, "step": 794 }, { "epoch": 0.12826718296224587, "grad_norm": 7.435208320617676, "learning_rate": 9.754636254122381e-05, "loss": 2.3464, "step": 795 }, { "epoch": 0.12842852533075186, "grad_norm": 4.349523067474365, "learning_rate": 9.75382717646304e-05, "loss": 2.0887, "step": 796 }, { "epoch": 0.12858986769925781, "grad_norm": 4.330012321472168, "learning_rate": 9.753016800712573e-05, "loss": 2.2709, "step": 797 }, { "epoch": 0.1287512100677638, "grad_norm": 5.999225616455078, "learning_rate": 9.752205127092265e-05, "loss": 1.959, "step": 798 }, { "epoch": 0.12891255243626976, "grad_norm": 5.202185153961182, "learning_rate": 9.751392155823752e-05, "loss": 2.1121, "step": 799 }, { "epoch": 0.12907389480477574, "grad_norm": 4.725948333740234, "learning_rate": 9.750577887129027e-05, "loss": 2.1956, "step": 800 }, { "epoch": 0.1292352371732817, "grad_norm": 6.285621643066406, "learning_rate": 9.749762321230433e-05, "loss": 2.4139, "step": 801 }, { "epoch": 0.12939657954178768, "grad_norm": 4.640841007232666, "learning_rate": 9.748945458350673e-05, "loss": 2.06, "step": 802 }, { "epoch": 0.12955792191029364, "grad_norm": 3.547497510910034, "learning_rate": 9.748127298712803e-05, "loss": 2.2075, "step": 803 }, { "epoch": 0.12971926427879962, "grad_norm": 4.867875099182129, "learning_rate": 9.747307842540229e-05, "loss": 2.2949, "step": 804 }, { "epoch": 0.12988060664730558, "grad_norm": 5.806981086730957, "learning_rate": 9.746487090056713e-05, "loss": 2.2565, "step": 805 }, { "epoch": 0.13004194901581156, "grad_norm": 4.769914150238037, "learning_rate": 9.745665041486374e-05, "loss": 2.1801, "step": 806 }, { "epoch": 0.13020329138431752, "grad_norm": 4.5631890296936035, "learning_rate": 9.744841697053681e-05, "loss": 2.0623, "step": 807 }, { "epoch": 0.13036463375282348, "grad_norm": 4.284159183502197, "learning_rate": 9.744017056983459e-05, "loss": 2.2616, "step": 808 }, { "epoch": 0.13052597612132946, "grad_norm": 4.25748348236084, "learning_rate": 9.743191121500887e-05, "loss": 2.0961, "step": 809 }, { "epoch": 0.13068731848983542, "grad_norm": 5.114495277404785, "learning_rate": 9.742363890831494e-05, "loss": 2.3635, "step": 810 }, { "epoch": 0.1308486608583414, "grad_norm": 4.6019721031188965, "learning_rate": 9.741535365201168e-05, "loss": 2.3813, "step": 811 }, { "epoch": 0.13101000322684736, "grad_norm": 4.860962390899658, "learning_rate": 9.740705544836146e-05, "loss": 2.0449, "step": 812 }, { "epoch": 0.13117134559535334, "grad_norm": 3.8722214698791504, "learning_rate": 9.739874429963023e-05, "loss": 2.1397, "step": 813 }, { "epoch": 0.1313326879638593, "grad_norm": 5.776529312133789, "learning_rate": 9.739042020808746e-05, "loss": 2.1399, "step": 814 }, { "epoch": 0.13149403033236529, "grad_norm": 5.5494232177734375, "learning_rate": 9.73820831760061e-05, "loss": 2.3765, "step": 815 }, { "epoch": 0.13165537270087124, "grad_norm": 5.315171241760254, "learning_rate": 9.737373320566272e-05, "loss": 2.1391, "step": 816 }, { "epoch": 0.13181671506937723, "grad_norm": 4.855938911437988, "learning_rate": 9.736537029933738e-05, "loss": 2.1547, "step": 817 }, { "epoch": 0.13197805743788318, "grad_norm": 6.183468341827393, "learning_rate": 9.735699445931365e-05, "loss": 2.1848, "step": 818 }, { "epoch": 0.13213939980638917, "grad_norm": 7.368840217590332, "learning_rate": 9.734860568787868e-05, "loss": 1.9264, "step": 819 }, { "epoch": 0.13230074217489513, "grad_norm": 7.0486741065979, "learning_rate": 9.734020398732311e-05, "loss": 2.4843, "step": 820 }, { "epoch": 0.1324620845434011, "grad_norm": 6.184607028961182, "learning_rate": 9.733178935994115e-05, "loss": 2.1397, "step": 821 }, { "epoch": 0.13262342691190707, "grad_norm": 4.931450366973877, "learning_rate": 9.73233618080305e-05, "loss": 2.1738, "step": 822 }, { "epoch": 0.13278476928041302, "grad_norm": 4.700014114379883, "learning_rate": 9.73149213338924e-05, "loss": 1.9448, "step": 823 }, { "epoch": 0.132946111648919, "grad_norm": 4.283225059509277, "learning_rate": 9.730646793983165e-05, "loss": 2.2092, "step": 824 }, { "epoch": 0.13310745401742496, "grad_norm": 5.251535892486572, "learning_rate": 9.729800162815652e-05, "loss": 2.2563, "step": 825 }, { "epoch": 0.13326879638593095, "grad_norm": 4.738597393035889, "learning_rate": 9.728952240117888e-05, "loss": 2.2449, "step": 826 }, { "epoch": 0.1334301387544369, "grad_norm": 3.9104974269866943, "learning_rate": 9.728103026121407e-05, "loss": 2.1812, "step": 827 }, { "epoch": 0.1335914811229429, "grad_norm": 5.504037857055664, "learning_rate": 9.727252521058097e-05, "loss": 2.378, "step": 828 }, { "epoch": 0.13375282349144885, "grad_norm": 5.019674777984619, "learning_rate": 9.726400725160198e-05, "loss": 2.162, "step": 829 }, { "epoch": 0.13391416585995483, "grad_norm": 4.628396511077881, "learning_rate": 9.725547638660305e-05, "loss": 2.2211, "step": 830 }, { "epoch": 0.1340755082284608, "grad_norm": 4.508102893829346, "learning_rate": 9.724693261791364e-05, "loss": 2.0691, "step": 831 }, { "epoch": 0.13423685059696677, "grad_norm": 4.798797607421875, "learning_rate": 9.723837594786672e-05, "loss": 2.0428, "step": 832 }, { "epoch": 0.13439819296547273, "grad_norm": 4.994052886962891, "learning_rate": 9.722980637879879e-05, "loss": 2.328, "step": 833 }, { "epoch": 0.1345595353339787, "grad_norm": 6.477321624755859, "learning_rate": 9.722122391304988e-05, "loss": 2.5859, "step": 834 }, { "epoch": 0.13472087770248467, "grad_norm": 4.700345993041992, "learning_rate": 9.721262855296357e-05, "loss": 2.0088, "step": 835 }, { "epoch": 0.13488222007099065, "grad_norm": 5.024048805236816, "learning_rate": 9.72040203008869e-05, "loss": 2.4683, "step": 836 }, { "epoch": 0.1350435624394966, "grad_norm": 4.9408979415893555, "learning_rate": 9.719539915917043e-05, "loss": 2.2342, "step": 837 }, { "epoch": 0.13520490480800257, "grad_norm": 4.616140842437744, "learning_rate": 9.718676513016832e-05, "loss": 2.1205, "step": 838 }, { "epoch": 0.13536624717650855, "grad_norm": 5.232850551605225, "learning_rate": 9.717811821623817e-05, "loss": 2.1548, "step": 839 }, { "epoch": 0.1355275895450145, "grad_norm": 5.0068583488464355, "learning_rate": 9.716945841974115e-05, "loss": 2.0153, "step": 840 }, { "epoch": 0.1356889319135205, "grad_norm": 5.102565288543701, "learning_rate": 9.716078574304189e-05, "loss": 2.1854, "step": 841 }, { "epoch": 0.13585027428202645, "grad_norm": 5.298509120941162, "learning_rate": 9.715210018850859e-05, "loss": 2.1734, "step": 842 }, { "epoch": 0.13601161665053244, "grad_norm": 5.077696800231934, "learning_rate": 9.714340175851297e-05, "loss": 2.0086, "step": 843 }, { "epoch": 0.1361729590190384, "grad_norm": 4.419206619262695, "learning_rate": 9.713469045543022e-05, "loss": 1.9763, "step": 844 }, { "epoch": 0.13633430138754438, "grad_norm": 5.292459011077881, "learning_rate": 9.712596628163906e-05, "loss": 2.1624, "step": 845 }, { "epoch": 0.13649564375605033, "grad_norm": 5.468176364898682, "learning_rate": 9.711722923952173e-05, "loss": 2.0919, "step": 846 }, { "epoch": 0.13665698612455632, "grad_norm": 4.132210731506348, "learning_rate": 9.710847933146403e-05, "loss": 2.2685, "step": 847 }, { "epoch": 0.13681832849306227, "grad_norm": 4.604116916656494, "learning_rate": 9.709971655985518e-05, "loss": 2.3708, "step": 848 }, { "epoch": 0.13697967086156826, "grad_norm": 4.225286960601807, "learning_rate": 9.709094092708799e-05, "loss": 2.3804, "step": 849 }, { "epoch": 0.13714101323007422, "grad_norm": 7.442013740539551, "learning_rate": 9.708215243555875e-05, "loss": 2.2357, "step": 850 }, { "epoch": 0.1373023555985802, "grad_norm": 6.1611127853393555, "learning_rate": 9.707335108766726e-05, "loss": 2.1841, "step": 851 }, { "epoch": 0.13746369796708616, "grad_norm": 5.973389148712158, "learning_rate": 9.706453688581684e-05, "loss": 2.0254, "step": 852 }, { "epoch": 0.1376250403355921, "grad_norm": 5.30540657043457, "learning_rate": 9.705570983241432e-05, "loss": 2.277, "step": 853 }, { "epoch": 0.1377863827040981, "grad_norm": 5.778249740600586, "learning_rate": 9.704686992987005e-05, "loss": 2.2288, "step": 854 }, { "epoch": 0.13794772507260405, "grad_norm": 6.66129207611084, "learning_rate": 9.703801718059783e-05, "loss": 2.2849, "step": 855 }, { "epoch": 0.13810906744111004, "grad_norm": 5.594348907470703, "learning_rate": 9.702915158701506e-05, "loss": 2.1925, "step": 856 }, { "epoch": 0.138270409809616, "grad_norm": 4.9399614334106445, "learning_rate": 9.702027315154257e-05, "loss": 2.1171, "step": 857 }, { "epoch": 0.13843175217812198, "grad_norm": 6.186155796051025, "learning_rate": 9.701138187660473e-05, "loss": 2.3412, "step": 858 }, { "epoch": 0.13859309454662794, "grad_norm": 7.687668800354004, "learning_rate": 9.700247776462943e-05, "loss": 2.2036, "step": 859 }, { "epoch": 0.13875443691513392, "grad_norm": 10.154778480529785, "learning_rate": 9.699356081804803e-05, "loss": 2.5356, "step": 860 }, { "epoch": 0.13891577928363988, "grad_norm": 9.834787368774414, "learning_rate": 9.698463103929542e-05, "loss": 2.1638, "step": 861 }, { "epoch": 0.13907712165214586, "grad_norm": 6.674441337585449, "learning_rate": 9.697568843081e-05, "loss": 2.0882, "step": 862 }, { "epoch": 0.13923846402065182, "grad_norm": 4.7841315269470215, "learning_rate": 9.696673299503361e-05, "loss": 2.0868, "step": 863 }, { "epoch": 0.1393998063891578, "grad_norm": 5.529294013977051, "learning_rate": 9.695776473441169e-05, "loss": 2.1338, "step": 864 }, { "epoch": 0.13956114875766376, "grad_norm": 5.372847557067871, "learning_rate": 9.694878365139313e-05, "loss": 2.2776, "step": 865 }, { "epoch": 0.13972249112616975, "grad_norm": 5.5793352127075195, "learning_rate": 9.693978974843032e-05, "loss": 2.3289, "step": 866 }, { "epoch": 0.1398838334946757, "grad_norm": 4.251262664794922, "learning_rate": 9.693078302797914e-05, "loss": 2.059, "step": 867 }, { "epoch": 0.14004517586318166, "grad_norm": 5.220300674438477, "learning_rate": 9.6921763492499e-05, "loss": 2.5555, "step": 868 }, { "epoch": 0.14020651823168764, "grad_norm": 5.264746189117432, "learning_rate": 9.691273114445278e-05, "loss": 2.3829, "step": 869 }, { "epoch": 0.1403678606001936, "grad_norm": 4.5468974113464355, "learning_rate": 9.69036859863069e-05, "loss": 2.1567, "step": 870 }, { "epoch": 0.14052920296869958, "grad_norm": 6.331204414367676, "learning_rate": 9.689462802053125e-05, "loss": 2.1218, "step": 871 }, { "epoch": 0.14069054533720554, "grad_norm": 7.563364505767822, "learning_rate": 9.688555724959918e-05, "loss": 2.279, "step": 872 }, { "epoch": 0.14085188770571153, "grad_norm": 5.659290790557861, "learning_rate": 9.687647367598762e-05, "loss": 2.4341, "step": 873 }, { "epoch": 0.14101323007421748, "grad_norm": 4.670441150665283, "learning_rate": 9.686737730217695e-05, "loss": 2.3573, "step": 874 }, { "epoch": 0.14117457244272347, "grad_norm": 3.848391056060791, "learning_rate": 9.685826813065102e-05, "loss": 2.368, "step": 875 }, { "epoch": 0.14133591481122942, "grad_norm": 4.092101573944092, "learning_rate": 9.684914616389721e-05, "loss": 2.124, "step": 876 }, { "epoch": 0.1414972571797354, "grad_norm": 4.652859687805176, "learning_rate": 9.684001140440639e-05, "loss": 2.1059, "step": 877 }, { "epoch": 0.14165859954824137, "grad_norm": 5.927852630615234, "learning_rate": 9.683086385467293e-05, "loss": 2.2456, "step": 878 }, { "epoch": 0.14181994191674735, "grad_norm": 7.353157997131348, "learning_rate": 9.682170351719465e-05, "loss": 2.3135, "step": 879 }, { "epoch": 0.1419812842852533, "grad_norm": 4.424384593963623, "learning_rate": 9.681253039447294e-05, "loss": 2.2472, "step": 880 }, { "epoch": 0.1421426266537593, "grad_norm": 8.861557006835938, "learning_rate": 9.680334448901258e-05, "loss": 2.0083, "step": 881 }, { "epoch": 0.14230396902226525, "grad_norm": 5.569779872894287, "learning_rate": 9.679414580332194e-05, "loss": 2.1191, "step": 882 }, { "epoch": 0.1424653113907712, "grad_norm": 6.622554779052734, "learning_rate": 9.67849343399128e-05, "loss": 2.4547, "step": 883 }, { "epoch": 0.1426266537592772, "grad_norm": 6.614085674285889, "learning_rate": 9.67757101013005e-05, "loss": 2.1192, "step": 884 }, { "epoch": 0.14278799612778315, "grad_norm": 4.401320934295654, "learning_rate": 9.676647309000379e-05, "loss": 2.0062, "step": 885 }, { "epoch": 0.14294933849628913, "grad_norm": 4.417647361755371, "learning_rate": 9.6757223308545e-05, "loss": 2.3803, "step": 886 }, { "epoch": 0.1431106808647951, "grad_norm": 6.003297328948975, "learning_rate": 9.674796075944985e-05, "loss": 2.3687, "step": 887 }, { "epoch": 0.14327202323330107, "grad_norm": 5.431515693664551, "learning_rate": 9.673868544524762e-05, "loss": 2.4209, "step": 888 }, { "epoch": 0.14343336560180703, "grad_norm": 3.7381820678710938, "learning_rate": 9.672939736847103e-05, "loss": 2.1941, "step": 889 }, { "epoch": 0.143594707970313, "grad_norm": 5.933220386505127, "learning_rate": 9.672009653165632e-05, "loss": 2.1986, "step": 890 }, { "epoch": 0.14375605033881897, "grad_norm": 3.514514684677124, "learning_rate": 9.67107829373432e-05, "loss": 1.9916, "step": 891 }, { "epoch": 0.14391739270732495, "grad_norm": 5.188986301422119, "learning_rate": 9.670145658807485e-05, "loss": 2.1056, "step": 892 }, { "epoch": 0.1440787350758309, "grad_norm": 4.180763244628906, "learning_rate": 9.669211748639795e-05, "loss": 2.2095, "step": 893 }, { "epoch": 0.1442400774443369, "grad_norm": 3.691096544265747, "learning_rate": 9.668276563486266e-05, "loss": 2.35, "step": 894 }, { "epoch": 0.14440141981284285, "grad_norm": 4.197265148162842, "learning_rate": 9.667340103602261e-05, "loss": 2.1548, "step": 895 }, { "epoch": 0.1445627621813488, "grad_norm": 4.846789360046387, "learning_rate": 9.666402369243492e-05, "loss": 2.2217, "step": 896 }, { "epoch": 0.1447241045498548, "grad_norm": 3.7879931926727295, "learning_rate": 9.665463360666021e-05, "loss": 2.2129, "step": 897 }, { "epoch": 0.14488544691836075, "grad_norm": 6.429107666015625, "learning_rate": 9.664523078126253e-05, "loss": 2.1699, "step": 898 }, { "epoch": 0.14504678928686673, "grad_norm": 5.464096546173096, "learning_rate": 9.663581521880945e-05, "loss": 2.094, "step": 899 }, { "epoch": 0.1452081316553727, "grad_norm": 5.067931175231934, "learning_rate": 9.6626386921872e-05, "loss": 2.3702, "step": 900 }, { "epoch": 0.14536947402387868, "grad_norm": 5.132601261138916, "learning_rate": 9.661694589302471e-05, "loss": 2.0874, "step": 901 }, { "epoch": 0.14553081639238463, "grad_norm": 5.8880391120910645, "learning_rate": 9.660749213484555e-05, "loss": 2.1288, "step": 902 }, { "epoch": 0.14569215876089062, "grad_norm": 4.455813884735107, "learning_rate": 9.6598025649916e-05, "loss": 2.197, "step": 903 }, { "epoch": 0.14585350112939657, "grad_norm": 4.309800624847412, "learning_rate": 9.658854644082098e-05, "loss": 2.3049, "step": 904 }, { "epoch": 0.14601484349790256, "grad_norm": 6.354419708251953, "learning_rate": 9.657905451014893e-05, "loss": 2.048, "step": 905 }, { "epoch": 0.14617618586640851, "grad_norm": 4.877176761627197, "learning_rate": 9.656954986049171e-05, "loss": 2.2906, "step": 906 }, { "epoch": 0.1463375282349145, "grad_norm": 4.866892337799072, "learning_rate": 9.656003249444471e-05, "loss": 2.2998, "step": 907 }, { "epoch": 0.14649887060342046, "grad_norm": 3.9700560569763184, "learning_rate": 9.655050241460675e-05, "loss": 2.3659, "step": 908 }, { "epoch": 0.14666021297192644, "grad_norm": 5.312385082244873, "learning_rate": 9.654095962358014e-05, "loss": 2.1026, "step": 909 }, { "epoch": 0.1468215553404324, "grad_norm": 5.85587215423584, "learning_rate": 9.653140412397064e-05, "loss": 2.0669, "step": 910 }, { "epoch": 0.14698289770893835, "grad_norm": 5.001843452453613, "learning_rate": 9.652183591838752e-05, "loss": 2.2449, "step": 911 }, { "epoch": 0.14714424007744434, "grad_norm": 4.770449638366699, "learning_rate": 9.65122550094435e-05, "loss": 2.1298, "step": 912 }, { "epoch": 0.1473055824459503, "grad_norm": 6.339503288269043, "learning_rate": 9.650266139975474e-05, "loss": 2.2039, "step": 913 }, { "epoch": 0.14746692481445628, "grad_norm": 5.899519920349121, "learning_rate": 9.649305509194092e-05, "loss": 2.0838, "step": 914 }, { "epoch": 0.14762826718296224, "grad_norm": 6.260024547576904, "learning_rate": 9.648343608862515e-05, "loss": 2.2062, "step": 915 }, { "epoch": 0.14778960955146822, "grad_norm": 4.567520618438721, "learning_rate": 9.647380439243399e-05, "loss": 2.1741, "step": 916 }, { "epoch": 0.14795095191997418, "grad_norm": 4.420521259307861, "learning_rate": 9.646416000599754e-05, "loss": 2.3293, "step": 917 }, { "epoch": 0.14811229428848016, "grad_norm": 4.748103618621826, "learning_rate": 9.64545029319493e-05, "loss": 2.1784, "step": 918 }, { "epoch": 0.14827363665698612, "grad_norm": 6.980162620544434, "learning_rate": 9.644483317292623e-05, "loss": 2.1, "step": 919 }, { "epoch": 0.1484349790254921, "grad_norm": 5.445437431335449, "learning_rate": 9.643515073156881e-05, "loss": 2.2574, "step": 920 }, { "epoch": 0.14859632139399806, "grad_norm": 6.650381088256836, "learning_rate": 9.642545561052095e-05, "loss": 2.0731, "step": 921 }, { "epoch": 0.14875766376250404, "grad_norm": 3.9137048721313477, "learning_rate": 9.641574781242999e-05, "loss": 2.1378, "step": 922 }, { "epoch": 0.14891900613101, "grad_norm": 6.624279499053955, "learning_rate": 9.640602733994679e-05, "loss": 2.2704, "step": 923 }, { "epoch": 0.14908034849951599, "grad_norm": 4.790276050567627, "learning_rate": 9.639629419572565e-05, "loss": 2.2818, "step": 924 }, { "epoch": 0.14924169086802194, "grad_norm": 4.834343433380127, "learning_rate": 9.638654838242429e-05, "loss": 2.266, "step": 925 }, { "epoch": 0.1494030332365279, "grad_norm": 5.247141361236572, "learning_rate": 9.637678990270396e-05, "loss": 2.2551, "step": 926 }, { "epoch": 0.14956437560503388, "grad_norm": 5.176708698272705, "learning_rate": 9.636701875922933e-05, "loss": 2.2663, "step": 927 }, { "epoch": 0.14972571797353984, "grad_norm": 5.154667377471924, "learning_rate": 9.635723495466851e-05, "loss": 2.3111, "step": 928 }, { "epoch": 0.14988706034204582, "grad_norm": 3.496387004852295, "learning_rate": 9.63474384916931e-05, "loss": 2.2975, "step": 929 }, { "epoch": 0.15004840271055178, "grad_norm": 6.210982799530029, "learning_rate": 9.633762937297814e-05, "loss": 2.1714, "step": 930 }, { "epoch": 0.15020974507905777, "grad_norm": 4.316873550415039, "learning_rate": 9.632780760120215e-05, "loss": 2.1158, "step": 931 }, { "epoch": 0.15037108744756372, "grad_norm": 4.4215874671936035, "learning_rate": 9.631797317904707e-05, "loss": 2.2234, "step": 932 }, { "epoch": 0.1505324298160697, "grad_norm": 6.841871738433838, "learning_rate": 9.630812610919832e-05, "loss": 2.1703, "step": 933 }, { "epoch": 0.15069377218457566, "grad_norm": 4.790676116943359, "learning_rate": 9.629826639434475e-05, "loss": 2.0918, "step": 934 }, { "epoch": 0.15085511455308165, "grad_norm": 5.36135721206665, "learning_rate": 9.628839403717868e-05, "loss": 2.002, "step": 935 }, { "epoch": 0.1510164569215876, "grad_norm": 4.241684436798096, "learning_rate": 9.627850904039588e-05, "loss": 2.2189, "step": 936 }, { "epoch": 0.1511777992900936, "grad_norm": 3.8772034645080566, "learning_rate": 9.626861140669558e-05, "loss": 2.3375, "step": 937 }, { "epoch": 0.15133914165859955, "grad_norm": 6.061500549316406, "learning_rate": 9.625870113878044e-05, "loss": 2.1103, "step": 938 }, { "epoch": 0.15150048402710553, "grad_norm": 4.246634006500244, "learning_rate": 9.624877823935659e-05, "loss": 2.2886, "step": 939 }, { "epoch": 0.1516618263956115, "grad_norm": 4.799205780029297, "learning_rate": 9.623884271113359e-05, "loss": 2.2289, "step": 940 }, { "epoch": 0.15182316876411744, "grad_norm": 4.209903240203857, "learning_rate": 9.622889455682446e-05, "loss": 2.114, "step": 941 }, { "epoch": 0.15198451113262343, "grad_norm": 4.7471604347229, "learning_rate": 9.621893377914567e-05, "loss": 2.4034, "step": 942 }, { "epoch": 0.15214585350112939, "grad_norm": 4.916505813598633, "learning_rate": 9.620896038081713e-05, "loss": 2.1852, "step": 943 }, { "epoch": 0.15230719586963537, "grad_norm": 5.238925933837891, "learning_rate": 9.619897436456221e-05, "loss": 2.2174, "step": 944 }, { "epoch": 0.15246853823814133, "grad_norm": 4.896865367889404, "learning_rate": 9.61889757331077e-05, "loss": 2.0628, "step": 945 }, { "epoch": 0.1526298806066473, "grad_norm": 4.1599884033203125, "learning_rate": 9.617896448918386e-05, "loss": 2.1821, "step": 946 }, { "epoch": 0.15279122297515327, "grad_norm": 5.072725772857666, "learning_rate": 9.616894063552438e-05, "loss": 2.3195, "step": 947 }, { "epoch": 0.15295256534365925, "grad_norm": 4.6819071769714355, "learning_rate": 9.615890417486639e-05, "loss": 2.1734, "step": 948 }, { "epoch": 0.1531139077121652, "grad_norm": 4.185474872589111, "learning_rate": 9.614885510995047e-05, "loss": 2.0745, "step": 949 }, { "epoch": 0.1532752500806712, "grad_norm": 4.95952844619751, "learning_rate": 9.613879344352066e-05, "loss": 2.4051, "step": 950 }, { "epoch": 0.15343659244917715, "grad_norm": 5.198223114013672, "learning_rate": 9.61287191783244e-05, "loss": 2.4239, "step": 951 }, { "epoch": 0.15359793481768313, "grad_norm": 7.1581220626831055, "learning_rate": 9.611863231711261e-05, "loss": 2.254, "step": 952 }, { "epoch": 0.1537592771861891, "grad_norm": 5.759435653686523, "learning_rate": 9.610853286263963e-05, "loss": 2.1716, "step": 953 }, { "epoch": 0.15392061955469508, "grad_norm": 5.360166549682617, "learning_rate": 9.609842081766321e-05, "loss": 2.0779, "step": 954 }, { "epoch": 0.15408196192320103, "grad_norm": 5.1558146476745605, "learning_rate": 9.608829618494462e-05, "loss": 2.4345, "step": 955 }, { "epoch": 0.154243304291707, "grad_norm": 6.02689266204834, "learning_rate": 9.607815896724846e-05, "loss": 2.2129, "step": 956 }, { "epoch": 0.15440464666021297, "grad_norm": 5.901403427124023, "learning_rate": 9.606800916734286e-05, "loss": 2.1403, "step": 957 }, { "epoch": 0.15456598902871893, "grad_norm": 5.751607894897461, "learning_rate": 9.605784678799934e-05, "loss": 2.28, "step": 958 }, { "epoch": 0.15472733139722492, "grad_norm": 4.285133361816406, "learning_rate": 9.604767183199287e-05, "loss": 2.0941, "step": 959 }, { "epoch": 0.15488867376573087, "grad_norm": 4.502032279968262, "learning_rate": 9.603748430210183e-05, "loss": 2.2289, "step": 960 }, { "epoch": 0.15505001613423686, "grad_norm": 4.904419422149658, "learning_rate": 9.602728420110806e-05, "loss": 2.0168, "step": 961 }, { "epoch": 0.1552113585027428, "grad_norm": 4.548860549926758, "learning_rate": 9.601707153179682e-05, "loss": 2.139, "step": 962 }, { "epoch": 0.1553727008712488, "grad_norm": 4.90744686126709, "learning_rate": 9.600684629695682e-05, "loss": 2.0714, "step": 963 }, { "epoch": 0.15553404323975475, "grad_norm": 4.9324235916137695, "learning_rate": 9.599660849938016e-05, "loss": 2.038, "step": 964 }, { "epoch": 0.15569538560826074, "grad_norm": 6.045592784881592, "learning_rate": 9.598635814186241e-05, "loss": 1.9441, "step": 965 }, { "epoch": 0.1558567279767667, "grad_norm": 4.217911243438721, "learning_rate": 9.597609522720257e-05, "loss": 2.0476, "step": 966 }, { "epoch": 0.15601807034527268, "grad_norm": 7.424112319946289, "learning_rate": 9.596581975820303e-05, "loss": 2.3635, "step": 967 }, { "epoch": 0.15617941271377864, "grad_norm": 3.862766742706299, "learning_rate": 9.595553173766965e-05, "loss": 2.0969, "step": 968 }, { "epoch": 0.15634075508228462, "grad_norm": 6.841447830200195, "learning_rate": 9.59452311684117e-05, "loss": 2.4406, "step": 969 }, { "epoch": 0.15650209745079058, "grad_norm": 5.120818138122559, "learning_rate": 9.593491805324189e-05, "loss": 1.978, "step": 970 }, { "epoch": 0.15666343981929653, "grad_norm": 4.624609470367432, "learning_rate": 9.59245923949763e-05, "loss": 2.2101, "step": 971 }, { "epoch": 0.15682478218780252, "grad_norm": 5.428524971008301, "learning_rate": 9.591425419643452e-05, "loss": 2.3038, "step": 972 }, { "epoch": 0.15698612455630848, "grad_norm": 5.182520866394043, "learning_rate": 9.590390346043951e-05, "loss": 2.06, "step": 973 }, { "epoch": 0.15714746692481446, "grad_norm": 4.55204963684082, "learning_rate": 9.589354018981767e-05, "loss": 2.2339, "step": 974 }, { "epoch": 0.15730880929332042, "grad_norm": 4.689589023590088, "learning_rate": 9.58831643873988e-05, "loss": 2.0099, "step": 975 }, { "epoch": 0.1574701516618264, "grad_norm": 4.149195194244385, "learning_rate": 9.587277605601617e-05, "loss": 2.1516, "step": 976 }, { "epoch": 0.15763149403033236, "grad_norm": 4.985833644866943, "learning_rate": 9.58623751985064e-05, "loss": 2.2491, "step": 977 }, { "epoch": 0.15779283639883834, "grad_norm": 5.3002800941467285, "learning_rate": 9.585196181770963e-05, "loss": 2.4263, "step": 978 }, { "epoch": 0.1579541787673443, "grad_norm": 6.448243618011475, "learning_rate": 9.584153591646932e-05, "loss": 2.2362, "step": 979 }, { "epoch": 0.15811552113585028, "grad_norm": 7.038323879241943, "learning_rate": 9.583109749763239e-05, "loss": 2.3127, "step": 980 }, { "epoch": 0.15827686350435624, "grad_norm": 5.628559112548828, "learning_rate": 9.582064656404921e-05, "loss": 2.2545, "step": 981 }, { "epoch": 0.15843820587286223, "grad_norm": 4.454617023468018, "learning_rate": 9.58101831185735e-05, "loss": 2.1904, "step": 982 }, { "epoch": 0.15859954824136818, "grad_norm": 5.519943714141846, "learning_rate": 9.579970716406245e-05, "loss": 2.2977, "step": 983 }, { "epoch": 0.15876089060987417, "grad_norm": 5.4137349128723145, "learning_rate": 9.578921870337667e-05, "loss": 1.9913, "step": 984 }, { "epoch": 0.15892223297838012, "grad_norm": 5.2817583084106445, "learning_rate": 9.577871773938011e-05, "loss": 2.0741, "step": 985 }, { "epoch": 0.15908357534688608, "grad_norm": 4.221789836883545, "learning_rate": 9.576820427494025e-05, "loss": 2.3467, "step": 986 }, { "epoch": 0.15924491771539206, "grad_norm": 5.049351692199707, "learning_rate": 9.575767831292788e-05, "loss": 2.0332, "step": 987 }, { "epoch": 0.15940626008389802, "grad_norm": 5.472332000732422, "learning_rate": 9.574713985621725e-05, "loss": 2.2033, "step": 988 }, { "epoch": 0.159567602452404, "grad_norm": 4.4902849197387695, "learning_rate": 9.573658890768602e-05, "loss": 1.872, "step": 989 }, { "epoch": 0.15972894482090996, "grad_norm": 6.237600326538086, "learning_rate": 9.572602547021526e-05, "loss": 2.4649, "step": 990 }, { "epoch": 0.15989028718941595, "grad_norm": 4.866677761077881, "learning_rate": 9.571544954668945e-05, "loss": 2.2058, "step": 991 }, { "epoch": 0.1600516295579219, "grad_norm": 4.447527885437012, "learning_rate": 9.570486113999646e-05, "loss": 2.2357, "step": 992 }, { "epoch": 0.1602129719264279, "grad_norm": 5.310793876647949, "learning_rate": 9.569426025302759e-05, "loss": 2.3725, "step": 993 }, { "epoch": 0.16037431429493385, "grad_norm": 5.911520481109619, "learning_rate": 9.568364688867757e-05, "loss": 2.1938, "step": 994 }, { "epoch": 0.16053565666343983, "grad_norm": 4.786045551300049, "learning_rate": 9.567302104984446e-05, "loss": 2.1026, "step": 995 }, { "epoch": 0.1606969990319458, "grad_norm": 3.7093453407287598, "learning_rate": 9.566238273942982e-05, "loss": 2.1269, "step": 996 }, { "epoch": 0.16085834140045177, "grad_norm": 5.790320873260498, "learning_rate": 9.565173196033855e-05, "loss": 2.136, "step": 997 }, { "epoch": 0.16101968376895773, "grad_norm": 7.40656852722168, "learning_rate": 9.564106871547899e-05, "loss": 2.1995, "step": 998 }, { "epoch": 0.16118102613746368, "grad_norm": 4.74056339263916, "learning_rate": 9.563039300776287e-05, "loss": 2.0488, "step": 999 }, { "epoch": 0.16134236850596967, "grad_norm": 6.514090538024902, "learning_rate": 9.56197048401053e-05, "loss": 2.2703, "step": 1000 }, { "epoch": 0.16150371087447563, "grad_norm": 6.964737892150879, "learning_rate": 9.560900421542483e-05, "loss": 1.9261, "step": 1001 }, { "epoch": 0.1616650532429816, "grad_norm": 5.097550868988037, "learning_rate": 9.55982911366434e-05, "loss": 2.4466, "step": 1002 }, { "epoch": 0.16182639561148757, "grad_norm": 5.489377021789551, "learning_rate": 9.558756560668636e-05, "loss": 2.5064, "step": 1003 }, { "epoch": 0.16198773797999355, "grad_norm": 4.444252967834473, "learning_rate": 9.557682762848244e-05, "loss": 2.2717, "step": 1004 }, { "epoch": 0.1621490803484995, "grad_norm": 5.357511043548584, "learning_rate": 9.556607720496376e-05, "loss": 2.1742, "step": 1005 }, { "epoch": 0.1623104227170055, "grad_norm": 5.26129150390625, "learning_rate": 9.555531433906587e-05, "loss": 2.2445, "step": 1006 }, { "epoch": 0.16247176508551145, "grad_norm": 3.4206292629241943, "learning_rate": 9.55445390337277e-05, "loss": 2.3338, "step": 1007 }, { "epoch": 0.16263310745401743, "grad_norm": 4.6428446769714355, "learning_rate": 9.55337512918916e-05, "loss": 2.0477, "step": 1008 }, { "epoch": 0.1627944498225234, "grad_norm": 4.148924350738525, "learning_rate": 9.552295111650326e-05, "loss": 2.3308, "step": 1009 }, { "epoch": 0.16295579219102937, "grad_norm": 4.782018661499023, "learning_rate": 9.551213851051183e-05, "loss": 2.0843, "step": 1010 }, { "epoch": 0.16311713455953533, "grad_norm": 5.063188076019287, "learning_rate": 9.550131347686981e-05, "loss": 2.3427, "step": 1011 }, { "epoch": 0.16327847692804132, "grad_norm": 3.833597421646118, "learning_rate": 9.549047601853311e-05, "loss": 2.0908, "step": 1012 }, { "epoch": 0.16343981929654727, "grad_norm": 4.0963215827941895, "learning_rate": 9.547962613846105e-05, "loss": 2.2118, "step": 1013 }, { "epoch": 0.16360116166505323, "grad_norm": 5.363045692443848, "learning_rate": 9.54687638396163e-05, "loss": 2.2259, "step": 1014 }, { "epoch": 0.16376250403355921, "grad_norm": 6.040219783782959, "learning_rate": 9.545788912496496e-05, "loss": 2.058, "step": 1015 }, { "epoch": 0.16392384640206517, "grad_norm": 5.767908096313477, "learning_rate": 9.54470019974765e-05, "loss": 1.9915, "step": 1016 }, { "epoch": 0.16408518877057116, "grad_norm": 5.240634441375732, "learning_rate": 9.543610246012377e-05, "loss": 2.3411, "step": 1017 }, { "epoch": 0.1642465311390771, "grad_norm": 4.714745998382568, "learning_rate": 9.542519051588305e-05, "loss": 2.1866, "step": 1018 }, { "epoch": 0.1644078735075831, "grad_norm": 4.932572364807129, "learning_rate": 9.541426616773396e-05, "loss": 2.1218, "step": 1019 }, { "epoch": 0.16456921587608905, "grad_norm": 6.019719123840332, "learning_rate": 9.540332941865953e-05, "loss": 2.289, "step": 1020 }, { "epoch": 0.16473055824459504, "grad_norm": 5.029490947723389, "learning_rate": 9.539238027164619e-05, "loss": 1.8102, "step": 1021 }, { "epoch": 0.164891900613101, "grad_norm": 6.065807342529297, "learning_rate": 9.538141872968371e-05, "loss": 2.3399, "step": 1022 }, { "epoch": 0.16505324298160698, "grad_norm": 6.009767055511475, "learning_rate": 9.537044479576531e-05, "loss": 2.0795, "step": 1023 }, { "epoch": 0.16521458535011294, "grad_norm": 5.45097541809082, "learning_rate": 9.535945847288754e-05, "loss": 2.2277, "step": 1024 }, { "epoch": 0.16537592771861892, "grad_norm": 4.281061172485352, "learning_rate": 9.534845976405035e-05, "loss": 2.1457, "step": 1025 }, { "epoch": 0.16553727008712488, "grad_norm": 5.280691146850586, "learning_rate": 9.533744867225707e-05, "loss": 2.2294, "step": 1026 }, { "epoch": 0.16569861245563086, "grad_norm": 5.329169750213623, "learning_rate": 9.532642520051442e-05, "loss": 2.1606, "step": 1027 }, { "epoch": 0.16585995482413682, "grad_norm": 4.205093860626221, "learning_rate": 9.53153893518325e-05, "loss": 2.1819, "step": 1028 }, { "epoch": 0.16602129719264277, "grad_norm": 4.498367786407471, "learning_rate": 9.530434112922477e-05, "loss": 2.2947, "step": 1029 }, { "epoch": 0.16618263956114876, "grad_norm": 3.871359348297119, "learning_rate": 9.52932805357081e-05, "loss": 2.3221, "step": 1030 }, { "epoch": 0.16634398192965472, "grad_norm": 4.586424350738525, "learning_rate": 9.528220757430272e-05, "loss": 2.2141, "step": 1031 }, { "epoch": 0.1665053242981607, "grad_norm": 4.1155009269714355, "learning_rate": 9.527112224803223e-05, "loss": 1.9745, "step": 1032 }, { "epoch": 0.16666666666666666, "grad_norm": 3.8675432205200195, "learning_rate": 9.52600245599236e-05, "loss": 2.2398, "step": 1033 }, { "epoch": 0.16682800903517264, "grad_norm": 4.718024253845215, "learning_rate": 9.524891451300721e-05, "loss": 2.1174, "step": 1034 }, { "epoch": 0.1669893514036786, "grad_norm": 4.081422805786133, "learning_rate": 9.523779211031682e-05, "loss": 2.0654, "step": 1035 }, { "epoch": 0.16715069377218458, "grad_norm": 5.289209842681885, "learning_rate": 9.522665735488949e-05, "loss": 2.1711, "step": 1036 }, { "epoch": 0.16731203614069054, "grad_norm": 4.412351608276367, "learning_rate": 9.521551024976573e-05, "loss": 2.0829, "step": 1037 }, { "epoch": 0.16747337850919652, "grad_norm": 5.0605363845825195, "learning_rate": 9.520435079798939e-05, "loss": 2.0377, "step": 1038 }, { "epoch": 0.16763472087770248, "grad_norm": 4.3810811042785645, "learning_rate": 9.51931790026077e-05, "loss": 1.9752, "step": 1039 }, { "epoch": 0.16779606324620847, "grad_norm": 5.456241130828857, "learning_rate": 9.518199486667123e-05, "loss": 2.257, "step": 1040 }, { "epoch": 0.16795740561471442, "grad_norm": 4.840457439422607, "learning_rate": 9.517079839323398e-05, "loss": 2.3499, "step": 1041 }, { "epoch": 0.1681187479832204, "grad_norm": 5.18924617767334, "learning_rate": 9.515958958535326e-05, "loss": 2.1178, "step": 1042 }, { "epoch": 0.16828009035172636, "grad_norm": 5.006588935852051, "learning_rate": 9.514836844608982e-05, "loss": 2.0868, "step": 1043 }, { "epoch": 0.16844143272023232, "grad_norm": 5.704937934875488, "learning_rate": 9.513713497850767e-05, "loss": 2.0938, "step": 1044 }, { "epoch": 0.1686027750887383, "grad_norm": 4.047709941864014, "learning_rate": 9.512588918567429e-05, "loss": 2.0518, "step": 1045 }, { "epoch": 0.16876411745724426, "grad_norm": 4.593513488769531, "learning_rate": 9.511463107066045e-05, "loss": 2.1238, "step": 1046 }, { "epoch": 0.16892545982575025, "grad_norm": 6.242722511291504, "learning_rate": 9.510336063654034e-05, "loss": 2.0898, "step": 1047 }, { "epoch": 0.1690868021942562, "grad_norm": 4.933840274810791, "learning_rate": 9.509207788639147e-05, "loss": 2.0248, "step": 1048 }, { "epoch": 0.1692481445627622, "grad_norm": 4.743569850921631, "learning_rate": 9.508078282329478e-05, "loss": 2.2515, "step": 1049 }, { "epoch": 0.16940948693126814, "grad_norm": 4.313886642456055, "learning_rate": 9.506947545033447e-05, "loss": 2.1975, "step": 1050 }, { "epoch": 0.16957082929977413, "grad_norm": 5.031490325927734, "learning_rate": 9.505815577059817e-05, "loss": 2.1413, "step": 1051 }, { "epoch": 0.16973217166828009, "grad_norm": 6.796832084655762, "learning_rate": 9.50468237871769e-05, "loss": 2.0446, "step": 1052 }, { "epoch": 0.16989351403678607, "grad_norm": 4.098788738250732, "learning_rate": 9.503547950316494e-05, "loss": 2.507, "step": 1053 }, { "epoch": 0.17005485640529203, "grad_norm": 4.281834125518799, "learning_rate": 9.502412292166004e-05, "loss": 2.1102, "step": 1054 }, { "epoch": 0.170216198773798, "grad_norm": 5.966123104095459, "learning_rate": 9.501275404576323e-05, "loss": 2.2269, "step": 1055 }, { "epoch": 0.17037754114230397, "grad_norm": 4.702624320983887, "learning_rate": 9.500137287857889e-05, "loss": 2.001, "step": 1056 }, { "epoch": 0.17053888351080995, "grad_norm": 6.717199325561523, "learning_rate": 9.498997942321483e-05, "loss": 2.2389, "step": 1057 }, { "epoch": 0.1707002258793159, "grad_norm": 5.48018741607666, "learning_rate": 9.497857368278218e-05, "loss": 2.1897, "step": 1058 }, { "epoch": 0.17086156824782187, "grad_norm": 5.843608379364014, "learning_rate": 9.496715566039538e-05, "loss": 2.1196, "step": 1059 }, { "epoch": 0.17102291061632785, "grad_norm": 4.224806785583496, "learning_rate": 9.495572535917229e-05, "loss": 2.2262, "step": 1060 }, { "epoch": 0.1711842529848338, "grad_norm": 4.144807815551758, "learning_rate": 9.494428278223409e-05, "loss": 1.8662, "step": 1061 }, { "epoch": 0.1713455953533398, "grad_norm": 4.138956069946289, "learning_rate": 9.493282793270531e-05, "loss": 2.0499, "step": 1062 }, { "epoch": 0.17150693772184575, "grad_norm": 4.989041328430176, "learning_rate": 9.492136081371384e-05, "loss": 2.3913, "step": 1063 }, { "epoch": 0.17166828009035173, "grad_norm": 4.749313831329346, "learning_rate": 9.490988142839091e-05, "loss": 2.1736, "step": 1064 }, { "epoch": 0.1718296224588577, "grad_norm": 5.544332504272461, "learning_rate": 9.489838977987114e-05, "loss": 2.1189, "step": 1065 }, { "epoch": 0.17199096482736367, "grad_norm": 4.9664626121521, "learning_rate": 9.488688587129242e-05, "loss": 2.3713, "step": 1066 }, { "epoch": 0.17215230719586963, "grad_norm": 5.020138740539551, "learning_rate": 9.487536970579606e-05, "loss": 2.177, "step": 1067 }, { "epoch": 0.17231364956437561, "grad_norm": 3.4289164543151855, "learning_rate": 9.48638412865267e-05, "loss": 2.0738, "step": 1068 }, { "epoch": 0.17247499193288157, "grad_norm": 5.5560221672058105, "learning_rate": 9.48523006166323e-05, "loss": 2.151, "step": 1069 }, { "epoch": 0.17263633430138756, "grad_norm": 6.141683101654053, "learning_rate": 9.48407476992642e-05, "loss": 2.8241, "step": 1070 }, { "epoch": 0.1727976766698935, "grad_norm": 4.250937461853027, "learning_rate": 9.482918253757705e-05, "loss": 2.1673, "step": 1071 }, { "epoch": 0.1729590190383995, "grad_norm": 4.556088924407959, "learning_rate": 9.481760513472885e-05, "loss": 2.2115, "step": 1072 }, { "epoch": 0.17312036140690545, "grad_norm": 4.4660773277282715, "learning_rate": 9.480601549388097e-05, "loss": 2.1846, "step": 1073 }, { "epoch": 0.1732817037754114, "grad_norm": 4.742733955383301, "learning_rate": 9.479441361819811e-05, "loss": 2.2282, "step": 1074 }, { "epoch": 0.1734430461439174, "grad_norm": 5.606072425842285, "learning_rate": 9.47827995108483e-05, "loss": 1.9925, "step": 1075 }, { "epoch": 0.17360438851242335, "grad_norm": 6.388113021850586, "learning_rate": 9.47711731750029e-05, "loss": 2.2225, "step": 1076 }, { "epoch": 0.17376573088092934, "grad_norm": 4.076513290405273, "learning_rate": 9.475953461383664e-05, "loss": 1.8734, "step": 1077 }, { "epoch": 0.1739270732494353, "grad_norm": 4.824914455413818, "learning_rate": 9.474788383052756e-05, "loss": 2.1996, "step": 1078 }, { "epoch": 0.17408841561794128, "grad_norm": 6.446959018707275, "learning_rate": 9.473622082825707e-05, "loss": 2.2974, "step": 1079 }, { "epoch": 0.17424975798644723, "grad_norm": 4.997390270233154, "learning_rate": 9.47245456102099e-05, "loss": 2.3537, "step": 1080 }, { "epoch": 0.17441110035495322, "grad_norm": 6.039966583251953, "learning_rate": 9.471285817957407e-05, "loss": 2.2296, "step": 1081 }, { "epoch": 0.17457244272345918, "grad_norm": 4.892731189727783, "learning_rate": 9.4701158539541e-05, "loss": 2.1276, "step": 1082 }, { "epoch": 0.17473378509196516, "grad_norm": 4.1530442237854, "learning_rate": 9.468944669330545e-05, "loss": 2.0831, "step": 1083 }, { "epoch": 0.17489512746047112, "grad_norm": 4.372376441955566, "learning_rate": 9.467772264406545e-05, "loss": 2.1864, "step": 1084 }, { "epoch": 0.1750564698289771, "grad_norm": 3.467975616455078, "learning_rate": 9.46659863950224e-05, "loss": 2.2786, "step": 1085 }, { "epoch": 0.17521781219748306, "grad_norm": 4.543912410736084, "learning_rate": 9.465423794938104e-05, "loss": 2.0789, "step": 1086 }, { "epoch": 0.17537915456598901, "grad_norm": 5.803583145141602, "learning_rate": 9.464247731034943e-05, "loss": 2.1541, "step": 1087 }, { "epoch": 0.175540496934495, "grad_norm": 5.195874214172363, "learning_rate": 9.463070448113893e-05, "loss": 2.3098, "step": 1088 }, { "epoch": 0.17570183930300096, "grad_norm": 4.555354118347168, "learning_rate": 9.461891946496428e-05, "loss": 2.0214, "step": 1089 }, { "epoch": 0.17586318167150694, "grad_norm": 7.192603588104248, "learning_rate": 9.460712226504353e-05, "loss": 2.0761, "step": 1090 }, { "epoch": 0.1760245240400129, "grad_norm": 4.054807186126709, "learning_rate": 9.459531288459803e-05, "loss": 2.1649, "step": 1091 }, { "epoch": 0.17618586640851888, "grad_norm": 4.142963409423828, "learning_rate": 9.458349132685249e-05, "loss": 2.1988, "step": 1092 }, { "epoch": 0.17634720877702484, "grad_norm": 4.702714920043945, "learning_rate": 9.457165759503493e-05, "loss": 2.1305, "step": 1093 }, { "epoch": 0.17650855114553082, "grad_norm": 3.626248836517334, "learning_rate": 9.455981169237668e-05, "loss": 2.0853, "step": 1094 }, { "epoch": 0.17666989351403678, "grad_norm": 4.734299659729004, "learning_rate": 9.454795362211244e-05, "loss": 2.1626, "step": 1095 }, { "epoch": 0.17683123588254276, "grad_norm": 5.6444783210754395, "learning_rate": 9.45360833874802e-05, "loss": 2.2003, "step": 1096 }, { "epoch": 0.17699257825104872, "grad_norm": 4.0697407722473145, "learning_rate": 9.452420099172124e-05, "loss": 2.2748, "step": 1097 }, { "epoch": 0.1771539206195547, "grad_norm": 4.217678070068359, "learning_rate": 9.451230643808023e-05, "loss": 2.1378, "step": 1098 }, { "epoch": 0.17731526298806066, "grad_norm": 4.668602466583252, "learning_rate": 9.450039972980509e-05, "loss": 2.2964, "step": 1099 }, { "epoch": 0.17747660535656665, "grad_norm": 4.176945686340332, "learning_rate": 9.448848087014712e-05, "loss": 2.1765, "step": 1100 }, { "epoch": 0.1776379477250726, "grad_norm": 4.385271072387695, "learning_rate": 9.447654986236092e-05, "loss": 2.1224, "step": 1101 }, { "epoch": 0.17779929009357856, "grad_norm": 3.5701985359191895, "learning_rate": 9.446460670970436e-05, "loss": 2.0128, "step": 1102 }, { "epoch": 0.17796063246208454, "grad_norm": 4.596713542938232, "learning_rate": 9.44526514154387e-05, "loss": 2.1401, "step": 1103 }, { "epoch": 0.1781219748305905, "grad_norm": 5.741366863250732, "learning_rate": 9.444068398282848e-05, "loss": 2.3957, "step": 1104 }, { "epoch": 0.17828331719909649, "grad_norm": 4.24362850189209, "learning_rate": 9.442870441514154e-05, "loss": 2.2815, "step": 1105 }, { "epoch": 0.17844465956760244, "grad_norm": 3.9866597652435303, "learning_rate": 9.441671271564906e-05, "loss": 2.0728, "step": 1106 }, { "epoch": 0.17860600193610843, "grad_norm": 3.9969146251678467, "learning_rate": 9.440470888762552e-05, "loss": 2.3408, "step": 1107 }, { "epoch": 0.17876734430461438, "grad_norm": 4.508840084075928, "learning_rate": 9.439269293434868e-05, "loss": 2.0563, "step": 1108 }, { "epoch": 0.17892868667312037, "grad_norm": 5.449844837188721, "learning_rate": 9.438066485909969e-05, "loss": 2.2244, "step": 1109 }, { "epoch": 0.17909002904162633, "grad_norm": 4.140756130218506, "learning_rate": 9.436862466516294e-05, "loss": 1.9053, "step": 1110 }, { "epoch": 0.1792513714101323, "grad_norm": 4.184217929840088, "learning_rate": 9.435657235582616e-05, "loss": 2.1351, "step": 1111 }, { "epoch": 0.17941271377863827, "grad_norm": 3.90503191947937, "learning_rate": 9.43445079343804e-05, "loss": 2.278, "step": 1112 }, { "epoch": 0.17957405614714425, "grad_norm": 3.791114568710327, "learning_rate": 9.433243140411996e-05, "loss": 1.9327, "step": 1113 }, { "epoch": 0.1797353985156502, "grad_norm": 4.18504524230957, "learning_rate": 9.432034276834252e-05, "loss": 1.948, "step": 1114 }, { "epoch": 0.1798967408841562, "grad_norm": 4.193762302398682, "learning_rate": 9.4308242030349e-05, "loss": 1.9719, "step": 1115 }, { "epoch": 0.18005808325266215, "grad_norm": 4.792637825012207, "learning_rate": 9.429612919344368e-05, "loss": 2.0218, "step": 1116 }, { "epoch": 0.1802194256211681, "grad_norm": 6.854551792144775, "learning_rate": 9.428400426093413e-05, "loss": 2.1162, "step": 1117 }, { "epoch": 0.1803807679896741, "grad_norm": 4.02646017074585, "learning_rate": 9.427186723613117e-05, "loss": 2.0172, "step": 1118 }, { "epoch": 0.18054211035818005, "grad_norm": 4.791799068450928, "learning_rate": 9.425971812234901e-05, "loss": 2.0845, "step": 1119 }, { "epoch": 0.18070345272668603, "grad_norm": 4.092919826507568, "learning_rate": 9.424755692290507e-05, "loss": 2.3227, "step": 1120 }, { "epoch": 0.180864795095192, "grad_norm": 5.270394325256348, "learning_rate": 9.423538364112017e-05, "loss": 2.1156, "step": 1121 }, { "epoch": 0.18102613746369797, "grad_norm": 3.8187763690948486, "learning_rate": 9.422319828031832e-05, "loss": 2.4983, "step": 1122 }, { "epoch": 0.18118747983220393, "grad_norm": 4.516146659851074, "learning_rate": 9.421100084382693e-05, "loss": 1.8625, "step": 1123 }, { "epoch": 0.1813488222007099, "grad_norm": 5.56035041809082, "learning_rate": 9.419879133497663e-05, "loss": 2.1817, "step": 1124 }, { "epoch": 0.18151016456921587, "grad_norm": 4.905139923095703, "learning_rate": 9.418656975710136e-05, "loss": 1.8659, "step": 1125 }, { "epoch": 0.18167150693772185, "grad_norm": 5.2623724937438965, "learning_rate": 9.417433611353842e-05, "loss": 2.1334, "step": 1126 }, { "epoch": 0.1818328493062278, "grad_norm": 4.874891757965088, "learning_rate": 9.416209040762833e-05, "loss": 2.2453, "step": 1127 }, { "epoch": 0.1819941916747338, "grad_norm": 4.088444232940674, "learning_rate": 9.414983264271493e-05, "loss": 1.947, "step": 1128 }, { "epoch": 0.18215553404323975, "grad_norm": 5.268361568450928, "learning_rate": 9.413756282214537e-05, "loss": 2.017, "step": 1129 }, { "epoch": 0.18231687641174574, "grad_norm": 4.026107311248779, "learning_rate": 9.412528094927007e-05, "loss": 2.1563, "step": 1130 }, { "epoch": 0.1824782187802517, "grad_norm": 4.318005084991455, "learning_rate": 9.411298702744274e-05, "loss": 2.0524, "step": 1131 }, { "epoch": 0.18263956114875765, "grad_norm": 4.455986022949219, "learning_rate": 9.410068106002036e-05, "loss": 2.0646, "step": 1132 }, { "epoch": 0.18280090351726364, "grad_norm": 4.1529035568237305, "learning_rate": 9.408836305036328e-05, "loss": 2.2399, "step": 1133 }, { "epoch": 0.1829622458857696, "grad_norm": 4.354226112365723, "learning_rate": 9.407603300183507e-05, "loss": 2.1329, "step": 1134 }, { "epoch": 0.18312358825427558, "grad_norm": 4.02735710144043, "learning_rate": 9.406369091780257e-05, "loss": 2.2201, "step": 1135 }, { "epoch": 0.18328493062278153, "grad_norm": 5.632328510284424, "learning_rate": 9.405133680163598e-05, "loss": 2.1951, "step": 1136 }, { "epoch": 0.18344627299128752, "grad_norm": 5.8479838371276855, "learning_rate": 9.40389706567087e-05, "loss": 2.2726, "step": 1137 }, { "epoch": 0.18360761535979347, "grad_norm": 5.934751033782959, "learning_rate": 9.402659248639749e-05, "loss": 2.161, "step": 1138 }, { "epoch": 0.18376895772829946, "grad_norm": 3.6951584815979004, "learning_rate": 9.401420229408236e-05, "loss": 2.1799, "step": 1139 }, { "epoch": 0.18393030009680542, "grad_norm": 3.9756414890289307, "learning_rate": 9.40018000831466e-05, "loss": 1.9636, "step": 1140 }, { "epoch": 0.1840916424653114, "grad_norm": 4.538594722747803, "learning_rate": 9.398938585697678e-05, "loss": 2.0378, "step": 1141 }, { "epoch": 0.18425298483381736, "grad_norm": 4.996496677398682, "learning_rate": 9.397695961896275e-05, "loss": 2.0256, "step": 1142 }, { "epoch": 0.18441432720232334, "grad_norm": 6.864620685577393, "learning_rate": 9.396452137249769e-05, "loss": 2.2353, "step": 1143 }, { "epoch": 0.1845756695708293, "grad_norm": 5.642723083496094, "learning_rate": 9.395207112097797e-05, "loss": 2.1499, "step": 1144 }, { "epoch": 0.18473701193933528, "grad_norm": 5.145697593688965, "learning_rate": 9.393960886780329e-05, "loss": 2.3424, "step": 1145 }, { "epoch": 0.18489835430784124, "grad_norm": 4.262571334838867, "learning_rate": 9.392713461637665e-05, "loss": 2.4949, "step": 1146 }, { "epoch": 0.1850596966763472, "grad_norm": 5.42431116104126, "learning_rate": 9.391464837010428e-05, "loss": 2.4643, "step": 1147 }, { "epoch": 0.18522103904485318, "grad_norm": 5.535486221313477, "learning_rate": 9.390215013239569e-05, "loss": 1.8437, "step": 1148 }, { "epoch": 0.18538238141335914, "grad_norm": 4.056037425994873, "learning_rate": 9.38896399066637e-05, "loss": 2.2023, "step": 1149 }, { "epoch": 0.18554372378186512, "grad_norm": 4.414604187011719, "learning_rate": 9.387711769632439e-05, "loss": 2.1182, "step": 1150 }, { "epoch": 0.18570506615037108, "grad_norm": 3.6318039894104004, "learning_rate": 9.386458350479707e-05, "loss": 1.9319, "step": 1151 }, { "epoch": 0.18586640851887706, "grad_norm": 4.180037498474121, "learning_rate": 9.385203733550438e-05, "loss": 1.9944, "step": 1152 }, { "epoch": 0.18602775088738302, "grad_norm": 4.404853820800781, "learning_rate": 9.38394791918722e-05, "loss": 2.4426, "step": 1153 }, { "epoch": 0.186189093255889, "grad_norm": 4.873829364776611, "learning_rate": 9.382690907732968e-05, "loss": 2.1799, "step": 1154 }, { "epoch": 0.18635043562439496, "grad_norm": 5.15049934387207, "learning_rate": 9.381432699530925e-05, "loss": 2.2582, "step": 1155 }, { "epoch": 0.18651177799290095, "grad_norm": 4.201823711395264, "learning_rate": 9.380173294924662e-05, "loss": 2.1879, "step": 1156 }, { "epoch": 0.1866731203614069, "grad_norm": 3.9464304447174072, "learning_rate": 9.378912694258073e-05, "loss": 2.0987, "step": 1157 }, { "epoch": 0.1868344627299129, "grad_norm": 6.513865947723389, "learning_rate": 9.377650897875379e-05, "loss": 2.3973, "step": 1158 }, { "epoch": 0.18699580509841884, "grad_norm": 4.84373664855957, "learning_rate": 9.376387906121132e-05, "loss": 2.0951, "step": 1159 }, { "epoch": 0.18715714746692483, "grad_norm": 6.011229515075684, "learning_rate": 9.375123719340206e-05, "loss": 2.0646, "step": 1160 }, { "epoch": 0.18731848983543078, "grad_norm": 4.3001837730407715, "learning_rate": 9.373858337877803e-05, "loss": 2.047, "step": 1161 }, { "epoch": 0.18747983220393674, "grad_norm": 6.261403560638428, "learning_rate": 9.372591762079452e-05, "loss": 2.0702, "step": 1162 }, { "epoch": 0.18764117457244273, "grad_norm": 4.611581325531006, "learning_rate": 9.371323992291006e-05, "loss": 2.4375, "step": 1163 }, { "epoch": 0.18780251694094868, "grad_norm": 3.757481575012207, "learning_rate": 9.370055028858647e-05, "loss": 2.2058, "step": 1164 }, { "epoch": 0.18796385930945467, "grad_norm": 3.8578453063964844, "learning_rate": 9.368784872128878e-05, "loss": 2.0362, "step": 1165 }, { "epoch": 0.18812520167796062, "grad_norm": 5.547084808349609, "learning_rate": 9.367513522448531e-05, "loss": 2.2821, "step": 1166 }, { "epoch": 0.1882865440464666, "grad_norm": 6.220110893249512, "learning_rate": 9.366240980164767e-05, "loss": 2.2423, "step": 1167 }, { "epoch": 0.18844788641497257, "grad_norm": 7.139183044433594, "learning_rate": 9.364967245625067e-05, "loss": 2.2226, "step": 1168 }, { "epoch": 0.18860922878347855, "grad_norm": 6.001524925231934, "learning_rate": 9.363692319177241e-05, "loss": 2.1597, "step": 1169 }, { "epoch": 0.1887705711519845, "grad_norm": 4.458691596984863, "learning_rate": 9.362416201169425e-05, "loss": 1.9492, "step": 1170 }, { "epoch": 0.1889319135204905, "grad_norm": 3.713860511779785, "learning_rate": 9.361138891950073e-05, "loss": 2.0342, "step": 1171 }, { "epoch": 0.18909325588899645, "grad_norm": 5.495253086090088, "learning_rate": 9.359860391867975e-05, "loss": 2.3154, "step": 1172 }, { "epoch": 0.18925459825750243, "grad_norm": 5.796265125274658, "learning_rate": 9.35858070127224e-05, "loss": 2.3185, "step": 1173 }, { "epoch": 0.1894159406260084, "grad_norm": 4.879892826080322, "learning_rate": 9.357299820512304e-05, "loss": 2.1154, "step": 1174 }, { "epoch": 0.18957728299451435, "grad_norm": 4.1518659591674805, "learning_rate": 9.356017749937925e-05, "loss": 2.3264, "step": 1175 }, { "epoch": 0.18973862536302033, "grad_norm": 5.204110145568848, "learning_rate": 9.354734489899191e-05, "loss": 2.3976, "step": 1176 }, { "epoch": 0.1898999677315263, "grad_norm": 3.671693801879883, "learning_rate": 9.35345004074651e-05, "loss": 2.1576, "step": 1177 }, { "epoch": 0.19006131010003227, "grad_norm": 4.392444133758545, "learning_rate": 9.352164402830618e-05, "loss": 2.3819, "step": 1178 }, { "epoch": 0.19022265246853823, "grad_norm": 5.3956193923950195, "learning_rate": 9.350877576502573e-05, "loss": 2.2044, "step": 1179 }, { "epoch": 0.1903839948370442, "grad_norm": 4.536787509918213, "learning_rate": 9.34958956211376e-05, "loss": 1.9926, "step": 1180 }, { "epoch": 0.19054533720555017, "grad_norm": 7.908175945281982, "learning_rate": 9.348300360015885e-05, "loss": 2.3019, "step": 1181 }, { "epoch": 0.19070667957405615, "grad_norm": 5.751628875732422, "learning_rate": 9.347009970560984e-05, "loss": 2.2324, "step": 1182 }, { "epoch": 0.1908680219425621, "grad_norm": 3.7884788513183594, "learning_rate": 9.345718394101411e-05, "loss": 2.0753, "step": 1183 }, { "epoch": 0.1910293643110681, "grad_norm": 5.273193359375, "learning_rate": 9.344425630989848e-05, "loss": 2.2719, "step": 1184 }, { "epoch": 0.19119070667957405, "grad_norm": 3.5896897315979004, "learning_rate": 9.343131681579301e-05, "loss": 2.202, "step": 1185 }, { "epoch": 0.19135204904808004, "grad_norm": 4.108644485473633, "learning_rate": 9.341836546223095e-05, "loss": 2.0847, "step": 1186 }, { "epoch": 0.191513391416586, "grad_norm": 4.217356204986572, "learning_rate": 9.340540225274887e-05, "loss": 2.2478, "step": 1187 }, { "epoch": 0.19167473378509198, "grad_norm": 6.304388046264648, "learning_rate": 9.339242719088651e-05, "loss": 2.1341, "step": 1188 }, { "epoch": 0.19183607615359793, "grad_norm": 5.135892391204834, "learning_rate": 9.337944028018688e-05, "loss": 1.9682, "step": 1189 }, { "epoch": 0.1919974185221039, "grad_norm": 4.361384391784668, "learning_rate": 9.336644152419622e-05, "loss": 2.0092, "step": 1190 }, { "epoch": 0.19215876089060988, "grad_norm": 4.100388526916504, "learning_rate": 9.335343092646399e-05, "loss": 1.9264, "step": 1191 }, { "epoch": 0.19232010325911583, "grad_norm": 5.539300918579102, "learning_rate": 9.334040849054289e-05, "loss": 2.0711, "step": 1192 }, { "epoch": 0.19248144562762182, "grad_norm": 3.903207540512085, "learning_rate": 9.332737421998886e-05, "loss": 2.1269, "step": 1193 }, { "epoch": 0.19264278799612777, "grad_norm": 4.74282169342041, "learning_rate": 9.331432811836108e-05, "loss": 2.1357, "step": 1194 }, { "epoch": 0.19280413036463376, "grad_norm": 4.196990013122559, "learning_rate": 9.330127018922194e-05, "loss": 2.1716, "step": 1195 }, { "epoch": 0.19296547273313971, "grad_norm": 5.20350980758667, "learning_rate": 9.328820043613707e-05, "loss": 2.2362, "step": 1196 }, { "epoch": 0.1931268151016457, "grad_norm": 5.489796161651611, "learning_rate": 9.327511886267532e-05, "loss": 2.2222, "step": 1197 }, { "epoch": 0.19328815747015166, "grad_norm": 6.44775915145874, "learning_rate": 9.32620254724088e-05, "loss": 2.1585, "step": 1198 }, { "epoch": 0.19344949983865764, "grad_norm": 5.332704544067383, "learning_rate": 9.324892026891279e-05, "loss": 2.1961, "step": 1199 }, { "epoch": 0.1936108422071636, "grad_norm": 5.2855095863342285, "learning_rate": 9.323580325576584e-05, "loss": 2.1933, "step": 1200 }, { "epoch": 0.19377218457566958, "grad_norm": 4.12716007232666, "learning_rate": 9.322267443654972e-05, "loss": 1.9773, "step": 1201 }, { "epoch": 0.19393352694417554, "grad_norm": 5.293431758880615, "learning_rate": 9.320953381484943e-05, "loss": 2.0362, "step": 1202 }, { "epoch": 0.19409486931268152, "grad_norm": 5.689045429229736, "learning_rate": 9.319638139425313e-05, "loss": 2.5079, "step": 1203 }, { "epoch": 0.19425621168118748, "grad_norm": 5.536917686462402, "learning_rate": 9.318321717835228e-05, "loss": 2.2431, "step": 1204 }, { "epoch": 0.19441755404969344, "grad_norm": 6.199563980102539, "learning_rate": 9.317004117074154e-05, "loss": 2.0321, "step": 1205 }, { "epoch": 0.19457889641819942, "grad_norm": 5.072630405426025, "learning_rate": 9.315685337501876e-05, "loss": 2.2442, "step": 1206 }, { "epoch": 0.19474023878670538, "grad_norm": 4.364281177520752, "learning_rate": 9.314365379478506e-05, "loss": 2.16, "step": 1207 }, { "epoch": 0.19490158115521136, "grad_norm": 4.033108711242676, "learning_rate": 9.313044243364473e-05, "loss": 2.335, "step": 1208 }, { "epoch": 0.19506292352371732, "grad_norm": 4.639063358306885, "learning_rate": 9.311721929520527e-05, "loss": 2.1587, "step": 1209 }, { "epoch": 0.1952242658922233, "grad_norm": 4.656102180480957, "learning_rate": 9.310398438307746e-05, "loss": 2.081, "step": 1210 }, { "epoch": 0.19538560826072926, "grad_norm": 4.852657318115234, "learning_rate": 9.309073770087524e-05, "loss": 1.9645, "step": 1211 }, { "epoch": 0.19554695062923524, "grad_norm": 4.050775527954102, "learning_rate": 9.30774792522158e-05, "loss": 2.1736, "step": 1212 }, { "epoch": 0.1957082929977412, "grad_norm": 3.999640703201294, "learning_rate": 9.306420904071948e-05, "loss": 2.1352, "step": 1213 }, { "epoch": 0.19586963536624719, "grad_norm": 5.520906448364258, "learning_rate": 9.305092707000992e-05, "loss": 2.4808, "step": 1214 }, { "epoch": 0.19603097773475314, "grad_norm": 5.296695232391357, "learning_rate": 9.30376333437139e-05, "loss": 1.9308, "step": 1215 }, { "epoch": 0.19619232010325913, "grad_norm": 3.6606791019439697, "learning_rate": 9.302432786546142e-05, "loss": 2.127, "step": 1216 }, { "epoch": 0.19635366247176508, "grad_norm": 5.4831862449646, "learning_rate": 9.301101063888575e-05, "loss": 2.2932, "step": 1217 }, { "epoch": 0.19651500484027107, "grad_norm": 5.197719097137451, "learning_rate": 9.29976816676233e-05, "loss": 2.1172, "step": 1218 }, { "epoch": 0.19667634720877702, "grad_norm": 4.50950813293457, "learning_rate": 9.29843409553137e-05, "loss": 2.3063, "step": 1219 }, { "epoch": 0.19683768957728298, "grad_norm": 5.969484329223633, "learning_rate": 9.297098850559982e-05, "loss": 2.2218, "step": 1220 }, { "epoch": 0.19699903194578897, "grad_norm": 7.5558295249938965, "learning_rate": 9.295762432212767e-05, "loss": 2.0204, "step": 1221 }, { "epoch": 0.19716037431429492, "grad_norm": 4.77316951751709, "learning_rate": 9.294424840854654e-05, "loss": 1.8611, "step": 1222 }, { "epoch": 0.1973217166828009, "grad_norm": 4.685102939605713, "learning_rate": 9.29308607685089e-05, "loss": 1.8307, "step": 1223 }, { "epoch": 0.19748305905130686, "grad_norm": 4.8697099685668945, "learning_rate": 9.291746140567036e-05, "loss": 1.977, "step": 1224 }, { "epoch": 0.19764440141981285, "grad_norm": 5.616896629333496, "learning_rate": 9.290405032368983e-05, "loss": 2.0616, "step": 1225 }, { "epoch": 0.1978057437883188, "grad_norm": 5.136362552642822, "learning_rate": 9.289062752622934e-05, "loss": 2.06, "step": 1226 }, { "epoch": 0.1979670861568248, "grad_norm": 4.789569854736328, "learning_rate": 9.287719301695418e-05, "loss": 2.1118, "step": 1227 }, { "epoch": 0.19812842852533075, "grad_norm": 4.089688301086426, "learning_rate": 9.286374679953279e-05, "loss": 1.865, "step": 1228 }, { "epoch": 0.19828977089383673, "grad_norm": 3.7538259029388428, "learning_rate": 9.28502888776368e-05, "loss": 1.9576, "step": 1229 }, { "epoch": 0.1984511132623427, "grad_norm": 3.620793342590332, "learning_rate": 9.283681925494111e-05, "loss": 2.0583, "step": 1230 }, { "epoch": 0.19861245563084867, "grad_norm": 5.347387790679932, "learning_rate": 9.282333793512375e-05, "loss": 2.4643, "step": 1231 }, { "epoch": 0.19877379799935463, "grad_norm": 4.361889839172363, "learning_rate": 9.280984492186594e-05, "loss": 2.2125, "step": 1232 }, { "epoch": 0.1989351403678606, "grad_norm": 5.193058013916016, "learning_rate": 9.279634021885212e-05, "loss": 2.2039, "step": 1233 }, { "epoch": 0.19909648273636657, "grad_norm": 6.2152886390686035, "learning_rate": 9.278282382976995e-05, "loss": 2.2702, "step": 1234 }, { "epoch": 0.19925782510487253, "grad_norm": 4.003117561340332, "learning_rate": 9.276929575831021e-05, "loss": 2.1436, "step": 1235 }, { "epoch": 0.1994191674733785, "grad_norm": 4.4928879737854, "learning_rate": 9.27557560081669e-05, "loss": 2.1452, "step": 1236 }, { "epoch": 0.19958050984188447, "grad_norm": 5.384605407714844, "learning_rate": 9.274220458303727e-05, "loss": 2.1992, "step": 1237 }, { "epoch": 0.19974185221039045, "grad_norm": 4.7580246925354, "learning_rate": 9.272864148662163e-05, "loss": 2.0621, "step": 1238 }, { "epoch": 0.1999031945788964, "grad_norm": 4.127257823944092, "learning_rate": 9.271506672262362e-05, "loss": 2.2737, "step": 1239 }, { "epoch": 0.2000645369474024, "grad_norm": 4.529548645019531, "learning_rate": 9.270148029474994e-05, "loss": 2.0339, "step": 1240 }, { "epoch": 0.20022587931590835, "grad_norm": 6.915964603424072, "learning_rate": 9.268788220671056e-05, "loss": 2.1123, "step": 1241 }, { "epoch": 0.20038722168441434, "grad_norm": 4.442663192749023, "learning_rate": 9.267427246221863e-05, "loss": 2.2201, "step": 1242 }, { "epoch": 0.2005485640529203, "grad_norm": 3.644043207168579, "learning_rate": 9.26606510649904e-05, "loss": 2.0024, "step": 1243 }, { "epoch": 0.20070990642142628, "grad_norm": 5.306570053100586, "learning_rate": 9.264701801874539e-05, "loss": 2.1486, "step": 1244 }, { "epoch": 0.20087124878993223, "grad_norm": 4.061726093292236, "learning_rate": 9.263337332720629e-05, "loss": 2.1827, "step": 1245 }, { "epoch": 0.20103259115843822, "grad_norm": 4.937375545501709, "learning_rate": 9.261971699409893e-05, "loss": 2.1337, "step": 1246 }, { "epoch": 0.20119393352694417, "grad_norm": 6.167418956756592, "learning_rate": 9.260604902315233e-05, "loss": 2.0779, "step": 1247 }, { "epoch": 0.20135527589545016, "grad_norm": 3.956418752670288, "learning_rate": 9.259236941809873e-05, "loss": 2.0061, "step": 1248 }, { "epoch": 0.20151661826395612, "grad_norm": 4.3427252769470215, "learning_rate": 9.257867818267348e-05, "loss": 2.1665, "step": 1249 }, { "epoch": 0.20167796063246207, "grad_norm": 4.450231075286865, "learning_rate": 9.256497532061515e-05, "loss": 2.4017, "step": 1250 }, { "epoch": 0.20183930300096806, "grad_norm": 4.20451021194458, "learning_rate": 9.25512608356655e-05, "loss": 1.9089, "step": 1251 }, { "epoch": 0.202000645369474, "grad_norm": 4.522383689880371, "learning_rate": 9.253753473156943e-05, "loss": 2.0234, "step": 1252 }, { "epoch": 0.20216198773798, "grad_norm": 4.063665390014648, "learning_rate": 9.252379701207499e-05, "loss": 2.1226, "step": 1253 }, { "epoch": 0.20232333010648595, "grad_norm": 4.9527740478515625, "learning_rate": 9.251004768093348e-05, "loss": 2.3531, "step": 1254 }, { "epoch": 0.20248467247499194, "grad_norm": 5.102415561676025, "learning_rate": 9.249628674189927e-05, "loss": 2.2128, "step": 1255 }, { "epoch": 0.2026460148434979, "grad_norm": 5.253018379211426, "learning_rate": 9.248251419873002e-05, "loss": 2.1547, "step": 1256 }, { "epoch": 0.20280735721200388, "grad_norm": 4.89177131652832, "learning_rate": 9.246873005518644e-05, "loss": 2.1247, "step": 1257 }, { "epoch": 0.20296869958050984, "grad_norm": 4.267810821533203, "learning_rate": 9.245493431503249e-05, "loss": 2.1507, "step": 1258 }, { "epoch": 0.20313004194901582, "grad_norm": 4.486478328704834, "learning_rate": 9.244112698203524e-05, "loss": 1.9206, "step": 1259 }, { "epoch": 0.20329138431752178, "grad_norm": 3.4107930660247803, "learning_rate": 9.242730805996499e-05, "loss": 1.9769, "step": 1260 }, { "epoch": 0.20345272668602776, "grad_norm": 4.6431565284729, "learning_rate": 9.241347755259514e-05, "loss": 2.3648, "step": 1261 }, { "epoch": 0.20361406905453372, "grad_norm": 5.335633277893066, "learning_rate": 9.239963546370227e-05, "loss": 2.3202, "step": 1262 }, { "epoch": 0.2037754114230397, "grad_norm": 5.694500923156738, "learning_rate": 9.238578179706616e-05, "loss": 1.994, "step": 1263 }, { "epoch": 0.20393675379154566, "grad_norm": 5.830473899841309, "learning_rate": 9.237191655646972e-05, "loss": 2.0949, "step": 1264 }, { "epoch": 0.20409809616005162, "grad_norm": 4.1669487953186035, "learning_rate": 9.235803974569901e-05, "loss": 2.1442, "step": 1265 }, { "epoch": 0.2042594385285576, "grad_norm": 4.0471930503845215, "learning_rate": 9.234415136854328e-05, "loss": 2.0706, "step": 1266 }, { "epoch": 0.20442078089706356, "grad_norm": 5.178165912628174, "learning_rate": 9.23302514287949e-05, "loss": 2.2316, "step": 1267 }, { "epoch": 0.20458212326556954, "grad_norm": 3.4137814044952393, "learning_rate": 9.231633993024944e-05, "loss": 2.3389, "step": 1268 }, { "epoch": 0.2047434656340755, "grad_norm": 5.766005992889404, "learning_rate": 9.230241687670561e-05, "loss": 2.3266, "step": 1269 }, { "epoch": 0.20490480800258148, "grad_norm": 5.218629837036133, "learning_rate": 9.228848227196528e-05, "loss": 2.3806, "step": 1270 }, { "epoch": 0.20506615037108744, "grad_norm": 4.675017833709717, "learning_rate": 9.227453611983341e-05, "loss": 2.3105, "step": 1271 }, { "epoch": 0.20522749273959343, "grad_norm": 4.16404390335083, "learning_rate": 9.226057842411823e-05, "loss": 2.105, "step": 1272 }, { "epoch": 0.20538883510809938, "grad_norm": 4.810925006866455, "learning_rate": 9.224660918863104e-05, "loss": 2.0151, "step": 1273 }, { "epoch": 0.20555017747660537, "grad_norm": 4.716014862060547, "learning_rate": 9.22326284171863e-05, "loss": 1.9447, "step": 1274 }, { "epoch": 0.20571151984511132, "grad_norm": 3.6420786380767822, "learning_rate": 9.221863611360164e-05, "loss": 2.0919, "step": 1275 }, { "epoch": 0.2058728622136173, "grad_norm": 4.337446689605713, "learning_rate": 9.220463228169785e-05, "loss": 2.0707, "step": 1276 }, { "epoch": 0.20603420458212326, "grad_norm": 3.9118850231170654, "learning_rate": 9.219061692529882e-05, "loss": 2.1869, "step": 1277 }, { "epoch": 0.20619554695062922, "grad_norm": 6.766507148742676, "learning_rate": 9.217659004823162e-05, "loss": 2.1102, "step": 1278 }, { "epoch": 0.2063568893191352, "grad_norm": 4.498598575592041, "learning_rate": 9.216255165432648e-05, "loss": 2.1412, "step": 1279 }, { "epoch": 0.20651823168764116, "grad_norm": 5.993344306945801, "learning_rate": 9.214850174741677e-05, "loss": 2.0669, "step": 1280 }, { "epoch": 0.20667957405614715, "grad_norm": 5.468132495880127, "learning_rate": 9.213444033133893e-05, "loss": 2.1463, "step": 1281 }, { "epoch": 0.2068409164246531, "grad_norm": 4.580942630767822, "learning_rate": 9.212036740993266e-05, "loss": 2.1803, "step": 1282 }, { "epoch": 0.2070022587931591, "grad_norm": 4.758857250213623, "learning_rate": 9.210628298704072e-05, "loss": 2.1023, "step": 1283 }, { "epoch": 0.20716360116166505, "grad_norm": 5.4867072105407715, "learning_rate": 9.209218706650902e-05, "loss": 2.3216, "step": 1284 }, { "epoch": 0.20732494353017103, "grad_norm": 3.8598763942718506, "learning_rate": 9.207807965218668e-05, "loss": 2.4166, "step": 1285 }, { "epoch": 0.207486285898677, "grad_norm": 7.519045352935791, "learning_rate": 9.206396074792585e-05, "loss": 2.1097, "step": 1286 }, { "epoch": 0.20764762826718297, "grad_norm": 5.47310733795166, "learning_rate": 9.204983035758187e-05, "loss": 1.8827, "step": 1287 }, { "epoch": 0.20780897063568893, "grad_norm": 5.6422505378723145, "learning_rate": 9.203568848501327e-05, "loss": 2.1591, "step": 1288 }, { "epoch": 0.2079703130041949, "grad_norm": 4.1179938316345215, "learning_rate": 9.202153513408162e-05, "loss": 1.9835, "step": 1289 }, { "epoch": 0.20813165537270087, "grad_norm": 5.341106414794922, "learning_rate": 9.200737030865168e-05, "loss": 2.3916, "step": 1290 }, { "epoch": 0.20829299774120685, "grad_norm": 5.809384346008301, "learning_rate": 9.199319401259131e-05, "loss": 1.9919, "step": 1291 }, { "epoch": 0.2084543401097128, "grad_norm": 5.8908514976501465, "learning_rate": 9.197900624977156e-05, "loss": 2.2108, "step": 1292 }, { "epoch": 0.20861568247821877, "grad_norm": 5.788384437561035, "learning_rate": 9.196480702406653e-05, "loss": 2.1998, "step": 1293 }, { "epoch": 0.20877702484672475, "grad_norm": 8.431769371032715, "learning_rate": 9.195059633935352e-05, "loss": 2.5768, "step": 1294 }, { "epoch": 0.2089383672152307, "grad_norm": 4.767300605773926, "learning_rate": 9.193637419951294e-05, "loss": 1.9114, "step": 1295 }, { "epoch": 0.2090997095837367, "grad_norm": 4.139121055603027, "learning_rate": 9.19221406084283e-05, "loss": 2.0196, "step": 1296 }, { "epoch": 0.20926105195224265, "grad_norm": 4.451340675354004, "learning_rate": 9.190789556998627e-05, "loss": 1.9757, "step": 1297 }, { "epoch": 0.20942239432074863, "grad_norm": 5.227597713470459, "learning_rate": 9.189363908807663e-05, "loss": 2.0621, "step": 1298 }, { "epoch": 0.2095837366892546, "grad_norm": 5.614994049072266, "learning_rate": 9.187937116659229e-05, "loss": 2.3018, "step": 1299 }, { "epoch": 0.20974507905776058, "grad_norm": 9.214066505432129, "learning_rate": 9.186509180942928e-05, "loss": 2.4219, "step": 1300 }, { "epoch": 0.20990642142626653, "grad_norm": 5.049530982971191, "learning_rate": 9.185080102048675e-05, "loss": 2.4699, "step": 1301 }, { "epoch": 0.21006776379477252, "grad_norm": 4.416045188903809, "learning_rate": 9.1836498803667e-05, "loss": 2.1083, "step": 1302 }, { "epoch": 0.21022910616327847, "grad_norm": 4.00837516784668, "learning_rate": 9.182218516287539e-05, "loss": 1.8756, "step": 1303 }, { "epoch": 0.21039044853178446, "grad_norm": 4.334080696105957, "learning_rate": 9.180786010202045e-05, "loss": 2.4178, "step": 1304 }, { "epoch": 0.21055179090029041, "grad_norm": 3.3530056476593018, "learning_rate": 9.179352362501384e-05, "loss": 2.25, "step": 1305 }, { "epoch": 0.2107131332687964, "grad_norm": 5.390597820281982, "learning_rate": 9.177917573577026e-05, "loss": 2.2067, "step": 1306 }, { "epoch": 0.21087447563730236, "grad_norm": 6.295938491821289, "learning_rate": 9.176481643820762e-05, "loss": 2.1304, "step": 1307 }, { "epoch": 0.2110358180058083, "grad_norm": 4.074714660644531, "learning_rate": 9.17504457362469e-05, "loss": 2.2887, "step": 1308 }, { "epoch": 0.2111971603743143, "grad_norm": 4.701354026794434, "learning_rate": 9.173606363381219e-05, "loss": 2.4589, "step": 1309 }, { "epoch": 0.21135850274282025, "grad_norm": 5.782101154327393, "learning_rate": 9.172167013483068e-05, "loss": 2.0267, "step": 1310 }, { "epoch": 0.21151984511132624, "grad_norm": 4.437861919403076, "learning_rate": 9.170726524323273e-05, "loss": 2.1448, "step": 1311 }, { "epoch": 0.2116811874798322, "grad_norm": 5.867735385894775, "learning_rate": 9.169284896295174e-05, "loss": 2.0455, "step": 1312 }, { "epoch": 0.21184252984833818, "grad_norm": 3.6233768463134766, "learning_rate": 9.167842129792428e-05, "loss": 2.3327, "step": 1313 }, { "epoch": 0.21200387221684414, "grad_norm": 7.002150535583496, "learning_rate": 9.166398225208999e-05, "loss": 2.0155, "step": 1314 }, { "epoch": 0.21216521458535012, "grad_norm": 4.855229377746582, "learning_rate": 9.164953182939162e-05, "loss": 2.0364, "step": 1315 }, { "epoch": 0.21232655695385608, "grad_norm": 5.568299293518066, "learning_rate": 9.163507003377506e-05, "loss": 2.155, "step": 1316 }, { "epoch": 0.21248789932236206, "grad_norm": 5.044248580932617, "learning_rate": 9.162059686918924e-05, "loss": 2.2754, "step": 1317 }, { "epoch": 0.21264924169086802, "grad_norm": 6.40256929397583, "learning_rate": 9.160611233958629e-05, "loss": 2.1125, "step": 1318 }, { "epoch": 0.212810584059374, "grad_norm": 4.975361347198486, "learning_rate": 9.159161644892135e-05, "loss": 2.1204, "step": 1319 }, { "epoch": 0.21297192642787996, "grad_norm": 4.8289031982421875, "learning_rate": 9.157710920115273e-05, "loss": 2.0861, "step": 1320 }, { "epoch": 0.21313326879638594, "grad_norm": 3.993028163909912, "learning_rate": 9.156259060024177e-05, "loss": 2.1801, "step": 1321 }, { "epoch": 0.2132946111648919, "grad_norm": 5.233036994934082, "learning_rate": 9.1548060650153e-05, "loss": 2.1827, "step": 1322 }, { "epoch": 0.21345595353339786, "grad_norm": 4.861788749694824, "learning_rate": 9.153351935485397e-05, "loss": 2.155, "step": 1323 }, { "epoch": 0.21361729590190384, "grad_norm": 4.867989540100098, "learning_rate": 9.151896671831538e-05, "loss": 2.3657, "step": 1324 }, { "epoch": 0.2137786382704098, "grad_norm": 5.0348334312438965, "learning_rate": 9.1504402744511e-05, "loss": 2.1548, "step": 1325 }, { "epoch": 0.21393998063891578, "grad_norm": 4.9925994873046875, "learning_rate": 9.14898274374177e-05, "loss": 1.9949, "step": 1326 }, { "epoch": 0.21410132300742174, "grad_norm": 4.181024551391602, "learning_rate": 9.147524080101544e-05, "loss": 2.0769, "step": 1327 }, { "epoch": 0.21426266537592772, "grad_norm": 4.714298248291016, "learning_rate": 9.14606428392873e-05, "loss": 2.1353, "step": 1328 }, { "epoch": 0.21442400774443368, "grad_norm": 4.4157938957214355, "learning_rate": 9.144603355621941e-05, "loss": 2.2024, "step": 1329 }, { "epoch": 0.21458535011293967, "grad_norm": 3.2527670860290527, "learning_rate": 9.143141295580104e-05, "loss": 1.9403, "step": 1330 }, { "epoch": 0.21474669248144562, "grad_norm": 3.534852981567383, "learning_rate": 9.14167810420245e-05, "loss": 2.0353, "step": 1331 }, { "epoch": 0.2149080348499516, "grad_norm": 4.721923828125, "learning_rate": 9.140213781888524e-05, "loss": 2.0659, "step": 1332 }, { "epoch": 0.21506937721845756, "grad_norm": 4.635398864746094, "learning_rate": 9.138748329038177e-05, "loss": 2.2004, "step": 1333 }, { "epoch": 0.21523071958696355, "grad_norm": 5.3266520500183105, "learning_rate": 9.137281746051565e-05, "loss": 2.8019, "step": 1334 }, { "epoch": 0.2153920619554695, "grad_norm": 4.811481952667236, "learning_rate": 9.135814033329162e-05, "loss": 2.2637, "step": 1335 }, { "epoch": 0.2155534043239755, "grad_norm": 4.6001458168029785, "learning_rate": 9.134345191271742e-05, "loss": 2.2032, "step": 1336 }, { "epoch": 0.21571474669248145, "grad_norm": 4.366300106048584, "learning_rate": 9.13287522028039e-05, "loss": 1.971, "step": 1337 }, { "epoch": 0.2158760890609874, "grad_norm": 4.729623794555664, "learning_rate": 9.131404120756502e-05, "loss": 1.9098, "step": 1338 }, { "epoch": 0.2160374314294934, "grad_norm": 3.6516096591949463, "learning_rate": 9.129931893101778e-05, "loss": 2.0485, "step": 1339 }, { "epoch": 0.21619877379799934, "grad_norm": 3.8406763076782227, "learning_rate": 9.12845853771823e-05, "loss": 2.0777, "step": 1340 }, { "epoch": 0.21636011616650533, "grad_norm": 4.912896633148193, "learning_rate": 9.126984055008172e-05, "loss": 2.0741, "step": 1341 }, { "epoch": 0.21652145853501129, "grad_norm": 4.041453838348389, "learning_rate": 9.125508445374233e-05, "loss": 2.1465, "step": 1342 }, { "epoch": 0.21668280090351727, "grad_norm": 4.528754234313965, "learning_rate": 9.124031709219346e-05, "loss": 2.2577, "step": 1343 }, { "epoch": 0.21684414327202323, "grad_norm": 4.499135971069336, "learning_rate": 9.122553846946751e-05, "loss": 2.2312, "step": 1344 }, { "epoch": 0.2170054856405292, "grad_norm": 5.06335973739624, "learning_rate": 9.121074858959997e-05, "loss": 1.9082, "step": 1345 }, { "epoch": 0.21716682800903517, "grad_norm": 4.791558742523193, "learning_rate": 9.119594745662941e-05, "loss": 2.006, "step": 1346 }, { "epoch": 0.21732817037754115, "grad_norm": 4.483156204223633, "learning_rate": 9.118113507459743e-05, "loss": 2.1619, "step": 1347 }, { "epoch": 0.2174895127460471, "grad_norm": 4.789119243621826, "learning_rate": 9.116631144754877e-05, "loss": 2.2854, "step": 1348 }, { "epoch": 0.2176508551145531, "grad_norm": 5.603588104248047, "learning_rate": 9.115147657953118e-05, "loss": 2.3163, "step": 1349 }, { "epoch": 0.21781219748305905, "grad_norm": 4.183100700378418, "learning_rate": 9.113663047459553e-05, "loss": 2.0677, "step": 1350 }, { "epoch": 0.21797353985156503, "grad_norm": 4.857205390930176, "learning_rate": 9.11217731367957e-05, "loss": 2.3799, "step": 1351 }, { "epoch": 0.218134882220071, "grad_norm": 3.7694199085235596, "learning_rate": 9.110690457018868e-05, "loss": 1.9902, "step": 1352 }, { "epoch": 0.21829622458857695, "grad_norm": 4.458879470825195, "learning_rate": 9.109202477883453e-05, "loss": 1.9408, "step": 1353 }, { "epoch": 0.21845756695708293, "grad_norm": 4.134251117706299, "learning_rate": 9.107713376679634e-05, "loss": 2.1774, "step": 1354 }, { "epoch": 0.2186189093255889, "grad_norm": 4.250706195831299, "learning_rate": 9.10622315381403e-05, "loss": 2.1894, "step": 1355 }, { "epoch": 0.21878025169409487, "grad_norm": 4.276532173156738, "learning_rate": 9.104731809693563e-05, "loss": 2.0367, "step": 1356 }, { "epoch": 0.21894159406260083, "grad_norm": 5.151341915130615, "learning_rate": 9.103239344725465e-05, "loss": 2.1474, "step": 1357 }, { "epoch": 0.21910293643110682, "grad_norm": 4.042210102081299, "learning_rate": 9.10174575931727e-05, "loss": 2.1367, "step": 1358 }, { "epoch": 0.21926427879961277, "grad_norm": 4.783393859863281, "learning_rate": 9.100251053876822e-05, "loss": 1.93, "step": 1359 }, { "epoch": 0.21942562116811876, "grad_norm": 4.063416957855225, "learning_rate": 9.098755228812268e-05, "loss": 1.977, "step": 1360 }, { "epoch": 0.2195869635366247, "grad_norm": 6.108659744262695, "learning_rate": 9.097258284532061e-05, "loss": 2.1322, "step": 1361 }, { "epoch": 0.2197483059051307, "grad_norm": 4.696874141693115, "learning_rate": 9.09576022144496e-05, "loss": 1.9444, "step": 1362 }, { "epoch": 0.21990964827363665, "grad_norm": 4.419808387756348, "learning_rate": 9.094261039960027e-05, "loss": 2.1688, "step": 1363 }, { "epoch": 0.22007099064214264, "grad_norm": 4.2608442306518555, "learning_rate": 9.092760740486639e-05, "loss": 2.2771, "step": 1364 }, { "epoch": 0.2202323330106486, "grad_norm": 4.203035354614258, "learning_rate": 9.091259323434465e-05, "loss": 2.0907, "step": 1365 }, { "epoch": 0.22039367537915455, "grad_norm": 5.264461040496826, "learning_rate": 9.089756789213488e-05, "loss": 2.1593, "step": 1366 }, { "epoch": 0.22055501774766054, "grad_norm": 4.781998634338379, "learning_rate": 9.088253138233993e-05, "loss": 2.1517, "step": 1367 }, { "epoch": 0.2207163601161665, "grad_norm": 4.442568778991699, "learning_rate": 9.08674837090657e-05, "loss": 2.0565, "step": 1368 }, { "epoch": 0.22087770248467248, "grad_norm": 4.468411922454834, "learning_rate": 9.085242487642116e-05, "loss": 2.2194, "step": 1369 }, { "epoch": 0.22103904485317843, "grad_norm": 4.508581161499023, "learning_rate": 9.083735488851828e-05, "loss": 1.977, "step": 1370 }, { "epoch": 0.22120038722168442, "grad_norm": 4.347281455993652, "learning_rate": 9.082227374947214e-05, "loss": 2.312, "step": 1371 }, { "epoch": 0.22136172959019038, "grad_norm": 5.029078483581543, "learning_rate": 9.08071814634008e-05, "loss": 2.2434, "step": 1372 }, { "epoch": 0.22152307195869636, "grad_norm": 5.283694267272949, "learning_rate": 9.079207803442542e-05, "loss": 2.3074, "step": 1373 }, { "epoch": 0.22168441432720232, "grad_norm": 6.727270126342773, "learning_rate": 9.077696346667015e-05, "loss": 2.2142, "step": 1374 }, { "epoch": 0.2218457566957083, "grad_norm": 4.587703227996826, "learning_rate": 9.076183776426224e-05, "loss": 2.2143, "step": 1375 }, { "epoch": 0.22200709906421426, "grad_norm": 5.426577568054199, "learning_rate": 9.074670093133193e-05, "loss": 2.2577, "step": 1376 }, { "epoch": 0.22216844143272024, "grad_norm": 4.498953819274902, "learning_rate": 9.073155297201252e-05, "loss": 2.0291, "step": 1377 }, { "epoch": 0.2223297838012262, "grad_norm": 4.776580333709717, "learning_rate": 9.071639389044036e-05, "loss": 2.0842, "step": 1378 }, { "epoch": 0.22249112616973218, "grad_norm": 4.266298294067383, "learning_rate": 9.070122369075481e-05, "loss": 2.2213, "step": 1379 }, { "epoch": 0.22265246853823814, "grad_norm": 4.398402214050293, "learning_rate": 9.068604237709828e-05, "loss": 1.9287, "step": 1380 }, { "epoch": 0.2228138109067441, "grad_norm": 5.507858753204346, "learning_rate": 9.067084995361623e-05, "loss": 1.8418, "step": 1381 }, { "epoch": 0.22297515327525008, "grad_norm": 4.711968421936035, "learning_rate": 9.065564642445711e-05, "loss": 2.0282, "step": 1382 }, { "epoch": 0.22313649564375604, "grad_norm": 4.0508575439453125, "learning_rate": 9.064043179377249e-05, "loss": 2.3358, "step": 1383 }, { "epoch": 0.22329783801226202, "grad_norm": 4.1921467781066895, "learning_rate": 9.062520606571682e-05, "loss": 1.908, "step": 1384 }, { "epoch": 0.22345918038076798, "grad_norm": 4.855215549468994, "learning_rate": 9.060996924444776e-05, "loss": 2.2436, "step": 1385 }, { "epoch": 0.22362052274927396, "grad_norm": 3.554455518722534, "learning_rate": 9.059472133412587e-05, "loss": 2.0395, "step": 1386 }, { "epoch": 0.22378186511777992, "grad_norm": 5.087851047515869, "learning_rate": 9.05794623389148e-05, "loss": 2.0126, "step": 1387 }, { "epoch": 0.2239432074862859, "grad_norm": 5.585208415985107, "learning_rate": 9.056419226298117e-05, "loss": 1.9502, "step": 1388 }, { "epoch": 0.22410454985479186, "grad_norm": 4.503779411315918, "learning_rate": 9.054891111049468e-05, "loss": 2.0783, "step": 1389 }, { "epoch": 0.22426589222329785, "grad_norm": 4.334665298461914, "learning_rate": 9.053361888562807e-05, "loss": 2.0684, "step": 1390 }, { "epoch": 0.2244272345918038, "grad_norm": 5.303945541381836, "learning_rate": 9.051831559255704e-05, "loss": 2.1628, "step": 1391 }, { "epoch": 0.2245885769603098, "grad_norm": 5.3270063400268555, "learning_rate": 9.050300123546033e-05, "loss": 2.048, "step": 1392 }, { "epoch": 0.22474991932881574, "grad_norm": 5.550933837890625, "learning_rate": 9.048767581851973e-05, "loss": 2.3384, "step": 1393 }, { "epoch": 0.22491126169732173, "grad_norm": 6.187167167663574, "learning_rate": 9.047233934592005e-05, "loss": 2.0726, "step": 1394 }, { "epoch": 0.22507260406582769, "grad_norm": 4.272796630859375, "learning_rate": 9.045699182184909e-05, "loss": 2.0404, "step": 1395 }, { "epoch": 0.22523394643433364, "grad_norm": 4.191316604614258, "learning_rate": 9.044163325049766e-05, "loss": 2.1977, "step": 1396 }, { "epoch": 0.22539528880283963, "grad_norm": 4.133779525756836, "learning_rate": 9.042626363605964e-05, "loss": 2.0868, "step": 1397 }, { "epoch": 0.22555663117134558, "grad_norm": 5.127355575561523, "learning_rate": 9.041088298273186e-05, "loss": 1.9973, "step": 1398 }, { "epoch": 0.22571797353985157, "grad_norm": 4.5121073722839355, "learning_rate": 9.039549129471423e-05, "loss": 2.324, "step": 1399 }, { "epoch": 0.22587931590835753, "grad_norm": 4.350682258605957, "learning_rate": 9.038008857620963e-05, "loss": 2.159, "step": 1400 }, { "epoch": 0.2260406582768635, "grad_norm": 4.376583576202393, "learning_rate": 9.036467483142394e-05, "loss": 2.1031, "step": 1401 }, { "epoch": 0.22620200064536947, "grad_norm": 4.949342727661133, "learning_rate": 9.034925006456611e-05, "loss": 2.3473, "step": 1402 }, { "epoch": 0.22636334301387545, "grad_norm": 5.014150142669678, "learning_rate": 9.033381427984803e-05, "loss": 2.3026, "step": 1403 }, { "epoch": 0.2265246853823814, "grad_norm": 6.434081554412842, "learning_rate": 9.031836748148465e-05, "loss": 2.0081, "step": 1404 }, { "epoch": 0.2266860277508874, "grad_norm": 6.950235843658447, "learning_rate": 9.030290967369392e-05, "loss": 2.3194, "step": 1405 }, { "epoch": 0.22684737011939335, "grad_norm": 6.151453971862793, "learning_rate": 9.028744086069674e-05, "loss": 2.1002, "step": 1406 }, { "epoch": 0.22700871248789933, "grad_norm": 7.216166973114014, "learning_rate": 9.027196104671712e-05, "loss": 2.1609, "step": 1407 }, { "epoch": 0.2271700548564053, "grad_norm": 3.784708023071289, "learning_rate": 9.025647023598196e-05, "loss": 2.3474, "step": 1408 }, { "epoch": 0.22733139722491127, "grad_norm": 5.03326416015625, "learning_rate": 9.024096843272124e-05, "loss": 2.1629, "step": 1409 }, { "epoch": 0.22749273959341723, "grad_norm": 4.098717212677002, "learning_rate": 9.022545564116793e-05, "loss": 1.9903, "step": 1410 }, { "epoch": 0.2276540819619232, "grad_norm": 5.1551361083984375, "learning_rate": 9.020993186555796e-05, "loss": 2.1439, "step": 1411 }, { "epoch": 0.22781542433042917, "grad_norm": 5.542691707611084, "learning_rate": 9.019439711013031e-05, "loss": 2.0417, "step": 1412 }, { "epoch": 0.22797676669893513, "grad_norm": 4.643231391906738, "learning_rate": 9.017885137912694e-05, "loss": 2.0455, "step": 1413 }, { "epoch": 0.2281381090674411, "grad_norm": 4.5204267501831055, "learning_rate": 9.016329467679281e-05, "loss": 2.1043, "step": 1414 }, { "epoch": 0.22829945143594707, "grad_norm": 4.192257881164551, "learning_rate": 9.014772700737584e-05, "loss": 2.1372, "step": 1415 }, { "epoch": 0.22846079380445306, "grad_norm": 4.215038776397705, "learning_rate": 9.013214837512697e-05, "loss": 2.002, "step": 1416 }, { "epoch": 0.228622136172959, "grad_norm": 7.06200647354126, "learning_rate": 9.011655878430019e-05, "loss": 2.2708, "step": 1417 }, { "epoch": 0.228783478541465, "grad_norm": 4.816001892089844, "learning_rate": 9.010095823915237e-05, "loss": 2.0151, "step": 1418 }, { "epoch": 0.22894482090997095, "grad_norm": 5.204929828643799, "learning_rate": 9.008534674394348e-05, "loss": 1.9334, "step": 1419 }, { "epoch": 0.22910616327847694, "grad_norm": 4.745861530303955, "learning_rate": 9.006972430293639e-05, "loss": 2.2435, "step": 1420 }, { "epoch": 0.2292675056469829, "grad_norm": 5.817446708679199, "learning_rate": 9.005409092039703e-05, "loss": 1.959, "step": 1421 }, { "epoch": 0.22942884801548888, "grad_norm": 4.074014186859131, "learning_rate": 9.003844660059428e-05, "loss": 1.9885, "step": 1422 }, { "epoch": 0.22959019038399484, "grad_norm": 4.622272491455078, "learning_rate": 9.00227913478e-05, "loss": 2.1142, "step": 1423 }, { "epoch": 0.22975153275250082, "grad_norm": 4.587558269500732, "learning_rate": 9.000712516628907e-05, "loss": 2.1376, "step": 1424 }, { "epoch": 0.22991287512100678, "grad_norm": 4.234425067901611, "learning_rate": 8.999144806033932e-05, "loss": 1.9972, "step": 1425 }, { "epoch": 0.23007421748951273, "grad_norm": 4.4645280838012695, "learning_rate": 8.997576003423159e-05, "loss": 1.9112, "step": 1426 }, { "epoch": 0.23023555985801872, "grad_norm": 4.017051696777344, "learning_rate": 8.996006109224968e-05, "loss": 2.0565, "step": 1427 }, { "epoch": 0.23039690222652467, "grad_norm": 5.7819132804870605, "learning_rate": 8.994435123868038e-05, "loss": 2.3573, "step": 1428 }, { "epoch": 0.23055824459503066, "grad_norm": 4.956873416900635, "learning_rate": 8.992863047781345e-05, "loss": 2.1731, "step": 1429 }, { "epoch": 0.23071958696353662, "grad_norm": 4.781367301940918, "learning_rate": 8.991289881394167e-05, "loss": 2.189, "step": 1430 }, { "epoch": 0.2308809293320426, "grad_norm": 4.809574604034424, "learning_rate": 8.989715625136072e-05, "loss": 2.0249, "step": 1431 }, { "epoch": 0.23104227170054856, "grad_norm": 5.7442755699157715, "learning_rate": 8.988140279436934e-05, "loss": 2.0393, "step": 1432 }, { "epoch": 0.23120361406905454, "grad_norm": 5.626896858215332, "learning_rate": 8.986563844726918e-05, "loss": 2.0689, "step": 1433 }, { "epoch": 0.2313649564375605, "grad_norm": 5.195352077484131, "learning_rate": 8.984986321436491e-05, "loss": 1.996, "step": 1434 }, { "epoch": 0.23152629880606648, "grad_norm": 4.860560417175293, "learning_rate": 8.983407709996414e-05, "loss": 2.1372, "step": 1435 }, { "epoch": 0.23168764117457244, "grad_norm": 3.8607332706451416, "learning_rate": 8.981828010837745e-05, "loss": 1.8863, "step": 1436 }, { "epoch": 0.23184898354307842, "grad_norm": 3.993809223175049, "learning_rate": 8.980247224391843e-05, "loss": 1.9653, "step": 1437 }, { "epoch": 0.23201032591158438, "grad_norm": 4.640903949737549, "learning_rate": 8.978665351090358e-05, "loss": 2.0526, "step": 1438 }, { "epoch": 0.23217166828009037, "grad_norm": 7.137829303741455, "learning_rate": 8.977082391365243e-05, "loss": 2.0915, "step": 1439 }, { "epoch": 0.23233301064859632, "grad_norm": 4.12993860244751, "learning_rate": 8.975498345648745e-05, "loss": 2.1223, "step": 1440 }, { "epoch": 0.23249435301710228, "grad_norm": 3.8784449100494385, "learning_rate": 8.973913214373404e-05, "loss": 2.2804, "step": 1441 }, { "epoch": 0.23265569538560826, "grad_norm": 4.640244007110596, "learning_rate": 8.972326997972062e-05, "loss": 2.1302, "step": 1442 }, { "epoch": 0.23281703775411422, "grad_norm": 5.461720943450928, "learning_rate": 8.970739696877854e-05, "loss": 1.8691, "step": 1443 }, { "epoch": 0.2329783801226202, "grad_norm": 3.66225004196167, "learning_rate": 8.969151311524214e-05, "loss": 2.1667, "step": 1444 }, { "epoch": 0.23313972249112616, "grad_norm": 5.224299430847168, "learning_rate": 8.967561842344867e-05, "loss": 2.0891, "step": 1445 }, { "epoch": 0.23330106485963215, "grad_norm": 4.650608062744141, "learning_rate": 8.96597128977384e-05, "loss": 2.1573, "step": 1446 }, { "epoch": 0.2334624072281381, "grad_norm": 5.109067916870117, "learning_rate": 8.964379654245452e-05, "loss": 1.9191, "step": 1447 }, { "epoch": 0.2336237495966441, "grad_norm": 8.680831909179688, "learning_rate": 8.962786936194318e-05, "loss": 2.0666, "step": 1448 }, { "epoch": 0.23378509196515004, "grad_norm": 4.50174617767334, "learning_rate": 8.96119313605535e-05, "loss": 2.0207, "step": 1449 }, { "epoch": 0.23394643433365603, "grad_norm": 3.517400026321411, "learning_rate": 8.959598254263754e-05, "loss": 1.9692, "step": 1450 }, { "epoch": 0.23410777670216198, "grad_norm": 4.062027454376221, "learning_rate": 8.958002291255035e-05, "loss": 2.2276, "step": 1451 }, { "epoch": 0.23426911907066797, "grad_norm": 4.07244873046875, "learning_rate": 8.956405247464987e-05, "loss": 2.1075, "step": 1452 }, { "epoch": 0.23443046143917393, "grad_norm": 3.9593074321746826, "learning_rate": 8.954807123329704e-05, "loss": 2.1666, "step": 1453 }, { "epoch": 0.2345918038076799, "grad_norm": 4.045650005340576, "learning_rate": 8.953207919285573e-05, "loss": 2.0366, "step": 1454 }, { "epoch": 0.23475314617618587, "grad_norm": 3.8713786602020264, "learning_rate": 8.951607635769275e-05, "loss": 2.041, "step": 1455 }, { "epoch": 0.23491448854469182, "grad_norm": 3.8852434158325195, "learning_rate": 8.95000627321779e-05, "loss": 2.0126, "step": 1456 }, { "epoch": 0.2350758309131978, "grad_norm": 4.7137908935546875, "learning_rate": 8.948403832068389e-05, "loss": 2.0859, "step": 1457 }, { "epoch": 0.23523717328170377, "grad_norm": 5.0109333992004395, "learning_rate": 8.946800312758638e-05, "loss": 2.4074, "step": 1458 }, { "epoch": 0.23539851565020975, "grad_norm": 4.378200054168701, "learning_rate": 8.945195715726396e-05, "loss": 2.3619, "step": 1459 }, { "epoch": 0.2355598580187157, "grad_norm": 4.874209403991699, "learning_rate": 8.943590041409822e-05, "loss": 1.8734, "step": 1460 }, { "epoch": 0.2357212003872217, "grad_norm": 3.297186851501465, "learning_rate": 8.94198329024736e-05, "loss": 2.1214, "step": 1461 }, { "epoch": 0.23588254275572765, "grad_norm": 3.798933744430542, "learning_rate": 8.940375462677757e-05, "loss": 2.1703, "step": 1462 }, { "epoch": 0.23604388512423363, "grad_norm": 3.941425085067749, "learning_rate": 8.93876655914005e-05, "loss": 1.998, "step": 1463 }, { "epoch": 0.2362052274927396, "grad_norm": 4.085728645324707, "learning_rate": 8.937156580073569e-05, "loss": 2.0896, "step": 1464 }, { "epoch": 0.23636656986124557, "grad_norm": 4.89539909362793, "learning_rate": 8.935545525917937e-05, "loss": 2.005, "step": 1465 }, { "epoch": 0.23652791222975153, "grad_norm": 5.450769901275635, "learning_rate": 8.933933397113075e-05, "loss": 2.1447, "step": 1466 }, { "epoch": 0.23668925459825751, "grad_norm": 4.819671630859375, "learning_rate": 8.932320194099194e-05, "loss": 2.2535, "step": 1467 }, { "epoch": 0.23685059696676347, "grad_norm": 6.937513828277588, "learning_rate": 8.930705917316797e-05, "loss": 2.2115, "step": 1468 }, { "epoch": 0.23701193933526943, "grad_norm": 6.169703483581543, "learning_rate": 8.929090567206685e-05, "loss": 1.9505, "step": 1469 }, { "epoch": 0.2371732817037754, "grad_norm": 4.902163982391357, "learning_rate": 8.927474144209947e-05, "loss": 2.0367, "step": 1470 }, { "epoch": 0.23733462407228137, "grad_norm": 6.251935958862305, "learning_rate": 8.92585664876797e-05, "loss": 2.0549, "step": 1471 }, { "epoch": 0.23749596644078735, "grad_norm": 4.960795879364014, "learning_rate": 8.924238081322427e-05, "loss": 1.9314, "step": 1472 }, { "epoch": 0.2376573088092933, "grad_norm": 4.389493465423584, "learning_rate": 8.922618442315291e-05, "loss": 2.1619, "step": 1473 }, { "epoch": 0.2378186511777993, "grad_norm": 3.819166421890259, "learning_rate": 8.920997732188823e-05, "loss": 2.1415, "step": 1474 }, { "epoch": 0.23797999354630525, "grad_norm": 4.902157783508301, "learning_rate": 8.919375951385579e-05, "loss": 2.1946, "step": 1475 }, { "epoch": 0.23814133591481124, "grad_norm": 4.391149997711182, "learning_rate": 8.917753100348405e-05, "loss": 1.8695, "step": 1476 }, { "epoch": 0.2383026782833172, "grad_norm": 4.471001625061035, "learning_rate": 8.916129179520442e-05, "loss": 2.2006, "step": 1477 }, { "epoch": 0.23846402065182318, "grad_norm": 3.5523102283477783, "learning_rate": 8.914504189345119e-05, "loss": 2.0731, "step": 1478 }, { "epoch": 0.23862536302032913, "grad_norm": 3.7575554847717285, "learning_rate": 8.912878130266162e-05, "loss": 2.3713, "step": 1479 }, { "epoch": 0.23878670538883512, "grad_norm": 4.426853179931641, "learning_rate": 8.911251002727588e-05, "loss": 1.8838, "step": 1480 }, { "epoch": 0.23894804775734108, "grad_norm": 4.108395576477051, "learning_rate": 8.909622807173698e-05, "loss": 2.2149, "step": 1481 }, { "epoch": 0.23910939012584706, "grad_norm": 4.647747993469238, "learning_rate": 8.907993544049098e-05, "loss": 2.0565, "step": 1482 }, { "epoch": 0.23927073249435302, "grad_norm": 3.7445273399353027, "learning_rate": 8.906363213798674e-05, "loss": 2.2598, "step": 1483 }, { "epoch": 0.23943207486285897, "grad_norm": 4.759930610656738, "learning_rate": 8.904731816867609e-05, "loss": 2.1619, "step": 1484 }, { "epoch": 0.23959341723136496, "grad_norm": 6.480990886688232, "learning_rate": 8.903099353701376e-05, "loss": 2.2404, "step": 1485 }, { "epoch": 0.23975475959987091, "grad_norm": 4.645752429962158, "learning_rate": 8.90146582474574e-05, "loss": 2.1223, "step": 1486 }, { "epoch": 0.2399161019683769, "grad_norm": 4.475593566894531, "learning_rate": 8.899831230446754e-05, "loss": 2.1774, "step": 1487 }, { "epoch": 0.24007744433688286, "grad_norm": 3.9450621604919434, "learning_rate": 8.898195571250768e-05, "loss": 2.2844, "step": 1488 }, { "epoch": 0.24023878670538884, "grad_norm": 4.216766357421875, "learning_rate": 8.896558847604414e-05, "loss": 2.1326, "step": 1489 }, { "epoch": 0.2404001290738948, "grad_norm": 4.920373916625977, "learning_rate": 8.894921059954622e-05, "loss": 1.9147, "step": 1490 }, { "epoch": 0.24056147144240078, "grad_norm": 5.657355785369873, "learning_rate": 8.893282208748612e-05, "loss": 1.9171, "step": 1491 }, { "epoch": 0.24072281381090674, "grad_norm": 3.456425666809082, "learning_rate": 8.891642294433891e-05, "loss": 2.079, "step": 1492 }, { "epoch": 0.24088415617941272, "grad_norm": 4.492105007171631, "learning_rate": 8.890001317458257e-05, "loss": 2.1169, "step": 1493 }, { "epoch": 0.24104549854791868, "grad_norm": 5.125545501708984, "learning_rate": 8.888359278269798e-05, "loss": 2.0139, "step": 1494 }, { "epoch": 0.24120684091642466, "grad_norm": 4.284847736358643, "learning_rate": 8.886716177316895e-05, "loss": 1.9375, "step": 1495 }, { "epoch": 0.24136818328493062, "grad_norm": 4.1635212898254395, "learning_rate": 8.885072015048217e-05, "loss": 2.2006, "step": 1496 }, { "epoch": 0.2415295256534366, "grad_norm": 5.857314586639404, "learning_rate": 8.883426791912723e-05, "loss": 2.0958, "step": 1497 }, { "epoch": 0.24169086802194256, "grad_norm": 4.1824259757995605, "learning_rate": 8.88178050835966e-05, "loss": 2.3473, "step": 1498 }, { "epoch": 0.24185221039044852, "grad_norm": 4.814222812652588, "learning_rate": 8.88013316483857e-05, "loss": 2.2205, "step": 1499 }, { "epoch": 0.2420135527589545, "grad_norm": 6.895967960357666, "learning_rate": 8.878484761799273e-05, "loss": 2.5106, "step": 1500 }, { "epoch": 0.24217489512746046, "grad_norm": 4.180379390716553, "learning_rate": 8.876835299691891e-05, "loss": 2.0276, "step": 1501 }, { "epoch": 0.24233623749596644, "grad_norm": 4.665975570678711, "learning_rate": 8.875184778966829e-05, "loss": 1.888, "step": 1502 }, { "epoch": 0.2424975798644724, "grad_norm": 4.229085922241211, "learning_rate": 8.873533200074784e-05, "loss": 2.3136, "step": 1503 }, { "epoch": 0.24265892223297839, "grad_norm": 5.273557186126709, "learning_rate": 8.871880563466736e-05, "loss": 2.1664, "step": 1504 }, { "epoch": 0.24282026460148434, "grad_norm": 3.392350196838379, "learning_rate": 8.870226869593961e-05, "loss": 2.0867, "step": 1505 }, { "epoch": 0.24298160696999033, "grad_norm": 4.525039196014404, "learning_rate": 8.86857211890802e-05, "loss": 1.8333, "step": 1506 }, { "epoch": 0.24314294933849628, "grad_norm": 3.7613155841827393, "learning_rate": 8.86691631186076e-05, "loss": 2.1106, "step": 1507 }, { "epoch": 0.24330429170700227, "grad_norm": 5.067121982574463, "learning_rate": 8.865259448904324e-05, "loss": 2.0235, "step": 1508 }, { "epoch": 0.24346563407550822, "grad_norm": 3.9605300426483154, "learning_rate": 8.863601530491137e-05, "loss": 2.0636, "step": 1509 }, { "epoch": 0.2436269764440142, "grad_norm": 4.152029037475586, "learning_rate": 8.861942557073912e-05, "loss": 2.231, "step": 1510 }, { "epoch": 0.24378831881252017, "grad_norm": 4.132868766784668, "learning_rate": 8.860282529105657e-05, "loss": 2.0088, "step": 1511 }, { "epoch": 0.24394966118102615, "grad_norm": 5.009464740753174, "learning_rate": 8.858621447039657e-05, "loss": 2.2279, "step": 1512 }, { "epoch": 0.2441110035495321, "grad_norm": 3.118217706680298, "learning_rate": 8.856959311329495e-05, "loss": 2.0143, "step": 1513 }, { "epoch": 0.24427234591803806, "grad_norm": 5.521666526794434, "learning_rate": 8.855296122429038e-05, "loss": 1.9785, "step": 1514 }, { "epoch": 0.24443368828654405, "grad_norm": 6.3549485206604, "learning_rate": 8.853631880792436e-05, "loss": 2.0396, "step": 1515 }, { "epoch": 0.24459503065505, "grad_norm": 3.5283477306365967, "learning_rate": 8.851966586874138e-05, "loss": 2.0881, "step": 1516 }, { "epoch": 0.244756373023556, "grad_norm": 4.201773643493652, "learning_rate": 8.850300241128866e-05, "loss": 2.0129, "step": 1517 }, { "epoch": 0.24491771539206195, "grad_norm": 5.038053512573242, "learning_rate": 8.848632844011639e-05, "loss": 2.2061, "step": 1518 }, { "epoch": 0.24507905776056793, "grad_norm": 4.533333778381348, "learning_rate": 8.846964395977762e-05, "loss": 1.9826, "step": 1519 }, { "epoch": 0.2452404001290739, "grad_norm": 4.284814357757568, "learning_rate": 8.845294897482822e-05, "loss": 2.1562, "step": 1520 }, { "epoch": 0.24540174249757987, "grad_norm": 5.211300849914551, "learning_rate": 8.843624348982698e-05, "loss": 2.3218, "step": 1521 }, { "epoch": 0.24556308486608583, "grad_norm": 6.453266620635986, "learning_rate": 8.841952750933554e-05, "loss": 2.0731, "step": 1522 }, { "epoch": 0.2457244272345918, "grad_norm": 4.951452255249023, "learning_rate": 8.84028010379184e-05, "loss": 2.3241, "step": 1523 }, { "epoch": 0.24588576960309777, "grad_norm": 5.738493919372559, "learning_rate": 8.838606408014292e-05, "loss": 2.0992, "step": 1524 }, { "epoch": 0.24604711197160375, "grad_norm": 4.673913478851318, "learning_rate": 8.836931664057935e-05, "loss": 2.332, "step": 1525 }, { "epoch": 0.2462084543401097, "grad_norm": 3.725369930267334, "learning_rate": 8.835255872380078e-05, "loss": 2.0166, "step": 1526 }, { "epoch": 0.2463697967086157, "grad_norm": 4.07654333114624, "learning_rate": 8.833579033438316e-05, "loss": 2.0895, "step": 1527 }, { "epoch": 0.24653113907712165, "grad_norm": 3.596895933151245, "learning_rate": 8.831901147690532e-05, "loss": 1.9721, "step": 1528 }, { "epoch": 0.2466924814456276, "grad_norm": 5.355297565460205, "learning_rate": 8.83022221559489e-05, "loss": 2.257, "step": 1529 }, { "epoch": 0.2468538238141336, "grad_norm": 4.837414741516113, "learning_rate": 8.828542237609846e-05, "loss": 2.064, "step": 1530 }, { "epoch": 0.24701516618263955, "grad_norm": 5.691197395324707, "learning_rate": 8.82686121419414e-05, "loss": 2.2296, "step": 1531 }, { "epoch": 0.24717650855114554, "grad_norm": 6.874438762664795, "learning_rate": 8.825179145806794e-05, "loss": 1.9951, "step": 1532 }, { "epoch": 0.2473378509196515, "grad_norm": 5.810023784637451, "learning_rate": 8.823496032907116e-05, "loss": 2.2515, "step": 1533 }, { "epoch": 0.24749919328815748, "grad_norm": 6.254134178161621, "learning_rate": 8.821811875954704e-05, "loss": 2.3287, "step": 1534 }, { "epoch": 0.24766053565666343, "grad_norm": 4.213715076446533, "learning_rate": 8.820126675409435e-05, "loss": 2.0134, "step": 1535 }, { "epoch": 0.24782187802516942, "grad_norm": 8.564623832702637, "learning_rate": 8.818440431731476e-05, "loss": 2.1856, "step": 1536 }, { "epoch": 0.24798322039367537, "grad_norm": 6.99215030670166, "learning_rate": 8.816753145381276e-05, "loss": 2.3262, "step": 1537 }, { "epoch": 0.24814456276218136, "grad_norm": 5.778186321258545, "learning_rate": 8.815064816819569e-05, "loss": 1.9285, "step": 1538 }, { "epoch": 0.24830590513068732, "grad_norm": 3.7695446014404297, "learning_rate": 8.813375446507373e-05, "loss": 2.2911, "step": 1539 }, { "epoch": 0.2484672474991933, "grad_norm": 3.9019341468811035, "learning_rate": 8.811685034905993e-05, "loss": 1.9976, "step": 1540 }, { "epoch": 0.24862858986769926, "grad_norm": 4.127923965454102, "learning_rate": 8.809993582477016e-05, "loss": 2.4042, "step": 1541 }, { "epoch": 0.24878993223620524, "grad_norm": 6.135165214538574, "learning_rate": 8.808301089682315e-05, "loss": 2.1746, "step": 1542 }, { "epoch": 0.2489512746047112, "grad_norm": 4.654680252075195, "learning_rate": 8.806607556984044e-05, "loss": 2.1586, "step": 1543 }, { "epoch": 0.24911261697321715, "grad_norm": 4.3369550704956055, "learning_rate": 8.804912984844645e-05, "loss": 2.0705, "step": 1544 }, { "epoch": 0.24927395934172314, "grad_norm": 3.8217198848724365, "learning_rate": 8.80321737372684e-05, "loss": 1.9911, "step": 1545 }, { "epoch": 0.2494353017102291, "grad_norm": 4.708687782287598, "learning_rate": 8.801520724093638e-05, "loss": 2.1225, "step": 1546 }, { "epoch": 0.24959664407873508, "grad_norm": 3.5129055976867676, "learning_rate": 8.79982303640833e-05, "loss": 2.0926, "step": 1547 }, { "epoch": 0.24975798644724104, "grad_norm": 4.3811540603637695, "learning_rate": 8.79812431113449e-05, "loss": 2.1391, "step": 1548 }, { "epoch": 0.24991932881574702, "grad_norm": 6.042469501495361, "learning_rate": 8.796424548735974e-05, "loss": 2.0243, "step": 1549 }, { "epoch": 0.250080671184253, "grad_norm": 3.6431429386138916, "learning_rate": 8.794723749676927e-05, "loss": 2.1022, "step": 1550 }, { "epoch": 0.25024201355275894, "grad_norm": 5.046748638153076, "learning_rate": 8.793021914421771e-05, "loss": 2.1472, "step": 1551 }, { "epoch": 0.25040335592126495, "grad_norm": 5.241067886352539, "learning_rate": 8.791319043435214e-05, "loss": 2.1424, "step": 1552 }, { "epoch": 0.2505646982897709, "grad_norm": 4.297329425811768, "learning_rate": 8.789615137182244e-05, "loss": 2.029, "step": 1553 }, { "epoch": 0.25072604065827686, "grad_norm": 5.346684455871582, "learning_rate": 8.787910196128134e-05, "loss": 2.1146, "step": 1554 }, { "epoch": 0.2508873830267828, "grad_norm": 5.6350016593933105, "learning_rate": 8.78620422073844e-05, "loss": 2.3859, "step": 1555 }, { "epoch": 0.25104872539528883, "grad_norm": 5.491115570068359, "learning_rate": 8.784497211479001e-05, "loss": 2.1892, "step": 1556 }, { "epoch": 0.2512100677637948, "grad_norm": 4.584228515625, "learning_rate": 8.782789168815937e-05, "loss": 2.0268, "step": 1557 }, { "epoch": 0.25137141013230074, "grad_norm": 7.174254417419434, "learning_rate": 8.781080093215645e-05, "loss": 2.1958, "step": 1558 }, { "epoch": 0.2515327525008067, "grad_norm": 6.113037586212158, "learning_rate": 8.779369985144816e-05, "loss": 1.9857, "step": 1559 }, { "epoch": 0.25169409486931266, "grad_norm": 4.866761207580566, "learning_rate": 8.77765884507041e-05, "loss": 2.0729, "step": 1560 }, { "epoch": 0.25185543723781867, "grad_norm": 5.130221843719482, "learning_rate": 8.775946673459681e-05, "loss": 2.0158, "step": 1561 }, { "epoch": 0.2520167796063246, "grad_norm": 4.172994136810303, "learning_rate": 8.774233470780154e-05, "loss": 2.114, "step": 1562 }, { "epoch": 0.2521781219748306, "grad_norm": 4.171538352966309, "learning_rate": 8.772519237499642e-05, "loss": 1.9266, "step": 1563 }, { "epoch": 0.25233946434333654, "grad_norm": 6.2082929611206055, "learning_rate": 8.770803974086237e-05, "loss": 2.2176, "step": 1564 }, { "epoch": 0.25250080671184255, "grad_norm": 5.585021495819092, "learning_rate": 8.769087681008311e-05, "loss": 2.1608, "step": 1565 }, { "epoch": 0.2526621490803485, "grad_norm": 4.160464763641357, "learning_rate": 8.767370358734522e-05, "loss": 2.0601, "step": 1566 }, { "epoch": 0.25282349144885446, "grad_norm": 5.117087364196777, "learning_rate": 8.765652007733805e-05, "loss": 2.1567, "step": 1567 }, { "epoch": 0.2529848338173604, "grad_norm": 4.678501129150391, "learning_rate": 8.763932628475378e-05, "loss": 2.1355, "step": 1568 }, { "epoch": 0.25314617618586643, "grad_norm": 3.966777801513672, "learning_rate": 8.762212221428736e-05, "loss": 2.1416, "step": 1569 }, { "epoch": 0.2533075185543724, "grad_norm": 5.832237243652344, "learning_rate": 8.760490787063659e-05, "loss": 2.0834, "step": 1570 }, { "epoch": 0.25346886092287835, "grad_norm": 9.335321426391602, "learning_rate": 8.758768325850206e-05, "loss": 1.8407, "step": 1571 }, { "epoch": 0.2536302032913843, "grad_norm": 4.441195487976074, "learning_rate": 8.757044838258715e-05, "loss": 1.8863, "step": 1572 }, { "epoch": 0.25379154565989026, "grad_norm": 5.355289936065674, "learning_rate": 8.755320324759808e-05, "loss": 2.0587, "step": 1573 }, { "epoch": 0.2539528880283963, "grad_norm": 4.481767654418945, "learning_rate": 8.753594785824383e-05, "loss": 1.917, "step": 1574 }, { "epoch": 0.25411423039690223, "grad_norm": 4.7891387939453125, "learning_rate": 8.75186822192362e-05, "loss": 1.9565, "step": 1575 }, { "epoch": 0.2542755727654082, "grad_norm": 5.2641921043396, "learning_rate": 8.750140633528978e-05, "loss": 2.3175, "step": 1576 }, { "epoch": 0.25443691513391414, "grad_norm": 4.2695441246032715, "learning_rate": 8.748412021112197e-05, "loss": 1.8745, "step": 1577 }, { "epoch": 0.25459825750242016, "grad_norm": 4.857851982116699, "learning_rate": 8.746682385145295e-05, "loss": 1.8918, "step": 1578 }, { "epoch": 0.2547595998709261, "grad_norm": 4.307529926300049, "learning_rate": 8.744951726100573e-05, "loss": 2.0401, "step": 1579 }, { "epoch": 0.25492094223943207, "grad_norm": 3.3972277641296387, "learning_rate": 8.743220044450604e-05, "loss": 2.0949, "step": 1580 }, { "epoch": 0.255082284607938, "grad_norm": 3.8061370849609375, "learning_rate": 8.741487340668251e-05, "loss": 1.8395, "step": 1581 }, { "epoch": 0.25524362697644404, "grad_norm": 4.64457893371582, "learning_rate": 8.739753615226644e-05, "loss": 2.0093, "step": 1582 }, { "epoch": 0.25540496934495, "grad_norm": 3.7927417755126953, "learning_rate": 8.738018868599205e-05, "loss": 2.2221, "step": 1583 }, { "epoch": 0.25556631171345595, "grad_norm": 4.176934242248535, "learning_rate": 8.736283101259621e-05, "loss": 2.2713, "step": 1584 }, { "epoch": 0.2557276540819619, "grad_norm": 4.73375940322876, "learning_rate": 8.734546313681869e-05, "loss": 2.1128, "step": 1585 }, { "epoch": 0.2558889964504679, "grad_norm": 3.8222851753234863, "learning_rate": 8.732808506340199e-05, "loss": 2.1663, "step": 1586 }, { "epoch": 0.2560503388189739, "grad_norm": 5.634716510772705, "learning_rate": 8.731069679709141e-05, "loss": 2.0878, "step": 1587 }, { "epoch": 0.25621168118747983, "grad_norm": 4.924232006072998, "learning_rate": 8.729329834263503e-05, "loss": 1.9505, "step": 1588 }, { "epoch": 0.2563730235559858, "grad_norm": 5.2137837409973145, "learning_rate": 8.72758897047837e-05, "loss": 1.916, "step": 1589 }, { "epoch": 0.25653436592449175, "grad_norm": 4.130535125732422, "learning_rate": 8.725847088829108e-05, "loss": 1.9296, "step": 1590 }, { "epoch": 0.25669570829299776, "grad_norm": 3.3059771060943604, "learning_rate": 8.724104189791359e-05, "loss": 1.9605, "step": 1591 }, { "epoch": 0.2568570506615037, "grad_norm": 4.49686336517334, "learning_rate": 8.722360273841044e-05, "loss": 2.0861, "step": 1592 }, { "epoch": 0.2570183930300097, "grad_norm": 3.792386054992676, "learning_rate": 8.720615341454357e-05, "loss": 2.1605, "step": 1593 }, { "epoch": 0.25717973539851563, "grad_norm": 5.434024333953857, "learning_rate": 8.718869393107778e-05, "loss": 2.1165, "step": 1594 }, { "epoch": 0.25734107776702164, "grad_norm": 4.544057369232178, "learning_rate": 8.717122429278055e-05, "loss": 2.2117, "step": 1595 }, { "epoch": 0.2575024201355276, "grad_norm": 4.096789836883545, "learning_rate": 8.715374450442223e-05, "loss": 2.1793, "step": 1596 }, { "epoch": 0.25766376250403356, "grad_norm": 4.786748886108398, "learning_rate": 8.713625457077585e-05, "loss": 1.9036, "step": 1597 }, { "epoch": 0.2578251048725395, "grad_norm": 5.406977653503418, "learning_rate": 8.711875449661728e-05, "loss": 2.1494, "step": 1598 }, { "epoch": 0.2579864472410455, "grad_norm": 3.7704203128814697, "learning_rate": 8.710124428672513e-05, "loss": 2.3003, "step": 1599 }, { "epoch": 0.2581477896095515, "grad_norm": 4.686526298522949, "learning_rate": 8.708372394588076e-05, "loss": 2.1823, "step": 1600 }, { "epoch": 0.25830913197805744, "grad_norm": 3.888341188430786, "learning_rate": 8.706619347886831e-05, "loss": 2.0202, "step": 1601 }, { "epoch": 0.2584704743465634, "grad_norm": 4.793549537658691, "learning_rate": 8.704865289047473e-05, "loss": 2.0186, "step": 1602 }, { "epoch": 0.25863181671506935, "grad_norm": 4.393258094787598, "learning_rate": 8.703110218548964e-05, "loss": 2.0682, "step": 1603 }, { "epoch": 0.25879315908357536, "grad_norm": 5.088795185089111, "learning_rate": 8.701354136870552e-05, "loss": 1.816, "step": 1604 }, { "epoch": 0.2589545014520813, "grad_norm": 4.1309356689453125, "learning_rate": 8.699597044491756e-05, "loss": 1.9896, "step": 1605 }, { "epoch": 0.2591158438205873, "grad_norm": 4.061094760894775, "learning_rate": 8.69783894189237e-05, "loss": 1.9654, "step": 1606 }, { "epoch": 0.25927718618909323, "grad_norm": 4.1488542556762695, "learning_rate": 8.696079829552468e-05, "loss": 2.1103, "step": 1607 }, { "epoch": 0.25943852855759925, "grad_norm": 3.9607887268066406, "learning_rate": 8.694319707952394e-05, "loss": 2.4623, "step": 1608 }, { "epoch": 0.2595998709261052, "grad_norm": 3.6128292083740234, "learning_rate": 8.692558577572774e-05, "loss": 1.9496, "step": 1609 }, { "epoch": 0.25976121329461116, "grad_norm": 4.164627552032471, "learning_rate": 8.690796438894504e-05, "loss": 2.0055, "step": 1610 }, { "epoch": 0.2599225556631171, "grad_norm": 4.3762335777282715, "learning_rate": 8.689033292398759e-05, "loss": 2.0936, "step": 1611 }, { "epoch": 0.26008389803162313, "grad_norm": 4.450123310089111, "learning_rate": 8.687269138566988e-05, "loss": 1.9501, "step": 1612 }, { "epoch": 0.2602452404001291, "grad_norm": 4.878677845001221, "learning_rate": 8.685503977880916e-05, "loss": 1.9101, "step": 1613 }, { "epoch": 0.26040658276863504, "grad_norm": 3.2923424243927, "learning_rate": 8.683737810822539e-05, "loss": 1.981, "step": 1614 }, { "epoch": 0.260567925137141, "grad_norm": 4.8123908042907715, "learning_rate": 8.681970637874132e-05, "loss": 2.1394, "step": 1615 }, { "epoch": 0.26072926750564696, "grad_norm": 3.266145944595337, "learning_rate": 8.680202459518244e-05, "loss": 1.806, "step": 1616 }, { "epoch": 0.26089060987415297, "grad_norm": 5.465076923370361, "learning_rate": 8.678433276237698e-05, "loss": 2.3119, "step": 1617 }, { "epoch": 0.2610519522426589, "grad_norm": 4.121337890625, "learning_rate": 8.676663088515591e-05, "loss": 1.9882, "step": 1618 }, { "epoch": 0.2612132946111649, "grad_norm": 3.6586122512817383, "learning_rate": 8.674891896835293e-05, "loss": 2.076, "step": 1619 }, { "epoch": 0.26137463697967084, "grad_norm": 4.997232437133789, "learning_rate": 8.673119701680452e-05, "loss": 1.8912, "step": 1620 }, { "epoch": 0.26153597934817685, "grad_norm": 7.334474563598633, "learning_rate": 8.671346503534988e-05, "loss": 2.1024, "step": 1621 }, { "epoch": 0.2616973217166828, "grad_norm": 3.8910131454467773, "learning_rate": 8.669572302883094e-05, "loss": 1.9326, "step": 1622 }, { "epoch": 0.26185866408518876, "grad_norm": 4.191247940063477, "learning_rate": 8.667797100209234e-05, "loss": 1.7807, "step": 1623 }, { "epoch": 0.2620200064536947, "grad_norm": 5.892439365386963, "learning_rate": 8.666020895998153e-05, "loss": 2.1133, "step": 1624 }, { "epoch": 0.26218134882220073, "grad_norm": 3.86570405960083, "learning_rate": 8.664243690734865e-05, "loss": 1.8862, "step": 1625 }, { "epoch": 0.2623426911907067, "grad_norm": 3.905013084411621, "learning_rate": 8.662465484904656e-05, "loss": 2.1019, "step": 1626 }, { "epoch": 0.26250403355921265, "grad_norm": 4.451311111450195, "learning_rate": 8.66068627899309e-05, "loss": 1.981, "step": 1627 }, { "epoch": 0.2626653759277186, "grad_norm": 5.009078502655029, "learning_rate": 8.658906073485998e-05, "loss": 2.1871, "step": 1628 }, { "epoch": 0.2628267182962246, "grad_norm": 5.540653228759766, "learning_rate": 8.657124868869489e-05, "loss": 2.0475, "step": 1629 }, { "epoch": 0.26298806066473057, "grad_norm": 5.579919338226318, "learning_rate": 8.655342665629943e-05, "loss": 2.2816, "step": 1630 }, { "epoch": 0.26314940303323653, "grad_norm": 4.403649806976318, "learning_rate": 8.653559464254008e-05, "loss": 2.0569, "step": 1631 }, { "epoch": 0.2633107454017425, "grad_norm": 3.9714784622192383, "learning_rate": 8.651775265228617e-05, "loss": 2.0732, "step": 1632 }, { "epoch": 0.26347208777024844, "grad_norm": 5.194519996643066, "learning_rate": 8.649990069040961e-05, "loss": 1.9404, "step": 1633 }, { "epoch": 0.26363343013875445, "grad_norm": 4.312530040740967, "learning_rate": 8.648203876178514e-05, "loss": 2.0072, "step": 1634 }, { "epoch": 0.2637947725072604, "grad_norm": 4.333287715911865, "learning_rate": 8.646416687129013e-05, "loss": 2.1602, "step": 1635 }, { "epoch": 0.26395611487576637, "grad_norm": 4.85657262802124, "learning_rate": 8.644628502380479e-05, "loss": 2.0286, "step": 1636 }, { "epoch": 0.2641174572442723, "grad_norm": 4.16322135925293, "learning_rate": 8.642839322421192e-05, "loss": 2.2035, "step": 1637 }, { "epoch": 0.26427879961277834, "grad_norm": 3.8348686695098877, "learning_rate": 8.641049147739713e-05, "loss": 1.9357, "step": 1638 }, { "epoch": 0.2644401419812843, "grad_norm": 5.0198774337768555, "learning_rate": 8.63925797882487e-05, "loss": 2.0692, "step": 1639 }, { "epoch": 0.26460148434979025, "grad_norm": 4.261548042297363, "learning_rate": 8.637465816165763e-05, "loss": 1.8413, "step": 1640 }, { "epoch": 0.2647628267182962, "grad_norm": 4.39458703994751, "learning_rate": 8.635672660251765e-05, "loss": 2.0827, "step": 1641 }, { "epoch": 0.2649241690868022, "grad_norm": 4.479990005493164, "learning_rate": 8.63387851157252e-05, "loss": 1.9944, "step": 1642 }, { "epoch": 0.2650855114553082, "grad_norm": 5.603365421295166, "learning_rate": 8.632083370617941e-05, "loss": 2.3669, "step": 1643 }, { "epoch": 0.26524685382381413, "grad_norm": 4.003785610198975, "learning_rate": 8.630287237878214e-05, "loss": 2.1475, "step": 1644 }, { "epoch": 0.2654081961923201, "grad_norm": 4.122474670410156, "learning_rate": 8.628490113843797e-05, "loss": 1.9272, "step": 1645 }, { "epoch": 0.26556953856082605, "grad_norm": 4.912667751312256, "learning_rate": 8.626691999005414e-05, "loss": 2.0954, "step": 1646 }, { "epoch": 0.26573088092933206, "grad_norm": 5.25667667388916, "learning_rate": 8.624892893854062e-05, "loss": 2.0019, "step": 1647 }, { "epoch": 0.265892223297838, "grad_norm": 6.928044319152832, "learning_rate": 8.623092798881012e-05, "loss": 2.2042, "step": 1648 }, { "epoch": 0.26605356566634397, "grad_norm": 5.584704875946045, "learning_rate": 8.6212917145778e-05, "loss": 2.2149, "step": 1649 }, { "epoch": 0.26621490803484993, "grad_norm": 4.9668121337890625, "learning_rate": 8.619489641436236e-05, "loss": 2.192, "step": 1650 }, { "epoch": 0.26637625040335594, "grad_norm": 4.284021377563477, "learning_rate": 8.617686579948397e-05, "loss": 2.0923, "step": 1651 }, { "epoch": 0.2665375927718619, "grad_norm": 3.5677449703216553, "learning_rate": 8.61588253060663e-05, "loss": 2.0471, "step": 1652 }, { "epoch": 0.26669893514036785, "grad_norm": 5.858703136444092, "learning_rate": 8.614077493903553e-05, "loss": 2.2517, "step": 1653 }, { "epoch": 0.2668602775088738, "grad_norm": 6.740094184875488, "learning_rate": 8.612271470332057e-05, "loss": 2.2777, "step": 1654 }, { "epoch": 0.2670216198773798, "grad_norm": 6.313733100891113, "learning_rate": 8.610464460385296e-05, "loss": 1.8602, "step": 1655 }, { "epoch": 0.2671829622458858, "grad_norm": 4.579122066497803, "learning_rate": 8.608656464556699e-05, "loss": 1.9912, "step": 1656 }, { "epoch": 0.26734430461439174, "grad_norm": 4.619999885559082, "learning_rate": 8.606847483339957e-05, "loss": 2.1888, "step": 1657 }, { "epoch": 0.2675056469828977, "grad_norm": 5.256847858428955, "learning_rate": 8.605037517229037e-05, "loss": 2.135, "step": 1658 }, { "epoch": 0.2676669893514037, "grad_norm": 5.907052040100098, "learning_rate": 8.603226566718174e-05, "loss": 2.1339, "step": 1659 }, { "epoch": 0.26782833171990966, "grad_norm": 4.06528902053833, "learning_rate": 8.601414632301869e-05, "loss": 2.0343, "step": 1660 }, { "epoch": 0.2679896740884156, "grad_norm": 3.8804450035095215, "learning_rate": 8.599601714474894e-05, "loss": 2.0802, "step": 1661 }, { "epoch": 0.2681510164569216, "grad_norm": 4.992329120635986, "learning_rate": 8.597787813732286e-05, "loss": 1.9986, "step": 1662 }, { "epoch": 0.26831235882542753, "grad_norm": 4.483761787414551, "learning_rate": 8.595972930569356e-05, "loss": 2.1974, "step": 1663 }, { "epoch": 0.26847370119393354, "grad_norm": 4.82814359664917, "learning_rate": 8.594157065481679e-05, "loss": 1.8907, "step": 1664 }, { "epoch": 0.2686350435624395, "grad_norm": 4.251171112060547, "learning_rate": 8.592340218965099e-05, "loss": 2.3435, "step": 1665 }, { "epoch": 0.26879638593094546, "grad_norm": 4.5502166748046875, "learning_rate": 8.590522391515729e-05, "loss": 2.0974, "step": 1666 }, { "epoch": 0.2689577282994514, "grad_norm": 4.192358016967773, "learning_rate": 8.588703583629948e-05, "loss": 2.2086, "step": 1667 }, { "epoch": 0.2691190706679574, "grad_norm": 4.075906753540039, "learning_rate": 8.586883795804406e-05, "loss": 2.0219, "step": 1668 }, { "epoch": 0.2692804130364634, "grad_norm": 5.171660900115967, "learning_rate": 8.585063028536016e-05, "loss": 2.0953, "step": 1669 }, { "epoch": 0.26944175540496934, "grad_norm": 3.467822313308716, "learning_rate": 8.583241282321963e-05, "loss": 2.4836, "step": 1670 }, { "epoch": 0.2696030977734753, "grad_norm": 3.6750383377075195, "learning_rate": 8.581418557659695e-05, "loss": 1.9712, "step": 1671 }, { "epoch": 0.2697644401419813, "grad_norm": 5.118645191192627, "learning_rate": 8.579594855046933e-05, "loss": 2.0233, "step": 1672 }, { "epoch": 0.26992578251048727, "grad_norm": 4.578837871551514, "learning_rate": 8.577770174981658e-05, "loss": 2.3076, "step": 1673 }, { "epoch": 0.2700871248789932, "grad_norm": 4.3101487159729, "learning_rate": 8.575944517962125e-05, "loss": 2.1367, "step": 1674 }, { "epoch": 0.2702484672474992, "grad_norm": 3.4827797412872314, "learning_rate": 8.574117884486847e-05, "loss": 1.9562, "step": 1675 }, { "epoch": 0.27040980961600514, "grad_norm": 4.5507683753967285, "learning_rate": 8.572290275054613e-05, "loss": 2.0804, "step": 1676 }, { "epoch": 0.27057115198451115, "grad_norm": 3.2505276203155518, "learning_rate": 8.570461690164474e-05, "loss": 1.7681, "step": 1677 }, { "epoch": 0.2707324943530171, "grad_norm": 4.711097717285156, "learning_rate": 8.568632130315745e-05, "loss": 2.3152, "step": 1678 }, { "epoch": 0.27089383672152306, "grad_norm": 3.9545562267303467, "learning_rate": 8.566801596008013e-05, "loss": 2.1795, "step": 1679 }, { "epoch": 0.271055179090029, "grad_norm": 4.976853370666504, "learning_rate": 8.564970087741126e-05, "loss": 1.9317, "step": 1680 }, { "epoch": 0.27121652145853503, "grad_norm": 4.758996963500977, "learning_rate": 8.5631376060152e-05, "loss": 2.2757, "step": 1681 }, { "epoch": 0.271377863827041, "grad_norm": 4.466953277587891, "learning_rate": 8.561304151330617e-05, "loss": 1.7344, "step": 1682 }, { "epoch": 0.27153920619554694, "grad_norm": 6.375881195068359, "learning_rate": 8.559469724188027e-05, "loss": 2.0868, "step": 1683 }, { "epoch": 0.2717005485640529, "grad_norm": 3.204301357269287, "learning_rate": 8.55763432508834e-05, "loss": 2.1115, "step": 1684 }, { "epoch": 0.2718618909325589, "grad_norm": 5.237433910369873, "learning_rate": 8.555797954532733e-05, "loss": 1.9778, "step": 1685 }, { "epoch": 0.27202323330106487, "grad_norm": 3.665327310562134, "learning_rate": 8.553960613022652e-05, "loss": 2.0409, "step": 1686 }, { "epoch": 0.2721845756695708, "grad_norm": 4.775506019592285, "learning_rate": 8.552122301059806e-05, "loss": 2.071, "step": 1687 }, { "epoch": 0.2723459180380768, "grad_norm": 3.734705686569214, "learning_rate": 8.550283019146167e-05, "loss": 2.0403, "step": 1688 }, { "epoch": 0.2725072604065828, "grad_norm": 6.832278251647949, "learning_rate": 8.548442767783975e-05, "loss": 1.9087, "step": 1689 }, { "epoch": 0.27266860277508875, "grad_norm": 7.069394111633301, "learning_rate": 8.546601547475734e-05, "loss": 2.4925, "step": 1690 }, { "epoch": 0.2728299451435947, "grad_norm": 4.048940658569336, "learning_rate": 8.54475935872421e-05, "loss": 2.0747, "step": 1691 }, { "epoch": 0.27299128751210067, "grad_norm": 4.290622234344482, "learning_rate": 8.542916202032436e-05, "loss": 2.2706, "step": 1692 }, { "epoch": 0.2731526298806066, "grad_norm": 3.9478087425231934, "learning_rate": 8.541072077903709e-05, "loss": 2.2192, "step": 1693 }, { "epoch": 0.27331397224911264, "grad_norm": 4.618847370147705, "learning_rate": 8.53922698684159e-05, "loss": 2.232, "step": 1694 }, { "epoch": 0.2734753146176186, "grad_norm": 3.8600399494171143, "learning_rate": 8.537380929349903e-05, "loss": 2.0081, "step": 1695 }, { "epoch": 0.27363665698612455, "grad_norm": 5.539452075958252, "learning_rate": 8.535533905932738e-05, "loss": 2.3145, "step": 1696 }, { "epoch": 0.2737979993546305, "grad_norm": 5.374900817871094, "learning_rate": 8.533685917094447e-05, "loss": 1.9882, "step": 1697 }, { "epoch": 0.2739593417231365, "grad_norm": 3.646822929382324, "learning_rate": 8.531836963339645e-05, "loss": 2.0992, "step": 1698 }, { "epoch": 0.2741206840916425, "grad_norm": 5.130327224731445, "learning_rate": 8.529987045173213e-05, "loss": 2.0718, "step": 1699 }, { "epoch": 0.27428202646014843, "grad_norm": 4.452453136444092, "learning_rate": 8.528136163100295e-05, "loss": 2.2012, "step": 1700 }, { "epoch": 0.2744433688286544, "grad_norm": 3.7064199447631836, "learning_rate": 8.526284317626294e-05, "loss": 2.1222, "step": 1701 }, { "epoch": 0.2746047111971604, "grad_norm": 4.690736293792725, "learning_rate": 8.52443150925688e-05, "loss": 2.171, "step": 1702 }, { "epoch": 0.27476605356566636, "grad_norm": 3.434121608734131, "learning_rate": 8.52257773849799e-05, "loss": 2.1656, "step": 1703 }, { "epoch": 0.2749273959341723, "grad_norm": 3.749788284301758, "learning_rate": 8.520723005855813e-05, "loss": 2.0383, "step": 1704 }, { "epoch": 0.27508873830267827, "grad_norm": 4.258902072906494, "learning_rate": 8.518867311836808e-05, "loss": 2.0491, "step": 1705 }, { "epoch": 0.2752500806711842, "grad_norm": 4.509291648864746, "learning_rate": 8.517010656947696e-05, "loss": 2.1671, "step": 1706 }, { "epoch": 0.27541142303969024, "grad_norm": 4.518990516662598, "learning_rate": 8.515153041695459e-05, "loss": 2.111, "step": 1707 }, { "epoch": 0.2755727654081962, "grad_norm": 6.002044200897217, "learning_rate": 8.513294466587342e-05, "loss": 1.9611, "step": 1708 }, { "epoch": 0.27573410777670215, "grad_norm": 5.309200286865234, "learning_rate": 8.511434932130855e-05, "loss": 1.8995, "step": 1709 }, { "epoch": 0.2758954501452081, "grad_norm": 2.978005886077881, "learning_rate": 8.50957443883376e-05, "loss": 2.0993, "step": 1710 }, { "epoch": 0.2760567925137141, "grad_norm": 4.166509628295898, "learning_rate": 8.507712987204094e-05, "loss": 2.0326, "step": 1711 }, { "epoch": 0.2762181348822201, "grad_norm": 4.222262859344482, "learning_rate": 8.505850577750145e-05, "loss": 1.9066, "step": 1712 }, { "epoch": 0.27637947725072604, "grad_norm": 3.854065418243408, "learning_rate": 8.503987210980471e-05, "loss": 2.0014, "step": 1713 }, { "epoch": 0.276540819619232, "grad_norm": 4.566579341888428, "learning_rate": 8.502122887403883e-05, "loss": 2.0936, "step": 1714 }, { "epoch": 0.276702161987738, "grad_norm": 5.428273677825928, "learning_rate": 8.50025760752946e-05, "loss": 2.0519, "step": 1715 }, { "epoch": 0.27686350435624396, "grad_norm": 6.084582805633545, "learning_rate": 8.498391371866538e-05, "loss": 2.0133, "step": 1716 }, { "epoch": 0.2770248467247499, "grad_norm": 7.042144775390625, "learning_rate": 8.496524180924718e-05, "loss": 2.174, "step": 1717 }, { "epoch": 0.2771861890932559, "grad_norm": 3.311413288116455, "learning_rate": 8.494656035213857e-05, "loss": 2.0763, "step": 1718 }, { "epoch": 0.27734753146176183, "grad_norm": 4.330287456512451, "learning_rate": 8.492786935244078e-05, "loss": 2.0849, "step": 1719 }, { "epoch": 0.27750887383026784, "grad_norm": 3.794973611831665, "learning_rate": 8.490916881525759e-05, "loss": 2.2467, "step": 1720 }, { "epoch": 0.2776702161987738, "grad_norm": 3.6375041007995605, "learning_rate": 8.489045874569544e-05, "loss": 2.0532, "step": 1721 }, { "epoch": 0.27783155856727976, "grad_norm": 4.892178058624268, "learning_rate": 8.487173914886331e-05, "loss": 2.3658, "step": 1722 }, { "epoch": 0.2779929009357857, "grad_norm": 4.804330348968506, "learning_rate": 8.485301002987284e-05, "loss": 2.0655, "step": 1723 }, { "epoch": 0.2781542433042917, "grad_norm": 5.6723432540893555, "learning_rate": 8.483427139383826e-05, "loss": 1.9983, "step": 1724 }, { "epoch": 0.2783155856727977, "grad_norm": 4.15791130065918, "learning_rate": 8.481552324587636e-05, "loss": 2.0655, "step": 1725 }, { "epoch": 0.27847692804130364, "grad_norm": 3.6118197441101074, "learning_rate": 8.479676559110656e-05, "loss": 2.0098, "step": 1726 }, { "epoch": 0.2786382704098096, "grad_norm": 4.086574554443359, "learning_rate": 8.477799843465088e-05, "loss": 1.9056, "step": 1727 }, { "epoch": 0.2787996127783156, "grad_norm": 3.7872986793518066, "learning_rate": 8.475922178163392e-05, "loss": 2.1815, "step": 1728 }, { "epoch": 0.27896095514682157, "grad_norm": 4.921558856964111, "learning_rate": 8.474043563718285e-05, "loss": 1.8451, "step": 1729 }, { "epoch": 0.2791222975153275, "grad_norm": 4.6860833168029785, "learning_rate": 8.47216400064275e-05, "loss": 1.5947, "step": 1730 }, { "epoch": 0.2792836398838335, "grad_norm": 4.63102912902832, "learning_rate": 8.470283489450022e-05, "loss": 2.1791, "step": 1731 }, { "epoch": 0.2794449822523395, "grad_norm": 4.210811614990234, "learning_rate": 8.468402030653597e-05, "loss": 1.9916, "step": 1732 }, { "epoch": 0.27960632462084545, "grad_norm": 4.811304569244385, "learning_rate": 8.466519624767235e-05, "loss": 1.7945, "step": 1733 }, { "epoch": 0.2797676669893514, "grad_norm": 5.427934646606445, "learning_rate": 8.464636272304945e-05, "loss": 1.9227, "step": 1734 }, { "epoch": 0.27992900935785736, "grad_norm": 4.605364799499512, "learning_rate": 8.462751973781003e-05, "loss": 1.9805, "step": 1735 }, { "epoch": 0.2800903517263633, "grad_norm": 4.8616862297058105, "learning_rate": 8.460866729709937e-05, "loss": 1.9491, "step": 1736 }, { "epoch": 0.28025169409486933, "grad_norm": 4.779373645782471, "learning_rate": 8.458980540606541e-05, "loss": 1.9869, "step": 1737 }, { "epoch": 0.2804130364633753, "grad_norm": 3.774815559387207, "learning_rate": 8.457093406985857e-05, "loss": 2.0055, "step": 1738 }, { "epoch": 0.28057437883188124, "grad_norm": 5.13038444519043, "learning_rate": 8.455205329363193e-05, "loss": 1.8714, "step": 1739 }, { "epoch": 0.2807357212003872, "grad_norm": 4.071251392364502, "learning_rate": 8.453316308254111e-05, "loss": 2.0485, "step": 1740 }, { "epoch": 0.2808970635688932, "grad_norm": 4.834817409515381, "learning_rate": 8.451426344174433e-05, "loss": 2.0874, "step": 1741 }, { "epoch": 0.28105840593739917, "grad_norm": 4.914193630218506, "learning_rate": 8.449535437640234e-05, "loss": 2.0483, "step": 1742 }, { "epoch": 0.2812197483059051, "grad_norm": 8.055091857910156, "learning_rate": 8.44764358916785e-05, "loss": 2.4413, "step": 1743 }, { "epoch": 0.2813810906744111, "grad_norm": 4.535501956939697, "learning_rate": 8.445750799273877e-05, "loss": 2.0976, "step": 1744 }, { "epoch": 0.2815424330429171, "grad_norm": 9.10206127166748, "learning_rate": 8.44385706847516e-05, "loss": 2.1505, "step": 1745 }, { "epoch": 0.28170377541142305, "grad_norm": 4.140308380126953, "learning_rate": 8.44196239728881e-05, "loss": 1.9627, "step": 1746 }, { "epoch": 0.281865117779929, "grad_norm": 6.006581783294678, "learning_rate": 8.440066786232186e-05, "loss": 1.8477, "step": 1747 }, { "epoch": 0.28202646014843497, "grad_norm": 4.237965106964111, "learning_rate": 8.43817023582291e-05, "loss": 2.0056, "step": 1748 }, { "epoch": 0.2821878025169409, "grad_norm": 5.994650363922119, "learning_rate": 8.436272746578859e-05, "loss": 1.9832, "step": 1749 }, { "epoch": 0.28234914488544693, "grad_norm": 5.219415187835693, "learning_rate": 8.434374319018165e-05, "loss": 2.0885, "step": 1750 }, { "epoch": 0.2825104872539529, "grad_norm": 5.175772190093994, "learning_rate": 8.432474953659219e-05, "loss": 2.3823, "step": 1751 }, { "epoch": 0.28267182962245885, "grad_norm": 6.690397262573242, "learning_rate": 8.430574651020664e-05, "loss": 1.9578, "step": 1752 }, { "epoch": 0.2828331719909648, "grad_norm": 4.6421284675598145, "learning_rate": 8.428673411621401e-05, "loss": 2.0015, "step": 1753 }, { "epoch": 0.2829945143594708, "grad_norm": 3.951106548309326, "learning_rate": 8.426771235980587e-05, "loss": 2.1639, "step": 1754 }, { "epoch": 0.2831558567279768, "grad_norm": 4.136624813079834, "learning_rate": 8.424868124617636e-05, "loss": 2.0264, "step": 1755 }, { "epoch": 0.28331719909648273, "grad_norm": 5.540128707885742, "learning_rate": 8.422964078052213e-05, "loss": 2.165, "step": 1756 }, { "epoch": 0.2834785414649887, "grad_norm": 3.8847310543060303, "learning_rate": 8.421059096804244e-05, "loss": 2.0028, "step": 1757 }, { "epoch": 0.2836398838334947, "grad_norm": 3.25927472114563, "learning_rate": 8.419153181393909e-05, "loss": 2.08, "step": 1758 }, { "epoch": 0.28380122620200066, "grad_norm": 4.559014797210693, "learning_rate": 8.417246332341637e-05, "loss": 2.0188, "step": 1759 }, { "epoch": 0.2839625685705066, "grad_norm": 5.007056713104248, "learning_rate": 8.41533855016812e-05, "loss": 2.2219, "step": 1760 }, { "epoch": 0.28412391093901257, "grad_norm": 5.803976058959961, "learning_rate": 8.413429835394302e-05, "loss": 2.1245, "step": 1761 }, { "epoch": 0.2842852533075186, "grad_norm": 3.300891876220703, "learning_rate": 8.411520188541379e-05, "loss": 2.117, "step": 1762 }, { "epoch": 0.28444659567602454, "grad_norm": 3.984173536300659, "learning_rate": 8.409609610130804e-05, "loss": 1.945, "step": 1763 }, { "epoch": 0.2846079380445305, "grad_norm": 8.194157600402832, "learning_rate": 8.407698100684284e-05, "loss": 2.1773, "step": 1764 }, { "epoch": 0.28476928041303645, "grad_norm": 5.650846481323242, "learning_rate": 8.405785660723783e-05, "loss": 2.1189, "step": 1765 }, { "epoch": 0.2849306227815424, "grad_norm": 3.9818012714385986, "learning_rate": 8.403872290771513e-05, "loss": 1.8912, "step": 1766 }, { "epoch": 0.2850919651500484, "grad_norm": 6.198014259338379, "learning_rate": 8.401957991349945e-05, "loss": 2.1052, "step": 1767 }, { "epoch": 0.2852533075185544, "grad_norm": 4.0621161460876465, "learning_rate": 8.400042762981799e-05, "loss": 2.1539, "step": 1768 }, { "epoch": 0.28541464988706033, "grad_norm": 5.210166931152344, "learning_rate": 8.398126606190056e-05, "loss": 2.1784, "step": 1769 }, { "epoch": 0.2855759922555663, "grad_norm": 5.423094749450684, "learning_rate": 8.396209521497942e-05, "loss": 1.9488, "step": 1770 }, { "epoch": 0.2857373346240723, "grad_norm": 5.281177520751953, "learning_rate": 8.394291509428945e-05, "loss": 2.3128, "step": 1771 }, { "epoch": 0.28589867699257826, "grad_norm": 3.8413915634155273, "learning_rate": 8.3923725705068e-05, "loss": 2.0873, "step": 1772 }, { "epoch": 0.2860600193610842, "grad_norm": 4.192770481109619, "learning_rate": 8.390452705255495e-05, "loss": 1.9095, "step": 1773 }, { "epoch": 0.2862213617295902, "grad_norm": 4.927295684814453, "learning_rate": 8.388531914199275e-05, "loss": 1.8837, "step": 1774 }, { "epoch": 0.2863827040980962, "grad_norm": 3.406172037124634, "learning_rate": 8.386610197862636e-05, "loss": 2.0226, "step": 1775 }, { "epoch": 0.28654404646660214, "grad_norm": 5.010184288024902, "learning_rate": 8.384687556770326e-05, "loss": 1.9072, "step": 1776 }, { "epoch": 0.2867053888351081, "grad_norm": 4.786250591278076, "learning_rate": 8.382763991447344e-05, "loss": 2.0567, "step": 1777 }, { "epoch": 0.28686673120361406, "grad_norm": 5.376852035522461, "learning_rate": 8.380839502418945e-05, "loss": 2.1561, "step": 1778 }, { "epoch": 0.28702807357212, "grad_norm": 5.463832378387451, "learning_rate": 8.378914090210634e-05, "loss": 2.2231, "step": 1779 }, { "epoch": 0.287189415940626, "grad_norm": 3.5489230155944824, "learning_rate": 8.37698775534817e-05, "loss": 2.2164, "step": 1780 }, { "epoch": 0.287350758309132, "grad_norm": 5.189237117767334, "learning_rate": 8.375060498357561e-05, "loss": 2.1313, "step": 1781 }, { "epoch": 0.28751210067763794, "grad_norm": 3.619138479232788, "learning_rate": 8.373132319765066e-05, "loss": 2.2953, "step": 1782 }, { "epoch": 0.2876734430461439, "grad_norm": 4.930722236633301, "learning_rate": 8.371203220097202e-05, "loss": 2.4205, "step": 1783 }, { "epoch": 0.2878347854146499, "grad_norm": 4.0688323974609375, "learning_rate": 8.369273199880731e-05, "loss": 2.1226, "step": 1784 }, { "epoch": 0.28799612778315586, "grad_norm": 3.675079584121704, "learning_rate": 8.367342259642672e-05, "loss": 2.0162, "step": 1785 }, { "epoch": 0.2881574701516618, "grad_norm": 4.427017688751221, "learning_rate": 8.365410399910288e-05, "loss": 2.3975, "step": 1786 }, { "epoch": 0.2883188125201678, "grad_norm": 3.828359603881836, "learning_rate": 8.363477621211099e-05, "loss": 2.1091, "step": 1787 }, { "epoch": 0.2884801548886738, "grad_norm": 3.7655129432678223, "learning_rate": 8.361543924072873e-05, "loss": 1.8609, "step": 1788 }, { "epoch": 0.28864149725717975, "grad_norm": 4.367695331573486, "learning_rate": 8.359609309023632e-05, "loss": 2.1469, "step": 1789 }, { "epoch": 0.2888028396256857, "grad_norm": 4.422239303588867, "learning_rate": 8.357673776591643e-05, "loss": 1.9042, "step": 1790 }, { "epoch": 0.28896418199419166, "grad_norm": 3.3966128826141357, "learning_rate": 8.355737327305433e-05, "loss": 2.1162, "step": 1791 }, { "epoch": 0.2891255243626976, "grad_norm": 5.572344779968262, "learning_rate": 8.353799961693767e-05, "loss": 2.0833, "step": 1792 }, { "epoch": 0.28928686673120363, "grad_norm": 6.192033290863037, "learning_rate": 8.351861680285668e-05, "loss": 2.2833, "step": 1793 }, { "epoch": 0.2894482090997096, "grad_norm": 3.767331600189209, "learning_rate": 8.34992248361041e-05, "loss": 2.1159, "step": 1794 }, { "epoch": 0.28960955146821554, "grad_norm": 3.587618589401245, "learning_rate": 8.347982372197514e-05, "loss": 2.023, "step": 1795 }, { "epoch": 0.2897708938367215, "grad_norm": 3.6493425369262695, "learning_rate": 8.346041346576751e-05, "loss": 2.3199, "step": 1796 }, { "epoch": 0.2899322362052275, "grad_norm": 4.177402973175049, "learning_rate": 8.344099407278141e-05, "loss": 1.7962, "step": 1797 }, { "epoch": 0.29009357857373347, "grad_norm": 4.965822219848633, "learning_rate": 8.342156554831955e-05, "loss": 1.8738, "step": 1798 }, { "epoch": 0.2902549209422394, "grad_norm": 4.516384124755859, "learning_rate": 8.340212789768712e-05, "loss": 2.319, "step": 1799 }, { "epoch": 0.2904162633107454, "grad_norm": 4.535025119781494, "learning_rate": 8.338268112619183e-05, "loss": 2.0098, "step": 1800 }, { "epoch": 0.2905776056792514, "grad_norm": 3.820551633834839, "learning_rate": 8.336322523914385e-05, "loss": 1.9986, "step": 1801 }, { "epoch": 0.29073894804775735, "grad_norm": 3.7913529872894287, "learning_rate": 8.334376024185584e-05, "loss": 2.091, "step": 1802 }, { "epoch": 0.2909002904162633, "grad_norm": 4.389832496643066, "learning_rate": 8.332428613964298e-05, "loss": 2.1236, "step": 1803 }, { "epoch": 0.29106163278476926, "grad_norm": 5.472838401794434, "learning_rate": 8.33048029378229e-05, "loss": 1.8827, "step": 1804 }, { "epoch": 0.2912229751532753, "grad_norm": 4.006913661956787, "learning_rate": 8.328531064171572e-05, "loss": 1.9958, "step": 1805 }, { "epoch": 0.29138431752178123, "grad_norm": 5.290530681610107, "learning_rate": 8.326580925664406e-05, "loss": 1.8631, "step": 1806 }, { "epoch": 0.2915456598902872, "grad_norm": 3.8902676105499268, "learning_rate": 8.324629878793303e-05, "loss": 2.2186, "step": 1807 }, { "epoch": 0.29170700225879315, "grad_norm": 5.456364154815674, "learning_rate": 8.322677924091018e-05, "loss": 2.1408, "step": 1808 }, { "epoch": 0.2918683446272991, "grad_norm": 4.808485984802246, "learning_rate": 8.320725062090557e-05, "loss": 1.6919, "step": 1809 }, { "epoch": 0.2920296869958051, "grad_norm": 4.256877899169922, "learning_rate": 8.318771293325174e-05, "loss": 2.2299, "step": 1810 }, { "epoch": 0.2921910293643111, "grad_norm": 4.910990238189697, "learning_rate": 8.316816618328367e-05, "loss": 2.223, "step": 1811 }, { "epoch": 0.29235237173281703, "grad_norm": 3.7441246509552, "learning_rate": 8.314861037633889e-05, "loss": 1.9777, "step": 1812 }, { "epoch": 0.292513714101323, "grad_norm": 4.108473777770996, "learning_rate": 8.312904551775731e-05, "loss": 2.0013, "step": 1813 }, { "epoch": 0.292675056469829, "grad_norm": 4.527835369110107, "learning_rate": 8.310947161288136e-05, "loss": 2.033, "step": 1814 }, { "epoch": 0.29283639883833495, "grad_norm": 4.592922687530518, "learning_rate": 8.308988866705596e-05, "loss": 2.0957, "step": 1815 }, { "epoch": 0.2929977412068409, "grad_norm": 4.6102800369262695, "learning_rate": 8.307029668562847e-05, "loss": 1.9809, "step": 1816 }, { "epoch": 0.29315908357534687, "grad_norm": 5.48193883895874, "learning_rate": 8.30506956739487e-05, "loss": 1.9832, "step": 1817 }, { "epoch": 0.2933204259438529, "grad_norm": 5.1117682456970215, "learning_rate": 8.303108563736894e-05, "loss": 1.9945, "step": 1818 }, { "epoch": 0.29348176831235884, "grad_norm": 4.609536647796631, "learning_rate": 8.3011466581244e-05, "loss": 1.8112, "step": 1819 }, { "epoch": 0.2936431106808648, "grad_norm": 4.317946910858154, "learning_rate": 8.299183851093108e-05, "loss": 1.9708, "step": 1820 }, { "epoch": 0.29380445304937075, "grad_norm": 4.565094947814941, "learning_rate": 8.297220143178986e-05, "loss": 2.0382, "step": 1821 }, { "epoch": 0.2939657954178767, "grad_norm": 3.6840734481811523, "learning_rate": 8.295255534918248e-05, "loss": 1.9289, "step": 1822 }, { "epoch": 0.2941271377863827, "grad_norm": 3.8791749477386475, "learning_rate": 8.293290026847356e-05, "loss": 1.9768, "step": 1823 }, { "epoch": 0.2942884801548887, "grad_norm": 3.9318950176239014, "learning_rate": 8.291323619503018e-05, "loss": 1.9546, "step": 1824 }, { "epoch": 0.29444982252339463, "grad_norm": 4.0623884201049805, "learning_rate": 8.289356313422182e-05, "loss": 1.8619, "step": 1825 }, { "epoch": 0.2946111648919006, "grad_norm": 3.451681137084961, "learning_rate": 8.287388109142046e-05, "loss": 1.8466, "step": 1826 }, { "epoch": 0.2947725072604066, "grad_norm": 3.832487106323242, "learning_rate": 8.285419007200055e-05, "loss": 2.1728, "step": 1827 }, { "epoch": 0.29493384962891256, "grad_norm": 5.565883159637451, "learning_rate": 8.283449008133894e-05, "loss": 2.1671, "step": 1828 }, { "epoch": 0.2950951919974185, "grad_norm": 5.599283218383789, "learning_rate": 8.281478112481497e-05, "loss": 2.007, "step": 1829 }, { "epoch": 0.2952565343659245, "grad_norm": 4.877832889556885, "learning_rate": 8.279506320781041e-05, "loss": 2.3093, "step": 1830 }, { "epoch": 0.2954178767344305, "grad_norm": 4.901605606079102, "learning_rate": 8.277533633570948e-05, "loss": 1.7875, "step": 1831 }, { "epoch": 0.29557921910293644, "grad_norm": 4.114346981048584, "learning_rate": 8.275560051389884e-05, "loss": 2.053, "step": 1832 }, { "epoch": 0.2957405614714424, "grad_norm": 4.338045120239258, "learning_rate": 8.273585574776758e-05, "loss": 1.9956, "step": 1833 }, { "epoch": 0.29590190383994835, "grad_norm": 4.81793212890625, "learning_rate": 8.27161020427073e-05, "loss": 1.901, "step": 1834 }, { "epoch": 0.29606324620845437, "grad_norm": 4.17056941986084, "learning_rate": 8.269633940411196e-05, "loss": 1.873, "step": 1835 }, { "epoch": 0.2962245885769603, "grad_norm": 5.027039527893066, "learning_rate": 8.267656783737801e-05, "loss": 2.2233, "step": 1836 }, { "epoch": 0.2963859309454663, "grad_norm": 5.195655345916748, "learning_rate": 8.26567873479043e-05, "loss": 2.1567, "step": 1837 }, { "epoch": 0.29654727331397224, "grad_norm": 4.910747051239014, "learning_rate": 8.263699794109215e-05, "loss": 1.8349, "step": 1838 }, { "epoch": 0.2967086156824782, "grad_norm": 3.8523197174072266, "learning_rate": 8.261719962234529e-05, "loss": 2.0515, "step": 1839 }, { "epoch": 0.2968699580509842, "grad_norm": 4.3005757331848145, "learning_rate": 8.259739239706991e-05, "loss": 1.9005, "step": 1840 }, { "epoch": 0.29703130041949016, "grad_norm": 3.722055673599243, "learning_rate": 8.257757627067459e-05, "loss": 2.0472, "step": 1841 }, { "epoch": 0.2971926427879961, "grad_norm": 3.796337604522705, "learning_rate": 8.255775124857042e-05, "loss": 1.927, "step": 1842 }, { "epoch": 0.2973539851565021, "grad_norm": 4.589344024658203, "learning_rate": 8.253791733617082e-05, "loss": 1.9603, "step": 1843 }, { "epoch": 0.2975153275250081, "grad_norm": 4.884768962860107, "learning_rate": 8.251807453889171e-05, "loss": 2.164, "step": 1844 }, { "epoch": 0.29767666989351405, "grad_norm": 6.825287342071533, "learning_rate": 8.249822286215139e-05, "loss": 2.1274, "step": 1845 }, { "epoch": 0.29783801226202, "grad_norm": 4.882968902587891, "learning_rate": 8.247836231137061e-05, "loss": 2.2944, "step": 1846 }, { "epoch": 0.29799935463052596, "grad_norm": 3.7253918647766113, "learning_rate": 8.245849289197253e-05, "loss": 2.0048, "step": 1847 }, { "epoch": 0.29816069699903197, "grad_norm": 5.805208206176758, "learning_rate": 8.243861460938278e-05, "loss": 2.0602, "step": 1848 }, { "epoch": 0.29832203936753793, "grad_norm": 7.223762035369873, "learning_rate": 8.241872746902935e-05, "loss": 2.3192, "step": 1849 }, { "epoch": 0.2984833817360439, "grad_norm": 5.2446208000183105, "learning_rate": 8.239883147634263e-05, "loss": 2.0628, "step": 1850 }, { "epoch": 0.29864472410454984, "grad_norm": 7.644248962402344, "learning_rate": 8.23789266367555e-05, "loss": 2.1512, "step": 1851 }, { "epoch": 0.2988060664730558, "grad_norm": 4.67316198348999, "learning_rate": 8.235901295570324e-05, "loss": 2.0144, "step": 1852 }, { "epoch": 0.2989674088415618, "grad_norm": 4.881799697875977, "learning_rate": 8.23390904386235e-05, "loss": 1.8919, "step": 1853 }, { "epoch": 0.29912875121006777, "grad_norm": 5.2973480224609375, "learning_rate": 8.231915909095637e-05, "loss": 2.0731, "step": 1854 }, { "epoch": 0.2992900935785737, "grad_norm": 5.177144527435303, "learning_rate": 8.229921891814436e-05, "loss": 1.953, "step": 1855 }, { "epoch": 0.2994514359470797, "grad_norm": 4.5609893798828125, "learning_rate": 8.227926992563237e-05, "loss": 1.8789, "step": 1856 }, { "epoch": 0.2996127783155857, "grad_norm": 6.363867282867432, "learning_rate": 8.225931211886772e-05, "loss": 2.0935, "step": 1857 }, { "epoch": 0.29977412068409165, "grad_norm": 4.509897708892822, "learning_rate": 8.223934550330015e-05, "loss": 2.0326, "step": 1858 }, { "epoch": 0.2999354630525976, "grad_norm": 3.1676878929138184, "learning_rate": 8.221937008438178e-05, "loss": 2.1461, "step": 1859 }, { "epoch": 0.30009680542110356, "grad_norm": 3.9433722496032715, "learning_rate": 8.219938586756712e-05, "loss": 2.073, "step": 1860 }, { "epoch": 0.3002581477896096, "grad_norm": 4.704596996307373, "learning_rate": 8.217939285831316e-05, "loss": 2.3441, "step": 1861 }, { "epoch": 0.30041949015811553, "grad_norm": 4.053814888000488, "learning_rate": 8.21593910620792e-05, "loss": 2.0403, "step": 1862 }, { "epoch": 0.3005808325266215, "grad_norm": 5.388185024261475, "learning_rate": 8.213938048432697e-05, "loss": 1.9212, "step": 1863 }, { "epoch": 0.30074217489512745, "grad_norm": 3.7790966033935547, "learning_rate": 8.211936113052063e-05, "loss": 2.034, "step": 1864 }, { "epoch": 0.30090351726363346, "grad_norm": 3.5199756622314453, "learning_rate": 8.20993330061267e-05, "loss": 2.0374, "step": 1865 }, { "epoch": 0.3010648596321394, "grad_norm": 5.511442184448242, "learning_rate": 8.207929611661411e-05, "loss": 2.457, "step": 1866 }, { "epoch": 0.30122620200064537, "grad_norm": 4.7877655029296875, "learning_rate": 8.205925046745419e-05, "loss": 2.0208, "step": 1867 }, { "epoch": 0.30138754436915133, "grad_norm": 4.138000965118408, "learning_rate": 8.203919606412063e-05, "loss": 1.9117, "step": 1868 }, { "epoch": 0.3015488867376573, "grad_norm": 5.232863426208496, "learning_rate": 8.201913291208954e-05, "loss": 1.958, "step": 1869 }, { "epoch": 0.3017102291061633, "grad_norm": 4.581702709197998, "learning_rate": 8.199906101683941e-05, "loss": 2.271, "step": 1870 }, { "epoch": 0.30187157147466925, "grad_norm": 4.821216583251953, "learning_rate": 8.19789803838511e-05, "loss": 1.9112, "step": 1871 }, { "epoch": 0.3020329138431752, "grad_norm": 6.259906768798828, "learning_rate": 8.195889101860793e-05, "loss": 2.0971, "step": 1872 }, { "epoch": 0.30219425621168117, "grad_norm": 3.859330892562866, "learning_rate": 8.19387929265955e-05, "loss": 2.0736, "step": 1873 }, { "epoch": 0.3023555985801872, "grad_norm": 3.745858907699585, "learning_rate": 8.191868611330184e-05, "loss": 2.0505, "step": 1874 }, { "epoch": 0.30251694094869314, "grad_norm": 4.091365814208984, "learning_rate": 8.189857058421741e-05, "loss": 2.2169, "step": 1875 }, { "epoch": 0.3026782833171991, "grad_norm": 4.674230098724365, "learning_rate": 8.187844634483496e-05, "loss": 2.0723, "step": 1876 }, { "epoch": 0.30283962568570505, "grad_norm": 4.745269775390625, "learning_rate": 8.185831340064967e-05, "loss": 2.1235, "step": 1877 }, { "epoch": 0.30300096805421106, "grad_norm": 6.834924221038818, "learning_rate": 8.18381717571591e-05, "loss": 2.2466, "step": 1878 }, { "epoch": 0.303162310422717, "grad_norm": 5.590103626251221, "learning_rate": 8.181802141986317e-05, "loss": 2.1019, "step": 1879 }, { "epoch": 0.303323652791223, "grad_norm": 4.22111701965332, "learning_rate": 8.179786239426417e-05, "loss": 2.0257, "step": 1880 }, { "epoch": 0.30348499515972893, "grad_norm": 3.378553867340088, "learning_rate": 8.177769468586677e-05, "loss": 1.9401, "step": 1881 }, { "epoch": 0.3036463375282349, "grad_norm": 4.3551506996154785, "learning_rate": 8.175751830017801e-05, "loss": 2.0098, "step": 1882 }, { "epoch": 0.3038076798967409, "grad_norm": 6.12330961227417, "learning_rate": 8.173733324270733e-05, "loss": 1.6995, "step": 1883 }, { "epoch": 0.30396902226524686, "grad_norm": 3.480078935623169, "learning_rate": 8.171713951896647e-05, "loss": 2.1602, "step": 1884 }, { "epoch": 0.3041303646337528, "grad_norm": 5.214806079864502, "learning_rate": 8.169693713446959e-05, "loss": 2.0116, "step": 1885 }, { "epoch": 0.30429170700225877, "grad_norm": 3.7905850410461426, "learning_rate": 8.167672609473323e-05, "loss": 2.0998, "step": 1886 }, { "epoch": 0.3044530493707648, "grad_norm": 4.413736820220947, "learning_rate": 8.16565064052762e-05, "loss": 1.8602, "step": 1887 }, { "epoch": 0.30461439173927074, "grad_norm": 3.732191801071167, "learning_rate": 8.16362780716198e-05, "loss": 1.9877, "step": 1888 }, { "epoch": 0.3047757341077767, "grad_norm": 7.122950077056885, "learning_rate": 8.161604109928757e-05, "loss": 2.2097, "step": 1889 }, { "epoch": 0.30493707647628265, "grad_norm": 4.7883806228637695, "learning_rate": 8.159579549380552e-05, "loss": 1.9594, "step": 1890 }, { "epoch": 0.30509841884478867, "grad_norm": 3.7021067142486572, "learning_rate": 8.157554126070191e-05, "loss": 2.0678, "step": 1891 }, { "epoch": 0.3052597612132946, "grad_norm": 4.850696086883545, "learning_rate": 8.155527840550746e-05, "loss": 2.0929, "step": 1892 }, { "epoch": 0.3054211035818006, "grad_norm": 4.618776798248291, "learning_rate": 8.153500693375515e-05, "loss": 1.9542, "step": 1893 }, { "epoch": 0.30558244595030654, "grad_norm": 5.683887481689453, "learning_rate": 8.151472685098036e-05, "loss": 1.956, "step": 1894 }, { "epoch": 0.3057437883188125, "grad_norm": 5.511989593505859, "learning_rate": 8.149443816272083e-05, "loss": 2.067, "step": 1895 }, { "epoch": 0.3059051306873185, "grad_norm": 4.221396446228027, "learning_rate": 8.147414087451664e-05, "loss": 1.8161, "step": 1896 }, { "epoch": 0.30606647305582446, "grad_norm": 4.867199420928955, "learning_rate": 8.14538349919102e-05, "loss": 2.18, "step": 1897 }, { "epoch": 0.3062278154243304, "grad_norm": 5.007112979888916, "learning_rate": 8.14335205204463e-05, "loss": 2.2441, "step": 1898 }, { "epoch": 0.3063891577928364, "grad_norm": 3.9696736335754395, "learning_rate": 8.141319746567204e-05, "loss": 1.874, "step": 1899 }, { "epoch": 0.3065505001613424, "grad_norm": 5.212623119354248, "learning_rate": 8.139286583313689e-05, "loss": 1.9406, "step": 1900 }, { "epoch": 0.30671184252984834, "grad_norm": 4.380959987640381, "learning_rate": 8.137252562839265e-05, "loss": 1.8606, "step": 1901 }, { "epoch": 0.3068731848983543, "grad_norm": 3.6407463550567627, "learning_rate": 8.135217685699345e-05, "loss": 2.0841, "step": 1902 }, { "epoch": 0.30703452726686026, "grad_norm": 4.7653608322143555, "learning_rate": 8.133181952449582e-05, "loss": 2.106, "step": 1903 }, { "epoch": 0.30719586963536627, "grad_norm": 5.136886119842529, "learning_rate": 8.131145363645851e-05, "loss": 2.1021, "step": 1904 }, { "epoch": 0.3073572120038722, "grad_norm": 3.9044461250305176, "learning_rate": 8.129107919844274e-05, "loss": 2.0675, "step": 1905 }, { "epoch": 0.3075185543723782, "grad_norm": 3.826951265335083, "learning_rate": 8.127069621601198e-05, "loss": 1.9938, "step": 1906 }, { "epoch": 0.30767989674088414, "grad_norm": 4.828694820404053, "learning_rate": 8.125030469473202e-05, "loss": 2.0543, "step": 1907 }, { "epoch": 0.30784123910939015, "grad_norm": 5.609941005706787, "learning_rate": 8.122990464017109e-05, "loss": 2.1488, "step": 1908 }, { "epoch": 0.3080025814778961, "grad_norm": 4.673035144805908, "learning_rate": 8.12094960578996e-05, "loss": 1.8555, "step": 1909 }, { "epoch": 0.30816392384640207, "grad_norm": 3.8414700031280518, "learning_rate": 8.118907895349039e-05, "loss": 2.1498, "step": 1910 }, { "epoch": 0.308325266214908, "grad_norm": 4.503723621368408, "learning_rate": 8.116865333251864e-05, "loss": 2.1516, "step": 1911 }, { "epoch": 0.308486608583414, "grad_norm": 3.872340440750122, "learning_rate": 8.114821920056177e-05, "loss": 2.0245, "step": 1912 }, { "epoch": 0.30864795095192, "grad_norm": 4.699495315551758, "learning_rate": 8.112777656319959e-05, "loss": 2.2828, "step": 1913 }, { "epoch": 0.30880929332042595, "grad_norm": 5.339008331298828, "learning_rate": 8.110732542601423e-05, "loss": 1.9724, "step": 1914 }, { "epoch": 0.3089706356889319, "grad_norm": 4.341615676879883, "learning_rate": 8.10868657945901e-05, "loss": 2.0989, "step": 1915 }, { "epoch": 0.30913197805743786, "grad_norm": 5.45268440246582, "learning_rate": 8.106639767451396e-05, "loss": 2.2122, "step": 1916 }, { "epoch": 0.3092933204259439, "grad_norm": 6.427046298980713, "learning_rate": 8.104592107137489e-05, "loss": 2.0737, "step": 1917 }, { "epoch": 0.30945466279444983, "grad_norm": 4.466939926147461, "learning_rate": 8.102543599076427e-05, "loss": 2.2812, "step": 1918 }, { "epoch": 0.3096160051629558, "grad_norm": 3.7878451347351074, "learning_rate": 8.100494243827582e-05, "loss": 2.1585, "step": 1919 }, { "epoch": 0.30977734753146174, "grad_norm": 5.668677806854248, "learning_rate": 8.098444041950553e-05, "loss": 1.9682, "step": 1920 }, { "epoch": 0.30993868989996776, "grad_norm": 4.185689926147461, "learning_rate": 8.096392994005177e-05, "loss": 2.0743, "step": 1921 }, { "epoch": 0.3101000322684737, "grad_norm": 7.4727463722229, "learning_rate": 8.094341100551512e-05, "loss": 2.4402, "step": 1922 }, { "epoch": 0.31026137463697967, "grad_norm": 4.48516845703125, "learning_rate": 8.092288362149858e-05, "loss": 2.38, "step": 1923 }, { "epoch": 0.3104227170054856, "grad_norm": 4.385977745056152, "learning_rate": 8.09023477936074e-05, "loss": 2.1088, "step": 1924 }, { "epoch": 0.3105840593739916, "grad_norm": 3.730656385421753, "learning_rate": 8.088180352744911e-05, "loss": 2.1832, "step": 1925 }, { "epoch": 0.3107454017424976, "grad_norm": 5.2465972900390625, "learning_rate": 8.08612508286336e-05, "loss": 2.1811, "step": 1926 }, { "epoch": 0.31090674411100355, "grad_norm": 4.33252477645874, "learning_rate": 8.084068970277305e-05, "loss": 2.1834, "step": 1927 }, { "epoch": 0.3110680864795095, "grad_norm": 7.046680450439453, "learning_rate": 8.082012015548188e-05, "loss": 2.371, "step": 1928 }, { "epoch": 0.31122942884801547, "grad_norm": 4.966737270355225, "learning_rate": 8.07995421923769e-05, "loss": 2.0372, "step": 1929 }, { "epoch": 0.3113907712165215, "grad_norm": 3.923862934112549, "learning_rate": 8.077895581907718e-05, "loss": 2.218, "step": 1930 }, { "epoch": 0.31155211358502743, "grad_norm": 4.700870990753174, "learning_rate": 8.075836104120407e-05, "loss": 2.0869, "step": 1931 }, { "epoch": 0.3117134559535334, "grad_norm": 4.45346736907959, "learning_rate": 8.073775786438122e-05, "loss": 2.0875, "step": 1932 }, { "epoch": 0.31187479832203935, "grad_norm": 5.010293483734131, "learning_rate": 8.071714629423459e-05, "loss": 2.1765, "step": 1933 }, { "epoch": 0.31203614069054536, "grad_norm": 3.4277050495147705, "learning_rate": 8.069652633639241e-05, "loss": 1.8646, "step": 1934 }, { "epoch": 0.3121974830590513, "grad_norm": 4.554781913757324, "learning_rate": 8.067589799648523e-05, "loss": 1.6562, "step": 1935 }, { "epoch": 0.3123588254275573, "grad_norm": 4.1686296463012695, "learning_rate": 8.065526128014587e-05, "loss": 2.0406, "step": 1936 }, { "epoch": 0.31252016779606323, "grad_norm": 4.863027572631836, "learning_rate": 8.063461619300943e-05, "loss": 2.0048, "step": 1937 }, { "epoch": 0.31268151016456924, "grad_norm": 4.528186321258545, "learning_rate": 8.061396274071333e-05, "loss": 1.8828, "step": 1938 }, { "epoch": 0.3128428525330752, "grad_norm": 3.636806011199951, "learning_rate": 8.059330092889723e-05, "loss": 2.135, "step": 1939 }, { "epoch": 0.31300419490158116, "grad_norm": 5.246001720428467, "learning_rate": 8.057263076320309e-05, "loss": 2.1713, "step": 1940 }, { "epoch": 0.3131655372700871, "grad_norm": 4.590300559997559, "learning_rate": 8.055195224927517e-05, "loss": 1.9298, "step": 1941 }, { "epoch": 0.31332687963859307, "grad_norm": 4.156294822692871, "learning_rate": 8.053126539275997e-05, "loss": 2.1276, "step": 1942 }, { "epoch": 0.3134882220070991, "grad_norm": 7.552245140075684, "learning_rate": 8.051057019930631e-05, "loss": 2.1637, "step": 1943 }, { "epoch": 0.31364956437560504, "grad_norm": 3.9365739822387695, "learning_rate": 8.048986667456527e-05, "loss": 2.2709, "step": 1944 }, { "epoch": 0.313810906744111, "grad_norm": 5.453755855560303, "learning_rate": 8.046915482419018e-05, "loss": 2.1096, "step": 1945 }, { "epoch": 0.31397224911261695, "grad_norm": 3.7501649856567383, "learning_rate": 8.044843465383669e-05, "loss": 2.0991, "step": 1946 }, { "epoch": 0.31413359148112296, "grad_norm": 3.9636921882629395, "learning_rate": 8.042770616916269e-05, "loss": 2.0603, "step": 1947 }, { "epoch": 0.3142949338496289, "grad_norm": 5.861011505126953, "learning_rate": 8.040696937582832e-05, "loss": 1.9778, "step": 1948 }, { "epoch": 0.3144562762181349, "grad_norm": 5.550604820251465, "learning_rate": 8.038622427949607e-05, "loss": 2.007, "step": 1949 }, { "epoch": 0.31461761858664083, "grad_norm": 5.225160121917725, "learning_rate": 8.036547088583062e-05, "loss": 2.3065, "step": 1950 }, { "epoch": 0.31477896095514685, "grad_norm": 4.004058361053467, "learning_rate": 8.034470920049892e-05, "loss": 2.1883, "step": 1951 }, { "epoch": 0.3149403033236528, "grad_norm": 4.266819477081299, "learning_rate": 8.032393922917024e-05, "loss": 1.8595, "step": 1952 }, { "epoch": 0.31510164569215876, "grad_norm": 4.176856994628906, "learning_rate": 8.030316097751606e-05, "loss": 1.904, "step": 1953 }, { "epoch": 0.3152629880606647, "grad_norm": 3.6861655712127686, "learning_rate": 8.028237445121013e-05, "loss": 2.0782, "step": 1954 }, { "epoch": 0.3154243304291707, "grad_norm": 4.144218444824219, "learning_rate": 8.026157965592849e-05, "loss": 1.9113, "step": 1955 }, { "epoch": 0.3155856727976767, "grad_norm": 4.056292533874512, "learning_rate": 8.024077659734938e-05, "loss": 1.9954, "step": 1956 }, { "epoch": 0.31574701516618264, "grad_norm": 4.331222057342529, "learning_rate": 8.021996528115335e-05, "loss": 2.058, "step": 1957 }, { "epoch": 0.3159083575346886, "grad_norm": 5.137801647186279, "learning_rate": 8.01991457130232e-05, "loss": 2.0594, "step": 1958 }, { "epoch": 0.31606969990319456, "grad_norm": 3.8450074195861816, "learning_rate": 8.017831789864394e-05, "loss": 1.9862, "step": 1959 }, { "epoch": 0.31623104227170057, "grad_norm": 3.9215176105499268, "learning_rate": 8.01574818437029e-05, "loss": 1.8986, "step": 1960 }, { "epoch": 0.3163923846402065, "grad_norm": 4.650374412536621, "learning_rate": 8.013663755388958e-05, "loss": 1.9499, "step": 1961 }, { "epoch": 0.3165537270087125, "grad_norm": 4.070428848266602, "learning_rate": 8.01157850348958e-05, "loss": 2.1298, "step": 1962 }, { "epoch": 0.31671506937721844, "grad_norm": 4.45579719543457, "learning_rate": 8.009492429241559e-05, "loss": 2.3318, "step": 1963 }, { "epoch": 0.31687641174572445, "grad_norm": 5.952184677124023, "learning_rate": 8.007405533214524e-05, "loss": 2.2048, "step": 1964 }, { "epoch": 0.3170377541142304, "grad_norm": 4.4508376121521, "learning_rate": 8.005317815978326e-05, "loss": 2.0861, "step": 1965 }, { "epoch": 0.31719909648273636, "grad_norm": 7.377951622009277, "learning_rate": 8.003229278103043e-05, "loss": 2.0022, "step": 1966 }, { "epoch": 0.3173604388512423, "grad_norm": 5.454758644104004, "learning_rate": 8.001139920158976e-05, "loss": 1.9503, "step": 1967 }, { "epoch": 0.31752178121974833, "grad_norm": 5.345936298370361, "learning_rate": 7.999049742716649e-05, "loss": 2.0027, "step": 1968 }, { "epoch": 0.3176831235882543, "grad_norm": 5.304137229919434, "learning_rate": 7.996958746346812e-05, "loss": 2.0196, "step": 1969 }, { "epoch": 0.31784446595676025, "grad_norm": 8.837878227233887, "learning_rate": 7.994866931620438e-05, "loss": 2.1589, "step": 1970 }, { "epoch": 0.3180058083252662, "grad_norm": 4.453640460968018, "learning_rate": 7.99277429910872e-05, "loss": 1.9992, "step": 1971 }, { "epoch": 0.31816715069377216, "grad_norm": 5.355186462402344, "learning_rate": 7.99068084938308e-05, "loss": 2.0244, "step": 1972 }, { "epoch": 0.3183284930622782, "grad_norm": 4.449441432952881, "learning_rate": 7.988586583015156e-05, "loss": 2.0835, "step": 1973 }, { "epoch": 0.31848983543078413, "grad_norm": 4.989075660705566, "learning_rate": 7.986491500576818e-05, "loss": 2.3297, "step": 1974 }, { "epoch": 0.3186511777992901, "grad_norm": 4.702456951141357, "learning_rate": 7.984395602640153e-05, "loss": 1.8845, "step": 1975 }, { "epoch": 0.31881252016779604, "grad_norm": 4.21317195892334, "learning_rate": 7.982298889777471e-05, "loss": 1.8683, "step": 1976 }, { "epoch": 0.31897386253630206, "grad_norm": 3.82844877243042, "learning_rate": 7.980201362561305e-05, "loss": 2.1639, "step": 1977 }, { "epoch": 0.319135204904808, "grad_norm": 5.126898765563965, "learning_rate": 7.978103021564412e-05, "loss": 1.8773, "step": 1978 }, { "epoch": 0.31929654727331397, "grad_norm": 5.022282600402832, "learning_rate": 7.976003867359767e-05, "loss": 1.818, "step": 1979 }, { "epoch": 0.3194578896418199, "grad_norm": 5.216141700744629, "learning_rate": 7.973903900520574e-05, "loss": 1.9978, "step": 1980 }, { "epoch": 0.31961923201032594, "grad_norm": 7.017202377319336, "learning_rate": 7.971803121620251e-05, "loss": 1.9971, "step": 1981 }, { "epoch": 0.3197805743788319, "grad_norm": 4.33177375793457, "learning_rate": 7.969701531232445e-05, "loss": 2.0671, "step": 1982 }, { "epoch": 0.31994191674733785, "grad_norm": 3.8065788745880127, "learning_rate": 7.967599129931019e-05, "loss": 1.9905, "step": 1983 }, { "epoch": 0.3201032591158438, "grad_norm": 4.233896255493164, "learning_rate": 7.96549591829006e-05, "loss": 1.9748, "step": 1984 }, { "epoch": 0.32026460148434976, "grad_norm": 4.714889049530029, "learning_rate": 7.963391896883874e-05, "loss": 1.9152, "step": 1985 }, { "epoch": 0.3204259438528558, "grad_norm": 5.847426891326904, "learning_rate": 7.961287066286994e-05, "loss": 2.0224, "step": 1986 }, { "epoch": 0.32058728622136173, "grad_norm": 3.1339242458343506, "learning_rate": 7.959181427074167e-05, "loss": 2.0242, "step": 1987 }, { "epoch": 0.3207486285898677, "grad_norm": 4.087879180908203, "learning_rate": 7.957074979820365e-05, "loss": 2.1799, "step": 1988 }, { "epoch": 0.32090997095837365, "grad_norm": 4.901279926300049, "learning_rate": 7.954967725100779e-05, "loss": 2.2795, "step": 1989 }, { "epoch": 0.32107131332687966, "grad_norm": 4.170873165130615, "learning_rate": 7.95285966349082e-05, "loss": 2.1406, "step": 1990 }, { "epoch": 0.3212326556953856, "grad_norm": 3.6736488342285156, "learning_rate": 7.950750795566123e-05, "loss": 1.9033, "step": 1991 }, { "epoch": 0.3213939980638916, "grad_norm": 4.261050224304199, "learning_rate": 7.948641121902537e-05, "loss": 2.2996, "step": 1992 }, { "epoch": 0.32155534043239753, "grad_norm": 4.296965599060059, "learning_rate": 7.946530643076138e-05, "loss": 2.2059, "step": 1993 }, { "epoch": 0.32171668280090354, "grad_norm": 3.860304594039917, "learning_rate": 7.944419359663213e-05, "loss": 1.8987, "step": 1994 }, { "epoch": 0.3218780251694095, "grad_norm": 4.054232120513916, "learning_rate": 7.94230727224028e-05, "loss": 1.8697, "step": 1995 }, { "epoch": 0.32203936753791546, "grad_norm": 4.921481132507324, "learning_rate": 7.940194381384066e-05, "loss": 2.0686, "step": 1996 }, { "epoch": 0.3222007099064214, "grad_norm": 3.2399144172668457, "learning_rate": 7.938080687671524e-05, "loss": 1.9011, "step": 1997 }, { "epoch": 0.32236205227492737, "grad_norm": 3.696622848510742, "learning_rate": 7.935966191679824e-05, "loss": 2.0503, "step": 1998 }, { "epoch": 0.3225233946434334, "grad_norm": 4.244028568267822, "learning_rate": 7.933850893986354e-05, "loss": 1.9405, "step": 1999 }, { "epoch": 0.32268473701193934, "grad_norm": 7.01930570602417, "learning_rate": 7.931734795168724e-05, "loss": 1.8827, "step": 2000 }, { "epoch": 0.3228460793804453, "grad_norm": 4.2505645751953125, "learning_rate": 7.92961789580476e-05, "loss": 1.9412, "step": 2001 }, { "epoch": 0.32300742174895125, "grad_norm": 3.3729472160339355, "learning_rate": 7.927500196472506e-05, "loss": 2.2648, "step": 2002 }, { "epoch": 0.32316876411745726, "grad_norm": 4.692650318145752, "learning_rate": 7.925381697750229e-05, "loss": 2.4044, "step": 2003 }, { "epoch": 0.3233301064859632, "grad_norm": 3.2933013439178467, "learning_rate": 7.923262400216408e-05, "loss": 2.0314, "step": 2004 }, { "epoch": 0.3234914488544692, "grad_norm": 6.476714134216309, "learning_rate": 7.921142304449745e-05, "loss": 1.7739, "step": 2005 }, { "epoch": 0.32365279122297513, "grad_norm": 5.154987812042236, "learning_rate": 7.91902141102916e-05, "loss": 2.0001, "step": 2006 }, { "epoch": 0.32381413359148115, "grad_norm": 3.7503819465637207, "learning_rate": 7.916899720533786e-05, "loss": 2.227, "step": 2007 }, { "epoch": 0.3239754759599871, "grad_norm": 4.138298511505127, "learning_rate": 7.914777233542978e-05, "loss": 1.9531, "step": 2008 }, { "epoch": 0.32413681832849306, "grad_norm": 3.416266918182373, "learning_rate": 7.912653950636306e-05, "loss": 2.1842, "step": 2009 }, { "epoch": 0.324298160696999, "grad_norm": 4.819826126098633, "learning_rate": 7.91052987239356e-05, "loss": 2.0731, "step": 2010 }, { "epoch": 0.32445950306550503, "grad_norm": 4.643051624298096, "learning_rate": 7.908404999394746e-05, "loss": 2.1121, "step": 2011 }, { "epoch": 0.324620845434011, "grad_norm": 3.803864002227783, "learning_rate": 7.906279332220087e-05, "loss": 1.831, "step": 2012 }, { "epoch": 0.32478218780251694, "grad_norm": 5.006372928619385, "learning_rate": 7.904152871450022e-05, "loss": 1.8421, "step": 2013 }, { "epoch": 0.3249435301710229, "grad_norm": 4.55983829498291, "learning_rate": 7.902025617665205e-05, "loss": 2.1396, "step": 2014 }, { "epoch": 0.32510487253952886, "grad_norm": 5.165456771850586, "learning_rate": 7.899897571446513e-05, "loss": 2.0341, "step": 2015 }, { "epoch": 0.32526621490803487, "grad_norm": 4.067203998565674, "learning_rate": 7.897768733375033e-05, "loss": 1.9253, "step": 2016 }, { "epoch": 0.3254275572765408, "grad_norm": 3.1523919105529785, "learning_rate": 7.895639104032071e-05, "loss": 2.0588, "step": 2017 }, { "epoch": 0.3255888996450468, "grad_norm": 3.850659132003784, "learning_rate": 7.893508683999149e-05, "loss": 1.8334, "step": 2018 }, { "epoch": 0.32575024201355274, "grad_norm": 4.036777496337891, "learning_rate": 7.891377473858002e-05, "loss": 1.9177, "step": 2019 }, { "epoch": 0.32591158438205875, "grad_norm": 6.105804443359375, "learning_rate": 7.889245474190588e-05, "loss": 2.1875, "step": 2020 }, { "epoch": 0.3260729267505647, "grad_norm": 4.579841613769531, "learning_rate": 7.88711268557907e-05, "loss": 1.841, "step": 2021 }, { "epoch": 0.32623426911907066, "grad_norm": 3.5988316535949707, "learning_rate": 7.884979108605837e-05, "loss": 2.1727, "step": 2022 }, { "epoch": 0.3263956114875766, "grad_norm": 3.7694644927978516, "learning_rate": 7.882844743853484e-05, "loss": 2.1283, "step": 2023 }, { "epoch": 0.32655695385608263, "grad_norm": 4.465296745300293, "learning_rate": 7.880709591904832e-05, "loss": 1.9834, "step": 2024 }, { "epoch": 0.3267182962245886, "grad_norm": 3.46445631980896, "learning_rate": 7.878573653342904e-05, "loss": 2.144, "step": 2025 }, { "epoch": 0.32687963859309455, "grad_norm": 4.885611057281494, "learning_rate": 7.876436928750947e-05, "loss": 1.9757, "step": 2026 }, { "epoch": 0.3270409809616005, "grad_norm": 4.627254962921143, "learning_rate": 7.874299418712421e-05, "loss": 1.8614, "step": 2027 }, { "epoch": 0.32720232333010646, "grad_norm": 3.813387632369995, "learning_rate": 7.872161123810999e-05, "loss": 2.1079, "step": 2028 }, { "epoch": 0.32736366569861247, "grad_norm": 4.278948783874512, "learning_rate": 7.870022044630569e-05, "loss": 2.0231, "step": 2029 }, { "epoch": 0.32752500806711843, "grad_norm": 4.185835361480713, "learning_rate": 7.86788218175523e-05, "loss": 2.0688, "step": 2030 }, { "epoch": 0.3276863504356244, "grad_norm": 4.226234436035156, "learning_rate": 7.865741535769303e-05, "loss": 1.9921, "step": 2031 }, { "epoch": 0.32784769280413034, "grad_norm": 3.9076058864593506, "learning_rate": 7.863600107257314e-05, "loss": 2.0349, "step": 2032 }, { "epoch": 0.32800903517263635, "grad_norm": 3.57064151763916, "learning_rate": 7.861457896804007e-05, "loss": 1.9627, "step": 2033 }, { "epoch": 0.3281703775411423, "grad_norm": 4.979307174682617, "learning_rate": 7.859314904994339e-05, "loss": 2.4244, "step": 2034 }, { "epoch": 0.32833171990964827, "grad_norm": 4.064205169677734, "learning_rate": 7.857171132413483e-05, "loss": 2.1526, "step": 2035 }, { "epoch": 0.3284930622781542, "grad_norm": 3.6024906635284424, "learning_rate": 7.855026579646818e-05, "loss": 1.9607, "step": 2036 }, { "epoch": 0.32865440464666024, "grad_norm": 5.789567947387695, "learning_rate": 7.852881247279944e-05, "loss": 2.086, "step": 2037 }, { "epoch": 0.3288157470151662, "grad_norm": 4.560086727142334, "learning_rate": 7.85073513589867e-05, "loss": 1.9153, "step": 2038 }, { "epoch": 0.32897708938367215, "grad_norm": 4.217399597167969, "learning_rate": 7.848588246089017e-05, "loss": 2.1856, "step": 2039 }, { "epoch": 0.3291384317521781, "grad_norm": 5.792709827423096, "learning_rate": 7.84644057843722e-05, "loss": 2.0268, "step": 2040 }, { "epoch": 0.3292997741206841, "grad_norm": 4.474367141723633, "learning_rate": 7.844292133529727e-05, "loss": 2.0528, "step": 2041 }, { "epoch": 0.3294611164891901, "grad_norm": 3.6288115978240967, "learning_rate": 7.842142911953197e-05, "loss": 1.9576, "step": 2042 }, { "epoch": 0.32962245885769603, "grad_norm": 3.5075695514678955, "learning_rate": 7.839992914294499e-05, "loss": 1.9709, "step": 2043 }, { "epoch": 0.329783801226202, "grad_norm": 3.612318277359009, "learning_rate": 7.837842141140721e-05, "loss": 1.9535, "step": 2044 }, { "epoch": 0.32994514359470795, "grad_norm": 3.621208667755127, "learning_rate": 7.835690593079156e-05, "loss": 2.074, "step": 2045 }, { "epoch": 0.33010648596321396, "grad_norm": 5.363526821136475, "learning_rate": 7.833538270697309e-05, "loss": 1.8701, "step": 2046 }, { "epoch": 0.3302678283317199, "grad_norm": 4.517481803894043, "learning_rate": 7.831385174582901e-05, "loss": 1.9816, "step": 2047 }, { "epoch": 0.33042917070022587, "grad_norm": 5.250185489654541, "learning_rate": 7.829231305323858e-05, "loss": 2.1113, "step": 2048 }, { "epoch": 0.33059051306873183, "grad_norm": 3.945758581161499, "learning_rate": 7.827076663508326e-05, "loss": 1.8457, "step": 2049 }, { "epoch": 0.33075185543723784, "grad_norm": 4.452152729034424, "learning_rate": 7.82492124972465e-05, "loss": 2.061, "step": 2050 }, { "epoch": 0.3309131978057438, "grad_norm": 3.809438467025757, "learning_rate": 7.822765064561397e-05, "loss": 2.1061, "step": 2051 }, { "epoch": 0.33107454017424975, "grad_norm": 3.6477572917938232, "learning_rate": 7.820608108607339e-05, "loss": 1.9717, "step": 2052 }, { "epoch": 0.3312358825427557, "grad_norm": 5.503085613250732, "learning_rate": 7.818450382451457e-05, "loss": 2.0585, "step": 2053 }, { "epoch": 0.3313972249112617, "grad_norm": 3.4868550300598145, "learning_rate": 7.816291886682947e-05, "loss": 2.0033, "step": 2054 }, { "epoch": 0.3315585672797677, "grad_norm": 4.83896541595459, "learning_rate": 7.814132621891215e-05, "loss": 2.079, "step": 2055 }, { "epoch": 0.33171990964827364, "grad_norm": 4.1418890953063965, "learning_rate": 7.81197258866587e-05, "loss": 1.8944, "step": 2056 }, { "epoch": 0.3318812520167796, "grad_norm": 4.504819393157959, "learning_rate": 7.809811787596739e-05, "loss": 2.1623, "step": 2057 }, { "epoch": 0.33204259438528555, "grad_norm": 5.678425312042236, "learning_rate": 7.807650219273853e-05, "loss": 2.2039, "step": 2058 }, { "epoch": 0.33220393675379156, "grad_norm": 3.4308807849884033, "learning_rate": 7.805487884287457e-05, "loss": 2.099, "step": 2059 }, { "epoch": 0.3323652791222975, "grad_norm": 4.822578430175781, "learning_rate": 7.803324783228004e-05, "loss": 2.0879, "step": 2060 }, { "epoch": 0.3325266214908035, "grad_norm": 3.984004020690918, "learning_rate": 7.801160916686152e-05, "loss": 1.9915, "step": 2061 }, { "epoch": 0.33268796385930943, "grad_norm": 4.755062103271484, "learning_rate": 7.798996285252773e-05, "loss": 1.9483, "step": 2062 }, { "epoch": 0.33284930622781544, "grad_norm": 3.7323362827301025, "learning_rate": 7.796830889518949e-05, "loss": 1.9571, "step": 2063 }, { "epoch": 0.3330106485963214, "grad_norm": 4.8012213706970215, "learning_rate": 7.794664730075964e-05, "loss": 1.9506, "step": 2064 }, { "epoch": 0.33317199096482736, "grad_norm": 4.999101638793945, "learning_rate": 7.792497807515317e-05, "loss": 2.1258, "step": 2065 }, { "epoch": 0.3333333333333333, "grad_norm": 6.21721076965332, "learning_rate": 7.790330122428711e-05, "loss": 1.9964, "step": 2066 }, { "epoch": 0.3334946757018393, "grad_norm": 3.7509355545043945, "learning_rate": 7.788161675408061e-05, "loss": 1.9618, "step": 2067 }, { "epoch": 0.3336560180703453, "grad_norm": 5.406489372253418, "learning_rate": 7.785992467045487e-05, "loss": 1.9184, "step": 2068 }, { "epoch": 0.33381736043885124, "grad_norm": 4.889528274536133, "learning_rate": 7.783822497933321e-05, "loss": 2.2079, "step": 2069 }, { "epoch": 0.3339787028073572, "grad_norm": 5.023280143737793, "learning_rate": 7.781651768664095e-05, "loss": 1.9985, "step": 2070 }, { "epoch": 0.33414004517586315, "grad_norm": 4.156134128570557, "learning_rate": 7.779480279830557e-05, "loss": 1.9568, "step": 2071 }, { "epoch": 0.33430138754436917, "grad_norm": 4.915197849273682, "learning_rate": 7.777308032025657e-05, "loss": 1.9985, "step": 2072 }, { "epoch": 0.3344627299128751, "grad_norm": 4.4993157386779785, "learning_rate": 7.775135025842554e-05, "loss": 2.2353, "step": 2073 }, { "epoch": 0.3346240722813811, "grad_norm": 4.201979160308838, "learning_rate": 7.772961261874615e-05, "loss": 2.0364, "step": 2074 }, { "epoch": 0.33478541464988704, "grad_norm": 5.750626087188721, "learning_rate": 7.770786740715414e-05, "loss": 1.9875, "step": 2075 }, { "epoch": 0.33494675701839305, "grad_norm": 4.650112152099609, "learning_rate": 7.768611462958728e-05, "loss": 1.8518, "step": 2076 }, { "epoch": 0.335108099386899, "grad_norm": 4.810061454772949, "learning_rate": 7.766435429198546e-05, "loss": 1.9889, "step": 2077 }, { "epoch": 0.33526944175540496, "grad_norm": 4.015198707580566, "learning_rate": 7.76425864002906e-05, "loss": 2.1328, "step": 2078 }, { "epoch": 0.3354307841239109, "grad_norm": 4.309876918792725, "learning_rate": 7.762081096044668e-05, "loss": 1.9792, "step": 2079 }, { "epoch": 0.33559212649241693, "grad_norm": 4.62183141708374, "learning_rate": 7.759902797839979e-05, "loss": 1.8456, "step": 2080 }, { "epoch": 0.3357534688609229, "grad_norm": 5.688324451446533, "learning_rate": 7.757723746009799e-05, "loss": 2.2172, "step": 2081 }, { "epoch": 0.33591481122942884, "grad_norm": 3.3715908527374268, "learning_rate": 7.755543941149149e-05, "loss": 2.1086, "step": 2082 }, { "epoch": 0.3360761535979348, "grad_norm": 4.781265735626221, "learning_rate": 7.753363383853249e-05, "loss": 2.058, "step": 2083 }, { "epoch": 0.3362374959664408, "grad_norm": 5.9148783683776855, "learning_rate": 7.751182074717527e-05, "loss": 1.6179, "step": 2084 }, { "epoch": 0.33639883833494677, "grad_norm": 5.1858720779418945, "learning_rate": 7.74900001433762e-05, "loss": 1.9247, "step": 2085 }, { "epoch": 0.3365601807034527, "grad_norm": 5.318194389343262, "learning_rate": 7.746817203309362e-05, "loss": 1.8551, "step": 2086 }, { "epoch": 0.3367215230719587, "grad_norm": 4.078609943389893, "learning_rate": 7.744633642228798e-05, "loss": 1.9068, "step": 2087 }, { "epoch": 0.33688286544046464, "grad_norm": 3.140766143798828, "learning_rate": 7.742449331692177e-05, "loss": 2.0437, "step": 2088 }, { "epoch": 0.33704420780897065, "grad_norm": 6.085255146026611, "learning_rate": 7.740264272295953e-05, "loss": 2.4435, "step": 2089 }, { "epoch": 0.3372055501774766, "grad_norm": 3.4636056423187256, "learning_rate": 7.73807846463678e-05, "loss": 2.0657, "step": 2090 }, { "epoch": 0.33736689254598257, "grad_norm": 4.229785919189453, "learning_rate": 7.735891909311524e-05, "loss": 2.193, "step": 2091 }, { "epoch": 0.3375282349144885, "grad_norm": 4.070128440856934, "learning_rate": 7.733704606917247e-05, "loss": 2.0491, "step": 2092 }, { "epoch": 0.33768957728299454, "grad_norm": 3.961073398590088, "learning_rate": 7.73151655805122e-05, "loss": 1.8331, "step": 2093 }, { "epoch": 0.3378509196515005, "grad_norm": 5.665839672088623, "learning_rate": 7.729327763310919e-05, "loss": 2.0938, "step": 2094 }, { "epoch": 0.33801226202000645, "grad_norm": 3.713109254837036, "learning_rate": 7.727138223294019e-05, "loss": 2.1567, "step": 2095 }, { "epoch": 0.3381736043885124, "grad_norm": 5.792564392089844, "learning_rate": 7.724947938598401e-05, "loss": 2.0109, "step": 2096 }, { "epoch": 0.3383349467570184, "grad_norm": 4.827317237854004, "learning_rate": 7.722756909822151e-05, "loss": 1.9553, "step": 2097 }, { "epoch": 0.3384962891255244, "grad_norm": 4.299433708190918, "learning_rate": 7.720565137563554e-05, "loss": 1.9985, "step": 2098 }, { "epoch": 0.33865763149403033, "grad_norm": 4.724922180175781, "learning_rate": 7.718372622421101e-05, "loss": 2.0317, "step": 2099 }, { "epoch": 0.3388189738625363, "grad_norm": 4.753623962402344, "learning_rate": 7.716179364993486e-05, "loss": 1.8099, "step": 2100 }, { "epoch": 0.33898031623104224, "grad_norm": 6.869202136993408, "learning_rate": 7.713985365879606e-05, "loss": 1.955, "step": 2101 }, { "epoch": 0.33914165859954826, "grad_norm": 5.462575912475586, "learning_rate": 7.711790625678559e-05, "loss": 1.798, "step": 2102 }, { "epoch": 0.3393030009680542, "grad_norm": 7.04573392868042, "learning_rate": 7.709595144989643e-05, "loss": 2.1349, "step": 2103 }, { "epoch": 0.33946434333656017, "grad_norm": 5.625308990478516, "learning_rate": 7.707398924412365e-05, "loss": 2.0368, "step": 2104 }, { "epoch": 0.3396256857050661, "grad_norm": 4.275908946990967, "learning_rate": 7.705201964546429e-05, "loss": 1.9984, "step": 2105 }, { "epoch": 0.33978702807357214, "grad_norm": 5.542468547821045, "learning_rate": 7.70300426599174e-05, "loss": 2.0229, "step": 2106 }, { "epoch": 0.3399483704420781, "grad_norm": 3.9115285873413086, "learning_rate": 7.70080582934841e-05, "loss": 1.965, "step": 2107 }, { "epoch": 0.34010971281058405, "grad_norm": 4.481706619262695, "learning_rate": 7.698606655216745e-05, "loss": 1.9295, "step": 2108 }, { "epoch": 0.34027105517909, "grad_norm": 4.542694091796875, "learning_rate": 7.69640674419726e-05, "loss": 2.0746, "step": 2109 }, { "epoch": 0.340432397547596, "grad_norm": 3.7146451473236084, "learning_rate": 7.694206096890666e-05, "loss": 1.7149, "step": 2110 }, { "epoch": 0.340593739916102, "grad_norm": 5.7919602394104, "learning_rate": 7.692004713897878e-05, "loss": 1.8816, "step": 2111 }, { "epoch": 0.34075508228460794, "grad_norm": 5.574487686157227, "learning_rate": 7.689802595820013e-05, "loss": 2.1589, "step": 2112 }, { "epoch": 0.3409164246531139, "grad_norm": 4.516279220581055, "learning_rate": 7.68759974325838e-05, "loss": 2.1939, "step": 2113 }, { "epoch": 0.3410777670216199, "grad_norm": 3.7355384826660156, "learning_rate": 7.685396156814502e-05, "loss": 2.0675, "step": 2114 }, { "epoch": 0.34123910939012586, "grad_norm": 4.5356035232543945, "learning_rate": 7.683191837090092e-05, "loss": 1.8799, "step": 2115 }, { "epoch": 0.3414004517586318, "grad_norm": 3.9328882694244385, "learning_rate": 7.680986784687065e-05, "loss": 2.1635, "step": 2116 }, { "epoch": 0.3415617941271378, "grad_norm": 4.58782434463501, "learning_rate": 7.678781000207542e-05, "loss": 2.163, "step": 2117 }, { "epoch": 0.34172313649564373, "grad_norm": 3.2896623611450195, "learning_rate": 7.676574484253837e-05, "loss": 2.2619, "step": 2118 }, { "epoch": 0.34188447886414974, "grad_norm": 4.790579319000244, "learning_rate": 7.674367237428466e-05, "loss": 1.8289, "step": 2119 }, { "epoch": 0.3420458212326557, "grad_norm": 5.787225723266602, "learning_rate": 7.672159260334148e-05, "loss": 2.2768, "step": 2120 }, { "epoch": 0.34220716360116166, "grad_norm": 3.641658067703247, "learning_rate": 7.669950553573795e-05, "loss": 1.8741, "step": 2121 }, { "epoch": 0.3423685059696676, "grad_norm": 3.926100254058838, "learning_rate": 7.667741117750522e-05, "loss": 2.0661, "step": 2122 }, { "epoch": 0.3425298483381736, "grad_norm": 4.437743663787842, "learning_rate": 7.665530953467643e-05, "loss": 2.1468, "step": 2123 }, { "epoch": 0.3426911907066796, "grad_norm": 5.147749423980713, "learning_rate": 7.663320061328673e-05, "loss": 1.9784, "step": 2124 }, { "epoch": 0.34285253307518554, "grad_norm": 7.403375148773193, "learning_rate": 7.661108441937321e-05, "loss": 2.1373, "step": 2125 }, { "epoch": 0.3430138754436915, "grad_norm": 4.105249881744385, "learning_rate": 7.658896095897498e-05, "loss": 1.8298, "step": 2126 }, { "epoch": 0.3431752178121975, "grad_norm": 4.507058620452881, "learning_rate": 7.656683023813311e-05, "loss": 2.0232, "step": 2127 }, { "epoch": 0.34333656018070347, "grad_norm": 4.855741500854492, "learning_rate": 7.654469226289067e-05, "loss": 2.0097, "step": 2128 }, { "epoch": 0.3434979025492094, "grad_norm": 6.300288677215576, "learning_rate": 7.652254703929273e-05, "loss": 2.1869, "step": 2129 }, { "epoch": 0.3436592449177154, "grad_norm": 3.5406997203826904, "learning_rate": 7.650039457338628e-05, "loss": 2.0674, "step": 2130 }, { "epoch": 0.34382058728622134, "grad_norm": 3.7372398376464844, "learning_rate": 7.647823487122034e-05, "loss": 2.083, "step": 2131 }, { "epoch": 0.34398192965472735, "grad_norm": 3.858139753341675, "learning_rate": 7.645606793884592e-05, "loss": 2.0578, "step": 2132 }, { "epoch": 0.3441432720232333, "grad_norm": 4.632990837097168, "learning_rate": 7.643389378231592e-05, "loss": 2.0941, "step": 2133 }, { "epoch": 0.34430461439173926, "grad_norm": 3.7732186317443848, "learning_rate": 7.64117124076853e-05, "loss": 2.1238, "step": 2134 }, { "epoch": 0.3444659567602452, "grad_norm": 4.680307865142822, "learning_rate": 7.638952382101094e-05, "loss": 2.0591, "step": 2135 }, { "epoch": 0.34462729912875123, "grad_norm": 4.768545627593994, "learning_rate": 7.636732802835174e-05, "loss": 2.0028, "step": 2136 }, { "epoch": 0.3447886414972572, "grad_norm": 3.762096643447876, "learning_rate": 7.63451250357685e-05, "loss": 1.9523, "step": 2137 }, { "epoch": 0.34494998386576314, "grad_norm": 5.83837366104126, "learning_rate": 7.632291484932403e-05, "loss": 2.1163, "step": 2138 }, { "epoch": 0.3451113262342691, "grad_norm": 5.010060787200928, "learning_rate": 7.63006974750831e-05, "loss": 2.0617, "step": 2139 }, { "epoch": 0.3452726686027751, "grad_norm": 4.351550579071045, "learning_rate": 7.627847291911243e-05, "loss": 2.0207, "step": 2140 }, { "epoch": 0.34543401097128107, "grad_norm": 5.060973644256592, "learning_rate": 7.625624118748074e-05, "loss": 2.1295, "step": 2141 }, { "epoch": 0.345595353339787, "grad_norm": 4.052987575531006, "learning_rate": 7.623400228625863e-05, "loss": 2.0639, "step": 2142 }, { "epoch": 0.345756695708293, "grad_norm": 4.111716270446777, "learning_rate": 7.621175622151873e-05, "loss": 1.9512, "step": 2143 }, { "epoch": 0.345918038076799, "grad_norm": 4.8807783126831055, "learning_rate": 7.618950299933562e-05, "loss": 2.2095, "step": 2144 }, { "epoch": 0.34607938044530495, "grad_norm": 4.844640254974365, "learning_rate": 7.61672426257858e-05, "loss": 2.1281, "step": 2145 }, { "epoch": 0.3462407228138109, "grad_norm": 4.600811004638672, "learning_rate": 7.614497510694774e-05, "loss": 2.0428, "step": 2146 }, { "epoch": 0.34640206518231687, "grad_norm": 3.9224560260772705, "learning_rate": 7.612270044890184e-05, "loss": 2.0553, "step": 2147 }, { "epoch": 0.3465634075508228, "grad_norm": 3.4691977500915527, "learning_rate": 7.610041865773049e-05, "loss": 2.063, "step": 2148 }, { "epoch": 0.34672474991932883, "grad_norm": 4.167876720428467, "learning_rate": 7.607812973951802e-05, "loss": 1.8786, "step": 2149 }, { "epoch": 0.3468860922878348, "grad_norm": 4.364349842071533, "learning_rate": 7.605583370035069e-05, "loss": 2.2173, "step": 2150 }, { "epoch": 0.34704743465634075, "grad_norm": 3.3503668308258057, "learning_rate": 7.603353054631667e-05, "loss": 2.1548, "step": 2151 }, { "epoch": 0.3472087770248467, "grad_norm": 4.5009331703186035, "learning_rate": 7.601122028350617e-05, "loss": 2.246, "step": 2152 }, { "epoch": 0.3473701193933527, "grad_norm": 5.919765472412109, "learning_rate": 7.598890291801124e-05, "loss": 2.2103, "step": 2153 }, { "epoch": 0.3475314617618587, "grad_norm": 3.6515698432922363, "learning_rate": 7.596657845592594e-05, "loss": 2.0917, "step": 2154 }, { "epoch": 0.34769280413036463, "grad_norm": 4.1915693283081055, "learning_rate": 7.59442469033462e-05, "loss": 2.2508, "step": 2155 }, { "epoch": 0.3478541464988706, "grad_norm": 3.4758102893829346, "learning_rate": 7.592190826636997e-05, "loss": 2.1832, "step": 2156 }, { "epoch": 0.3480154888673766, "grad_norm": 5.346054553985596, "learning_rate": 7.589956255109705e-05, "loss": 2.0014, "step": 2157 }, { "epoch": 0.34817683123588256, "grad_norm": 6.513914585113525, "learning_rate": 7.587720976362927e-05, "loss": 2.1498, "step": 2158 }, { "epoch": 0.3483381736043885, "grad_norm": 6.071094512939453, "learning_rate": 7.585484991007027e-05, "loss": 1.9775, "step": 2159 }, { "epoch": 0.34849951597289447, "grad_norm": 3.1601836681365967, "learning_rate": 7.583248299652571e-05, "loss": 2.1196, "step": 2160 }, { "epoch": 0.3486608583414004, "grad_norm": 3.5352375507354736, "learning_rate": 7.581010902910316e-05, "loss": 1.8535, "step": 2161 }, { "epoch": 0.34882220070990644, "grad_norm": 4.067091464996338, "learning_rate": 7.578772801391209e-05, "loss": 1.9708, "step": 2162 }, { "epoch": 0.3489835430784124, "grad_norm": 4.216934680938721, "learning_rate": 7.576533995706394e-05, "loss": 1.846, "step": 2163 }, { "epoch": 0.34914488544691835, "grad_norm": 5.02567195892334, "learning_rate": 7.574294486467204e-05, "loss": 1.9846, "step": 2164 }, { "epoch": 0.3493062278154243, "grad_norm": 4.7987165451049805, "learning_rate": 7.572054274285161e-05, "loss": 1.9039, "step": 2165 }, { "epoch": 0.3494675701839303, "grad_norm": 4.404263019561768, "learning_rate": 7.569813359771986e-05, "loss": 2.0985, "step": 2166 }, { "epoch": 0.3496289125524363, "grad_norm": 4.553297996520996, "learning_rate": 7.567571743539585e-05, "loss": 2.0523, "step": 2167 }, { "epoch": 0.34979025492094223, "grad_norm": 4.184520244598389, "learning_rate": 7.565329426200065e-05, "loss": 2.0261, "step": 2168 }, { "epoch": 0.3499515972894482, "grad_norm": 3.7621541023254395, "learning_rate": 7.563086408365712e-05, "loss": 1.8697, "step": 2169 }, { "epoch": 0.3501129396579542, "grad_norm": 3.4361226558685303, "learning_rate": 7.560842690649014e-05, "loss": 2.0835, "step": 2170 }, { "epoch": 0.35027428202646016, "grad_norm": 4.431186199188232, "learning_rate": 7.558598273662643e-05, "loss": 1.9047, "step": 2171 }, { "epoch": 0.3504356243949661, "grad_norm": 4.451605319976807, "learning_rate": 7.556353158019467e-05, "loss": 1.934, "step": 2172 }, { "epoch": 0.3505969667634721, "grad_norm": 3.9445595741271973, "learning_rate": 7.55410734433254e-05, "loss": 2.0204, "step": 2173 }, { "epoch": 0.35075830913197803, "grad_norm": 3.83076548576355, "learning_rate": 7.551860833215112e-05, "loss": 1.9018, "step": 2174 }, { "epoch": 0.35091965150048404, "grad_norm": 3.7835171222686768, "learning_rate": 7.549613625280617e-05, "loss": 1.8462, "step": 2175 }, { "epoch": 0.35108099386899, "grad_norm": 4.310787200927734, "learning_rate": 7.547365721142687e-05, "loss": 2.0906, "step": 2176 }, { "epoch": 0.35124233623749596, "grad_norm": 5.287550449371338, "learning_rate": 7.545117121415136e-05, "loss": 2.0887, "step": 2177 }, { "epoch": 0.3514036786060019, "grad_norm": 4.817923069000244, "learning_rate": 7.542867826711974e-05, "loss": 2.009, "step": 2178 }, { "epoch": 0.3515650209745079, "grad_norm": 3.8996381759643555, "learning_rate": 7.5406178376474e-05, "loss": 2.0563, "step": 2179 }, { "epoch": 0.3517263633430139, "grad_norm": 3.9155139923095703, "learning_rate": 7.538367154835797e-05, "loss": 1.9556, "step": 2180 }, { "epoch": 0.35188770571151984, "grad_norm": 4.121738433837891, "learning_rate": 7.536115778891746e-05, "loss": 1.9905, "step": 2181 }, { "epoch": 0.3520490480800258, "grad_norm": 4.250374794006348, "learning_rate": 7.533863710430012e-05, "loss": 2.1936, "step": 2182 }, { "epoch": 0.3522103904485318, "grad_norm": 3.8731632232666016, "learning_rate": 7.531610950065549e-05, "loss": 2.261, "step": 2183 }, { "epoch": 0.35237173281703776, "grad_norm": 3.329829216003418, "learning_rate": 7.529357498413501e-05, "loss": 1.903, "step": 2184 }, { "epoch": 0.3525330751855437, "grad_norm": 3.554926633834839, "learning_rate": 7.5271033560892e-05, "loss": 2.0567, "step": 2185 }, { "epoch": 0.3526944175540497, "grad_norm": 4.352489471435547, "learning_rate": 7.524848523708168e-05, "loss": 1.9621, "step": 2186 }, { "epoch": 0.3528557599225557, "grad_norm": 4.469099521636963, "learning_rate": 7.522593001886118e-05, "loss": 1.787, "step": 2187 }, { "epoch": 0.35301710229106165, "grad_norm": 4.401259899139404, "learning_rate": 7.520336791238942e-05, "loss": 2.2623, "step": 2188 }, { "epoch": 0.3531784446595676, "grad_norm": 4.640010833740234, "learning_rate": 7.518079892382732e-05, "loss": 1.9496, "step": 2189 }, { "epoch": 0.35333978702807356, "grad_norm": 5.381575107574463, "learning_rate": 7.515822305933758e-05, "loss": 2.1458, "step": 2190 }, { "epoch": 0.3535011293965795, "grad_norm": 4.868968486785889, "learning_rate": 7.513564032508484e-05, "loss": 2.3643, "step": 2191 }, { "epoch": 0.35366247176508553, "grad_norm": 3.7936339378356934, "learning_rate": 7.511305072723559e-05, "loss": 2.0604, "step": 2192 }, { "epoch": 0.3538238141335915, "grad_norm": 4.639528751373291, "learning_rate": 7.509045427195819e-05, "loss": 2.0235, "step": 2193 }, { "epoch": 0.35398515650209744, "grad_norm": 3.7656235694885254, "learning_rate": 7.506785096542289e-05, "loss": 1.9896, "step": 2194 }, { "epoch": 0.3541464988706034, "grad_norm": 3.891155242919922, "learning_rate": 7.50452408138018e-05, "loss": 1.972, "step": 2195 }, { "epoch": 0.3543078412391094, "grad_norm": 4.294320583343506, "learning_rate": 7.502262382326888e-05, "loss": 2.0749, "step": 2196 }, { "epoch": 0.35446918360761537, "grad_norm": 4.284241676330566, "learning_rate": 7.500000000000001e-05, "loss": 1.8442, "step": 2197 }, { "epoch": 0.3546305259761213, "grad_norm": 4.18466854095459, "learning_rate": 7.49773693501729e-05, "loss": 2.0265, "step": 2198 }, { "epoch": 0.3547918683446273, "grad_norm": 5.000754356384277, "learning_rate": 7.495473187996709e-05, "loss": 2.0031, "step": 2199 }, { "epoch": 0.3549532107131333, "grad_norm": 4.0818705558776855, "learning_rate": 7.493208759556406e-05, "loss": 1.8871, "step": 2200 }, { "epoch": 0.35511455308163925, "grad_norm": 4.289008140563965, "learning_rate": 7.49094365031471e-05, "loss": 1.9596, "step": 2201 }, { "epoch": 0.3552758954501452, "grad_norm": 4.734685897827148, "learning_rate": 7.48867786089014e-05, "loss": 2.2165, "step": 2202 }, { "epoch": 0.35543723781865116, "grad_norm": 3.9588840007781982, "learning_rate": 7.48641139190139e-05, "loss": 1.9112, "step": 2203 }, { "epoch": 0.3555985801871571, "grad_norm": 5.025766372680664, "learning_rate": 7.484144243967353e-05, "loss": 1.9741, "step": 2204 }, { "epoch": 0.35575992255566313, "grad_norm": 5.695304870605469, "learning_rate": 7.481876417707102e-05, "loss": 1.9144, "step": 2205 }, { "epoch": 0.3559212649241691, "grad_norm": 5.085302829742432, "learning_rate": 7.479607913739894e-05, "loss": 2.021, "step": 2206 }, { "epoch": 0.35608260729267505, "grad_norm": 4.136185169219971, "learning_rate": 7.47733873268517e-05, "loss": 1.879, "step": 2207 }, { "epoch": 0.356243949661181, "grad_norm": 6.267850875854492, "learning_rate": 7.475068875162561e-05, "loss": 1.9467, "step": 2208 }, { "epoch": 0.356405292029687, "grad_norm": 4.190792083740234, "learning_rate": 7.472798341791877e-05, "loss": 1.9547, "step": 2209 }, { "epoch": 0.35656663439819297, "grad_norm": 6.514063358306885, "learning_rate": 7.470527133193116e-05, "loss": 1.6638, "step": 2210 }, { "epoch": 0.35672797676669893, "grad_norm": 4.96861457824707, "learning_rate": 7.46825524998646e-05, "loss": 1.9349, "step": 2211 }, { "epoch": 0.3568893191352049, "grad_norm": 4.728672504425049, "learning_rate": 7.465982692792275e-05, "loss": 1.8288, "step": 2212 }, { "epoch": 0.3570506615037109, "grad_norm": 3.9773061275482178, "learning_rate": 7.46370946223111e-05, "loss": 1.7914, "step": 2213 }, { "epoch": 0.35721200387221685, "grad_norm": 4.387299060821533, "learning_rate": 7.461435558923698e-05, "loss": 1.8791, "step": 2214 }, { "epoch": 0.3573733462407228, "grad_norm": 4.533514499664307, "learning_rate": 7.459160983490959e-05, "loss": 2.2867, "step": 2215 }, { "epoch": 0.35753468860922877, "grad_norm": 4.0857977867126465, "learning_rate": 7.456885736553989e-05, "loss": 2.14, "step": 2216 }, { "epoch": 0.3576960309777348, "grad_norm": 4.624113082885742, "learning_rate": 7.454609818734076e-05, "loss": 1.7061, "step": 2217 }, { "epoch": 0.35785737334624074, "grad_norm": 5.158027648925781, "learning_rate": 7.452333230652688e-05, "loss": 1.7976, "step": 2218 }, { "epoch": 0.3580187157147467, "grad_norm": 4.00270938873291, "learning_rate": 7.450055972931473e-05, "loss": 2.1349, "step": 2219 }, { "epoch": 0.35818005808325265, "grad_norm": 5.796380519866943, "learning_rate": 7.447778046192267e-05, "loss": 1.9829, "step": 2220 }, { "epoch": 0.3583414004517586, "grad_norm": 4.402221202850342, "learning_rate": 7.445499451057084e-05, "loss": 2.1976, "step": 2221 }, { "epoch": 0.3585027428202646, "grad_norm": 3.4079225063323975, "learning_rate": 7.443220188148123e-05, "loss": 2.0053, "step": 2222 }, { "epoch": 0.3586640851887706, "grad_norm": 5.272507190704346, "learning_rate": 7.440940258087764e-05, "loss": 2.1123, "step": 2223 }, { "epoch": 0.35882542755727653, "grad_norm": 4.93818998336792, "learning_rate": 7.43865966149857e-05, "loss": 1.9708, "step": 2224 }, { "epoch": 0.3589867699257825, "grad_norm": 4.46998405456543, "learning_rate": 7.436378399003288e-05, "loss": 2.1915, "step": 2225 }, { "epoch": 0.3591481122942885, "grad_norm": 5.026282787322998, "learning_rate": 7.434096471224842e-05, "loss": 1.7489, "step": 2226 }, { "epoch": 0.35930945466279446, "grad_norm": 5.975449562072754, "learning_rate": 7.431813878786343e-05, "loss": 1.9611, "step": 2227 }, { "epoch": 0.3594707970313004, "grad_norm": 3.4423348903656006, "learning_rate": 7.42953062231108e-05, "loss": 2.0725, "step": 2228 }, { "epoch": 0.35963213939980637, "grad_norm": 3.4423348903656006, "learning_rate": 7.42953062231108e-05, "loss": 2.1422, "step": 2229 }, { "epoch": 0.3597934817683124, "grad_norm": 5.178089141845703, "learning_rate": 7.427246702422525e-05, "loss": 1.9105, "step": 2230 }, { "epoch": 0.35995482413681834, "grad_norm": 4.918189525604248, "learning_rate": 7.42496211974433e-05, "loss": 2.1764, "step": 2231 }, { "epoch": 0.3601161665053243, "grad_norm": 4.000021934509277, "learning_rate": 7.422676874900329e-05, "loss": 2.0969, "step": 2232 }, { "epoch": 0.36027750887383025, "grad_norm": 5.2618536949157715, "learning_rate": 7.420390968514535e-05, "loss": 1.9344, "step": 2233 }, { "epoch": 0.3604388512423362, "grad_norm": 6.015827178955078, "learning_rate": 7.418104401211143e-05, "loss": 2.1002, "step": 2234 }, { "epoch": 0.3606001936108422, "grad_norm": 4.103837013244629, "learning_rate": 7.41581717361453e-05, "loss": 2.0114, "step": 2235 }, { "epoch": 0.3607615359793482, "grad_norm": 4.670347213745117, "learning_rate": 7.41352928634925e-05, "loss": 2.2096, "step": 2236 }, { "epoch": 0.36092287834785414, "grad_norm": 3.644935131072998, "learning_rate": 7.41124074004004e-05, "loss": 2.0364, "step": 2237 }, { "epoch": 0.3610842207163601, "grad_norm": 5.363805294036865, "learning_rate": 7.408951535311815e-05, "loss": 2.0182, "step": 2238 }, { "epoch": 0.3612455630848661, "grad_norm": 5.148021697998047, "learning_rate": 7.40666167278967e-05, "loss": 2.1056, "step": 2239 }, { "epoch": 0.36140690545337206, "grad_norm": 5.083660125732422, "learning_rate": 7.404371153098883e-05, "loss": 1.9729, "step": 2240 }, { "epoch": 0.361568247821878, "grad_norm": 4.589669704437256, "learning_rate": 7.402079976864905e-05, "loss": 2.0438, "step": 2241 }, { "epoch": 0.361729590190384, "grad_norm": 5.363358020782471, "learning_rate": 7.399788144713374e-05, "loss": 1.9283, "step": 2242 }, { "epoch": 0.36189093255889, "grad_norm": 4.749497890472412, "learning_rate": 7.3974956572701e-05, "loss": 2.1566, "step": 2243 }, { "epoch": 0.36205227492739595, "grad_norm": 4.069309234619141, "learning_rate": 7.395202515161073e-05, "loss": 1.7986, "step": 2244 }, { "epoch": 0.3622136172959019, "grad_norm": 3.9743974208831787, "learning_rate": 7.392908719012468e-05, "loss": 2.138, "step": 2245 }, { "epoch": 0.36237495966440786, "grad_norm": 4.392673969268799, "learning_rate": 7.390614269450634e-05, "loss": 1.753, "step": 2246 }, { "epoch": 0.36253630203291387, "grad_norm": 5.113845348358154, "learning_rate": 7.388319167102097e-05, "loss": 1.8796, "step": 2247 }, { "epoch": 0.3626976444014198, "grad_norm": 3.84138560295105, "learning_rate": 7.386023412593563e-05, "loss": 2.0539, "step": 2248 }, { "epoch": 0.3628589867699258, "grad_norm": 3.8947153091430664, "learning_rate": 7.383727006551916e-05, "loss": 2.0031, "step": 2249 }, { "epoch": 0.36302032913843174, "grad_norm": 3.7585270404815674, "learning_rate": 7.381429949604218e-05, "loss": 1.9055, "step": 2250 }, { "epoch": 0.3631816715069377, "grad_norm": 4.749552249908447, "learning_rate": 7.379132242377712e-05, "loss": 1.9108, "step": 2251 }, { "epoch": 0.3633430138754437, "grad_norm": 5.605308532714844, "learning_rate": 7.37683388549981e-05, "loss": 1.9858, "step": 2252 }, { "epoch": 0.36350435624394967, "grad_norm": 4.167489051818848, "learning_rate": 7.374534879598109e-05, "loss": 1.9966, "step": 2253 }, { "epoch": 0.3636656986124556, "grad_norm": 4.343514442443848, "learning_rate": 7.372235225300382e-05, "loss": 2.2502, "step": 2254 }, { "epoch": 0.3638270409809616, "grad_norm": 4.107678413391113, "learning_rate": 7.369934923234577e-05, "loss": 2.0626, "step": 2255 }, { "epoch": 0.3639883833494676, "grad_norm": 3.700336217880249, "learning_rate": 7.36763397402882e-05, "loss": 1.8523, "step": 2256 }, { "epoch": 0.36414972571797355, "grad_norm": 4.870800495147705, "learning_rate": 7.365332378311414e-05, "loss": 2.0273, "step": 2257 }, { "epoch": 0.3643110680864795, "grad_norm": 5.010525226593018, "learning_rate": 7.363030136710836e-05, "loss": 2.1416, "step": 2258 }, { "epoch": 0.36447241045498546, "grad_norm": 5.208023548126221, "learning_rate": 7.360727249855744e-05, "loss": 1.8927, "step": 2259 }, { "epoch": 0.3646337528234915, "grad_norm": 3.7048261165618896, "learning_rate": 7.35842371837497e-05, "loss": 2.0855, "step": 2260 }, { "epoch": 0.36479509519199743, "grad_norm": 3.731630563735962, "learning_rate": 7.356119542897518e-05, "loss": 1.9924, "step": 2261 }, { "epoch": 0.3649564375605034, "grad_norm": 4.801681995391846, "learning_rate": 7.353814724052576e-05, "loss": 2.1764, "step": 2262 }, { "epoch": 0.36511777992900935, "grad_norm": 5.089466094970703, "learning_rate": 7.3515092624695e-05, "loss": 1.8582, "step": 2263 }, { "epoch": 0.3652791222975153, "grad_norm": 4.4316840171813965, "learning_rate": 7.349203158777826e-05, "loss": 2.1998, "step": 2264 }, { "epoch": 0.3654404646660213, "grad_norm": 3.751309394836426, "learning_rate": 7.346896413607262e-05, "loss": 2.053, "step": 2265 }, { "epoch": 0.36560180703452727, "grad_norm": 2.876962661743164, "learning_rate": 7.344589027587697e-05, "loss": 2.16, "step": 2266 }, { "epoch": 0.3657631494030332, "grad_norm": 4.779649257659912, "learning_rate": 7.34228100134919e-05, "loss": 1.84, "step": 2267 }, { "epoch": 0.3659244917715392, "grad_norm": 3.540619373321533, "learning_rate": 7.339972335521972e-05, "loss": 1.972, "step": 2268 }, { "epoch": 0.3660858341400452, "grad_norm": 4.27071475982666, "learning_rate": 7.33766303073646e-05, "loss": 1.913, "step": 2269 }, { "epoch": 0.36624717650855115, "grad_norm": 5.800755977630615, "learning_rate": 7.335353087623231e-05, "loss": 1.8588, "step": 2270 }, { "epoch": 0.3664085188770571, "grad_norm": 4.293838977813721, "learning_rate": 7.333042506813048e-05, "loss": 2.1418, "step": 2271 }, { "epoch": 0.36656986124556307, "grad_norm": 3.397761821746826, "learning_rate": 7.330731288936843e-05, "loss": 2.0779, "step": 2272 }, { "epoch": 0.3667312036140691, "grad_norm": 3.5220279693603516, "learning_rate": 7.32841943462572e-05, "loss": 1.8257, "step": 2273 }, { "epoch": 0.36689254598257504, "grad_norm": 3.7657272815704346, "learning_rate": 7.32610694451096e-05, "loss": 1.8189, "step": 2274 }, { "epoch": 0.367053888351081, "grad_norm": 4.797881126403809, "learning_rate": 7.32379381922402e-05, "loss": 2.0172, "step": 2275 }, { "epoch": 0.36721523071958695, "grad_norm": 3.328403949737549, "learning_rate": 7.321480059396523e-05, "loss": 1.9931, "step": 2276 }, { "epoch": 0.3673765730880929, "grad_norm": 6.129566192626953, "learning_rate": 7.319165665660273e-05, "loss": 1.9476, "step": 2277 }, { "epoch": 0.3675379154565989, "grad_norm": 4.815751075744629, "learning_rate": 7.316850638647243e-05, "loss": 1.9807, "step": 2278 }, { "epoch": 0.3676992578251049, "grad_norm": 5.611696243286133, "learning_rate": 7.31453497898958e-05, "loss": 1.8491, "step": 2279 }, { "epoch": 0.36786060019361083, "grad_norm": 4.9298224449157715, "learning_rate": 7.312218687319603e-05, "loss": 1.8654, "step": 2280 }, { "epoch": 0.3680219425621168, "grad_norm": 4.291321754455566, "learning_rate": 7.309901764269802e-05, "loss": 1.9494, "step": 2281 }, { "epoch": 0.3681832849306228, "grad_norm": 4.082421779632568, "learning_rate": 7.307584210472844e-05, "loss": 2.0036, "step": 2282 }, { "epoch": 0.36834462729912876, "grad_norm": 4.3880743980407715, "learning_rate": 7.305266026561565e-05, "loss": 1.9789, "step": 2283 }, { "epoch": 0.3685059696676347, "grad_norm": 3.8602914810180664, "learning_rate": 7.302947213168974e-05, "loss": 1.8724, "step": 2284 }, { "epoch": 0.36866731203614067, "grad_norm": 4.736367702484131, "learning_rate": 7.300627770928252e-05, "loss": 1.9731, "step": 2285 }, { "epoch": 0.3688286544046467, "grad_norm": 5.397782802581787, "learning_rate": 7.298307700472748e-05, "loss": 1.9259, "step": 2286 }, { "epoch": 0.36898999677315264, "grad_norm": 5.391665935516357, "learning_rate": 7.295987002435989e-05, "loss": 1.9912, "step": 2287 }, { "epoch": 0.3691513391416586, "grad_norm": 4.335827350616455, "learning_rate": 7.29366567745167e-05, "loss": 1.7932, "step": 2288 }, { "epoch": 0.36931268151016455, "grad_norm": 3.538144826889038, "learning_rate": 7.291343726153656e-05, "loss": 1.9518, "step": 2289 }, { "epoch": 0.36947402387867057, "grad_norm": 3.7503881454467773, "learning_rate": 7.289021149175986e-05, "loss": 2.0434, "step": 2290 }, { "epoch": 0.3696353662471765, "grad_norm": 3.6585474014282227, "learning_rate": 7.286697947152867e-05, "loss": 1.9967, "step": 2291 }, { "epoch": 0.3697967086156825, "grad_norm": 4.704679489135742, "learning_rate": 7.28437412071868e-05, "loss": 2.1335, "step": 2292 }, { "epoch": 0.36995805098418844, "grad_norm": 4.349279880523682, "learning_rate": 7.28204967050797e-05, "loss": 1.9715, "step": 2293 }, { "epoch": 0.3701193933526944, "grad_norm": 4.37742805480957, "learning_rate": 7.279724597155462e-05, "loss": 1.7624, "step": 2294 }, { "epoch": 0.3702807357212004, "grad_norm": 4.009250640869141, "learning_rate": 7.277398901296044e-05, "loss": 2.0692, "step": 2295 }, { "epoch": 0.37044207808970636, "grad_norm": 5.741460800170898, "learning_rate": 7.275072583564775e-05, "loss": 1.9758, "step": 2296 }, { "epoch": 0.3706034204582123, "grad_norm": 3.7926852703094482, "learning_rate": 7.272745644596887e-05, "loss": 1.9235, "step": 2297 }, { "epoch": 0.3707647628267183, "grad_norm": 3.9284520149230957, "learning_rate": 7.270418085027776e-05, "loss": 2.3003, "step": 2298 }, { "epoch": 0.3709261051952243, "grad_norm": 4.326069355010986, "learning_rate": 7.268089905493013e-05, "loss": 2.0143, "step": 2299 }, { "epoch": 0.37108744756373024, "grad_norm": 5.114227771759033, "learning_rate": 7.265761106628337e-05, "loss": 2.0741, "step": 2300 }, { "epoch": 0.3712487899322362, "grad_norm": 5.760619640350342, "learning_rate": 7.263431689069651e-05, "loss": 2.0508, "step": 2301 }, { "epoch": 0.37141013230074216, "grad_norm": 5.333743095397949, "learning_rate": 7.261101653453038e-05, "loss": 1.8713, "step": 2302 }, { "epoch": 0.37157147466924817, "grad_norm": 4.196969509124756, "learning_rate": 7.258771000414735e-05, "loss": 2.0845, "step": 2303 }, { "epoch": 0.3717328170377541, "grad_norm": 3.85819673538208, "learning_rate": 7.256439730591162e-05, "loss": 2.2329, "step": 2304 }, { "epoch": 0.3718941594062601, "grad_norm": 5.132253170013428, "learning_rate": 7.2541078446189e-05, "loss": 1.9991, "step": 2305 }, { "epoch": 0.37205550177476604, "grad_norm": 6.330166339874268, "learning_rate": 7.251775343134694e-05, "loss": 1.9949, "step": 2306 }, { "epoch": 0.372216844143272, "grad_norm": 3.9378414154052734, "learning_rate": 7.249442226775469e-05, "loss": 1.917, "step": 2307 }, { "epoch": 0.372378186511778, "grad_norm": 4.884683132171631, "learning_rate": 7.247108496178307e-05, "loss": 2.036, "step": 2308 }, { "epoch": 0.37253952888028397, "grad_norm": 3.73783540725708, "learning_rate": 7.244774151980466e-05, "loss": 1.905, "step": 2309 }, { "epoch": 0.3727008712487899, "grad_norm": 4.066815376281738, "learning_rate": 7.242439194819364e-05, "loss": 2.1589, "step": 2310 }, { "epoch": 0.3728622136172959, "grad_norm": 4.2685770988464355, "learning_rate": 7.240103625332589e-05, "loss": 2.2511, "step": 2311 }, { "epoch": 0.3730235559858019, "grad_norm": 3.9241483211517334, "learning_rate": 7.237767444157899e-05, "loss": 1.9503, "step": 2312 }, { "epoch": 0.37318489835430785, "grad_norm": 3.376739025115967, "learning_rate": 7.235430651933217e-05, "loss": 2.0288, "step": 2313 }, { "epoch": 0.3733462407228138, "grad_norm": 3.62248158454895, "learning_rate": 7.233093249296631e-05, "loss": 2.0128, "step": 2314 }, { "epoch": 0.37350758309131976, "grad_norm": 3.5569229125976562, "learning_rate": 7.230755236886401e-05, "loss": 2.3202, "step": 2315 }, { "epoch": 0.3736689254598258, "grad_norm": 2.885396718978882, "learning_rate": 7.228416615340949e-05, "loss": 1.9526, "step": 2316 }, { "epoch": 0.37383026782833173, "grad_norm": 3.926088571548462, "learning_rate": 7.226077385298862e-05, "loss": 1.8278, "step": 2317 }, { "epoch": 0.3739916101968377, "grad_norm": 4.93345832824707, "learning_rate": 7.223737547398898e-05, "loss": 2.3146, "step": 2318 }, { "epoch": 0.37415295256534364, "grad_norm": 3.6649792194366455, "learning_rate": 7.221397102279979e-05, "loss": 1.9611, "step": 2319 }, { "epoch": 0.37431429493384966, "grad_norm": 4.375561714172363, "learning_rate": 7.21905605058119e-05, "loss": 1.8566, "step": 2320 }, { "epoch": 0.3744756373023556, "grad_norm": 3.7766683101654053, "learning_rate": 7.216714392941785e-05, "loss": 2.3165, "step": 2321 }, { "epoch": 0.37463697967086157, "grad_norm": 3.7465713024139404, "learning_rate": 7.214372130001184e-05, "loss": 2.0361, "step": 2322 }, { "epoch": 0.3747983220393675, "grad_norm": 4.272165775299072, "learning_rate": 7.212029262398972e-05, "loss": 1.925, "step": 2323 }, { "epoch": 0.3749596644078735, "grad_norm": 4.022883415222168, "learning_rate": 7.209685790774892e-05, "loss": 1.8744, "step": 2324 }, { "epoch": 0.3751210067763795, "grad_norm": 4.182277202606201, "learning_rate": 7.207341715768863e-05, "loss": 2.1156, "step": 2325 }, { "epoch": 0.37528234914488545, "grad_norm": 3.665260076522827, "learning_rate": 7.20499703802096e-05, "loss": 2.047, "step": 2326 }, { "epoch": 0.3754436915133914, "grad_norm": 4.080063819885254, "learning_rate": 7.202651758171431e-05, "loss": 1.9225, "step": 2327 }, { "epoch": 0.37560503388189737, "grad_norm": 6.866401672363281, "learning_rate": 7.200305876860678e-05, "loss": 2.1732, "step": 2328 }, { "epoch": 0.3757663762504034, "grad_norm": 4.220969200134277, "learning_rate": 7.19795939472928e-05, "loss": 2.1178, "step": 2329 }, { "epoch": 0.37592771861890933, "grad_norm": 5.637753486633301, "learning_rate": 7.195612312417965e-05, "loss": 2.1014, "step": 2330 }, { "epoch": 0.3760890609874153, "grad_norm": 4.461935043334961, "learning_rate": 7.193264630567635e-05, "loss": 2.0136, "step": 2331 }, { "epoch": 0.37625040335592125, "grad_norm": 4.913003444671631, "learning_rate": 7.190916349819356e-05, "loss": 2.0294, "step": 2332 }, { "epoch": 0.37641174572442726, "grad_norm": 4.060622215270996, "learning_rate": 7.188567470814354e-05, "loss": 1.9674, "step": 2333 }, { "epoch": 0.3765730880929332, "grad_norm": 4.1861348152160645, "learning_rate": 7.18621799419402e-05, "loss": 1.9682, "step": 2334 }, { "epoch": 0.3767344304614392, "grad_norm": 3.848463296890259, "learning_rate": 7.183867920599906e-05, "loss": 1.8223, "step": 2335 }, { "epoch": 0.37689577282994513, "grad_norm": 6.9484968185424805, "learning_rate": 7.181517250673728e-05, "loss": 1.9903, "step": 2336 }, { "epoch": 0.3770571151984511, "grad_norm": 4.20919942855835, "learning_rate": 7.179165985057368e-05, "loss": 1.835, "step": 2337 }, { "epoch": 0.3772184575669571, "grad_norm": 3.890127658843994, "learning_rate": 7.176814124392866e-05, "loss": 1.8775, "step": 2338 }, { "epoch": 0.37737979993546306, "grad_norm": 6.341292381286621, "learning_rate": 7.174461669322427e-05, "loss": 2.2324, "step": 2339 }, { "epoch": 0.377541142303969, "grad_norm": 4.031415939331055, "learning_rate": 7.172108620488419e-05, "loss": 1.7621, "step": 2340 }, { "epoch": 0.37770248467247497, "grad_norm": 5.048474311828613, "learning_rate": 7.16975497853337e-05, "loss": 2.0766, "step": 2341 }, { "epoch": 0.377863827040981, "grad_norm": 4.281801223754883, "learning_rate": 7.16740074409997e-05, "loss": 1.9402, "step": 2342 }, { "epoch": 0.37802516940948694, "grad_norm": 4.380557060241699, "learning_rate": 7.165045917831074e-05, "loss": 2.0619, "step": 2343 }, { "epoch": 0.3781865117779929, "grad_norm": 4.43021821975708, "learning_rate": 7.162690500369694e-05, "loss": 1.9965, "step": 2344 }, { "epoch": 0.37834785414649885, "grad_norm": 4.685453414916992, "learning_rate": 7.160334492359007e-05, "loss": 1.942, "step": 2345 }, { "epoch": 0.37850919651500486, "grad_norm": 4.532835483551025, "learning_rate": 7.157977894442349e-05, "loss": 1.9803, "step": 2346 }, { "epoch": 0.3786705388835108, "grad_norm": 4.49874210357666, "learning_rate": 7.155620707263223e-05, "loss": 2.0574, "step": 2347 }, { "epoch": 0.3788318812520168, "grad_norm": 5.898449897766113, "learning_rate": 7.15326293146528e-05, "loss": 1.9402, "step": 2348 }, { "epoch": 0.37899322362052273, "grad_norm": 4.97932767868042, "learning_rate": 7.150904567692348e-05, "loss": 2.173, "step": 2349 }, { "epoch": 0.3791545659890287, "grad_norm": 3.5668933391571045, "learning_rate": 7.148545616588398e-05, "loss": 1.9776, "step": 2350 }, { "epoch": 0.3793159083575347, "grad_norm": 4.622684955596924, "learning_rate": 7.146186078797578e-05, "loss": 1.9501, "step": 2351 }, { "epoch": 0.37947725072604066, "grad_norm": 3.8975729942321777, "learning_rate": 7.143825954964187e-05, "loss": 1.9992, "step": 2352 }, { "epoch": 0.3796385930945466, "grad_norm": 4.953866004943848, "learning_rate": 7.141465245732686e-05, "loss": 1.9886, "step": 2353 }, { "epoch": 0.3797999354630526, "grad_norm": 4.589084625244141, "learning_rate": 7.139103951747695e-05, "loss": 2.0232, "step": 2354 }, { "epoch": 0.3799612778315586, "grad_norm": 5.781042575836182, "learning_rate": 7.136742073653994e-05, "loss": 1.9773, "step": 2355 }, { "epoch": 0.38012262020006454, "grad_norm": 3.937396287918091, "learning_rate": 7.134379612096525e-05, "loss": 2.1002, "step": 2356 }, { "epoch": 0.3802839625685705, "grad_norm": 4.775443077087402, "learning_rate": 7.132016567720385e-05, "loss": 2.2923, "step": 2357 }, { "epoch": 0.38044530493707646, "grad_norm": 5.597178936004639, "learning_rate": 7.129652941170835e-05, "loss": 2.1866, "step": 2358 }, { "epoch": 0.38060664730558247, "grad_norm": 3.9723527431488037, "learning_rate": 7.12728873309329e-05, "loss": 2.016, "step": 2359 }, { "epoch": 0.3807679896740884, "grad_norm": 4.569044589996338, "learning_rate": 7.124923944133326e-05, "loss": 2.0614, "step": 2360 }, { "epoch": 0.3809293320425944, "grad_norm": 4.657077312469482, "learning_rate": 7.12255857493668e-05, "loss": 2.1499, "step": 2361 }, { "epoch": 0.38109067441110034, "grad_norm": 4.39790153503418, "learning_rate": 7.120192626149242e-05, "loss": 1.9213, "step": 2362 }, { "epoch": 0.38125201677960635, "grad_norm": 4.823693752288818, "learning_rate": 7.117826098417068e-05, "loss": 2.2264, "step": 2363 }, { "epoch": 0.3814133591481123, "grad_norm": 4.211658477783203, "learning_rate": 7.115458992386364e-05, "loss": 2.1787, "step": 2364 }, { "epoch": 0.38157470151661826, "grad_norm": 3.9508187770843506, "learning_rate": 7.113091308703498e-05, "loss": 1.8984, "step": 2365 }, { "epoch": 0.3817360438851242, "grad_norm": 3.8432865142822266, "learning_rate": 7.110723048014996e-05, "loss": 1.7879, "step": 2366 }, { "epoch": 0.3818973862536302, "grad_norm": 7.136585235595703, "learning_rate": 7.108354210967541e-05, "loss": 1.9045, "step": 2367 }, { "epoch": 0.3820587286221362, "grad_norm": 5.7573018074035645, "learning_rate": 7.105984798207972e-05, "loss": 2.1248, "step": 2368 }, { "epoch": 0.38222007099064215, "grad_norm": 4.350116729736328, "learning_rate": 7.103614810383288e-05, "loss": 1.9145, "step": 2369 }, { "epoch": 0.3823814133591481, "grad_norm": 3.821668863296509, "learning_rate": 7.101244248140642e-05, "loss": 2.0725, "step": 2370 }, { "epoch": 0.38254275572765406, "grad_norm": 5.314784526824951, "learning_rate": 7.098873112127345e-05, "loss": 2.022, "step": 2371 }, { "epoch": 0.3827040980961601, "grad_norm": 3.54974102973938, "learning_rate": 7.096501402990865e-05, "loss": 1.7158, "step": 2372 }, { "epoch": 0.38286544046466603, "grad_norm": 4.784726619720459, "learning_rate": 7.09412912137883e-05, "loss": 2.0397, "step": 2373 }, { "epoch": 0.383026782833172, "grad_norm": 4.838008403778076, "learning_rate": 7.091756267939015e-05, "loss": 2.0775, "step": 2374 }, { "epoch": 0.38318812520167794, "grad_norm": 4.776695728302002, "learning_rate": 7.089382843319361e-05, "loss": 2.2584, "step": 2375 }, { "epoch": 0.38334946757018395, "grad_norm": 3.448340654373169, "learning_rate": 7.087008848167959e-05, "loss": 1.8765, "step": 2376 }, { "epoch": 0.3835108099386899, "grad_norm": 5.133579730987549, "learning_rate": 7.084634283133059e-05, "loss": 2.2597, "step": 2377 }, { "epoch": 0.38367215230719587, "grad_norm": 4.340801239013672, "learning_rate": 7.082259148863064e-05, "loss": 2.0125, "step": 2378 }, { "epoch": 0.3838334946757018, "grad_norm": 6.2008376121521, "learning_rate": 7.079883446006535e-05, "loss": 2.0045, "step": 2379 }, { "epoch": 0.3839948370442078, "grad_norm": 6.701147079467773, "learning_rate": 7.077507175212183e-05, "loss": 1.8485, "step": 2380 }, { "epoch": 0.3841561794127138, "grad_norm": 3.891029119491577, "learning_rate": 7.075130337128884e-05, "loss": 2.1758, "step": 2381 }, { "epoch": 0.38431752178121975, "grad_norm": 4.597936153411865, "learning_rate": 7.07275293240566e-05, "loss": 2.1513, "step": 2382 }, { "epoch": 0.3844788641497257, "grad_norm": 4.322336196899414, "learning_rate": 7.07037496169169e-05, "loss": 1.8242, "step": 2383 }, { "epoch": 0.38464020651823166, "grad_norm": 5.00579309463501, "learning_rate": 7.067996425636308e-05, "loss": 1.8396, "step": 2384 }, { "epoch": 0.3848015488867377, "grad_norm": 5.800734519958496, "learning_rate": 7.065617324889006e-05, "loss": 1.9924, "step": 2385 }, { "epoch": 0.38496289125524363, "grad_norm": 4.476178169250488, "learning_rate": 7.063237660099422e-05, "loss": 2.2004, "step": 2386 }, { "epoch": 0.3851242336237496, "grad_norm": 6.143631458282471, "learning_rate": 7.060857431917358e-05, "loss": 1.8849, "step": 2387 }, { "epoch": 0.38528557599225555, "grad_norm": 4.4568328857421875, "learning_rate": 7.058476640992759e-05, "loss": 1.9289, "step": 2388 }, { "epoch": 0.38544691836076156, "grad_norm": 4.568746089935303, "learning_rate": 7.056095287975733e-05, "loss": 2.0947, "step": 2389 }, { "epoch": 0.3856082607292675, "grad_norm": 3.585322380065918, "learning_rate": 7.053713373516538e-05, "loss": 1.8935, "step": 2390 }, { "epoch": 0.3857696030977735, "grad_norm": 3.811159610748291, "learning_rate": 7.051330898265582e-05, "loss": 1.9406, "step": 2391 }, { "epoch": 0.38593094546627943, "grad_norm": 4.124796390533447, "learning_rate": 7.048947862873434e-05, "loss": 1.9188, "step": 2392 }, { "epoch": 0.38609228783478544, "grad_norm": 4.1473846435546875, "learning_rate": 7.046564267990807e-05, "loss": 1.856, "step": 2393 }, { "epoch": 0.3862536302032914, "grad_norm": 4.339417934417725, "learning_rate": 7.044180114268572e-05, "loss": 2.0046, "step": 2394 }, { "epoch": 0.38641497257179735, "grad_norm": 5.697010040283203, "learning_rate": 7.041795402357753e-05, "loss": 2.0657, "step": 2395 }, { "epoch": 0.3865763149403033, "grad_norm": 4.324869155883789, "learning_rate": 7.039410132909524e-05, "loss": 2.2903, "step": 2396 }, { "epoch": 0.38673765730880927, "grad_norm": 3.6093547344207764, "learning_rate": 7.037024306575212e-05, "loss": 1.9665, "step": 2397 }, { "epoch": 0.3868989996773153, "grad_norm": 3.4312937259674072, "learning_rate": 7.034637924006297e-05, "loss": 1.7842, "step": 2398 }, { "epoch": 0.38706034204582124, "grad_norm": 4.827066898345947, "learning_rate": 7.032250985854409e-05, "loss": 2.0614, "step": 2399 }, { "epoch": 0.3872216844143272, "grad_norm": 3.1605043411254883, "learning_rate": 7.029863492771332e-05, "loss": 1.7812, "step": 2400 }, { "epoch": 0.38738302678283315, "grad_norm": 5.1260294914245605, "learning_rate": 7.027475445409e-05, "loss": 2.0561, "step": 2401 }, { "epoch": 0.38754436915133916, "grad_norm": 4.700570106506348, "learning_rate": 7.025086844419499e-05, "loss": 1.9542, "step": 2402 }, { "epoch": 0.3877057115198451, "grad_norm": 4.195440769195557, "learning_rate": 7.022697690455065e-05, "loss": 2.1843, "step": 2403 }, { "epoch": 0.3878670538883511, "grad_norm": 4.5467095375061035, "learning_rate": 7.020307984168088e-05, "loss": 1.8634, "step": 2404 }, { "epoch": 0.38802839625685703, "grad_norm": 3.495610237121582, "learning_rate": 7.017917726211106e-05, "loss": 1.8406, "step": 2405 }, { "epoch": 0.38818973862536305, "grad_norm": 4.1670708656311035, "learning_rate": 7.015526917236806e-05, "loss": 2.0324, "step": 2406 }, { "epoch": 0.388351080993869, "grad_norm": 4.634245872497559, "learning_rate": 7.013135557898032e-05, "loss": 2.1647, "step": 2407 }, { "epoch": 0.38851242336237496, "grad_norm": 4.569153785705566, "learning_rate": 7.01074364884777e-05, "loss": 1.957, "step": 2408 }, { "epoch": 0.3886737657308809, "grad_norm": 3.983964681625366, "learning_rate": 7.008351190739162e-05, "loss": 2.2292, "step": 2409 }, { "epoch": 0.3888351080993869, "grad_norm": 4.7621331214904785, "learning_rate": 7.0059581842255e-05, "loss": 2.4628, "step": 2410 }, { "epoch": 0.3889964504678929, "grad_norm": 3.6143999099731445, "learning_rate": 7.003564629960222e-05, "loss": 2.0751, "step": 2411 }, { "epoch": 0.38915779283639884, "grad_norm": 4.416424751281738, "learning_rate": 7.001170528596917e-05, "loss": 1.8244, "step": 2412 }, { "epoch": 0.3893191352049048, "grad_norm": 3.624743700027466, "learning_rate": 6.998775880789326e-05, "loss": 2.0942, "step": 2413 }, { "epoch": 0.38948047757341075, "grad_norm": 5.6762776374816895, "learning_rate": 6.996380687191335e-05, "loss": 2.1162, "step": 2414 }, { "epoch": 0.38964181994191677, "grad_norm": 4.099328517913818, "learning_rate": 6.993984948456981e-05, "loss": 2.1135, "step": 2415 }, { "epoch": 0.3898031623104227, "grad_norm": 4.025030136108398, "learning_rate": 6.991588665240454e-05, "loss": 1.9314, "step": 2416 }, { "epoch": 0.3899645046789287, "grad_norm": 4.785827159881592, "learning_rate": 6.989191838196082e-05, "loss": 1.8687, "step": 2417 }, { "epoch": 0.39012584704743464, "grad_norm": 4.294789791107178, "learning_rate": 6.986794467978355e-05, "loss": 1.9542, "step": 2418 }, { "epoch": 0.39028718941594065, "grad_norm": 4.796219825744629, "learning_rate": 6.984396555241899e-05, "loss": 2.0095, "step": 2419 }, { "epoch": 0.3904485317844466, "grad_norm": 4.830128192901611, "learning_rate": 6.981998100641497e-05, "loss": 2.1175, "step": 2420 }, { "epoch": 0.39060987415295256, "grad_norm": 4.1153950691223145, "learning_rate": 6.979599104832075e-05, "loss": 1.9986, "step": 2421 }, { "epoch": 0.3907712165214585, "grad_norm": 4.315544605255127, "learning_rate": 6.977199568468709e-05, "loss": 1.9064, "step": 2422 }, { "epoch": 0.39093255888996453, "grad_norm": 3.3176615238189697, "learning_rate": 6.974799492206622e-05, "loss": 1.8931, "step": 2423 }, { "epoch": 0.3910939012584705, "grad_norm": 4.308636665344238, "learning_rate": 6.972398876701187e-05, "loss": 2.1158, "step": 2424 }, { "epoch": 0.39125524362697645, "grad_norm": 5.303024768829346, "learning_rate": 6.969997722607916e-05, "loss": 2.2496, "step": 2425 }, { "epoch": 0.3914165859954824, "grad_norm": 4.133060455322266, "learning_rate": 6.967596030582478e-05, "loss": 2.0026, "step": 2426 }, { "epoch": 0.39157792836398836, "grad_norm": 4.185387134552002, "learning_rate": 6.965193801280683e-05, "loss": 2.1184, "step": 2427 }, { "epoch": 0.39173927073249437, "grad_norm": 3.4608633518218994, "learning_rate": 6.96279103535849e-05, "loss": 1.954, "step": 2428 }, { "epoch": 0.39190061310100033, "grad_norm": 4.845871448516846, "learning_rate": 6.960387733472003e-05, "loss": 1.9661, "step": 2429 }, { "epoch": 0.3920619554695063, "grad_norm": 4.5562286376953125, "learning_rate": 6.957983896277473e-05, "loss": 1.9637, "step": 2430 }, { "epoch": 0.39222329783801224, "grad_norm": 4.882256031036377, "learning_rate": 6.9555795244313e-05, "loss": 2.403, "step": 2431 }, { "epoch": 0.39238464020651825, "grad_norm": 4.715920925140381, "learning_rate": 6.953174618590026e-05, "loss": 1.6436, "step": 2432 }, { "epoch": 0.3925459825750242, "grad_norm": 4.414078235626221, "learning_rate": 6.950769179410336e-05, "loss": 1.8671, "step": 2433 }, { "epoch": 0.39270732494353017, "grad_norm": 4.121337890625, "learning_rate": 6.948363207549073e-05, "loss": 1.9667, "step": 2434 }, { "epoch": 0.3928686673120361, "grad_norm": 4.242137432098389, "learning_rate": 6.945956703663211e-05, "loss": 1.8209, "step": 2435 }, { "epoch": 0.39303000968054214, "grad_norm": 3.7549500465393066, "learning_rate": 6.943549668409879e-05, "loss": 1.8583, "step": 2436 }, { "epoch": 0.3931913520490481, "grad_norm": 4.3018879890441895, "learning_rate": 6.941142102446342e-05, "loss": 2.0814, "step": 2437 }, { "epoch": 0.39335269441755405, "grad_norm": 4.0712971687316895, "learning_rate": 6.938734006430024e-05, "loss": 1.7733, "step": 2438 }, { "epoch": 0.39351403678606, "grad_norm": 4.466480255126953, "learning_rate": 6.936325381018478e-05, "loss": 1.6663, "step": 2439 }, { "epoch": 0.39367537915456596, "grad_norm": 5.666343688964844, "learning_rate": 6.933916226869414e-05, "loss": 1.8364, "step": 2440 }, { "epoch": 0.393836721523072, "grad_norm": 3.993345260620117, "learning_rate": 6.931506544640677e-05, "loss": 1.7569, "step": 2441 }, { "epoch": 0.39399806389157793, "grad_norm": 4.941212177276611, "learning_rate": 6.929096334990264e-05, "loss": 2.1538, "step": 2442 }, { "epoch": 0.3941594062600839, "grad_norm": 3.743314504623413, "learning_rate": 6.92668559857631e-05, "loss": 1.9386, "step": 2443 }, { "epoch": 0.39432074862858985, "grad_norm": 3.9533352851867676, "learning_rate": 6.924274336057099e-05, "loss": 1.9302, "step": 2444 }, { "epoch": 0.39448209099709586, "grad_norm": 3.6654293537139893, "learning_rate": 6.921862548091051e-05, "loss": 1.9921, "step": 2445 }, { "epoch": 0.3946434333656018, "grad_norm": 5.150160789489746, "learning_rate": 6.91945023533674e-05, "loss": 2.0982, "step": 2446 }, { "epoch": 0.39480477573410777, "grad_norm": 3.7324631214141846, "learning_rate": 6.917037398452876e-05, "loss": 1.7244, "step": 2447 }, { "epoch": 0.39496611810261373, "grad_norm": 4.152327537536621, "learning_rate": 6.914624038098312e-05, "loss": 2.1647, "step": 2448 }, { "epoch": 0.39512746047111974, "grad_norm": 4.865609645843506, "learning_rate": 6.912210154932049e-05, "loss": 1.8787, "step": 2449 }, { "epoch": 0.3952888028396257, "grad_norm": 3.872842311859131, "learning_rate": 6.909795749613223e-05, "loss": 2.1031, "step": 2450 }, { "epoch": 0.39545014520813165, "grad_norm": 5.2089056968688965, "learning_rate": 6.90738082280112e-05, "loss": 1.7995, "step": 2451 }, { "epoch": 0.3956114875766376, "grad_norm": 4.466630935668945, "learning_rate": 6.904965375155167e-05, "loss": 2.015, "step": 2452 }, { "epoch": 0.39577282994514357, "grad_norm": 3.957719326019287, "learning_rate": 6.902549407334929e-05, "loss": 2.003, "step": 2453 }, { "epoch": 0.3959341723136496, "grad_norm": 3.738546848297119, "learning_rate": 6.900132920000117e-05, "loss": 1.8192, "step": 2454 }, { "epoch": 0.39609551468215554, "grad_norm": 3.4601757526397705, "learning_rate": 6.897715913810582e-05, "loss": 2.0163, "step": 2455 }, { "epoch": 0.3962568570506615, "grad_norm": 3.8311967849731445, "learning_rate": 6.89529838942632e-05, "loss": 1.9218, "step": 2456 }, { "epoch": 0.39641819941916745, "grad_norm": 3.2115988731384277, "learning_rate": 6.892880347507461e-05, "loss": 1.9858, "step": 2457 }, { "epoch": 0.39657954178767346, "grad_norm": 4.476034164428711, "learning_rate": 6.890461788714286e-05, "loss": 2.0909, "step": 2458 }, { "epoch": 0.3967408841561794, "grad_norm": 4.0941033363342285, "learning_rate": 6.88804271370721e-05, "loss": 1.9763, "step": 2459 }, { "epoch": 0.3969022265246854, "grad_norm": 4.4979987144470215, "learning_rate": 6.88562312314679e-05, "loss": 1.9758, "step": 2460 }, { "epoch": 0.39706356889319133, "grad_norm": 5.366666793823242, "learning_rate": 6.883203017693726e-05, "loss": 2.2577, "step": 2461 }, { "epoch": 0.39722491126169734, "grad_norm": 4.031561374664307, "learning_rate": 6.880782398008862e-05, "loss": 1.8966, "step": 2462 }, { "epoch": 0.3973862536302033, "grad_norm": 5.680191993713379, "learning_rate": 6.878361264753171e-05, "loss": 2.2706, "step": 2463 }, { "epoch": 0.39754759599870926, "grad_norm": 4.758927822113037, "learning_rate": 6.875939618587779e-05, "loss": 1.8676, "step": 2464 }, { "epoch": 0.3977089383672152, "grad_norm": 6.526544094085693, "learning_rate": 6.873517460173941e-05, "loss": 2.2248, "step": 2465 }, { "epoch": 0.3978702807357212, "grad_norm": 4.234228134155273, "learning_rate": 6.87109479017306e-05, "loss": 1.7944, "step": 2466 }, { "epoch": 0.3980316231042272, "grad_norm": 4.572405815124512, "learning_rate": 6.868671609246678e-05, "loss": 1.7055, "step": 2467 }, { "epoch": 0.39819296547273314, "grad_norm": 3.432278633117676, "learning_rate": 6.866247918056471e-05, "loss": 2.0137, "step": 2468 }, { "epoch": 0.3983543078412391, "grad_norm": 5.302343845367432, "learning_rate": 6.86382371726426e-05, "loss": 2.0285, "step": 2469 }, { "epoch": 0.39851565020974505, "grad_norm": 4.160001277923584, "learning_rate": 6.861399007532002e-05, "loss": 2.3, "step": 2470 }, { "epoch": 0.39867699257825107, "grad_norm": 4.059791088104248, "learning_rate": 6.858973789521793e-05, "loss": 2.1066, "step": 2471 }, { "epoch": 0.398838334946757, "grad_norm": 4.373169422149658, "learning_rate": 6.85654806389587e-05, "loss": 2.0145, "step": 2472 }, { "epoch": 0.398999677315263, "grad_norm": 3.703003168106079, "learning_rate": 6.854121831316607e-05, "loss": 1.822, "step": 2473 }, { "epoch": 0.39916101968376894, "grad_norm": 3.666779041290283, "learning_rate": 6.851695092446517e-05, "loss": 1.941, "step": 2474 }, { "epoch": 0.39932236205227495, "grad_norm": 5.37600564956665, "learning_rate": 6.84926784794825e-05, "loss": 2.2424, "step": 2475 }, { "epoch": 0.3994837044207809, "grad_norm": 3.541210651397705, "learning_rate": 6.846840098484596e-05, "loss": 1.9848, "step": 2476 }, { "epoch": 0.39964504678928686, "grad_norm": 3.728093385696411, "learning_rate": 6.844411844718481e-05, "loss": 2.1511, "step": 2477 }, { "epoch": 0.3998063891577928, "grad_norm": 5.245541572570801, "learning_rate": 6.841983087312971e-05, "loss": 1.7297, "step": 2478 }, { "epoch": 0.39996773152629883, "grad_norm": 4.711781024932861, "learning_rate": 6.839553826931267e-05, "loss": 1.9193, "step": 2479 }, { "epoch": 0.4001290738948048, "grad_norm": 3.592806100845337, "learning_rate": 6.837124064236709e-05, "loss": 1.9462, "step": 2480 }, { "epoch": 0.40029041626331074, "grad_norm": 7.024892807006836, "learning_rate": 6.834693799892773e-05, "loss": 1.9904, "step": 2481 }, { "epoch": 0.4004517586318167, "grad_norm": 4.028841018676758, "learning_rate": 6.832263034563073e-05, "loss": 1.914, "step": 2482 }, { "epoch": 0.40061310100032266, "grad_norm": 3.887765645980835, "learning_rate": 6.829831768911361e-05, "loss": 1.9009, "step": 2483 }, { "epoch": 0.40077444336882867, "grad_norm": 5.264831066131592, "learning_rate": 6.827400003601522e-05, "loss": 1.9565, "step": 2484 }, { "epoch": 0.4009357857373346, "grad_norm": 6.523907661437988, "learning_rate": 6.82496773929758e-05, "loss": 2.0184, "step": 2485 }, { "epoch": 0.4010971281058406, "grad_norm": 4.7822160720825195, "learning_rate": 6.822534976663695e-05, "loss": 1.7987, "step": 2486 }, { "epoch": 0.40125847047434654, "grad_norm": 3.6801819801330566, "learning_rate": 6.820101716364162e-05, "loss": 2.1259, "step": 2487 }, { "epoch": 0.40141981284285255, "grad_norm": 4.366987228393555, "learning_rate": 6.817667959063414e-05, "loss": 2.0377, "step": 2488 }, { "epoch": 0.4015811552113585, "grad_norm": 4.573003768920898, "learning_rate": 6.815233705426019e-05, "loss": 1.8376, "step": 2489 }, { "epoch": 0.40174249757986447, "grad_norm": 3.706190347671509, "learning_rate": 6.812798956116677e-05, "loss": 1.9558, "step": 2490 }, { "epoch": 0.4019038399483704, "grad_norm": 4.338067054748535, "learning_rate": 6.81036371180023e-05, "loss": 1.9312, "step": 2491 }, { "epoch": 0.40206518231687643, "grad_norm": 5.0187907218933105, "learning_rate": 6.807927973141651e-05, "loss": 2.0904, "step": 2492 }, { "epoch": 0.4022265246853824, "grad_norm": 4.582776069641113, "learning_rate": 6.805491740806043e-05, "loss": 1.9393, "step": 2493 }, { "epoch": 0.40238786705388835, "grad_norm": 3.3647243976593018, "learning_rate": 6.803055015458656e-05, "loss": 2.0611, "step": 2494 }, { "epoch": 0.4025492094223943, "grad_norm": 5.384952068328857, "learning_rate": 6.800617797764865e-05, "loss": 2.0093, "step": 2495 }, { "epoch": 0.4027105517909003, "grad_norm": 4.835544109344482, "learning_rate": 6.798180088390183e-05, "loss": 1.7988, "step": 2496 }, { "epoch": 0.4028718941594063, "grad_norm": 3.5792434215545654, "learning_rate": 6.795741888000256e-05, "loss": 1.949, "step": 2497 }, { "epoch": 0.40303323652791223, "grad_norm": 3.372053861618042, "learning_rate": 6.793303197260864e-05, "loss": 1.8905, "step": 2498 }, { "epoch": 0.4031945788964182, "grad_norm": 4.386721134185791, "learning_rate": 6.790864016837923e-05, "loss": 1.9178, "step": 2499 }, { "epoch": 0.40335592126492414, "grad_norm": 3.8055319786071777, "learning_rate": 6.788424347397482e-05, "loss": 1.9185, "step": 2500 }, { "epoch": 0.40351726363343016, "grad_norm": 4.16497278213501, "learning_rate": 6.785984189605721e-05, "loss": 2.0803, "step": 2501 }, { "epoch": 0.4036786060019361, "grad_norm": 4.593762397766113, "learning_rate": 6.783543544128957e-05, "loss": 1.9773, "step": 2502 }, { "epoch": 0.40383994837044207, "grad_norm": 4.823954105377197, "learning_rate": 6.781102411633635e-05, "loss": 2.1822, "step": 2503 }, { "epoch": 0.404001290738948, "grad_norm": 4.050856590270996, "learning_rate": 6.77866079278634e-05, "loss": 2.1946, "step": 2504 }, { "epoch": 0.40416263310745404, "grad_norm": 5.379467487335205, "learning_rate": 6.776218688253784e-05, "loss": 2.0267, "step": 2505 }, { "epoch": 0.40432397547596, "grad_norm": 5.673772811889648, "learning_rate": 6.773776098702816e-05, "loss": 2.0904, "step": 2506 }, { "epoch": 0.40448531784446595, "grad_norm": 3.2573928833007812, "learning_rate": 6.771333024800411e-05, "loss": 1.9578, "step": 2507 }, { "epoch": 0.4046466602129719, "grad_norm": 3.8524608612060547, "learning_rate": 6.768889467213684e-05, "loss": 1.8406, "step": 2508 }, { "epoch": 0.4048080025814779, "grad_norm": 4.494475841522217, "learning_rate": 6.766445426609877e-05, "loss": 1.7922, "step": 2509 }, { "epoch": 0.4049693449499839, "grad_norm": 4.574101448059082, "learning_rate": 6.764000903656366e-05, "loss": 1.7937, "step": 2510 }, { "epoch": 0.40513068731848983, "grad_norm": 4.136654853820801, "learning_rate": 6.76155589902066e-05, "loss": 1.8986, "step": 2511 }, { "epoch": 0.4052920296869958, "grad_norm": 3.4168479442596436, "learning_rate": 6.759110413370395e-05, "loss": 2.0704, "step": 2512 }, { "epoch": 0.40545337205550175, "grad_norm": 4.0800557136535645, "learning_rate": 6.756664447373344e-05, "loss": 2.0706, "step": 2513 }, { "epoch": 0.40561471442400776, "grad_norm": 4.138115882873535, "learning_rate": 6.754218001697402e-05, "loss": 2.1694, "step": 2514 }, { "epoch": 0.4057760567925137, "grad_norm": 5.320624828338623, "learning_rate": 6.751771077010607e-05, "loss": 1.8025, "step": 2515 }, { "epoch": 0.4059373991610197, "grad_norm": 4.6533379554748535, "learning_rate": 6.74932367398112e-05, "loss": 1.9051, "step": 2516 }, { "epoch": 0.40609874152952563, "grad_norm": 3.4974732398986816, "learning_rate": 6.746875793277233e-05, "loss": 1.7668, "step": 2517 }, { "epoch": 0.40626008389803164, "grad_norm": 4.321040153503418, "learning_rate": 6.744427435567373e-05, "loss": 1.7968, "step": 2518 }, { "epoch": 0.4064214262665376, "grad_norm": 5.548377990722656, "learning_rate": 6.741978601520092e-05, "loss": 1.7595, "step": 2519 }, { "epoch": 0.40658276863504356, "grad_norm": 4.565524578094482, "learning_rate": 6.739529291804076e-05, "loss": 2.0442, "step": 2520 }, { "epoch": 0.4067441110035495, "grad_norm": 3.937290668487549, "learning_rate": 6.737079507088139e-05, "loss": 1.8561, "step": 2521 }, { "epoch": 0.4069054533720555, "grad_norm": 4.6694183349609375, "learning_rate": 6.734629248041226e-05, "loss": 2.262, "step": 2522 }, { "epoch": 0.4070667957405615, "grad_norm": 4.72750186920166, "learning_rate": 6.732178515332406e-05, "loss": 2.0002, "step": 2523 }, { "epoch": 0.40722813810906744, "grad_norm": 4.05148458480835, "learning_rate": 6.729727309630885e-05, "loss": 2.0543, "step": 2524 }, { "epoch": 0.4073894804775734, "grad_norm": 4.776993274688721, "learning_rate": 6.727275631605995e-05, "loss": 1.8568, "step": 2525 }, { "epoch": 0.4075508228460794, "grad_norm": 5.503616809844971, "learning_rate": 6.724823481927198e-05, "loss": 1.936, "step": 2526 }, { "epoch": 0.40771216521458536, "grad_norm": 6.97620964050293, "learning_rate": 6.72237086126408e-05, "loss": 1.9444, "step": 2527 }, { "epoch": 0.4078735075830913, "grad_norm": 5.984745025634766, "learning_rate": 6.719917770286362e-05, "loss": 2.0345, "step": 2528 }, { "epoch": 0.4080348499515973, "grad_norm": 4.583484172821045, "learning_rate": 6.717464209663891e-05, "loss": 1.9219, "step": 2529 }, { "epoch": 0.40819619232010323, "grad_norm": 3.649150848388672, "learning_rate": 6.715010180066641e-05, "loss": 1.9231, "step": 2530 }, { "epoch": 0.40835753468860925, "grad_norm": 4.01031494140625, "learning_rate": 6.712555682164715e-05, "loss": 2.0998, "step": 2531 }, { "epoch": 0.4085188770571152, "grad_norm": 4.0240349769592285, "learning_rate": 6.710100716628344e-05, "loss": 1.8933, "step": 2532 }, { "epoch": 0.40868021942562116, "grad_norm": 3.484869956970215, "learning_rate": 6.707645284127887e-05, "loss": 1.8798, "step": 2533 }, { "epoch": 0.4088415617941271, "grad_norm": 5.766565799713135, "learning_rate": 6.70518938533383e-05, "loss": 1.8677, "step": 2534 }, { "epoch": 0.40900290416263313, "grad_norm": 5.190849781036377, "learning_rate": 6.702733020916786e-05, "loss": 2.1769, "step": 2535 }, { "epoch": 0.4091642465311391, "grad_norm": 3.1495325565338135, "learning_rate": 6.700276191547496e-05, "loss": 1.9317, "step": 2536 }, { "epoch": 0.40932558889964504, "grad_norm": 3.671961545944214, "learning_rate": 6.697818897896828e-05, "loss": 2.2158, "step": 2537 }, { "epoch": 0.409486931268151, "grad_norm": 4.503951549530029, "learning_rate": 6.695361140635776e-05, "loss": 1.8565, "step": 2538 }, { "epoch": 0.409648273636657, "grad_norm": 3.85292649269104, "learning_rate": 6.69290292043546e-05, "loss": 1.5461, "step": 2539 }, { "epoch": 0.40980961600516297, "grad_norm": 6.35936975479126, "learning_rate": 6.690444237967129e-05, "loss": 1.8699, "step": 2540 }, { "epoch": 0.4099709583736689, "grad_norm": 4.711306095123291, "learning_rate": 6.687985093902155e-05, "loss": 1.9941, "step": 2541 }, { "epoch": 0.4101323007421749, "grad_norm": 3.86368989944458, "learning_rate": 6.685525488912037e-05, "loss": 1.7849, "step": 2542 }, { "epoch": 0.41029364311068084, "grad_norm": 5.367621421813965, "learning_rate": 6.683065423668403e-05, "loss": 1.8176, "step": 2543 }, { "epoch": 0.41045498547918685, "grad_norm": 5.0536208152771, "learning_rate": 6.680604898843002e-05, "loss": 1.9325, "step": 2544 }, { "epoch": 0.4106163278476928, "grad_norm": 4.238080024719238, "learning_rate": 6.678143915107713e-05, "loss": 2.1431, "step": 2545 }, { "epoch": 0.41077767021619876, "grad_norm": 4.902920246124268, "learning_rate": 6.675682473134536e-05, "loss": 1.8925, "step": 2546 }, { "epoch": 0.4109390125847047, "grad_norm": 3.542854070663452, "learning_rate": 6.673220573595598e-05, "loss": 1.9062, "step": 2547 }, { "epoch": 0.41110035495321073, "grad_norm": 4.582688331604004, "learning_rate": 6.670758217163151e-05, "loss": 2.0112, "step": 2548 }, { "epoch": 0.4112616973217167, "grad_norm": 4.175585746765137, "learning_rate": 6.668295404509574e-05, "loss": 2.021, "step": 2549 }, { "epoch": 0.41142303969022265, "grad_norm": 4.21159553527832, "learning_rate": 6.665832136307366e-05, "loss": 1.8985, "step": 2550 }, { "epoch": 0.4115843820587286, "grad_norm": 4.019961357116699, "learning_rate": 6.663368413229155e-05, "loss": 1.855, "step": 2551 }, { "epoch": 0.4117457244272346, "grad_norm": 3.089282989501953, "learning_rate": 6.660904235947687e-05, "loss": 1.7267, "step": 2552 }, { "epoch": 0.4119070667957406, "grad_norm": 4.0074663162231445, "learning_rate": 6.65843960513584e-05, "loss": 1.857, "step": 2553 }, { "epoch": 0.41206840916424653, "grad_norm": 3.842273235321045, "learning_rate": 6.655974521466608e-05, "loss": 2.0801, "step": 2554 }, { "epoch": 0.4122297515327525, "grad_norm": 3.413011312484741, "learning_rate": 6.653508985613117e-05, "loss": 1.8709, "step": 2555 }, { "epoch": 0.41239109390125844, "grad_norm": 3.932624340057373, "learning_rate": 6.651042998248608e-05, "loss": 1.8626, "step": 2556 }, { "epoch": 0.41255243626976446, "grad_norm": 5.163025856018066, "learning_rate": 6.648576560046452e-05, "loss": 2.1256, "step": 2557 }, { "epoch": 0.4127137786382704, "grad_norm": 5.4284186363220215, "learning_rate": 6.64610967168014e-05, "loss": 1.775, "step": 2558 }, { "epoch": 0.41287512100677637, "grad_norm": 4.206260681152344, "learning_rate": 6.643642333823286e-05, "loss": 1.9807, "step": 2559 }, { "epoch": 0.4130364633752823, "grad_norm": 3.877509593963623, "learning_rate": 6.641174547149624e-05, "loss": 1.7632, "step": 2560 }, { "epoch": 0.41319780574378834, "grad_norm": 4.569438934326172, "learning_rate": 6.638706312333018e-05, "loss": 2.0044, "step": 2561 }, { "epoch": 0.4133591481122943, "grad_norm": 4.021951198577881, "learning_rate": 6.636237630047448e-05, "loss": 1.8816, "step": 2562 }, { "epoch": 0.41352049048080025, "grad_norm": 4.502031326293945, "learning_rate": 6.633768500967019e-05, "loss": 1.9982, "step": 2563 }, { "epoch": 0.4136818328493062, "grad_norm": 5.826133728027344, "learning_rate": 6.631298925765955e-05, "loss": 2.087, "step": 2564 }, { "epoch": 0.4138431752178122, "grad_norm": 3.8425328731536865, "learning_rate": 6.628828905118608e-05, "loss": 1.9743, "step": 2565 }, { "epoch": 0.4140045175863182, "grad_norm": 4.0037760734558105, "learning_rate": 6.626358439699442e-05, "loss": 2.1166, "step": 2566 }, { "epoch": 0.41416585995482413, "grad_norm": 3.5549118518829346, "learning_rate": 6.623887530183051e-05, "loss": 2.0896, "step": 2567 }, { "epoch": 0.4143272023233301, "grad_norm": 5.458221435546875, "learning_rate": 6.621416177244148e-05, "loss": 1.9489, "step": 2568 }, { "epoch": 0.4144885446918361, "grad_norm": 5.151551246643066, "learning_rate": 6.618944381557568e-05, "loss": 1.8281, "step": 2569 }, { "epoch": 0.41464988706034206, "grad_norm": 3.459108829498291, "learning_rate": 6.616472143798261e-05, "loss": 1.8917, "step": 2570 }, { "epoch": 0.414811229428848, "grad_norm": 3.567786693572998, "learning_rate": 6.613999464641304e-05, "loss": 1.9488, "step": 2571 }, { "epoch": 0.414972571797354, "grad_norm": 6.768561840057373, "learning_rate": 6.611526344761893e-05, "loss": 2.0894, "step": 2572 }, { "epoch": 0.41513391416585993, "grad_norm": 4.484906196594238, "learning_rate": 6.609052784835342e-05, "loss": 1.8344, "step": 2573 }, { "epoch": 0.41529525653436594, "grad_norm": 3.241314649581909, "learning_rate": 6.60657878553709e-05, "loss": 1.8785, "step": 2574 }, { "epoch": 0.4154565989028719, "grad_norm": 4.768258094787598, "learning_rate": 6.604104347542693e-05, "loss": 2.0155, "step": 2575 }, { "epoch": 0.41561794127137786, "grad_norm": 4.974591255187988, "learning_rate": 6.601629471527822e-05, "loss": 1.9884, "step": 2576 }, { "epoch": 0.4157792836398838, "grad_norm": 4.00331974029541, "learning_rate": 6.599154158168278e-05, "loss": 1.8776, "step": 2577 }, { "epoch": 0.4159406260083898, "grad_norm": 3.2258119583129883, "learning_rate": 6.596678408139973e-05, "loss": 1.9786, "step": 2578 }, { "epoch": 0.4161019683768958, "grad_norm": 4.530983924865723, "learning_rate": 6.594202222118942e-05, "loss": 2.0937, "step": 2579 }, { "epoch": 0.41626331074540174, "grad_norm": 3.984894275665283, "learning_rate": 6.591725600781336e-05, "loss": 1.9892, "step": 2580 }, { "epoch": 0.4164246531139077, "grad_norm": 3.5328073501586914, "learning_rate": 6.589248544803431e-05, "loss": 2.114, "step": 2581 }, { "epoch": 0.4165859954824137, "grad_norm": 4.023244857788086, "learning_rate": 6.586771054861613e-05, "loss": 1.9073, "step": 2582 }, { "epoch": 0.41674733785091966, "grad_norm": 4.235401153564453, "learning_rate": 6.584293131632396e-05, "loss": 1.9853, "step": 2583 }, { "epoch": 0.4169086802194256, "grad_norm": 3.4555716514587402, "learning_rate": 6.581814775792403e-05, "loss": 1.9779, "step": 2584 }, { "epoch": 0.4170700225879316, "grad_norm": 4.903307914733887, "learning_rate": 6.579335988018383e-05, "loss": 1.9086, "step": 2585 }, { "epoch": 0.41723136495643753, "grad_norm": 4.533055782318115, "learning_rate": 6.576856768987197e-05, "loss": 2.2401, "step": 2586 }, { "epoch": 0.41739270732494355, "grad_norm": 5.497504234313965, "learning_rate": 6.574377119375829e-05, "loss": 2.0772, "step": 2587 }, { "epoch": 0.4175540496934495, "grad_norm": 5.563016891479492, "learning_rate": 6.571897039861377e-05, "loss": 1.8796, "step": 2588 }, { "epoch": 0.41771539206195546, "grad_norm": 5.664429664611816, "learning_rate": 6.569416531121056e-05, "loss": 1.6566, "step": 2589 }, { "epoch": 0.4178767344304614, "grad_norm": 5.091784477233887, "learning_rate": 6.5669355938322e-05, "loss": 2.0411, "step": 2590 }, { "epoch": 0.41803807679896743, "grad_norm": 3.325024366378784, "learning_rate": 6.564454228672259e-05, "loss": 1.9708, "step": 2591 }, { "epoch": 0.4181994191674734, "grad_norm": 5.186642169952393, "learning_rate": 6.561972436318801e-05, "loss": 2.0746, "step": 2592 }, { "epoch": 0.41836076153597934, "grad_norm": 5.5119524002075195, "learning_rate": 6.559490217449513e-05, "loss": 1.7696, "step": 2593 }, { "epoch": 0.4185221039044853, "grad_norm": 5.138498306274414, "learning_rate": 6.55700757274219e-05, "loss": 2.1851, "step": 2594 }, { "epoch": 0.4186834462729913, "grad_norm": 4.437428951263428, "learning_rate": 6.554524502874752e-05, "loss": 2.0936, "step": 2595 }, { "epoch": 0.41884478864149727, "grad_norm": 2.9758529663085938, "learning_rate": 6.55204100852523e-05, "loss": 1.9382, "step": 2596 }, { "epoch": 0.4190061310100032, "grad_norm": 3.4663944244384766, "learning_rate": 6.549557090371776e-05, "loss": 2.0537, "step": 2597 }, { "epoch": 0.4191674733785092, "grad_norm": 4.565867900848389, "learning_rate": 6.547072749092652e-05, "loss": 1.965, "step": 2598 }, { "epoch": 0.4193288157470152, "grad_norm": 3.739332914352417, "learning_rate": 6.544587985366237e-05, "loss": 2.1783, "step": 2599 }, { "epoch": 0.41949015811552115, "grad_norm": 4.538175106048584, "learning_rate": 6.54210279987103e-05, "loss": 2.411, "step": 2600 }, { "epoch": 0.4196515004840271, "grad_norm": 7.21381139755249, "learning_rate": 6.539617193285639e-05, "loss": 2.1074, "step": 2601 }, { "epoch": 0.41981284285253306, "grad_norm": 5.1080193519592285, "learning_rate": 6.537131166288789e-05, "loss": 2.015, "step": 2602 }, { "epoch": 0.419974185221039, "grad_norm": 4.051692962646484, "learning_rate": 6.534644719559321e-05, "loss": 1.9574, "step": 2603 }, { "epoch": 0.42013552758954503, "grad_norm": 3.8842334747314453, "learning_rate": 6.532157853776191e-05, "loss": 1.9782, "step": 2604 }, { "epoch": 0.420296869958051, "grad_norm": 3.123093843460083, "learning_rate": 6.529670569618467e-05, "loss": 1.9015, "step": 2605 }, { "epoch": 0.42045821232655695, "grad_norm": 5.435220241546631, "learning_rate": 6.527182867765332e-05, "loss": 2.1036, "step": 2606 }, { "epoch": 0.4206195546950629, "grad_norm": 3.8733108043670654, "learning_rate": 6.524694748896086e-05, "loss": 1.9209, "step": 2607 }, { "epoch": 0.4207808970635689, "grad_norm": 3.982532024383545, "learning_rate": 6.522206213690141e-05, "loss": 1.9799, "step": 2608 }, { "epoch": 0.42094223943207487, "grad_norm": 4.876627445220947, "learning_rate": 6.519717262827018e-05, "loss": 1.9781, "step": 2609 }, { "epoch": 0.42110358180058083, "grad_norm": 4.795150279998779, "learning_rate": 6.517227896986359e-05, "loss": 1.9817, "step": 2610 }, { "epoch": 0.4212649241690868, "grad_norm": 4.908081531524658, "learning_rate": 6.514738116847915e-05, "loss": 1.6965, "step": 2611 }, { "epoch": 0.4214262665375928, "grad_norm": 5.139437198638916, "learning_rate": 6.512247923091552e-05, "loss": 2.0079, "step": 2612 }, { "epoch": 0.42158760890609875, "grad_norm": 5.754101276397705, "learning_rate": 6.509757316397248e-05, "loss": 1.9483, "step": 2613 }, { "epoch": 0.4217489512746047, "grad_norm": 3.195584297180176, "learning_rate": 6.507266297445092e-05, "loss": 1.8382, "step": 2614 }, { "epoch": 0.42191029364311067, "grad_norm": 5.019852638244629, "learning_rate": 6.50477486691529e-05, "loss": 2.0303, "step": 2615 }, { "epoch": 0.4220716360116166, "grad_norm": 3.8010637760162354, "learning_rate": 6.502283025488157e-05, "loss": 2.1413, "step": 2616 }, { "epoch": 0.42223297838012264, "grad_norm": 4.250025749206543, "learning_rate": 6.49979077384412e-05, "loss": 2.0895, "step": 2617 }, { "epoch": 0.4223943207486286, "grad_norm": 4.784327030181885, "learning_rate": 6.497298112663721e-05, "loss": 1.9401, "step": 2618 }, { "epoch": 0.42255566311713455, "grad_norm": 5.285989761352539, "learning_rate": 6.49480504262761e-05, "loss": 2.1307, "step": 2619 }, { "epoch": 0.4227170054856405, "grad_norm": 3.9630815982818604, "learning_rate": 6.49231156441655e-05, "loss": 2.1602, "step": 2620 }, { "epoch": 0.4228783478541465, "grad_norm": 4.505052089691162, "learning_rate": 6.489817678711418e-05, "loss": 2.1739, "step": 2621 }, { "epoch": 0.4230396902226525, "grad_norm": 4.205039024353027, "learning_rate": 6.487323386193199e-05, "loss": 2.0078, "step": 2622 }, { "epoch": 0.42320103259115843, "grad_norm": 4.105075359344482, "learning_rate": 6.48482868754299e-05, "loss": 2.0093, "step": 2623 }, { "epoch": 0.4233623749596644, "grad_norm": 5.950715065002441, "learning_rate": 6.482333583442002e-05, "loss": 2.0699, "step": 2624 }, { "epoch": 0.4235237173281704, "grad_norm": 3.457017183303833, "learning_rate": 6.479838074571551e-05, "loss": 2.0035, "step": 2625 }, { "epoch": 0.42368505969667636, "grad_norm": 3.98242449760437, "learning_rate": 6.477342161613068e-05, "loss": 2.0604, "step": 2626 }, { "epoch": 0.4238464020651823, "grad_norm": 4.566737651824951, "learning_rate": 6.47484584524809e-05, "loss": 2.2198, "step": 2627 }, { "epoch": 0.42400774443368827, "grad_norm": 3.8550214767456055, "learning_rate": 6.472349126158272e-05, "loss": 1.9878, "step": 2628 }, { "epoch": 0.4241690868021943, "grad_norm": 4.4448747634887695, "learning_rate": 6.46985200502537e-05, "loss": 1.8718, "step": 2629 }, { "epoch": 0.42433042917070024, "grad_norm": 3.811269998550415, "learning_rate": 6.467354482531253e-05, "loss": 1.9142, "step": 2630 }, { "epoch": 0.4244917715392062, "grad_norm": 4.5046796798706055, "learning_rate": 6.464856559357903e-05, "loss": 1.9578, "step": 2631 }, { "epoch": 0.42465311390771215, "grad_norm": 5.346692085266113, "learning_rate": 6.462358236187409e-05, "loss": 2.0217, "step": 2632 }, { "epoch": 0.4248144562762181, "grad_norm": 4.623473167419434, "learning_rate": 6.459859513701967e-05, "loss": 2.1269, "step": 2633 }, { "epoch": 0.4249757986447241, "grad_norm": 3.123763084411621, "learning_rate": 6.457360392583884e-05, "loss": 1.9277, "step": 2634 }, { "epoch": 0.4251371410132301, "grad_norm": 4.0693769454956055, "learning_rate": 6.454860873515577e-05, "loss": 1.9822, "step": 2635 }, { "epoch": 0.42529848338173604, "grad_norm": 3.4166367053985596, "learning_rate": 6.45236095717957e-05, "loss": 1.9156, "step": 2636 }, { "epoch": 0.425459825750242, "grad_norm": 3.774885416030884, "learning_rate": 6.449860644258497e-05, "loss": 1.8419, "step": 2637 }, { "epoch": 0.425621168118748, "grad_norm": 3.373805046081543, "learning_rate": 6.447359935435097e-05, "loss": 1.9435, "step": 2638 }, { "epoch": 0.42578251048725396, "grad_norm": 3.160585880279541, "learning_rate": 6.444858831392223e-05, "loss": 2.094, "step": 2639 }, { "epoch": 0.4259438528557599, "grad_norm": 3.7261202335357666, "learning_rate": 6.442357332812828e-05, "loss": 1.786, "step": 2640 }, { "epoch": 0.4261051952242659, "grad_norm": 5.06633186340332, "learning_rate": 6.439855440379978e-05, "loss": 1.9268, "step": 2641 }, { "epoch": 0.4262665375927719, "grad_norm": 4.561039447784424, "learning_rate": 6.437353154776849e-05, "loss": 2.221, "step": 2642 }, { "epoch": 0.42642787996127784, "grad_norm": 5.25416898727417, "learning_rate": 6.434850476686715e-05, "loss": 2.082, "step": 2643 }, { "epoch": 0.4265892223297838, "grad_norm": 4.456058979034424, "learning_rate": 6.43234740679297e-05, "loss": 1.682, "step": 2644 }, { "epoch": 0.42675056469828976, "grad_norm": 4.3220744132995605, "learning_rate": 6.429843945779104e-05, "loss": 2.0635, "step": 2645 }, { "epoch": 0.4269119070667957, "grad_norm": 4.297271251678467, "learning_rate": 6.427340094328718e-05, "loss": 1.9167, "step": 2646 }, { "epoch": 0.4270732494353017, "grad_norm": 4.471283435821533, "learning_rate": 6.424835853125521e-05, "loss": 1.9748, "step": 2647 }, { "epoch": 0.4272345918038077, "grad_norm": 4.177629470825195, "learning_rate": 6.422331222853326e-05, "loss": 2.0103, "step": 2648 }, { "epoch": 0.42739593417231364, "grad_norm": 4.565670967102051, "learning_rate": 6.419826204196052e-05, "loss": 1.8253, "step": 2649 }, { "epoch": 0.4275572765408196, "grad_norm": 3.8919923305511475, "learning_rate": 6.417320797837727e-05, "loss": 1.925, "step": 2650 }, { "epoch": 0.4277186189093256, "grad_norm": 4.406559467315674, "learning_rate": 6.414815004462483e-05, "loss": 1.8349, "step": 2651 }, { "epoch": 0.42787996127783157, "grad_norm": 3.911963939666748, "learning_rate": 6.412308824754557e-05, "loss": 1.7009, "step": 2652 }, { "epoch": 0.4280413036463375, "grad_norm": 4.386362552642822, "learning_rate": 6.409802259398293e-05, "loss": 2.1729, "step": 2653 }, { "epoch": 0.4282026460148435, "grad_norm": 4.710752964019775, "learning_rate": 6.407295309078138e-05, "loss": 2.0782, "step": 2654 }, { "epoch": 0.4283639883833495, "grad_norm": 4.254208087921143, "learning_rate": 6.404787974478649e-05, "loss": 1.9382, "step": 2655 }, { "epoch": 0.42852533075185545, "grad_norm": 4.716608047485352, "learning_rate": 6.402280256284481e-05, "loss": 2.0621, "step": 2656 }, { "epoch": 0.4286866731203614, "grad_norm": 4.185356616973877, "learning_rate": 6.3997721551804e-05, "loss": 1.8992, "step": 2657 }, { "epoch": 0.42884801548886736, "grad_norm": 4.865534782409668, "learning_rate": 6.397263671851273e-05, "loss": 2.0066, "step": 2658 }, { "epoch": 0.4290093578573733, "grad_norm": 5.55830717086792, "learning_rate": 6.39475480698207e-05, "loss": 1.7723, "step": 2659 }, { "epoch": 0.42917070022587933, "grad_norm": 3.3979928493499756, "learning_rate": 6.392245561257871e-05, "loss": 1.9591, "step": 2660 }, { "epoch": 0.4293320425943853, "grad_norm": 7.676721096038818, "learning_rate": 6.389735935363855e-05, "loss": 2.1379, "step": 2661 }, { "epoch": 0.42949338496289124, "grad_norm": 4.122786521911621, "learning_rate": 6.387225929985306e-05, "loss": 1.9849, "step": 2662 }, { "epoch": 0.4296547273313972, "grad_norm": 4.122786521911621, "learning_rate": 6.387225929985306e-05, "loss": 1.912, "step": 2663 }, { "epoch": 0.4298160696999032, "grad_norm": 3.77955961227417, "learning_rate": 6.384715545807613e-05, "loss": 2.0548, "step": 2664 }, { "epoch": 0.42997741206840917, "grad_norm": 6.101278781890869, "learning_rate": 6.382204783516267e-05, "loss": 2.0458, "step": 2665 }, { "epoch": 0.4301387544369151, "grad_norm": 3.9506847858428955, "learning_rate": 6.379693643796863e-05, "loss": 1.8902, "step": 2666 }, { "epoch": 0.4303000968054211, "grad_norm": 5.288346290588379, "learning_rate": 6.377182127335096e-05, "loss": 1.8848, "step": 2667 }, { "epoch": 0.4304614391739271, "grad_norm": 4.493162155151367, "learning_rate": 6.374670234816768e-05, "loss": 2.0725, "step": 2668 }, { "epoch": 0.43062278154243305, "grad_norm": 3.863415241241455, "learning_rate": 6.372157966927785e-05, "loss": 1.9583, "step": 2669 }, { "epoch": 0.430784123910939, "grad_norm": 4.328066349029541, "learning_rate": 6.369645324354149e-05, "loss": 1.9972, "step": 2670 }, { "epoch": 0.43094546627944497, "grad_norm": 4.514448165893555, "learning_rate": 6.36713230778197e-05, "loss": 1.8807, "step": 2671 }, { "epoch": 0.431106808647951, "grad_norm": 4.6742329597473145, "learning_rate": 6.364618917897456e-05, "loss": 1.8058, "step": 2672 }, { "epoch": 0.43126815101645694, "grad_norm": 3.9169187545776367, "learning_rate": 6.362105155386923e-05, "loss": 2.0142, "step": 2673 }, { "epoch": 0.4314294933849629, "grad_norm": 3.444545269012451, "learning_rate": 6.359591020936781e-05, "loss": 2.0093, "step": 2674 }, { "epoch": 0.43159083575346885, "grad_norm": 3.6630542278289795, "learning_rate": 6.357076515233548e-05, "loss": 1.8976, "step": 2675 }, { "epoch": 0.4317521781219748, "grad_norm": 5.304434776306152, "learning_rate": 6.35456163896384e-05, "loss": 1.9855, "step": 2676 }, { "epoch": 0.4319135204904808, "grad_norm": 4.996885776519775, "learning_rate": 6.352046392814375e-05, "loss": 1.9368, "step": 2677 }, { "epoch": 0.4320748628589868, "grad_norm": 4.649833679199219, "learning_rate": 6.34953077747197e-05, "loss": 1.9488, "step": 2678 }, { "epoch": 0.43223620522749273, "grad_norm": 3.217017889022827, "learning_rate": 6.347014793623547e-05, "loss": 2.1048, "step": 2679 }, { "epoch": 0.4323975475959987, "grad_norm": 3.99094820022583, "learning_rate": 6.344498441956127e-05, "loss": 2.0496, "step": 2680 }, { "epoch": 0.4325588899645047, "grad_norm": 3.524550199508667, "learning_rate": 6.341981723156829e-05, "loss": 1.8614, "step": 2681 }, { "epoch": 0.43272023233301066, "grad_norm": 3.9145429134368896, "learning_rate": 6.339464637912874e-05, "loss": 1.9051, "step": 2682 }, { "epoch": 0.4328815747015166, "grad_norm": 3.8224198818206787, "learning_rate": 6.336947186911585e-05, "loss": 2.0474, "step": 2683 }, { "epoch": 0.43304291707002257, "grad_norm": 4.108032703399658, "learning_rate": 6.334429370840381e-05, "loss": 1.9799, "step": 2684 }, { "epoch": 0.4332042594385286, "grad_norm": 3.7723186016082764, "learning_rate": 6.331911190386785e-05, "loss": 1.7414, "step": 2685 }, { "epoch": 0.43336560180703454, "grad_norm": 4.491466045379639, "learning_rate": 6.329392646238416e-05, "loss": 2.1263, "step": 2686 }, { "epoch": 0.4335269441755405, "grad_norm": 8.540251731872559, "learning_rate": 6.326873739082993e-05, "loss": 2.3336, "step": 2687 }, { "epoch": 0.43368828654404645, "grad_norm": 4.491308212280273, "learning_rate": 6.324354469608335e-05, "loss": 2.144, "step": 2688 }, { "epoch": 0.4338496289125524, "grad_norm": 3.9718141555786133, "learning_rate": 6.32183483850236e-05, "loss": 1.7607, "step": 2689 }, { "epoch": 0.4340109712810584, "grad_norm": 4.296075820922852, "learning_rate": 6.319314846453086e-05, "loss": 1.9612, "step": 2690 }, { "epoch": 0.4341723136495644, "grad_norm": 4.1339216232299805, "learning_rate": 6.316794494148625e-05, "loss": 1.932, "step": 2691 }, { "epoch": 0.43433365601807034, "grad_norm": 4.131509780883789, "learning_rate": 6.31427378227719e-05, "loss": 1.9349, "step": 2692 }, { "epoch": 0.4344949983865763, "grad_norm": 3.372370719909668, "learning_rate": 6.311752711527095e-05, "loss": 1.8986, "step": 2693 }, { "epoch": 0.4346563407550823, "grad_norm": 4.359404563903809, "learning_rate": 6.309231282586748e-05, "loss": 1.7414, "step": 2694 }, { "epoch": 0.43481768312358826, "grad_norm": 7.026899337768555, "learning_rate": 6.306709496144654e-05, "loss": 1.9072, "step": 2695 }, { "epoch": 0.4349790254920942, "grad_norm": 5.448616027832031, "learning_rate": 6.304187352889423e-05, "loss": 1.8877, "step": 2696 }, { "epoch": 0.4351403678606002, "grad_norm": 3.7698123455047607, "learning_rate": 6.301664853509754e-05, "loss": 2.2127, "step": 2697 }, { "epoch": 0.4353017102291062, "grad_norm": 4.5785441398620605, "learning_rate": 6.299141998694448e-05, "loss": 1.8905, "step": 2698 }, { "epoch": 0.43546305259761214, "grad_norm": 4.515157699584961, "learning_rate": 6.2966187891324e-05, "loss": 1.84, "step": 2699 }, { "epoch": 0.4356243949661181, "grad_norm": 5.423101425170898, "learning_rate": 6.294095225512603e-05, "loss": 1.8639, "step": 2700 }, { "epoch": 0.43578573733462406, "grad_norm": 5.622671127319336, "learning_rate": 6.29157130852415e-05, "loss": 1.9387, "step": 2701 }, { "epoch": 0.43594707970313007, "grad_norm": 3.312377691268921, "learning_rate": 6.289047038856226e-05, "loss": 2.1385, "step": 2702 }, { "epoch": 0.436108422071636, "grad_norm": 4.007981777191162, "learning_rate": 6.286522417198115e-05, "loss": 1.9194, "step": 2703 }, { "epoch": 0.436269764440142, "grad_norm": 4.537135124206543, "learning_rate": 6.283997444239194e-05, "loss": 1.9863, "step": 2704 }, { "epoch": 0.43643110680864794, "grad_norm": 3.8456368446350098, "learning_rate": 6.281472120668939e-05, "loss": 2.132, "step": 2705 }, { "epoch": 0.4365924491771539, "grad_norm": 4.065152645111084, "learning_rate": 6.278946447176923e-05, "loss": 1.8911, "step": 2706 }, { "epoch": 0.4367537915456599, "grad_norm": 3.647221565246582, "learning_rate": 6.27642042445281e-05, "loss": 1.9615, "step": 2707 }, { "epoch": 0.43691513391416587, "grad_norm": 6.745212554931641, "learning_rate": 6.27389405318636e-05, "loss": 1.9975, "step": 2708 }, { "epoch": 0.4370764762826718, "grad_norm": 4.661088466644287, "learning_rate": 6.271367334067431e-05, "loss": 2.1729, "step": 2709 }, { "epoch": 0.4372378186511778, "grad_norm": 3.776611328125, "learning_rate": 6.268840267785976e-05, "loss": 1.9579, "step": 2710 }, { "epoch": 0.4373991610196838, "grad_norm": 4.246987342834473, "learning_rate": 6.266312855032042e-05, "loss": 1.9564, "step": 2711 }, { "epoch": 0.43756050338818975, "grad_norm": 5.6204752922058105, "learning_rate": 6.26378509649577e-05, "loss": 2.0972, "step": 2712 }, { "epoch": 0.4377218457566957, "grad_norm": 4.941777229309082, "learning_rate": 6.261256992867392e-05, "loss": 2.0689, "step": 2713 }, { "epoch": 0.43788318812520166, "grad_norm": 4.1518964767456055, "learning_rate": 6.258728544837243e-05, "loss": 1.899, "step": 2714 }, { "epoch": 0.4380445304937077, "grad_norm": 4.4116411209106445, "learning_rate": 6.256199753095745e-05, "loss": 1.8749, "step": 2715 }, { "epoch": 0.43820587286221363, "grad_norm": 4.02710485458374, "learning_rate": 6.253670618333417e-05, "loss": 1.9701, "step": 2716 }, { "epoch": 0.4383672152307196, "grad_norm": 3.8983218669891357, "learning_rate": 6.251141141240866e-05, "loss": 1.9445, "step": 2717 }, { "epoch": 0.43852855759922554, "grad_norm": 5.86122989654541, "learning_rate": 6.2486113225088e-05, "loss": 1.9994, "step": 2718 }, { "epoch": 0.4386898999677315, "grad_norm": 4.823337078094482, "learning_rate": 6.246081162828016e-05, "loss": 1.9845, "step": 2719 }, { "epoch": 0.4388512423362375, "grad_norm": 3.7847797870635986, "learning_rate": 6.243550662889408e-05, "loss": 1.5948, "step": 2720 }, { "epoch": 0.43901258470474347, "grad_norm": 3.4540817737579346, "learning_rate": 6.241019823383959e-05, "loss": 2.0022, "step": 2721 }, { "epoch": 0.4391739270732494, "grad_norm": 4.29924201965332, "learning_rate": 6.238488645002744e-05, "loss": 1.8881, "step": 2722 }, { "epoch": 0.4393352694417554, "grad_norm": 4.001747131347656, "learning_rate": 6.235957128436936e-05, "loss": 2.1782, "step": 2723 }, { "epoch": 0.4394966118102614, "grad_norm": 3.5031304359436035, "learning_rate": 6.233425274377794e-05, "loss": 2.0432, "step": 2724 }, { "epoch": 0.43965795417876735, "grad_norm": 4.947254657745361, "learning_rate": 6.230893083516672e-05, "loss": 1.9521, "step": 2725 }, { "epoch": 0.4398192965472733, "grad_norm": 4.556342124938965, "learning_rate": 6.228360556545016e-05, "loss": 1.8621, "step": 2726 }, { "epoch": 0.43998063891577927, "grad_norm": 5.173356533050537, "learning_rate": 6.225827694154364e-05, "loss": 2.0605, "step": 2727 }, { "epoch": 0.4401419812842853, "grad_norm": 5.181009292602539, "learning_rate": 6.22329449703635e-05, "loss": 1.9213, "step": 2728 }, { "epoch": 0.44030332365279123, "grad_norm": 4.707743167877197, "learning_rate": 6.220760965882686e-05, "loss": 2.0548, "step": 2729 }, { "epoch": 0.4404646660212972, "grad_norm": 4.754821300506592, "learning_rate": 6.218227101385189e-05, "loss": 2.2322, "step": 2730 }, { "epoch": 0.44062600838980315, "grad_norm": 3.5412001609802246, "learning_rate": 6.215692904235762e-05, "loss": 1.9829, "step": 2731 }, { "epoch": 0.4407873507583091, "grad_norm": 3.518568992614746, "learning_rate": 6.213158375126398e-05, "loss": 1.7684, "step": 2732 }, { "epoch": 0.4409486931268151, "grad_norm": 4.005379676818848, "learning_rate": 6.21062351474918e-05, "loss": 1.8206, "step": 2733 }, { "epoch": 0.4411100354953211, "grad_norm": 4.170416355133057, "learning_rate": 6.208088323796286e-05, "loss": 1.9699, "step": 2734 }, { "epoch": 0.44127137786382703, "grad_norm": 3.935908317565918, "learning_rate": 6.20555280295998e-05, "loss": 1.7982, "step": 2735 }, { "epoch": 0.441432720232333, "grad_norm": 5.015724182128906, "learning_rate": 6.203016952932614e-05, "loss": 1.8578, "step": 2736 }, { "epoch": 0.441594062600839, "grad_norm": 6.472920894622803, "learning_rate": 6.200480774406637e-05, "loss": 2.1805, "step": 2737 }, { "epoch": 0.44175540496934496, "grad_norm": 4.625115394592285, "learning_rate": 6.197944268074583e-05, "loss": 1.9196, "step": 2738 }, { "epoch": 0.4419167473378509, "grad_norm": 4.94097375869751, "learning_rate": 6.195407434629077e-05, "loss": 1.9293, "step": 2739 }, { "epoch": 0.44207808970635687, "grad_norm": 5.129204273223877, "learning_rate": 6.192870274762831e-05, "loss": 1.8207, "step": 2740 }, { "epoch": 0.4422394320748629, "grad_norm": 4.711706638336182, "learning_rate": 6.190332789168648e-05, "loss": 2.3755, "step": 2741 }, { "epoch": 0.44240077444336884, "grad_norm": 6.469273567199707, "learning_rate": 6.187794978539419e-05, "loss": 2.1367, "step": 2742 }, { "epoch": 0.4425621168118748, "grad_norm": 6.1164870262146, "learning_rate": 6.185256843568127e-05, "loss": 2.1863, "step": 2743 }, { "epoch": 0.44272345918038075, "grad_norm": 4.332474708557129, "learning_rate": 6.18271838494784e-05, "loss": 1.992, "step": 2744 }, { "epoch": 0.44288480154888676, "grad_norm": 5.556947708129883, "learning_rate": 6.180179603371715e-05, "loss": 1.9286, "step": 2745 }, { "epoch": 0.4430461439173927, "grad_norm": 4.1797990798950195, "learning_rate": 6.177640499532996e-05, "loss": 1.7601, "step": 2746 }, { "epoch": 0.4432074862858987, "grad_norm": 3.9618661403656006, "learning_rate": 6.175101074125019e-05, "loss": 1.8364, "step": 2747 }, { "epoch": 0.44336882865440463, "grad_norm": 5.564877033233643, "learning_rate": 6.172561327841206e-05, "loss": 2.0964, "step": 2748 }, { "epoch": 0.4435301710229106, "grad_norm": 5.803670883178711, "learning_rate": 6.170021261375063e-05, "loss": 2.225, "step": 2749 }, { "epoch": 0.4436915133914166, "grad_norm": 3.9706003665924072, "learning_rate": 6.167480875420188e-05, "loss": 1.9598, "step": 2750 }, { "epoch": 0.44385285575992256, "grad_norm": 4.746767044067383, "learning_rate": 6.164940170670266e-05, "loss": 1.9101, "step": 2751 }, { "epoch": 0.4440141981284285, "grad_norm": 3.3168787956237793, "learning_rate": 6.162399147819066e-05, "loss": 2.02, "step": 2752 }, { "epoch": 0.4441755404969345, "grad_norm": 6.952641487121582, "learning_rate": 6.159857807560449e-05, "loss": 2.2001, "step": 2753 }, { "epoch": 0.4443368828654405, "grad_norm": 2.936422348022461, "learning_rate": 6.157316150588355e-05, "loss": 1.9248, "step": 2754 }, { "epoch": 0.44449822523394644, "grad_norm": 5.582231521606445, "learning_rate": 6.154774177596816e-05, "loss": 1.9168, "step": 2755 }, { "epoch": 0.4446595676024524, "grad_norm": 3.6427130699157715, "learning_rate": 6.15223188927995e-05, "loss": 1.774, "step": 2756 }, { "epoch": 0.44482090997095836, "grad_norm": 3.65397047996521, "learning_rate": 6.149689286331958e-05, "loss": 2.0293, "step": 2757 }, { "epoch": 0.44498225233946437, "grad_norm": 4.262415885925293, "learning_rate": 6.147146369447131e-05, "loss": 1.8036, "step": 2758 }, { "epoch": 0.4451435947079703, "grad_norm": 5.0748209953308105, "learning_rate": 6.144603139319845e-05, "loss": 2.052, "step": 2759 }, { "epoch": 0.4453049370764763, "grad_norm": 3.724123954772949, "learning_rate": 6.142059596644558e-05, "loss": 1.9632, "step": 2760 }, { "epoch": 0.44546627944498224, "grad_norm": 4.626696586608887, "learning_rate": 6.139515742115816e-05, "loss": 1.8951, "step": 2761 }, { "epoch": 0.4456276218134882, "grad_norm": 4.935206890106201, "learning_rate": 6.13697157642825e-05, "loss": 1.9481, "step": 2762 }, { "epoch": 0.4457889641819942, "grad_norm": 4.78384256362915, "learning_rate": 6.134427100276579e-05, "loss": 1.7873, "step": 2763 }, { "epoch": 0.44595030655050016, "grad_norm": 5.536062717437744, "learning_rate": 6.131882314355599e-05, "loss": 1.8287, "step": 2764 }, { "epoch": 0.4461116489190061, "grad_norm": 3.8308053016662598, "learning_rate": 6.129337219360196e-05, "loss": 2.2945, "step": 2765 }, { "epoch": 0.4462729912875121, "grad_norm": 3.763317584991455, "learning_rate": 6.126791815985343e-05, "loss": 2.2219, "step": 2766 }, { "epoch": 0.4464343336560181, "grad_norm": 3.8848650455474854, "learning_rate": 6.12424610492609e-05, "loss": 1.9936, "step": 2767 }, { "epoch": 0.44659567602452405, "grad_norm": 4.475578308105469, "learning_rate": 6.121700086877575e-05, "loss": 2.0197, "step": 2768 }, { "epoch": 0.44675701839303, "grad_norm": 5.381459712982178, "learning_rate": 6.119153762535021e-05, "loss": 2.2272, "step": 2769 }, { "epoch": 0.44691836076153596, "grad_norm": 4.24346399307251, "learning_rate": 6.116607132593733e-05, "loss": 2.2393, "step": 2770 }, { "epoch": 0.44707970313004197, "grad_norm": 5.122096538543701, "learning_rate": 6.114060197749101e-05, "loss": 1.9709, "step": 2771 }, { "epoch": 0.44724104549854793, "grad_norm": 5.45127534866333, "learning_rate": 6.111512958696594e-05, "loss": 2.0853, "step": 2772 }, { "epoch": 0.4474023878670539, "grad_norm": 4.182010173797607, "learning_rate": 6.10896541613177e-05, "loss": 1.8845, "step": 2773 }, { "epoch": 0.44756373023555984, "grad_norm": 4.00731897354126, "learning_rate": 6.106417570750265e-05, "loss": 2.0787, "step": 2774 }, { "epoch": 0.44772507260406585, "grad_norm": 5.120847702026367, "learning_rate": 6.1038694232478e-05, "loss": 1.6975, "step": 2775 }, { "epoch": 0.4478864149725718, "grad_norm": 4.001104354858398, "learning_rate": 6.1013209743201784e-05, "loss": 1.9372, "step": 2776 }, { "epoch": 0.44804775734107777, "grad_norm": 3.4694342613220215, "learning_rate": 6.098772224663285e-05, "loss": 2.0223, "step": 2777 }, { "epoch": 0.4482090997095837, "grad_norm": 3.6993019580841064, "learning_rate": 6.09622317497309e-05, "loss": 1.9419, "step": 2778 }, { "epoch": 0.4483704420780897, "grad_norm": 4.861392974853516, "learning_rate": 6.093673825945638e-05, "loss": 2.0335, "step": 2779 }, { "epoch": 0.4485317844465957, "grad_norm": 4.051278114318848, "learning_rate": 6.0911241782770644e-05, "loss": 1.7612, "step": 2780 }, { "epoch": 0.44869312681510165, "grad_norm": 3.8256630897521973, "learning_rate": 6.08857423266358e-05, "loss": 1.8444, "step": 2781 }, { "epoch": 0.4488544691836076, "grad_norm": 4.158143043518066, "learning_rate": 6.086023989801478e-05, "loss": 2.157, "step": 2782 }, { "epoch": 0.44901581155211356, "grad_norm": 4.315159797668457, "learning_rate": 6.0834734503871374e-05, "loss": 1.8622, "step": 2783 }, { "epoch": 0.4491771539206196, "grad_norm": 3.9262237548828125, "learning_rate": 6.0809226151170104e-05, "loss": 1.8943, "step": 2784 }, { "epoch": 0.44933849628912553, "grad_norm": 4.216813564300537, "learning_rate": 6.078371484687635e-05, "loss": 1.8937, "step": 2785 }, { "epoch": 0.4494998386576315, "grad_norm": 4.525496482849121, "learning_rate": 6.0758200597956306e-05, "loss": 2.1272, "step": 2786 }, { "epoch": 0.44966118102613745, "grad_norm": 3.2141411304473877, "learning_rate": 6.0732683411376935e-05, "loss": 1.9563, "step": 2787 }, { "epoch": 0.44982252339464346, "grad_norm": 3.9821486473083496, "learning_rate": 6.070716329410602e-05, "loss": 2.0271, "step": 2788 }, { "epoch": 0.4499838657631494, "grad_norm": 4.4023356437683105, "learning_rate": 6.068164025311215e-05, "loss": 2.0083, "step": 2789 }, { "epoch": 0.45014520813165537, "grad_norm": 6.945428848266602, "learning_rate": 6.065611429536471e-05, "loss": 2.1312, "step": 2790 }, { "epoch": 0.45030655050016133, "grad_norm": 4.356259346008301, "learning_rate": 6.0630585427833876e-05, "loss": 1.9445, "step": 2791 }, { "epoch": 0.4504678928686673, "grad_norm": 3.8623836040496826, "learning_rate": 6.060505365749061e-05, "loss": 1.8719, "step": 2792 }, { "epoch": 0.4506292352371733, "grad_norm": 8.641510009765625, "learning_rate": 6.057951899130668e-05, "loss": 2.0761, "step": 2793 }, { "epoch": 0.45079057760567925, "grad_norm": 3.494572162628174, "learning_rate": 6.055398143625465e-05, "loss": 1.9555, "step": 2794 }, { "epoch": 0.4509519199741852, "grad_norm": 4.154002666473389, "learning_rate": 6.0528440999307846e-05, "loss": 1.9841, "step": 2795 }, { "epoch": 0.45111326234269117, "grad_norm": 3.343074083328247, "learning_rate": 6.050289768744042e-05, "loss": 1.742, "step": 2796 }, { "epoch": 0.4512746047111972, "grad_norm": 4.662927150726318, "learning_rate": 6.0477351507627276e-05, "loss": 2.1056, "step": 2797 }, { "epoch": 0.45143594707970314, "grad_norm": 5.556552886962891, "learning_rate": 6.045180246684412e-05, "loss": 1.9523, "step": 2798 }, { "epoch": 0.4515972894482091, "grad_norm": 4.1540679931640625, "learning_rate": 6.042625057206742e-05, "loss": 1.9353, "step": 2799 }, { "epoch": 0.45175863181671505, "grad_norm": 4.263247013092041, "learning_rate": 6.0400695830274453e-05, "loss": 1.9561, "step": 2800 }, { "epoch": 0.45191997418522106, "grad_norm": 3.8728561401367188, "learning_rate": 6.037513824844326e-05, "loss": 1.9391, "step": 2801 }, { "epoch": 0.452081316553727, "grad_norm": 3.4104127883911133, "learning_rate": 6.034957783355264e-05, "loss": 1.8916, "step": 2802 }, { "epoch": 0.452242658922233, "grad_norm": 4.290063381195068, "learning_rate": 6.032401459258217e-05, "loss": 1.7813, "step": 2803 }, { "epoch": 0.45240400129073893, "grad_norm": 4.381455898284912, "learning_rate": 6.029844853251223e-05, "loss": 2.1467, "step": 2804 }, { "epoch": 0.45256534365924495, "grad_norm": 5.716737747192383, "learning_rate": 6.0272879660323934e-05, "loss": 1.8457, "step": 2805 }, { "epoch": 0.4527266860277509, "grad_norm": 5.184352397918701, "learning_rate": 6.024730798299918e-05, "loss": 2.0207, "step": 2806 }, { "epoch": 0.45288802839625686, "grad_norm": 5.929523944854736, "learning_rate": 6.022173350752064e-05, "loss": 1.9734, "step": 2807 }, { "epoch": 0.4530493707647628, "grad_norm": 3.670257329940796, "learning_rate": 6.0196156240871726e-05, "loss": 1.8307, "step": 2808 }, { "epoch": 0.45321071313326877, "grad_norm": 4.820164680480957, "learning_rate": 6.017057619003663e-05, "loss": 2.0091, "step": 2809 }, { "epoch": 0.4533720555017748, "grad_norm": 5.240920066833496, "learning_rate": 6.01449933620003e-05, "loss": 1.9866, "step": 2810 }, { "epoch": 0.45353339787028074, "grad_norm": 4.092869758605957, "learning_rate": 6.011940776374846e-05, "loss": 2.2141, "step": 2811 }, { "epoch": 0.4536947402387867, "grad_norm": 4.588827610015869, "learning_rate": 6.009381940226755e-05, "loss": 1.9181, "step": 2812 }, { "epoch": 0.45385608260729265, "grad_norm": 4.388471603393555, "learning_rate": 6.006822828454478e-05, "loss": 1.9646, "step": 2813 }, { "epoch": 0.45401742497579867, "grad_norm": 4.334965229034424, "learning_rate": 6.004263441756815e-05, "loss": 1.9277, "step": 2814 }, { "epoch": 0.4541787673443046, "grad_norm": 4.454052448272705, "learning_rate": 6.001703780832636e-05, "loss": 2.1556, "step": 2815 }, { "epoch": 0.4543401097128106, "grad_norm": 3.230821132659912, "learning_rate": 5.99914384638089e-05, "loss": 1.6343, "step": 2816 }, { "epoch": 0.45450145208131654, "grad_norm": 3.8127543926239014, "learning_rate": 5.9965836391005966e-05, "loss": 1.9881, "step": 2817 }, { "epoch": 0.45466279444982255, "grad_norm": 3.7591707706451416, "learning_rate": 5.9940231596908527e-05, "loss": 2.0796, "step": 2818 }, { "epoch": 0.4548241368183285, "grad_norm": 5.247003555297852, "learning_rate": 5.991462408850828e-05, "loss": 1.9372, "step": 2819 }, { "epoch": 0.45498547918683446, "grad_norm": 3.789271116256714, "learning_rate": 5.988901387279768e-05, "loss": 2.0667, "step": 2820 }, { "epoch": 0.4551468215553404, "grad_norm": 4.7549147605896, "learning_rate": 5.986340095676992e-05, "loss": 1.8772, "step": 2821 }, { "epoch": 0.4553081639238464, "grad_norm": 5.16868782043457, "learning_rate": 5.983778534741891e-05, "loss": 1.9445, "step": 2822 }, { "epoch": 0.4554695062923524, "grad_norm": 3.538343906402588, "learning_rate": 5.98121670517393e-05, "loss": 1.9306, "step": 2823 }, { "epoch": 0.45563084866085835, "grad_norm": 3.2506930828094482, "learning_rate": 5.97865460767265e-05, "loss": 2.1249, "step": 2824 }, { "epoch": 0.4557921910293643, "grad_norm": 3.847710609436035, "learning_rate": 5.976092242937663e-05, "loss": 1.8947, "step": 2825 }, { "epoch": 0.45595353339787026, "grad_norm": 4.152834415435791, "learning_rate": 5.9735296116686526e-05, "loss": 1.9749, "step": 2826 }, { "epoch": 0.45611487576637627, "grad_norm": 4.620455741882324, "learning_rate": 5.970966714565379e-05, "loss": 1.866, "step": 2827 }, { "epoch": 0.4562762181348822, "grad_norm": 3.5664255619049072, "learning_rate": 5.9684035523276716e-05, "loss": 1.8183, "step": 2828 }, { "epoch": 0.4564375605033882, "grad_norm": 4.250553607940674, "learning_rate": 5.9658401256554354e-05, "loss": 1.916, "step": 2829 }, { "epoch": 0.45659890287189414, "grad_norm": 5.06902551651001, "learning_rate": 5.963276435248642e-05, "loss": 2.0314, "step": 2830 }, { "epoch": 0.45676024524040015, "grad_norm": 3.892254590988159, "learning_rate": 5.9607124818073426e-05, "loss": 2.0672, "step": 2831 }, { "epoch": 0.4569215876089061, "grad_norm": 4.237087249755859, "learning_rate": 5.958148266031654e-05, "loss": 2.0516, "step": 2832 }, { "epoch": 0.45708292997741207, "grad_norm": 3.9006803035736084, "learning_rate": 5.955583788621766e-05, "loss": 1.8418, "step": 2833 }, { "epoch": 0.457244272345918, "grad_norm": 4.705850601196289, "learning_rate": 5.9530190502779425e-05, "loss": 1.7653, "step": 2834 }, { "epoch": 0.457405614714424, "grad_norm": 4.514369010925293, "learning_rate": 5.950454051700518e-05, "loss": 1.7485, "step": 2835 }, { "epoch": 0.45756695708293, "grad_norm": 3.989203453063965, "learning_rate": 5.947888793589894e-05, "loss": 1.9439, "step": 2836 }, { "epoch": 0.45772829945143595, "grad_norm": 3.7708961963653564, "learning_rate": 5.945323276646548e-05, "loss": 1.8588, "step": 2837 }, { "epoch": 0.4578896418199419, "grad_norm": 3.7233071327209473, "learning_rate": 5.942757501571026e-05, "loss": 1.875, "step": 2838 }, { "epoch": 0.45805098418844786, "grad_norm": 3.7089688777923584, "learning_rate": 5.940191469063943e-05, "loss": 1.7894, "step": 2839 }, { "epoch": 0.4582123265569539, "grad_norm": 4.522425174713135, "learning_rate": 5.937625179825988e-05, "loss": 2.1627, "step": 2840 }, { "epoch": 0.45837366892545983, "grad_norm": 6.098073959350586, "learning_rate": 5.9350586345579165e-05, "loss": 1.8362, "step": 2841 }, { "epoch": 0.4585350112939658, "grad_norm": 5.885412693023682, "learning_rate": 5.932491833960556e-05, "loss": 1.9232, "step": 2842 }, { "epoch": 0.45869635366247175, "grad_norm": 4.87391996383667, "learning_rate": 5.929924778734801e-05, "loss": 1.7168, "step": 2843 }, { "epoch": 0.45885769603097776, "grad_norm": 3.8623905181884766, "learning_rate": 5.9273574695816204e-05, "loss": 1.8761, "step": 2844 }, { "epoch": 0.4590190383994837, "grad_norm": 5.439482688903809, "learning_rate": 5.924789907202048e-05, "loss": 1.9109, "step": 2845 }, { "epoch": 0.45918038076798967, "grad_norm": 6.422399520874023, "learning_rate": 5.922222092297188e-05, "loss": 1.718, "step": 2846 }, { "epoch": 0.4593417231364956, "grad_norm": 4.699904441833496, "learning_rate": 5.919654025568215e-05, "loss": 2.2407, "step": 2847 }, { "epoch": 0.45950306550500164, "grad_norm": 3.374533176422119, "learning_rate": 5.917085707716372e-05, "loss": 2.0182, "step": 2848 }, { "epoch": 0.4596644078735076, "grad_norm": 6.88772439956665, "learning_rate": 5.914517139442968e-05, "loss": 1.9034, "step": 2849 }, { "epoch": 0.45982575024201355, "grad_norm": 4.0415940284729, "learning_rate": 5.9119483214493844e-05, "loss": 1.9439, "step": 2850 }, { "epoch": 0.4599870926105195, "grad_norm": 5.267615795135498, "learning_rate": 5.9093792544370665e-05, "loss": 1.7653, "step": 2851 }, { "epoch": 0.46014843497902547, "grad_norm": 4.279847621917725, "learning_rate": 5.9068099391075296e-05, "loss": 1.7213, "step": 2852 }, { "epoch": 0.4603097773475315, "grad_norm": 4.328471660614014, "learning_rate": 5.904240376162358e-05, "loss": 1.8291, "step": 2853 }, { "epoch": 0.46047111971603744, "grad_norm": 4.959749698638916, "learning_rate": 5.901670566303205e-05, "loss": 2.0452, "step": 2854 }, { "epoch": 0.4606324620845434, "grad_norm": 3.369969606399536, "learning_rate": 5.899100510231783e-05, "loss": 1.9867, "step": 2855 }, { "epoch": 0.46079380445304935, "grad_norm": 4.172391891479492, "learning_rate": 5.8965302086498816e-05, "loss": 1.9064, "step": 2856 }, { "epoch": 0.46095514682155536, "grad_norm": 3.6533727645874023, "learning_rate": 5.893959662259353e-05, "loss": 2.2306, "step": 2857 }, { "epoch": 0.4611164891900613, "grad_norm": 4.612024307250977, "learning_rate": 5.891388871762116e-05, "loss": 1.8467, "step": 2858 }, { "epoch": 0.4612778315585673, "grad_norm": 4.94940185546875, "learning_rate": 5.8888178378601565e-05, "loss": 1.77, "step": 2859 }, { "epoch": 0.46143917392707323, "grad_norm": 5.480003356933594, "learning_rate": 5.8862465612555286e-05, "loss": 2.1171, "step": 2860 }, { "epoch": 0.46160051629557924, "grad_norm": 4.599255084991455, "learning_rate": 5.8836750426503487e-05, "loss": 2.2328, "step": 2861 }, { "epoch": 0.4617618586640852, "grad_norm": 4.497989177703857, "learning_rate": 5.881103282746803e-05, "loss": 1.8345, "step": 2862 }, { "epoch": 0.46192320103259116, "grad_norm": 3.771289348602295, "learning_rate": 5.8785312822471405e-05, "loss": 2.0867, "step": 2863 }, { "epoch": 0.4620845434010971, "grad_norm": 5.573981285095215, "learning_rate": 5.8759590418536806e-05, "loss": 1.6688, "step": 2864 }, { "epoch": 0.46224588576960307, "grad_norm": 4.295119285583496, "learning_rate": 5.873386562268803e-05, "loss": 2.3071, "step": 2865 }, { "epoch": 0.4624072281381091, "grad_norm": 4.947121620178223, "learning_rate": 5.8708138441949556e-05, "loss": 1.8318, "step": 2866 }, { "epoch": 0.46256857050661504, "grad_norm": 3.5124878883361816, "learning_rate": 5.868240888334653e-05, "loss": 1.8992, "step": 2867 }, { "epoch": 0.462729912875121, "grad_norm": 3.6015613079071045, "learning_rate": 5.865667695390468e-05, "loss": 2.0089, "step": 2868 }, { "epoch": 0.46289125524362695, "grad_norm": 5.1361308097839355, "learning_rate": 5.863094266065046e-05, "loss": 2.0508, "step": 2869 }, { "epoch": 0.46305259761213297, "grad_norm": 4.714128017425537, "learning_rate": 5.860520601061093e-05, "loss": 1.7286, "step": 2870 }, { "epoch": 0.4632139399806389, "grad_norm": 4.3609938621521, "learning_rate": 5.857946701081379e-05, "loss": 2.0369, "step": 2871 }, { "epoch": 0.4633752823491449, "grad_norm": 5.165975570678711, "learning_rate": 5.855372566828741e-05, "loss": 1.9922, "step": 2872 }, { "epoch": 0.46353662471765084, "grad_norm": 3.714384078979492, "learning_rate": 5.8527981990060756e-05, "loss": 1.882, "step": 2873 }, { "epoch": 0.46369796708615685, "grad_norm": 4.203963279724121, "learning_rate": 5.850223598316347e-05, "loss": 2.0468, "step": 2874 }, { "epoch": 0.4638593094546628, "grad_norm": 5.084408760070801, "learning_rate": 5.8476487654625814e-05, "loss": 2.1784, "step": 2875 }, { "epoch": 0.46402065182316876, "grad_norm": 5.743606090545654, "learning_rate": 5.8450737011478686e-05, "loss": 2.0263, "step": 2876 }, { "epoch": 0.4641819941916747, "grad_norm": 4.035053730010986, "learning_rate": 5.842498406075363e-05, "loss": 1.9945, "step": 2877 }, { "epoch": 0.46434333656018073, "grad_norm": 5.210145950317383, "learning_rate": 5.8399228809482796e-05, "loss": 2.2279, "step": 2878 }, { "epoch": 0.4645046789286867, "grad_norm": 3.959944009780884, "learning_rate": 5.8373471264698975e-05, "loss": 1.8902, "step": 2879 }, { "epoch": 0.46466602129719264, "grad_norm": 3.5130996704101562, "learning_rate": 5.834771143343558e-05, "loss": 1.924, "step": 2880 }, { "epoch": 0.4648273636656986, "grad_norm": 3.5183193683624268, "learning_rate": 5.832194932272664e-05, "loss": 1.9975, "step": 2881 }, { "epoch": 0.46498870603420456, "grad_norm": 5.39290189743042, "learning_rate": 5.8296184939606834e-05, "loss": 1.8334, "step": 2882 }, { "epoch": 0.46515004840271057, "grad_norm": 3.941741943359375, "learning_rate": 5.827041829111144e-05, "loss": 1.9553, "step": 2883 }, { "epoch": 0.4653113907712165, "grad_norm": 5.16987419128418, "learning_rate": 5.824464938427636e-05, "loss": 1.9728, "step": 2884 }, { "epoch": 0.4654727331397225, "grad_norm": 4.827245235443115, "learning_rate": 5.82188782261381e-05, "loss": 1.8847, "step": 2885 }, { "epoch": 0.46563407550822844, "grad_norm": 4.589350700378418, "learning_rate": 5.81931048237338e-05, "loss": 1.8077, "step": 2886 }, { "epoch": 0.46579541787673445, "grad_norm": 5.262085437774658, "learning_rate": 5.8167329184101216e-05, "loss": 2.1506, "step": 2887 }, { "epoch": 0.4659567602452404, "grad_norm": 4.675025939941406, "learning_rate": 5.81415513142787e-05, "loss": 1.7157, "step": 2888 }, { "epoch": 0.46611810261374637, "grad_norm": 5.154076099395752, "learning_rate": 5.8115771221305204e-05, "loss": 2.0694, "step": 2889 }, { "epoch": 0.4662794449822523, "grad_norm": 3.6449477672576904, "learning_rate": 5.8089988912220306e-05, "loss": 1.7284, "step": 2890 }, { "epoch": 0.46644078735075833, "grad_norm": 4.152365684509277, "learning_rate": 5.806420439406419e-05, "loss": 1.9364, "step": 2891 }, { "epoch": 0.4666021297192643, "grad_norm": 6.4119553565979, "learning_rate": 5.8038417673877644e-05, "loss": 1.834, "step": 2892 }, { "epoch": 0.46676347208777025, "grad_norm": 4.654435157775879, "learning_rate": 5.8012628758702025e-05, "loss": 2.0098, "step": 2893 }, { "epoch": 0.4669248144562762, "grad_norm": 5.395243167877197, "learning_rate": 5.798683765557933e-05, "loss": 2.0146, "step": 2894 }, { "epoch": 0.46708615682478216, "grad_norm": 5.3816609382629395, "learning_rate": 5.796104437155213e-05, "loss": 2.2618, "step": 2895 }, { "epoch": 0.4672474991932882, "grad_norm": 3.4671974182128906, "learning_rate": 5.79352489136636e-05, "loss": 1.8487, "step": 2896 }, { "epoch": 0.46740884156179413, "grad_norm": 4.630336761474609, "learning_rate": 5.790945128895753e-05, "loss": 1.8822, "step": 2897 }, { "epoch": 0.4675701839303001, "grad_norm": 3.7400383949279785, "learning_rate": 5.7883651504478257e-05, "loss": 1.987, "step": 2898 }, { "epoch": 0.46773152629880604, "grad_norm": 5.043737888336182, "learning_rate": 5.7857849567270725e-05, "loss": 2.0825, "step": 2899 }, { "epoch": 0.46789286866731206, "grad_norm": 4.905903339385986, "learning_rate": 5.7832045484380495e-05, "loss": 1.9055, "step": 2900 }, { "epoch": 0.468054211035818, "grad_norm": 4.365562915802002, "learning_rate": 5.7806239262853665e-05, "loss": 2.0351, "step": 2901 }, { "epoch": 0.46821555340432397, "grad_norm": 4.589780807495117, "learning_rate": 5.778043090973696e-05, "loss": 1.7406, "step": 2902 }, { "epoch": 0.4683768957728299, "grad_norm": 3.2420592308044434, "learning_rate": 5.775462043207766e-05, "loss": 2.1973, "step": 2903 }, { "epoch": 0.46853823814133594, "grad_norm": 4.705362796783447, "learning_rate": 5.7728807836923624e-05, "loss": 1.8936, "step": 2904 }, { "epoch": 0.4686995805098419, "grad_norm": 3.618006944656372, "learning_rate": 5.770299313132334e-05, "loss": 2.0016, "step": 2905 }, { "epoch": 0.46886092287834785, "grad_norm": 3.33437442779541, "learning_rate": 5.767717632232579e-05, "loss": 1.8208, "step": 2906 }, { "epoch": 0.4690222652468538, "grad_norm": 3.5410573482513428, "learning_rate": 5.7651357416980575e-05, "loss": 2.1685, "step": 2907 }, { "epoch": 0.4691836076153598, "grad_norm": 4.212526798248291, "learning_rate": 5.76255364223379e-05, "loss": 1.9039, "step": 2908 }, { "epoch": 0.4693449499838658, "grad_norm": 3.523818016052246, "learning_rate": 5.759971334544847e-05, "loss": 1.97, "step": 2909 }, { "epoch": 0.46950629235237173, "grad_norm": 4.60017728805542, "learning_rate": 5.7573888193363603e-05, "loss": 1.8066, "step": 2910 }, { "epoch": 0.4696676347208777, "grad_norm": 4.738743782043457, "learning_rate": 5.754806097313516e-05, "loss": 1.8601, "step": 2911 }, { "epoch": 0.46982897708938365, "grad_norm": 4.0911478996276855, "learning_rate": 5.752223169181563e-05, "loss": 2.3781, "step": 2912 }, { "epoch": 0.46999031945788966, "grad_norm": 4.628448009490967, "learning_rate": 5.749640035645798e-05, "loss": 2.0009, "step": 2913 }, { "epoch": 0.4701516618263956, "grad_norm": 6.537326812744141, "learning_rate": 5.747056697411577e-05, "loss": 1.755, "step": 2914 }, { "epoch": 0.4703130041949016, "grad_norm": 4.529343128204346, "learning_rate": 5.7444731551843145e-05, "loss": 1.8524, "step": 2915 }, { "epoch": 0.47047434656340753, "grad_norm": 6.468570709228516, "learning_rate": 5.7418894096694785e-05, "loss": 1.9736, "step": 2916 }, { "epoch": 0.47063568893191354, "grad_norm": 4.821703910827637, "learning_rate": 5.739305461572591e-05, "loss": 1.9003, "step": 2917 }, { "epoch": 0.4707970313004195, "grad_norm": 4.382984161376953, "learning_rate": 5.736721311599232e-05, "loss": 1.9844, "step": 2918 }, { "epoch": 0.47095837366892546, "grad_norm": 5.662232398986816, "learning_rate": 5.734136960455035e-05, "loss": 2.1265, "step": 2919 }, { "epoch": 0.4711197160374314, "grad_norm": 4.168646812438965, "learning_rate": 5.731552408845689e-05, "loss": 2.1374, "step": 2920 }, { "epoch": 0.4712810584059374, "grad_norm": 4.660909175872803, "learning_rate": 5.728967657476936e-05, "loss": 1.6488, "step": 2921 }, { "epoch": 0.4714424007744434, "grad_norm": 4.780895709991455, "learning_rate": 5.7263827070545775e-05, "loss": 1.76, "step": 2922 }, { "epoch": 0.47160374314294934, "grad_norm": 5.367981910705566, "learning_rate": 5.723797558284464e-05, "loss": 2.1334, "step": 2923 }, { "epoch": 0.4717650855114553, "grad_norm": 4.828367710113525, "learning_rate": 5.721212211872502e-05, "loss": 1.9658, "step": 2924 }, { "epoch": 0.47192642787996125, "grad_norm": 4.586594581604004, "learning_rate": 5.718626668524655e-05, "loss": 1.7567, "step": 2925 }, { "epoch": 0.47208777024846726, "grad_norm": 4.738962650299072, "learning_rate": 5.716040928946935e-05, "loss": 1.915, "step": 2926 }, { "epoch": 0.4722491126169732, "grad_norm": 3.831724166870117, "learning_rate": 5.7134549938454095e-05, "loss": 2.1708, "step": 2927 }, { "epoch": 0.4724104549854792, "grad_norm": 4.1255879402160645, "learning_rate": 5.710868863926202e-05, "loss": 1.927, "step": 2928 }, { "epoch": 0.47257179735398513, "grad_norm": 3.5976734161376953, "learning_rate": 5.708282539895485e-05, "loss": 1.9019, "step": 2929 }, { "epoch": 0.47273313972249115, "grad_norm": 4.1457600593566895, "learning_rate": 5.70569602245949e-05, "loss": 1.9118, "step": 2930 }, { "epoch": 0.4728944820909971, "grad_norm": 3.9562318325042725, "learning_rate": 5.7031093123244925e-05, "loss": 1.9082, "step": 2931 }, { "epoch": 0.47305582445950306, "grad_norm": 3.8960869312286377, "learning_rate": 5.700522410196828e-05, "loss": 1.9282, "step": 2932 }, { "epoch": 0.473217166828009, "grad_norm": 4.371620178222656, "learning_rate": 5.697935316782883e-05, "loss": 1.9379, "step": 2933 }, { "epoch": 0.47337850919651503, "grad_norm": 3.91361665725708, "learning_rate": 5.695348032789093e-05, "loss": 1.9144, "step": 2934 }, { "epoch": 0.473539851565021, "grad_norm": 4.016637325286865, "learning_rate": 5.692760558921949e-05, "loss": 1.9151, "step": 2935 }, { "epoch": 0.47370119393352694, "grad_norm": 5.121694087982178, "learning_rate": 5.690172895887993e-05, "loss": 2.1035, "step": 2936 }, { "epoch": 0.4738625363020329, "grad_norm": 5.147009372711182, "learning_rate": 5.687585044393819e-05, "loss": 1.8547, "step": 2937 }, { "epoch": 0.47402387867053886, "grad_norm": 5.944944381713867, "learning_rate": 5.684997005146071e-05, "loss": 2.0241, "step": 2938 }, { "epoch": 0.47418522103904487, "grad_norm": 4.913337707519531, "learning_rate": 5.6824087788514424e-05, "loss": 1.8984, "step": 2939 }, { "epoch": 0.4743465634075508, "grad_norm": 4.556769847869873, "learning_rate": 5.679820366216684e-05, "loss": 2.1496, "step": 2940 }, { "epoch": 0.4745079057760568, "grad_norm": 4.924295902252197, "learning_rate": 5.677231767948592e-05, "loss": 1.9009, "step": 2941 }, { "epoch": 0.47466924814456274, "grad_norm": 4.699732303619385, "learning_rate": 5.674642984754016e-05, "loss": 1.8048, "step": 2942 }, { "epoch": 0.47483059051306875, "grad_norm": 5.167842864990234, "learning_rate": 5.672054017339855e-05, "loss": 1.8607, "step": 2943 }, { "epoch": 0.4749919328815747, "grad_norm": 4.477108478546143, "learning_rate": 5.669464866413058e-05, "loss": 1.9929, "step": 2944 }, { "epoch": 0.47515327525008066, "grad_norm": 4.411334037780762, "learning_rate": 5.666875532680624e-05, "loss": 2.0333, "step": 2945 }, { "epoch": 0.4753146176185866, "grad_norm": 4.497818946838379, "learning_rate": 5.664286016849604e-05, "loss": 1.9406, "step": 2946 }, { "epoch": 0.47547595998709263, "grad_norm": 5.784490585327148, "learning_rate": 5.661696319627097e-05, "loss": 1.9808, "step": 2947 }, { "epoch": 0.4756373023555986, "grad_norm": 3.4934520721435547, "learning_rate": 5.65910644172025e-05, "loss": 1.8308, "step": 2948 }, { "epoch": 0.47579864472410455, "grad_norm": 5.852362155914307, "learning_rate": 5.656516383836262e-05, "loss": 2.1092, "step": 2949 }, { "epoch": 0.4759599870926105, "grad_norm": 5.923304080963135, "learning_rate": 5.6539261466823814e-05, "loss": 1.7185, "step": 2950 }, { "epoch": 0.4761213294611165, "grad_norm": 4.36501407623291, "learning_rate": 5.651335730965902e-05, "loss": 1.9015, "step": 2951 }, { "epoch": 0.4762826718296225, "grad_norm": 5.649499893188477, "learning_rate": 5.648745137394171e-05, "loss": 1.9853, "step": 2952 }, { "epoch": 0.47644401419812843, "grad_norm": 3.385308265686035, "learning_rate": 5.646154366674582e-05, "loss": 1.9835, "step": 2953 }, { "epoch": 0.4766053565666344, "grad_norm": 4.42126989364624, "learning_rate": 5.643563419514576e-05, "loss": 1.9702, "step": 2954 }, { "epoch": 0.47676669893514034, "grad_norm": 4.301641464233398, "learning_rate": 5.6409722966216436e-05, "loss": 1.811, "step": 2955 }, { "epoch": 0.47692804130364636, "grad_norm": 4.124393463134766, "learning_rate": 5.638380998703322e-05, "loss": 2.02, "step": 2956 }, { "epoch": 0.4770893836721523, "grad_norm": 4.3341827392578125, "learning_rate": 5.6357895264671976e-05, "loss": 1.9654, "step": 2957 }, { "epoch": 0.47725072604065827, "grad_norm": 5.749555587768555, "learning_rate": 5.633197880620904e-05, "loss": 2.0513, "step": 2958 }, { "epoch": 0.4774120684091642, "grad_norm": 4.620724201202393, "learning_rate": 5.63060606187212e-05, "loss": 2.005, "step": 2959 }, { "epoch": 0.47757341077767024, "grad_norm": 5.979537010192871, "learning_rate": 5.6280140709285765e-05, "loss": 2.0375, "step": 2960 }, { "epoch": 0.4777347531461762, "grad_norm": 4.459228515625, "learning_rate": 5.625421908498048e-05, "loss": 1.948, "step": 2961 }, { "epoch": 0.47789609551468215, "grad_norm": 4.105099201202393, "learning_rate": 5.622829575288355e-05, "loss": 1.8121, "step": 2962 }, { "epoch": 0.4780574378831881, "grad_norm": 6.522353649139404, "learning_rate": 5.620237072007367e-05, "loss": 2.0249, "step": 2963 }, { "epoch": 0.4782187802516941, "grad_norm": 4.792418479919434, "learning_rate": 5.617644399363e-05, "loss": 2.0696, "step": 2964 }, { "epoch": 0.4783801226202001, "grad_norm": 4.468806743621826, "learning_rate": 5.6150515580632146e-05, "loss": 2.0892, "step": 2965 }, { "epoch": 0.47854146498870603, "grad_norm": 3.8313448429107666, "learning_rate": 5.6124585488160165e-05, "loss": 2.0339, "step": 2966 }, { "epoch": 0.478702807357212, "grad_norm": 3.5952653884887695, "learning_rate": 5.6098653723294604e-05, "loss": 1.9474, "step": 2967 }, { "epoch": 0.47886414972571795, "grad_norm": 4.359584808349609, "learning_rate": 5.6072720293116453e-05, "loss": 1.9697, "step": 2968 }, { "epoch": 0.47902549209422396, "grad_norm": 3.3227756023406982, "learning_rate": 5.604678520470714e-05, "loss": 1.8438, "step": 2969 }, { "epoch": 0.4791868344627299, "grad_norm": 4.508224010467529, "learning_rate": 5.6020848465148565e-05, "loss": 1.7845, "step": 2970 }, { "epoch": 0.4793481768312359, "grad_norm": 3.500623941421509, "learning_rate": 5.599491008152309e-05, "loss": 1.8891, "step": 2971 }, { "epoch": 0.47950951919974183, "grad_norm": 5.407317638397217, "learning_rate": 5.59689700609135e-05, "loss": 1.9803, "step": 2972 }, { "epoch": 0.47967086156824784, "grad_norm": 3.8324880599975586, "learning_rate": 5.5943028410403034e-05, "loss": 2.0477, "step": 2973 }, { "epoch": 0.4798322039367538, "grad_norm": 3.5237014293670654, "learning_rate": 5.5917085137075375e-05, "loss": 1.7231, "step": 2974 }, { "epoch": 0.47999354630525976, "grad_norm": 4.768789291381836, "learning_rate": 5.589114024801468e-05, "loss": 2.037, "step": 2975 }, { "epoch": 0.4801548886737657, "grad_norm": 5.117864608764648, "learning_rate": 5.586519375030549e-05, "loss": 2.1124, "step": 2976 }, { "epoch": 0.4803162310422717, "grad_norm": 3.719688653945923, "learning_rate": 5.583924565103283e-05, "loss": 1.9257, "step": 2977 }, { "epoch": 0.4804775734107777, "grad_norm": 3.956259250640869, "learning_rate": 5.5813295957282155e-05, "loss": 1.8487, "step": 2978 }, { "epoch": 0.48063891577928364, "grad_norm": 4.621438503265381, "learning_rate": 5.578734467613933e-05, "loss": 2.2352, "step": 2979 }, { "epoch": 0.4808002581477896, "grad_norm": 6.483243465423584, "learning_rate": 5.576139181469069e-05, "loss": 2.1027, "step": 2980 }, { "epoch": 0.4809616005162956, "grad_norm": 5.146411418914795, "learning_rate": 5.573543738002298e-05, "loss": 2.0156, "step": 2981 }, { "epoch": 0.48112294288480156, "grad_norm": 4.754197597503662, "learning_rate": 5.570948137922336e-05, "loss": 2.1035, "step": 2982 }, { "epoch": 0.4812842852533075, "grad_norm": 3.280088186264038, "learning_rate": 5.568352381937947e-05, "loss": 1.8788, "step": 2983 }, { "epoch": 0.4814456276218135, "grad_norm": 6.88657283782959, "learning_rate": 5.5657564707579315e-05, "loss": 1.8554, "step": 2984 }, { "epoch": 0.48160696999031943, "grad_norm": 5.31023645401001, "learning_rate": 5.563160405091136e-05, "loss": 1.7064, "step": 2985 }, { "epoch": 0.48176831235882545, "grad_norm": 4.084466934204102, "learning_rate": 5.5605641856464483e-05, "loss": 1.9841, "step": 2986 }, { "epoch": 0.4819296547273314, "grad_norm": 3.7623586654663086, "learning_rate": 5.557967813132797e-05, "loss": 1.9674, "step": 2987 }, { "epoch": 0.48209099709583736, "grad_norm": 5.303673267364502, "learning_rate": 5.555371288259155e-05, "loss": 2.184, "step": 2988 }, { "epoch": 0.4822523394643433, "grad_norm": 3.9341800212860107, "learning_rate": 5.552774611734535e-05, "loss": 2.0964, "step": 2989 }, { "epoch": 0.48241368183284933, "grad_norm": 3.611407518386841, "learning_rate": 5.550177784267991e-05, "loss": 1.8166, "step": 2990 }, { "epoch": 0.4825750242013553, "grad_norm": 4.035516262054443, "learning_rate": 5.547580806568621e-05, "loss": 1.7636, "step": 2991 }, { "epoch": 0.48273636656986124, "grad_norm": 4.942079544067383, "learning_rate": 5.544983679345559e-05, "loss": 2.0076, "step": 2992 }, { "epoch": 0.4828977089383672, "grad_norm": 3.7774479389190674, "learning_rate": 5.542386403307984e-05, "loss": 2.0257, "step": 2993 }, { "epoch": 0.4830590513068732, "grad_norm": 5.655243873596191, "learning_rate": 5.5397889791651145e-05, "loss": 1.774, "step": 2994 }, { "epoch": 0.48322039367537917, "grad_norm": 6.693843841552734, "learning_rate": 5.5371914076262085e-05, "loss": 2.0392, "step": 2995 }, { "epoch": 0.4833817360438851, "grad_norm": 6.32671594619751, "learning_rate": 5.534593689400565e-05, "loss": 1.8613, "step": 2996 }, { "epoch": 0.4835430784123911, "grad_norm": 5.015989780426025, "learning_rate": 5.531995825197522e-05, "loss": 1.8816, "step": 2997 }, { "epoch": 0.48370442078089704, "grad_norm": 4.0533952713012695, "learning_rate": 5.5293978157264605e-05, "loss": 1.7802, "step": 2998 }, { "epoch": 0.48386576314940305, "grad_norm": 3.7793238162994385, "learning_rate": 5.5267996616967966e-05, "loss": 2.1053, "step": 2999 }, { "epoch": 0.484027105517909, "grad_norm": 7.2383222579956055, "learning_rate": 5.524201363817991e-05, "loss": 2.0855, "step": 3000 }, { "epoch": 0.48418844788641496, "grad_norm": 6.114560127258301, "learning_rate": 5.521602922799539e-05, "loss": 1.9759, "step": 3001 }, { "epoch": 0.4843497902549209, "grad_norm": 6.07392692565918, "learning_rate": 5.519004339350977e-05, "loss": 2.1192, "step": 3002 }, { "epoch": 0.48451113262342693, "grad_norm": 4.475742340087891, "learning_rate": 5.516405614181883e-05, "loss": 1.8879, "step": 3003 }, { "epoch": 0.4846724749919329, "grad_norm": 3.745962619781494, "learning_rate": 5.513806748001866e-05, "loss": 2.0256, "step": 3004 }, { "epoch": 0.48483381736043885, "grad_norm": 4.5964508056640625, "learning_rate": 5.5112077415205834e-05, "loss": 1.9467, "step": 3005 }, { "epoch": 0.4849951597289448, "grad_norm": 6.5591936111450195, "learning_rate": 5.508608595447724e-05, "loss": 2.0158, "step": 3006 }, { "epoch": 0.4851565020974508, "grad_norm": 4.011103630065918, "learning_rate": 5.506009310493014e-05, "loss": 1.945, "step": 3007 }, { "epoch": 0.48531784446595677, "grad_norm": 4.502754211425781, "learning_rate": 5.5034098873662244e-05, "loss": 1.9072, "step": 3008 }, { "epoch": 0.48547918683446273, "grad_norm": 4.582625865936279, "learning_rate": 5.5008103267771585e-05, "loss": 1.8755, "step": 3009 }, { "epoch": 0.4856405292029687, "grad_norm": 6.4384989738464355, "learning_rate": 5.498210629435656e-05, "loss": 2.1877, "step": 3010 }, { "epoch": 0.48580187157147464, "grad_norm": 4.0016255378723145, "learning_rate": 5.495610796051599e-05, "loss": 1.9966, "step": 3011 }, { "epoch": 0.48596321393998065, "grad_norm": 5.0590434074401855, "learning_rate": 5.493010827334904e-05, "loss": 2.0301, "step": 3012 }, { "epoch": 0.4861245563084866, "grad_norm": 5.322793006896973, "learning_rate": 5.490410723995524e-05, "loss": 1.7476, "step": 3013 }, { "epoch": 0.48628589867699257, "grad_norm": 3.817650556564331, "learning_rate": 5.487810486743448e-05, "loss": 1.8565, "step": 3014 }, { "epoch": 0.4864472410454985, "grad_norm": 4.500606536865234, "learning_rate": 5.485210116288704e-05, "loss": 2.0309, "step": 3015 }, { "epoch": 0.48660858341400454, "grad_norm": 5.041215896606445, "learning_rate": 5.482609613341355e-05, "loss": 2.004, "step": 3016 }, { "epoch": 0.4867699257825105, "grad_norm": 4.371964454650879, "learning_rate": 5.4800089786115e-05, "loss": 1.6451, "step": 3017 }, { "epoch": 0.48693126815101645, "grad_norm": 3.7129971981048584, "learning_rate": 5.477408212809277e-05, "loss": 1.8587, "step": 3018 }, { "epoch": 0.4870926105195224, "grad_norm": 3.990514039993286, "learning_rate": 5.4748073166448545e-05, "loss": 1.9835, "step": 3019 }, { "epoch": 0.4872539528880284, "grad_norm": 3.36611270904541, "learning_rate": 5.472206290828438e-05, "loss": 1.8937, "step": 3020 }, { "epoch": 0.4874152952565344, "grad_norm": 3.641436815261841, "learning_rate": 5.4696051360702725e-05, "loss": 2.0468, "step": 3021 }, { "epoch": 0.48757663762504033, "grad_norm": 3.7677080631256104, "learning_rate": 5.467003853080634e-05, "loss": 1.9942, "step": 3022 }, { "epoch": 0.4877379799935463, "grad_norm": 3.343273639678955, "learning_rate": 5.464402442569837e-05, "loss": 1.9385, "step": 3023 }, { "epoch": 0.4878993223620523, "grad_norm": 5.741142749786377, "learning_rate": 5.461800905248225e-05, "loss": 1.9521, "step": 3024 }, { "epoch": 0.48806066473055826, "grad_norm": 4.08474588394165, "learning_rate": 5.459199241826183e-05, "loss": 2.1862, "step": 3025 }, { "epoch": 0.4882220070990642, "grad_norm": 3.6710474491119385, "learning_rate": 5.456597453014125e-05, "loss": 2.0495, "step": 3026 }, { "epoch": 0.48838334946757017, "grad_norm": 4.435654640197754, "learning_rate": 5.453995539522503e-05, "loss": 1.8065, "step": 3027 }, { "epoch": 0.48854469183607613, "grad_norm": 4.730984210968018, "learning_rate": 5.451393502061801e-05, "loss": 1.7733, "step": 3028 }, { "epoch": 0.48870603420458214, "grad_norm": 3.4625661373138428, "learning_rate": 5.448791341342538e-05, "loss": 1.9444, "step": 3029 }, { "epoch": 0.4888673765730881, "grad_norm": 4.53637170791626, "learning_rate": 5.446189058075265e-05, "loss": 1.9885, "step": 3030 }, { "epoch": 0.48902871894159405, "grad_norm": 5.451572895050049, "learning_rate": 5.4435866529705706e-05, "loss": 1.8764, "step": 3031 }, { "epoch": 0.4891900613101, "grad_norm": 6.030241012573242, "learning_rate": 5.4409841267390684e-05, "loss": 2.2974, "step": 3032 }, { "epoch": 0.489351403678606, "grad_norm": 3.6585793495178223, "learning_rate": 5.4383814800914135e-05, "loss": 2.0521, "step": 3033 }, { "epoch": 0.489512746047112, "grad_norm": 4.740541934967041, "learning_rate": 5.435778713738292e-05, "loss": 1.9246, "step": 3034 }, { "epoch": 0.48967408841561794, "grad_norm": 4.058865547180176, "learning_rate": 5.433175828390418e-05, "loss": 2.0038, "step": 3035 }, { "epoch": 0.4898354307841239, "grad_norm": 4.699848651885986, "learning_rate": 5.430572824758543e-05, "loss": 1.8504, "step": 3036 }, { "epoch": 0.4899967731526299, "grad_norm": 3.6751012802124023, "learning_rate": 5.4279697035534496e-05, "loss": 2.1574, "step": 3037 }, { "epoch": 0.49015811552113586, "grad_norm": 4.686280250549316, "learning_rate": 5.4253664654859515e-05, "loss": 1.8434, "step": 3038 }, { "epoch": 0.4903194578896418, "grad_norm": 7.2674880027771, "learning_rate": 5.4227631112668955e-05, "loss": 1.946, "step": 3039 }, { "epoch": 0.4904808002581478, "grad_norm": 4.655464172363281, "learning_rate": 5.4201596416071585e-05, "loss": 1.9058, "step": 3040 }, { "epoch": 0.49064214262665373, "grad_norm": 5.856889724731445, "learning_rate": 5.417556057217652e-05, "loss": 1.8253, "step": 3041 }, { "epoch": 0.49080348499515974, "grad_norm": 5.061101913452148, "learning_rate": 5.4149523588093156e-05, "loss": 1.9645, "step": 3042 }, { "epoch": 0.4909648273636657, "grad_norm": 5.176707744598389, "learning_rate": 5.41234854709312e-05, "loss": 1.7489, "step": 3043 }, { "epoch": 0.49112616973217166, "grad_norm": 4.296136856079102, "learning_rate": 5.4097446227800716e-05, "loss": 1.9497, "step": 3044 }, { "epoch": 0.4912875121006776, "grad_norm": 4.513986587524414, "learning_rate": 5.4071405865812e-05, "loss": 1.6907, "step": 3045 }, { "epoch": 0.4914488544691836, "grad_norm": 3.427591323852539, "learning_rate": 5.404536439207571e-05, "loss": 1.9645, "step": 3046 }, { "epoch": 0.4916101968376896, "grad_norm": 4.151731967926025, "learning_rate": 5.401932181370281e-05, "loss": 1.9425, "step": 3047 }, { "epoch": 0.49177153920619554, "grad_norm": 5.4362711906433105, "learning_rate": 5.3993278137804505e-05, "loss": 1.9941, "step": 3048 }, { "epoch": 0.4919328815747015, "grad_norm": 5.981296539306641, "learning_rate": 5.3967233371492385e-05, "loss": 1.9203, "step": 3049 }, { "epoch": 0.4920942239432075, "grad_norm": 5.723951816558838, "learning_rate": 5.3941187521878265e-05, "loss": 1.9091, "step": 3050 }, { "epoch": 0.49225556631171347, "grad_norm": 7.49515962600708, "learning_rate": 5.391514059607431e-05, "loss": 2.1372, "step": 3051 }, { "epoch": 0.4924169086802194, "grad_norm": 4.525052547454834, "learning_rate": 5.388909260119295e-05, "loss": 1.9937, "step": 3052 }, { "epoch": 0.4925782510487254, "grad_norm": 5.0868611335754395, "learning_rate": 5.386304354434688e-05, "loss": 1.8576, "step": 3053 }, { "epoch": 0.4927395934172314, "grad_norm": 4.022094249725342, "learning_rate": 5.383699343264915e-05, "loss": 1.7156, "step": 3054 }, { "epoch": 0.49290093578573735, "grad_norm": 4.555119037628174, "learning_rate": 5.381094227321305e-05, "loss": 2.0519, "step": 3055 }, { "epoch": 0.4930622781542433, "grad_norm": 6.8928022384643555, "learning_rate": 5.3784890073152184e-05, "loss": 1.9971, "step": 3056 }, { "epoch": 0.49322362052274926, "grad_norm": 3.7264394760131836, "learning_rate": 5.375883683958041e-05, "loss": 1.9227, "step": 3057 }, { "epoch": 0.4933849628912552, "grad_norm": 6.371485710144043, "learning_rate": 5.3732782579611885e-05, "loss": 2.0955, "step": 3058 }, { "epoch": 0.49354630525976123, "grad_norm": 5.1360063552856445, "learning_rate": 5.370672730036105e-05, "loss": 2.2286, "step": 3059 }, { "epoch": 0.4937076476282672, "grad_norm": 4.728470802307129, "learning_rate": 5.368067100894263e-05, "loss": 2.0495, "step": 3060 }, { "epoch": 0.49386898999677314, "grad_norm": 5.989841461181641, "learning_rate": 5.36546137124716e-05, "loss": 2.1039, "step": 3061 }, { "epoch": 0.4940303323652791, "grad_norm": 4.98484468460083, "learning_rate": 5.362855541806324e-05, "loss": 1.7701, "step": 3062 }, { "epoch": 0.4941916747337851, "grad_norm": 4.288937568664551, "learning_rate": 5.360249613283308e-05, "loss": 2.1159, "step": 3063 }, { "epoch": 0.49435301710229107, "grad_norm": 4.535952091217041, "learning_rate": 5.357643586389693e-05, "loss": 2.1569, "step": 3064 }, { "epoch": 0.494514359470797, "grad_norm": 4.4559102058410645, "learning_rate": 5.355037461837088e-05, "loss": 2.1523, "step": 3065 }, { "epoch": 0.494675701839303, "grad_norm": 5.075308799743652, "learning_rate": 5.3524312403371257e-05, "loss": 1.8285, "step": 3066 }, { "epoch": 0.494837044207809, "grad_norm": 4.463019847869873, "learning_rate": 5.349824922601467e-05, "loss": 2.0525, "step": 3067 }, { "epoch": 0.49499838657631495, "grad_norm": 6.228049278259277, "learning_rate": 5.3472185093418e-05, "loss": 2.192, "step": 3068 }, { "epoch": 0.4951597289448209, "grad_norm": 5.267539024353027, "learning_rate": 5.34461200126984e-05, "loss": 1.9324, "step": 3069 }, { "epoch": 0.49532107131332687, "grad_norm": 4.502386093139648, "learning_rate": 5.342005399097323e-05, "loss": 2.1786, "step": 3070 }, { "epoch": 0.4954824136818328, "grad_norm": 3.724005937576294, "learning_rate": 5.339398703536014e-05, "loss": 1.9359, "step": 3071 }, { "epoch": 0.49564375605033884, "grad_norm": 3.993429183959961, "learning_rate": 5.336791915297705e-05, "loss": 1.9509, "step": 3072 }, { "epoch": 0.4958050984188448, "grad_norm": 3.7583045959472656, "learning_rate": 5.33418503509421e-05, "loss": 1.9695, "step": 3073 }, { "epoch": 0.49596644078735075, "grad_norm": 5.138786792755127, "learning_rate": 5.331578063637371e-05, "loss": 1.8942, "step": 3074 }, { "epoch": 0.4961277831558567, "grad_norm": 3.136106014251709, "learning_rate": 5.3289710016390535e-05, "loss": 2.0119, "step": 3075 }, { "epoch": 0.4962891255243627, "grad_norm": 3.523416757583618, "learning_rate": 5.326363849811148e-05, "loss": 1.6708, "step": 3076 }, { "epoch": 0.4964504678928687, "grad_norm": 6.248582363128662, "learning_rate": 5.3237566088655686e-05, "loss": 1.9164, "step": 3077 }, { "epoch": 0.49661181026137463, "grad_norm": 5.099051475524902, "learning_rate": 5.321149279514256e-05, "loss": 2.0021, "step": 3078 }, { "epoch": 0.4967731526298806, "grad_norm": 4.510593414306641, "learning_rate": 5.318541862469172e-05, "loss": 1.9441, "step": 3079 }, { "epoch": 0.4969344949983866, "grad_norm": 3.972517251968384, "learning_rate": 5.315934358442306e-05, "loss": 1.8179, "step": 3080 }, { "epoch": 0.49709583736689256, "grad_norm": 4.122412204742432, "learning_rate": 5.313326768145668e-05, "loss": 2.0284, "step": 3081 }, { "epoch": 0.4972571797353985, "grad_norm": 4.420566558837891, "learning_rate": 5.310719092291292e-05, "loss": 1.9152, "step": 3082 }, { "epoch": 0.49741852210390447, "grad_norm": 4.935243129730225, "learning_rate": 5.308111331591237e-05, "loss": 1.9614, "step": 3083 }, { "epoch": 0.4975798644724105, "grad_norm": 3.7829785346984863, "learning_rate": 5.3055034867575826e-05, "loss": 2.0301, "step": 3084 }, { "epoch": 0.49774120684091644, "grad_norm": 5.601998805999756, "learning_rate": 5.302895558502435e-05, "loss": 1.9055, "step": 3085 }, { "epoch": 0.4979025492094224, "grad_norm": 3.415437936782837, "learning_rate": 5.300287547537921e-05, "loss": 1.8584, "step": 3086 }, { "epoch": 0.49806389157792835, "grad_norm": 3.8595170974731445, "learning_rate": 5.297679454576189e-05, "loss": 1.8926, "step": 3087 }, { "epoch": 0.4982252339464343, "grad_norm": 5.454835891723633, "learning_rate": 5.295071280329411e-05, "loss": 2.0281, "step": 3088 }, { "epoch": 0.4983865763149403, "grad_norm": 4.356043815612793, "learning_rate": 5.292463025509783e-05, "loss": 1.8235, "step": 3089 }, { "epoch": 0.4985479186834463, "grad_norm": 4.273855686187744, "learning_rate": 5.2898546908295196e-05, "loss": 1.9884, "step": 3090 }, { "epoch": 0.49870926105195224, "grad_norm": 4.208919048309326, "learning_rate": 5.287246277000859e-05, "loss": 1.9723, "step": 3091 }, { "epoch": 0.4988706034204582, "grad_norm": 4.538530349731445, "learning_rate": 5.284637784736059e-05, "loss": 1.8405, "step": 3092 }, { "epoch": 0.4990319457889642, "grad_norm": 4.146410942077637, "learning_rate": 5.282029214747404e-05, "loss": 1.9356, "step": 3093 }, { "epoch": 0.49919328815747016, "grad_norm": 3.9748480319976807, "learning_rate": 5.279420567747195e-05, "loss": 1.8358, "step": 3094 }, { "epoch": 0.4993546305259761, "grad_norm": 3.9157555103302, "learning_rate": 5.276811844447754e-05, "loss": 1.9643, "step": 3095 }, { "epoch": 0.4995159728944821, "grad_norm": 5.487942218780518, "learning_rate": 5.274203045561426e-05, "loss": 2.1133, "step": 3096 }, { "epoch": 0.4996773152629881, "grad_norm": 4.531738758087158, "learning_rate": 5.2715941718005747e-05, "loss": 1.7476, "step": 3097 }, { "epoch": 0.49983865763149404, "grad_norm": 5.055044174194336, "learning_rate": 5.268985223877586e-05, "loss": 1.9826, "step": 3098 }, { "epoch": 0.5, "grad_norm": 3.6202011108398438, "learning_rate": 5.266376202504866e-05, "loss": 1.9412, "step": 3099 }, { "epoch": 0.500161342368506, "grad_norm": 3.817711591720581, "learning_rate": 5.263767108394839e-05, "loss": 1.8635, "step": 3100 }, { "epoch": 0.5003226847370119, "grad_norm": 5.228734493255615, "learning_rate": 5.261157942259951e-05, "loss": 2.0059, "step": 3101 }, { "epoch": 0.5004840271055179, "grad_norm": 3.718454360961914, "learning_rate": 5.258548704812667e-05, "loss": 1.7086, "step": 3102 }, { "epoch": 0.5006453694740238, "grad_norm": 3.11460018157959, "learning_rate": 5.255939396765471e-05, "loss": 1.7851, "step": 3103 }, { "epoch": 0.5008067118425299, "grad_norm": 4.518640041351318, "learning_rate": 5.253330018830868e-05, "loss": 1.8237, "step": 3104 }, { "epoch": 0.5009680542110359, "grad_norm": 7.6932148933410645, "learning_rate": 5.250720571721378e-05, "loss": 2.1752, "step": 3105 }, { "epoch": 0.5011293965795418, "grad_norm": 5.212931156158447, "learning_rate": 5.248111056149545e-05, "loss": 1.7233, "step": 3106 }, { "epoch": 0.5012907389480478, "grad_norm": 5.633206844329834, "learning_rate": 5.2455014728279304e-05, "loss": 1.8938, "step": 3107 }, { "epoch": 0.5014520813165537, "grad_norm": 3.8108067512512207, "learning_rate": 5.2428918224691107e-05, "loss": 1.8824, "step": 3108 }, { "epoch": 0.5016134236850597, "grad_norm": 4.1044087409973145, "learning_rate": 5.240282105785683e-05, "loss": 1.7429, "step": 3109 }, { "epoch": 0.5017747660535656, "grad_norm": 6.116450309753418, "learning_rate": 5.237672323490266e-05, "loss": 1.8687, "step": 3110 }, { "epoch": 0.5019361084220716, "grad_norm": 5.414731502532959, "learning_rate": 5.2350624762954884e-05, "loss": 1.8503, "step": 3111 }, { "epoch": 0.5020974507905777, "grad_norm": 4.763272762298584, "learning_rate": 5.232452564914004e-05, "loss": 2.0168, "step": 3112 }, { "epoch": 0.5022587931590836, "grad_norm": 4.471733093261719, "learning_rate": 5.2298425900584805e-05, "loss": 1.8949, "step": 3113 }, { "epoch": 0.5024201355275896, "grad_norm": 3.62532901763916, "learning_rate": 5.2272325524416034e-05, "loss": 1.7583, "step": 3114 }, { "epoch": 0.5025814778960955, "grad_norm": 4.908361911773682, "learning_rate": 5.2246224527760765e-05, "loss": 2.134, "step": 3115 }, { "epoch": 0.5027428202646015, "grad_norm": 5.122865676879883, "learning_rate": 5.22201229177462e-05, "loss": 2.0385, "step": 3116 }, { "epoch": 0.5029041626331074, "grad_norm": 7.8537774085998535, "learning_rate": 5.219402070149968e-05, "loss": 1.8947, "step": 3117 }, { "epoch": 0.5030655050016134, "grad_norm": 6.622297763824463, "learning_rate": 5.2167917886148765e-05, "loss": 1.8418, "step": 3118 }, { "epoch": 0.5032268473701194, "grad_norm": 5.4166765213012695, "learning_rate": 5.2141814478821146e-05, "loss": 2.0358, "step": 3119 }, { "epoch": 0.5033881897386253, "grad_norm": 5.115591526031494, "learning_rate": 5.211571048664469e-05, "loss": 1.8973, "step": 3120 }, { "epoch": 0.5035495321071314, "grad_norm": 4.430712699890137, "learning_rate": 5.2089605916747374e-05, "loss": 1.9662, "step": 3121 }, { "epoch": 0.5037108744756373, "grad_norm": 5.537373065948486, "learning_rate": 5.20635007762574e-05, "loss": 2.0732, "step": 3122 }, { "epoch": 0.5038722168441433, "grad_norm": 3.0885260105133057, "learning_rate": 5.203739507230311e-05, "loss": 1.8567, "step": 3123 }, { "epoch": 0.5040335592126493, "grad_norm": 4.156231880187988, "learning_rate": 5.201128881201296e-05, "loss": 1.6745, "step": 3124 }, { "epoch": 0.5041949015811552, "grad_norm": 4.524260520935059, "learning_rate": 5.1985182002515595e-05, "loss": 2.0716, "step": 3125 }, { "epoch": 0.5043562439496612, "grad_norm": 4.900241851806641, "learning_rate": 5.195907465093982e-05, "loss": 1.7979, "step": 3126 }, { "epoch": 0.5045175863181671, "grad_norm": 3.2450215816497803, "learning_rate": 5.1932966764414545e-05, "loss": 1.8963, "step": 3127 }, { "epoch": 0.5046789286866731, "grad_norm": 6.959305286407471, "learning_rate": 5.190685835006888e-05, "loss": 2.1063, "step": 3128 }, { "epoch": 0.504840271055179, "grad_norm": 5.322142124176025, "learning_rate": 5.188074941503203e-05, "loss": 2.1626, "step": 3129 }, { "epoch": 0.5050016134236851, "grad_norm": 4.897068500518799, "learning_rate": 5.185463996643335e-05, "loss": 2.1379, "step": 3130 }, { "epoch": 0.5051629557921911, "grad_norm": 4.031672477722168, "learning_rate": 5.182853001140235e-05, "loss": 1.9897, "step": 3131 }, { "epoch": 0.505324298160697, "grad_norm": 3.338622570037842, "learning_rate": 5.180241955706872e-05, "loss": 2.0872, "step": 3132 }, { "epoch": 0.505485640529203, "grad_norm": 4.088404655456543, "learning_rate": 5.1776308610562175e-05, "loss": 1.9367, "step": 3133 }, { "epoch": 0.5056469828977089, "grad_norm": 4.4865217208862305, "learning_rate": 5.175019717901267e-05, "loss": 1.9833, "step": 3134 }, { "epoch": 0.5058083252662149, "grad_norm": 5.236238956451416, "learning_rate": 5.172408526955025e-05, "loss": 1.9675, "step": 3135 }, { "epoch": 0.5059696676347208, "grad_norm": 4.071722984313965, "learning_rate": 5.169797288930508e-05, "loss": 1.9855, "step": 3136 }, { "epoch": 0.5061310100032268, "grad_norm": 3.554135322570801, "learning_rate": 5.1671860045407484e-05, "loss": 2.0495, "step": 3137 }, { "epoch": 0.5062923523717329, "grad_norm": 4.239428520202637, "learning_rate": 5.164574674498788e-05, "loss": 1.9573, "step": 3138 }, { "epoch": 0.5064536947402388, "grad_norm": 4.632482528686523, "learning_rate": 5.1619632995176845e-05, "loss": 1.8922, "step": 3139 }, { "epoch": 0.5066150371087448, "grad_norm": 4.101121425628662, "learning_rate": 5.1593518803105055e-05, "loss": 1.8699, "step": 3140 }, { "epoch": 0.5067763794772507, "grad_norm": 4.697978973388672, "learning_rate": 5.1567404175903286e-05, "loss": 1.961, "step": 3141 }, { "epoch": 0.5069377218457567, "grad_norm": 3.9164650440216064, "learning_rate": 5.15412891207025e-05, "loss": 1.8541, "step": 3142 }, { "epoch": 0.5070990642142627, "grad_norm": 4.599571228027344, "learning_rate": 5.151517364463371e-05, "loss": 1.9727, "step": 3143 }, { "epoch": 0.5072604065827686, "grad_norm": 5.059162139892578, "learning_rate": 5.1489057754828075e-05, "loss": 1.9445, "step": 3144 }, { "epoch": 0.5074217489512746, "grad_norm": 3.971083641052246, "learning_rate": 5.146294145841687e-05, "loss": 1.88, "step": 3145 }, { "epoch": 0.5075830913197805, "grad_norm": 4.057460308074951, "learning_rate": 5.1436824762531444e-05, "loss": 2.0365, "step": 3146 }, { "epoch": 0.5077444336882866, "grad_norm": 3.408555507659912, "learning_rate": 5.14107076743033e-05, "loss": 1.9953, "step": 3147 }, { "epoch": 0.5079057760567925, "grad_norm": 4.3520660400390625, "learning_rate": 5.1384590200864047e-05, "loss": 2.0391, "step": 3148 }, { "epoch": 0.5080671184252985, "grad_norm": 4.447436332702637, "learning_rate": 5.1358472349345366e-05, "loss": 1.8032, "step": 3149 }, { "epoch": 0.5082284607938045, "grad_norm": 3.5266644954681396, "learning_rate": 5.1332354126879055e-05, "loss": 1.8932, "step": 3150 }, { "epoch": 0.5083898031623104, "grad_norm": 4.616915702819824, "learning_rate": 5.1306235540597016e-05, "loss": 1.9985, "step": 3151 }, { "epoch": 0.5085511455308164, "grad_norm": 4.386703968048096, "learning_rate": 5.128011659763125e-05, "loss": 1.8963, "step": 3152 }, { "epoch": 0.5087124878993223, "grad_norm": 3.514730215072632, "learning_rate": 5.125399730511388e-05, "loss": 2.1354, "step": 3153 }, { "epoch": 0.5088738302678283, "grad_norm": 4.217443943023682, "learning_rate": 5.1227877670177084e-05, "loss": 1.8325, "step": 3154 }, { "epoch": 0.5090351726363344, "grad_norm": 4.520450592041016, "learning_rate": 5.1201757699953134e-05, "loss": 1.9591, "step": 3155 }, { "epoch": 0.5091965150048403, "grad_norm": 4.804542064666748, "learning_rate": 5.117563740157444e-05, "loss": 1.9719, "step": 3156 }, { "epoch": 0.5093578573733463, "grad_norm": 4.119200229644775, "learning_rate": 5.1149516782173465e-05, "loss": 2.1569, "step": 3157 }, { "epoch": 0.5095191997418522, "grad_norm": 5.18499231338501, "learning_rate": 5.112339584888275e-05, "loss": 2.0214, "step": 3158 }, { "epoch": 0.5096805421103582, "grad_norm": 4.811190128326416, "learning_rate": 5.1097274608834955e-05, "loss": 2.2339, "step": 3159 }, { "epoch": 0.5098418844788641, "grad_norm": 5.532687187194824, "learning_rate": 5.107115306916278e-05, "loss": 1.6811, "step": 3160 }, { "epoch": 0.5100032268473701, "grad_norm": 4.852935314178467, "learning_rate": 5.104503123699906e-05, "loss": 2.2254, "step": 3161 }, { "epoch": 0.510164569215876, "grad_norm": 5.2240495681762695, "learning_rate": 5.101890911947668e-05, "loss": 1.9324, "step": 3162 }, { "epoch": 0.510325911584382, "grad_norm": 4.980324745178223, "learning_rate": 5.099278672372859e-05, "loss": 1.7929, "step": 3163 }, { "epoch": 0.5104872539528881, "grad_norm": 5.097092628479004, "learning_rate": 5.096666405688786e-05, "loss": 2.1415, "step": 3164 }, { "epoch": 0.510648596321394, "grad_norm": 6.43392276763916, "learning_rate": 5.094054112608758e-05, "loss": 2.025, "step": 3165 }, { "epoch": 0.5108099386899, "grad_norm": 6.511987686157227, "learning_rate": 5.0914417938460946e-05, "loss": 1.8817, "step": 3166 }, { "epoch": 0.510971281058406, "grad_norm": 2.9716813564300537, "learning_rate": 5.0888294501141245e-05, "loss": 1.7831, "step": 3167 }, { "epoch": 0.5111326234269119, "grad_norm": 4.300621032714844, "learning_rate": 5.0862170821261746e-05, "loss": 1.8249, "step": 3168 }, { "epoch": 0.5112939657954179, "grad_norm": 4.005673408508301, "learning_rate": 5.083604690595589e-05, "loss": 2.0527, "step": 3169 }, { "epoch": 0.5114553081639238, "grad_norm": 3.7447731494903564, "learning_rate": 5.080992276235712e-05, "loss": 1.8692, "step": 3170 }, { "epoch": 0.5116166505324298, "grad_norm": 4.364311695098877, "learning_rate": 5.078379839759895e-05, "loss": 2.2357, "step": 3171 }, { "epoch": 0.5117779929009358, "grad_norm": 3.762906789779663, "learning_rate": 5.0757673818814956e-05, "loss": 2.1641, "step": 3172 }, { "epoch": 0.5119393352694418, "grad_norm": 5.468181133270264, "learning_rate": 5.073154903313878e-05, "loss": 2.2245, "step": 3173 }, { "epoch": 0.5121006776379478, "grad_norm": 4.8055500984191895, "learning_rate": 5.070542404770413e-05, "loss": 1.8051, "step": 3174 }, { "epoch": 0.5122620200064537, "grad_norm": 4.544543743133545, "learning_rate": 5.0679298869644745e-05, "loss": 2.0303, "step": 3175 }, { "epoch": 0.5124233623749597, "grad_norm": 5.0618510246276855, "learning_rate": 5.065317350609443e-05, "loss": 1.7931, "step": 3176 }, { "epoch": 0.5125847047434656, "grad_norm": 3.694225549697876, "learning_rate": 5.062704796418703e-05, "loss": 2.0482, "step": 3177 }, { "epoch": 0.5127460471119716, "grad_norm": 4.4860358238220215, "learning_rate": 5.060092225105646e-05, "loss": 1.9537, "step": 3178 }, { "epoch": 0.5129073894804775, "grad_norm": 5.659295558929443, "learning_rate": 5.0574796373836654e-05, "loss": 1.79, "step": 3179 }, { "epoch": 0.5130687318489835, "grad_norm": 4.734598636627197, "learning_rate": 5.0548670339661605e-05, "loss": 1.7415, "step": 3180 }, { "epoch": 0.5132300742174896, "grad_norm": 4.6034955978393555, "learning_rate": 5.052254415566536e-05, "loss": 1.9392, "step": 3181 }, { "epoch": 0.5133914165859955, "grad_norm": 4.362708568572998, "learning_rate": 5.049641782898199e-05, "loss": 2.0239, "step": 3182 }, { "epoch": 0.5135527589545015, "grad_norm": 3.8116865158081055, "learning_rate": 5.047029136674563e-05, "loss": 1.8929, "step": 3183 }, { "epoch": 0.5137141013230074, "grad_norm": 4.168476104736328, "learning_rate": 5.044416477609038e-05, "loss": 1.905, "step": 3184 }, { "epoch": 0.5138754436915134, "grad_norm": 4.413907051086426, "learning_rate": 5.041803806415049e-05, "loss": 1.9336, "step": 3185 }, { "epoch": 0.5140367860600193, "grad_norm": 4.9159770011901855, "learning_rate": 5.039191123806013e-05, "loss": 1.9518, "step": 3186 }, { "epoch": 0.5141981284285253, "grad_norm": 3.9071576595306396, "learning_rate": 5.03657843049536e-05, "loss": 1.8817, "step": 3187 }, { "epoch": 0.5143594707970313, "grad_norm": 3.7660021781921387, "learning_rate": 5.033965727196513e-05, "loss": 1.8742, "step": 3188 }, { "epoch": 0.5145208131655372, "grad_norm": 5.279468536376953, "learning_rate": 5.031353014622907e-05, "loss": 1.8084, "step": 3189 }, { "epoch": 0.5146821555340433, "grad_norm": 4.2043681144714355, "learning_rate": 5.0287402934879725e-05, "loss": 2.1356, "step": 3190 }, { "epoch": 0.5148434979025492, "grad_norm": 4.737074851989746, "learning_rate": 5.026127564505147e-05, "loss": 2.0505, "step": 3191 }, { "epoch": 0.5150048402710552, "grad_norm": 3.856682062149048, "learning_rate": 5.0235148283878675e-05, "loss": 1.8909, "step": 3192 }, { "epoch": 0.5151661826395612, "grad_norm": 3.9248077869415283, "learning_rate": 5.020902085849575e-05, "loss": 1.8531, "step": 3193 }, { "epoch": 0.5153275250080671, "grad_norm": 3.782620668411255, "learning_rate": 5.018289337603709e-05, "loss": 2.0022, "step": 3194 }, { "epoch": 0.5154888673765731, "grad_norm": 6.275498390197754, "learning_rate": 5.0156765843637156e-05, "loss": 2.1021, "step": 3195 }, { "epoch": 0.515650209745079, "grad_norm": 5.375777244567871, "learning_rate": 5.013063826843036e-05, "loss": 1.7244, "step": 3196 }, { "epoch": 0.515811552113585, "grad_norm": 4.985101699829102, "learning_rate": 5.01045106575512e-05, "loss": 1.7357, "step": 3197 }, { "epoch": 0.515972894482091, "grad_norm": 4.513815879821777, "learning_rate": 5.007838301813409e-05, "loss": 1.8166, "step": 3198 }, { "epoch": 0.516134236850597, "grad_norm": 3.676476240158081, "learning_rate": 5.0052255357313536e-05, "loss": 1.7123, "step": 3199 }, { "epoch": 0.516295579219103, "grad_norm": 4.253829479217529, "learning_rate": 5.002612768222401e-05, "loss": 1.8319, "step": 3200 }, { "epoch": 0.5164569215876089, "grad_norm": 3.100123167037964, "learning_rate": 5e-05, "loss": 1.9244, "step": 3201 }, { "epoch": 0.5166182639561149, "grad_norm": 3.931265115737915, "learning_rate": 4.997387231777601e-05, "loss": 2.0529, "step": 3202 }, { "epoch": 0.5167796063246208, "grad_norm": 3.888005256652832, "learning_rate": 4.9947744642686476e-05, "loss": 1.9436, "step": 3203 }, { "epoch": 0.5169409486931268, "grad_norm": 4.62919807434082, "learning_rate": 4.9921616981865926e-05, "loss": 1.9782, "step": 3204 }, { "epoch": 0.5171022910616327, "grad_norm": 3.9387950897216797, "learning_rate": 4.9895489342448814e-05, "loss": 1.894, "step": 3205 }, { "epoch": 0.5172636334301387, "grad_norm": 7.06707763671875, "learning_rate": 4.9869361731569645e-05, "loss": 1.8967, "step": 3206 }, { "epoch": 0.5174249757986448, "grad_norm": 4.015041351318359, "learning_rate": 4.984323415636285e-05, "loss": 1.8313, "step": 3207 }, { "epoch": 0.5175863181671507, "grad_norm": 4.058426856994629, "learning_rate": 4.9817106623962915e-05, "loss": 1.9187, "step": 3208 }, { "epoch": 0.5177476605356567, "grad_norm": 4.128994941711426, "learning_rate": 4.9790979141504254e-05, "loss": 1.7979, "step": 3209 }, { "epoch": 0.5179090029041626, "grad_norm": 3.9643068313598633, "learning_rate": 4.9764851716121337e-05, "loss": 1.8926, "step": 3210 }, { "epoch": 0.5180703452726686, "grad_norm": 5.316287994384766, "learning_rate": 4.973872435494853e-05, "loss": 1.7501, "step": 3211 }, { "epoch": 0.5182316876411746, "grad_norm": 6.077383995056152, "learning_rate": 4.971259706512029e-05, "loss": 1.9895, "step": 3212 }, { "epoch": 0.5183930300096805, "grad_norm": 3.991368055343628, "learning_rate": 4.968646985377093e-05, "loss": 1.8354, "step": 3213 }, { "epoch": 0.5185543723781865, "grad_norm": 5.030738353729248, "learning_rate": 4.966034272803488e-05, "loss": 2.3001, "step": 3214 }, { "epoch": 0.5187157147466925, "grad_norm": 4.330802917480469, "learning_rate": 4.9634215695046425e-05, "loss": 1.9873, "step": 3215 }, { "epoch": 0.5188770571151985, "grad_norm": 4.938310623168945, "learning_rate": 4.960808876193987e-05, "loss": 1.9398, "step": 3216 }, { "epoch": 0.5190383994837044, "grad_norm": 3.9098732471466064, "learning_rate": 4.9581961935849536e-05, "loss": 1.8656, "step": 3217 }, { "epoch": 0.5191997418522104, "grad_norm": 4.209815979003906, "learning_rate": 4.955583522390962e-05, "loss": 2.0892, "step": 3218 }, { "epoch": 0.5193610842207164, "grad_norm": 3.854658842086792, "learning_rate": 4.95297086332544e-05, "loss": 1.8602, "step": 3219 }, { "epoch": 0.5195224265892223, "grad_norm": 3.757833480834961, "learning_rate": 4.9503582171018e-05, "loss": 1.9197, "step": 3220 }, { "epoch": 0.5196837689577283, "grad_norm": 5.500919818878174, "learning_rate": 4.9477455844334645e-05, "loss": 2.1579, "step": 3221 }, { "epoch": 0.5198451113262342, "grad_norm": 4.940765380859375, "learning_rate": 4.945132966033839e-05, "loss": 2.0228, "step": 3222 }, { "epoch": 0.5200064536947402, "grad_norm": 3.5651445388793945, "learning_rate": 4.942520362616336e-05, "loss": 1.8485, "step": 3223 }, { "epoch": 0.5201677960632463, "grad_norm": 4.323605537414551, "learning_rate": 4.9399077748943554e-05, "loss": 1.8058, "step": 3224 }, { "epoch": 0.5203291384317522, "grad_norm": 4.6020708084106445, "learning_rate": 4.937295203581297e-05, "loss": 1.903, "step": 3225 }, { "epoch": 0.5204904808002582, "grad_norm": 5.301663875579834, "learning_rate": 4.934682649390557e-05, "loss": 2.0427, "step": 3226 }, { "epoch": 0.5206518231687641, "grad_norm": 6.134703159332275, "learning_rate": 4.932070113035527e-05, "loss": 1.9043, "step": 3227 }, { "epoch": 0.5208131655372701, "grad_norm": 4.341734886169434, "learning_rate": 4.929457595229589e-05, "loss": 1.8886, "step": 3228 }, { "epoch": 0.520974507905776, "grad_norm": 4.849643707275391, "learning_rate": 4.926845096686122e-05, "loss": 2.1224, "step": 3229 }, { "epoch": 0.521135850274282, "grad_norm": 4.229915142059326, "learning_rate": 4.924232618118507e-05, "loss": 1.8808, "step": 3230 }, { "epoch": 0.521297192642788, "grad_norm": 5.061223983764648, "learning_rate": 4.9216201602401065e-05, "loss": 1.8286, "step": 3231 }, { "epoch": 0.5214585350112939, "grad_norm": 4.838642120361328, "learning_rate": 4.91900772376429e-05, "loss": 1.771, "step": 3232 }, { "epoch": 0.5216198773798, "grad_norm": 4.382593154907227, "learning_rate": 4.9163953094044114e-05, "loss": 2.0494, "step": 3233 }, { "epoch": 0.5217812197483059, "grad_norm": 4.9571709632873535, "learning_rate": 4.913782917873826e-05, "loss": 2.0606, "step": 3234 }, { "epoch": 0.5219425621168119, "grad_norm": 5.578112602233887, "learning_rate": 4.911170549885877e-05, "loss": 2.061, "step": 3235 }, { "epoch": 0.5221039044853178, "grad_norm": 4.324019908905029, "learning_rate": 4.908558206153906e-05, "loss": 2.0398, "step": 3236 }, { "epoch": 0.5222652468538238, "grad_norm": 4.468944549560547, "learning_rate": 4.905945887391242e-05, "loss": 1.8025, "step": 3237 }, { "epoch": 0.5224265892223298, "grad_norm": 4.0166120529174805, "learning_rate": 4.903333594311215e-05, "loss": 2.0221, "step": 3238 }, { "epoch": 0.5225879315908357, "grad_norm": 4.2127180099487305, "learning_rate": 4.900721327627143e-05, "loss": 2.0657, "step": 3239 }, { "epoch": 0.5227492739593417, "grad_norm": 4.6842732429504395, "learning_rate": 4.898109088052333e-05, "loss": 2.1807, "step": 3240 }, { "epoch": 0.5229106163278477, "grad_norm": 5.23138952255249, "learning_rate": 4.895496876300096e-05, "loss": 2.0688, "step": 3241 }, { "epoch": 0.5230719586963537, "grad_norm": 4.115994453430176, "learning_rate": 4.892884693083723e-05, "loss": 1.8319, "step": 3242 }, { "epoch": 0.5232333010648597, "grad_norm": 3.9842920303344727, "learning_rate": 4.890272539116507e-05, "loss": 1.8929, "step": 3243 }, { "epoch": 0.5233946434333656, "grad_norm": 4.264963150024414, "learning_rate": 4.887660415111727e-05, "loss": 1.9543, "step": 3244 }, { "epoch": 0.5235559858018716, "grad_norm": 4.318862438201904, "learning_rate": 4.8850483217826546e-05, "loss": 1.7397, "step": 3245 }, { "epoch": 0.5237173281703775, "grad_norm": 4.763772487640381, "learning_rate": 4.882436259842556e-05, "loss": 1.9595, "step": 3246 }, { "epoch": 0.5238786705388835, "grad_norm": 4.464093208312988, "learning_rate": 4.879824230004688e-05, "loss": 1.7959, "step": 3247 }, { "epoch": 0.5240400129073894, "grad_norm": 5.059757232666016, "learning_rate": 4.877212232982292e-05, "loss": 2.1096, "step": 3248 }, { "epoch": 0.5242013552758954, "grad_norm": 3.138390302658081, "learning_rate": 4.874600269488613e-05, "loss": 2.0268, "step": 3249 }, { "epoch": 0.5243626976444015, "grad_norm": 4.7847700119018555, "learning_rate": 4.8719883402368745e-05, "loss": 1.7649, "step": 3250 }, { "epoch": 0.5245240400129074, "grad_norm": 5.590267181396484, "learning_rate": 4.8693764459402996e-05, "loss": 2.3039, "step": 3251 }, { "epoch": 0.5246853823814134, "grad_norm": 3.87428617477417, "learning_rate": 4.866764587312097e-05, "loss": 1.8538, "step": 3252 }, { "epoch": 0.5248467247499193, "grad_norm": 4.53293514251709, "learning_rate": 4.8641527650654646e-05, "loss": 1.7762, "step": 3253 }, { "epoch": 0.5250080671184253, "grad_norm": 5.136706829071045, "learning_rate": 4.861540979913597e-05, "loss": 1.9608, "step": 3254 }, { "epoch": 0.5251694094869312, "grad_norm": 3.199901580810547, "learning_rate": 4.858929232569671e-05, "loss": 2.3918, "step": 3255 }, { "epoch": 0.5253307518554372, "grad_norm": 5.633172035217285, "learning_rate": 4.8563175237468575e-05, "loss": 2.1478, "step": 3256 }, { "epoch": 0.5254920942239432, "grad_norm": 5.483176231384277, "learning_rate": 4.853705854158315e-05, "loss": 1.8405, "step": 3257 }, { "epoch": 0.5256534365924492, "grad_norm": 6.112262725830078, "learning_rate": 4.8510942245171937e-05, "loss": 1.9673, "step": 3258 }, { "epoch": 0.5258147789609552, "grad_norm": 3.62973690032959, "learning_rate": 4.8484826355366295e-05, "loss": 1.7978, "step": 3259 }, { "epoch": 0.5259761213294611, "grad_norm": 3.750770330429077, "learning_rate": 4.845871087929751e-05, "loss": 1.8656, "step": 3260 }, { "epoch": 0.5261374636979671, "grad_norm": 5.85239315032959, "learning_rate": 4.8432595824096705e-05, "loss": 1.9442, "step": 3261 }, { "epoch": 0.5262988060664731, "grad_norm": 5.854916572570801, "learning_rate": 4.8406481196894956e-05, "loss": 1.8402, "step": 3262 }, { "epoch": 0.526460148434979, "grad_norm": 5.106485843658447, "learning_rate": 4.838036700482316e-05, "loss": 1.9989, "step": 3263 }, { "epoch": 0.526621490803485, "grad_norm": 3.8806090354919434, "learning_rate": 4.8354253255012134e-05, "loss": 1.9908, "step": 3264 }, { "epoch": 0.5267828331719909, "grad_norm": 3.4543380737304688, "learning_rate": 4.8328139954592534e-05, "loss": 1.8149, "step": 3265 }, { "epoch": 0.5269441755404969, "grad_norm": 4.389228343963623, "learning_rate": 4.830202711069493e-05, "loss": 1.9839, "step": 3266 }, { "epoch": 0.527105517909003, "grad_norm": 3.474240779876709, "learning_rate": 4.827591473044978e-05, "loss": 2.18, "step": 3267 }, { "epoch": 0.5272668602775089, "grad_norm": 3.6651768684387207, "learning_rate": 4.824980282098734e-05, "loss": 2.0556, "step": 3268 }, { "epoch": 0.5274282026460149, "grad_norm": 3.4399147033691406, "learning_rate": 4.8223691389437844e-05, "loss": 1.9626, "step": 3269 }, { "epoch": 0.5275895450145208, "grad_norm": 4.90449857711792, "learning_rate": 4.8197580442931295e-05, "loss": 1.8866, "step": 3270 }, { "epoch": 0.5277508873830268, "grad_norm": 4.023404121398926, "learning_rate": 4.817146998859765e-05, "loss": 2.0522, "step": 3271 }, { "epoch": 0.5279122297515327, "grad_norm": 3.915118932723999, "learning_rate": 4.814536003356666e-05, "loss": 2.2881, "step": 3272 }, { "epoch": 0.5280735721200387, "grad_norm": 4.261370658874512, "learning_rate": 4.811925058496798e-05, "loss": 2.1968, "step": 3273 }, { "epoch": 0.5282349144885446, "grad_norm": 3.4954028129577637, "learning_rate": 4.8093141649931126e-05, "loss": 1.8929, "step": 3274 }, { "epoch": 0.5283962568570507, "grad_norm": 3.987055778503418, "learning_rate": 4.806703323558546e-05, "loss": 2.2058, "step": 3275 }, { "epoch": 0.5285575992255567, "grad_norm": 4.587975025177002, "learning_rate": 4.804092534906018e-05, "loss": 1.9484, "step": 3276 }, { "epoch": 0.5287189415940626, "grad_norm": 4.985721111297607, "learning_rate": 4.801481799748441e-05, "loss": 1.8801, "step": 3277 }, { "epoch": 0.5288802839625686, "grad_norm": 3.6657228469848633, "learning_rate": 4.798871118798707e-05, "loss": 1.8155, "step": 3278 }, { "epoch": 0.5290416263310745, "grad_norm": 4.231720447540283, "learning_rate": 4.796260492769691e-05, "loss": 1.9869, "step": 3279 }, { "epoch": 0.5292029686995805, "grad_norm": 3.9197402000427246, "learning_rate": 4.7936499223742616e-05, "loss": 2.2224, "step": 3280 }, { "epoch": 0.5293643110680865, "grad_norm": 3.874336004257202, "learning_rate": 4.791039408325264e-05, "loss": 2.0934, "step": 3281 }, { "epoch": 0.5295256534365924, "grad_norm": 4.764644145965576, "learning_rate": 4.788428951335534e-05, "loss": 1.8271, "step": 3282 }, { "epoch": 0.5296869958050984, "grad_norm": 4.530116081237793, "learning_rate": 4.785818552117886e-05, "loss": 1.7182, "step": 3283 }, { "epoch": 0.5298483381736044, "grad_norm": 3.121483564376831, "learning_rate": 4.7832082113851247e-05, "loss": 2.1423, "step": 3284 }, { "epoch": 0.5300096805421104, "grad_norm": 3.517976760864258, "learning_rate": 4.780597929850032e-05, "loss": 1.7439, "step": 3285 }, { "epoch": 0.5301710229106164, "grad_norm": 4.023410797119141, "learning_rate": 4.777987708225382e-05, "loss": 1.7658, "step": 3286 }, { "epoch": 0.5303323652791223, "grad_norm": 3.6459848880767822, "learning_rate": 4.775377547223924e-05, "loss": 2.0099, "step": 3287 }, { "epoch": 0.5304937076476283, "grad_norm": 3.7603046894073486, "learning_rate": 4.772767447558398e-05, "loss": 1.8944, "step": 3288 }, { "epoch": 0.5306550500161342, "grad_norm": 3.734020948410034, "learning_rate": 4.77015740994152e-05, "loss": 1.7808, "step": 3289 }, { "epoch": 0.5308163923846402, "grad_norm": 3.546699285507202, "learning_rate": 4.767547435085997e-05, "loss": 1.9671, "step": 3290 }, { "epoch": 0.5309777347531461, "grad_norm": 4.612760543823242, "learning_rate": 4.7649375237045135e-05, "loss": 1.9822, "step": 3291 }, { "epoch": 0.5311390771216521, "grad_norm": 4.891711235046387, "learning_rate": 4.762327676509736e-05, "loss": 1.7121, "step": 3292 }, { "epoch": 0.5313004194901582, "grad_norm": 4.840190410614014, "learning_rate": 4.759717894214318e-05, "loss": 1.9484, "step": 3293 }, { "epoch": 0.5314617618586641, "grad_norm": 3.971648931503296, "learning_rate": 4.7571081775308905e-05, "loss": 2.027, "step": 3294 }, { "epoch": 0.5316231042271701, "grad_norm": 3.5140857696533203, "learning_rate": 4.754498527172072e-05, "loss": 1.935, "step": 3295 }, { "epoch": 0.531784446595676, "grad_norm": 3.765691041946411, "learning_rate": 4.751888943850455e-05, "loss": 1.8352, "step": 3296 }, { "epoch": 0.531945788964182, "grad_norm": 6.4853034019470215, "learning_rate": 4.7492794282786236e-05, "loss": 1.9982, "step": 3297 }, { "epoch": 0.5321071313326879, "grad_norm": 4.060369491577148, "learning_rate": 4.7466699811691326e-05, "loss": 1.8903, "step": 3298 }, { "epoch": 0.5322684737011939, "grad_norm": 8.457684516906738, "learning_rate": 4.74406060323453e-05, "loss": 2.0126, "step": 3299 }, { "epoch": 0.5324298160696999, "grad_norm": 4.315256118774414, "learning_rate": 4.741451295187332e-05, "loss": 1.9478, "step": 3300 }, { "epoch": 0.5325911584382059, "grad_norm": 3.9448628425598145, "learning_rate": 4.7388420577400496e-05, "loss": 1.6127, "step": 3301 }, { "epoch": 0.5327525008067119, "grad_norm": 4.897828578948975, "learning_rate": 4.736232891605161e-05, "loss": 2.1037, "step": 3302 }, { "epoch": 0.5329138431752178, "grad_norm": 5.935902118682861, "learning_rate": 4.733623797495136e-05, "loss": 1.8929, "step": 3303 }, { "epoch": 0.5330751855437238, "grad_norm": 4.652507781982422, "learning_rate": 4.731014776122416e-05, "loss": 1.8967, "step": 3304 }, { "epoch": 0.5332365279122298, "grad_norm": 3.986088275909424, "learning_rate": 4.728405828199427e-05, "loss": 1.9336, "step": 3305 }, { "epoch": 0.5333978702807357, "grad_norm": 4.08700704574585, "learning_rate": 4.725796954438577e-05, "loss": 1.8997, "step": 3306 }, { "epoch": 0.5335592126492417, "grad_norm": 5.69546365737915, "learning_rate": 4.723188155552247e-05, "loss": 1.9908, "step": 3307 }, { "epoch": 0.5337205550177476, "grad_norm": 4.448991298675537, "learning_rate": 4.720579432252807e-05, "loss": 2.1116, "step": 3308 }, { "epoch": 0.5338818973862536, "grad_norm": 3.5392987728118896, "learning_rate": 4.717970785252595e-05, "loss": 1.6317, "step": 3309 }, { "epoch": 0.5340432397547596, "grad_norm": 3.903165102005005, "learning_rate": 4.715362215263941e-05, "loss": 1.8261, "step": 3310 }, { "epoch": 0.5342045821232656, "grad_norm": 4.5423359870910645, "learning_rate": 4.712753722999143e-05, "loss": 1.8005, "step": 3311 }, { "epoch": 0.5343659244917716, "grad_norm": 4.870762348175049, "learning_rate": 4.710145309170481e-05, "loss": 2.1654, "step": 3312 }, { "epoch": 0.5345272668602775, "grad_norm": 4.262883186340332, "learning_rate": 4.7075369744902175e-05, "loss": 1.9795, "step": 3313 }, { "epoch": 0.5346886092287835, "grad_norm": 4.609106063842773, "learning_rate": 4.70492871967059e-05, "loss": 1.7828, "step": 3314 }, { "epoch": 0.5348499515972894, "grad_norm": 4.6488494873046875, "learning_rate": 4.7023205454238136e-05, "loss": 2.0667, "step": 3315 }, { "epoch": 0.5350112939657954, "grad_norm": 5.292774677276611, "learning_rate": 4.69971245246208e-05, "loss": 1.8568, "step": 3316 }, { "epoch": 0.5351726363343013, "grad_norm": 4.1986188888549805, "learning_rate": 4.6971044414975666e-05, "loss": 2.1148, "step": 3317 }, { "epoch": 0.5353339787028074, "grad_norm": 4.246354579925537, "learning_rate": 4.6944965132424185e-05, "loss": 1.8744, "step": 3318 }, { "epoch": 0.5354953210713134, "grad_norm": 6.183987140655518, "learning_rate": 4.691888668408766e-05, "loss": 2.0153, "step": 3319 }, { "epoch": 0.5356566634398193, "grad_norm": 4.5026044845581055, "learning_rate": 4.689280907708709e-05, "loss": 1.8281, "step": 3320 }, { "epoch": 0.5358180058083253, "grad_norm": 4.956146717071533, "learning_rate": 4.686673231854334e-05, "loss": 1.7823, "step": 3321 }, { "epoch": 0.5359793481768312, "grad_norm": 5.603353977203369, "learning_rate": 4.684065641557695e-05, "loss": 1.9652, "step": 3322 }, { "epoch": 0.5361406905453372, "grad_norm": 4.635777473449707, "learning_rate": 4.681458137530829e-05, "loss": 2.0288, "step": 3323 }, { "epoch": 0.5363020329138432, "grad_norm": 4.934343338012695, "learning_rate": 4.6788507204857446e-05, "loss": 1.888, "step": 3324 }, { "epoch": 0.5364633752823491, "grad_norm": 5.475190162658691, "learning_rate": 4.6762433911344325e-05, "loss": 1.7964, "step": 3325 }, { "epoch": 0.5366247176508551, "grad_norm": 4.2750396728515625, "learning_rate": 4.673636150188852e-05, "loss": 1.8277, "step": 3326 }, { "epoch": 0.5367860600193611, "grad_norm": 3.8814730644226074, "learning_rate": 4.671028998360947e-05, "loss": 1.9308, "step": 3327 }, { "epoch": 0.5369474023878671, "grad_norm": 4.442133903503418, "learning_rate": 4.6684219363626306e-05, "loss": 1.9073, "step": 3328 }, { "epoch": 0.537108744756373, "grad_norm": 3.948798894882202, "learning_rate": 4.66581496490579e-05, "loss": 1.8738, "step": 3329 }, { "epoch": 0.537270087124879, "grad_norm": 4.171131610870361, "learning_rate": 4.663208084702297e-05, "loss": 1.837, "step": 3330 }, { "epoch": 0.537431429493385, "grad_norm": 5.258491039276123, "learning_rate": 4.6606012964639874e-05, "loss": 2.0329, "step": 3331 }, { "epoch": 0.5375927718618909, "grad_norm": 4.995650291442871, "learning_rate": 4.6579946009026786e-05, "loss": 1.9554, "step": 3332 }, { "epoch": 0.5377541142303969, "grad_norm": 4.612634658813477, "learning_rate": 4.655387998730161e-05, "loss": 1.926, "step": 3333 }, { "epoch": 0.5379154565989028, "grad_norm": 3.0606601238250732, "learning_rate": 4.6527814906582e-05, "loss": 1.9066, "step": 3334 }, { "epoch": 0.5380767989674088, "grad_norm": 4.619715213775635, "learning_rate": 4.6501750773985326e-05, "loss": 1.632, "step": 3335 }, { "epoch": 0.5382381413359149, "grad_norm": 3.8743972778320312, "learning_rate": 4.647568759662876e-05, "loss": 2.0805, "step": 3336 }, { "epoch": 0.5383994837044208, "grad_norm": 5.274535179138184, "learning_rate": 4.644962538162913e-05, "loss": 1.919, "step": 3337 }, { "epoch": 0.5385608260729268, "grad_norm": 5.369234561920166, "learning_rate": 4.642356413610308e-05, "loss": 1.9116, "step": 3338 }, { "epoch": 0.5387221684414327, "grad_norm": 4.213049411773682, "learning_rate": 4.6397503867166926e-05, "loss": 1.8896, "step": 3339 }, { "epoch": 0.5388835108099387, "grad_norm": 4.885444164276123, "learning_rate": 4.637144458193677e-05, "loss": 1.8171, "step": 3340 }, { "epoch": 0.5390448531784446, "grad_norm": 3.53621768951416, "learning_rate": 4.634538628752841e-05, "loss": 1.9278, "step": 3341 }, { "epoch": 0.5392061955469506, "grad_norm": 4.078290939331055, "learning_rate": 4.631932899105739e-05, "loss": 2.1414, "step": 3342 }, { "epoch": 0.5393675379154566, "grad_norm": 5.2463908195495605, "learning_rate": 4.629327269963897e-05, "loss": 2.2, "step": 3343 }, { "epoch": 0.5395288802839626, "grad_norm": 4.759715557098389, "learning_rate": 4.6267217420388126e-05, "loss": 1.9083, "step": 3344 }, { "epoch": 0.5396902226524686, "grad_norm": 4.195973873138428, "learning_rate": 4.6241163160419616e-05, "loss": 1.8039, "step": 3345 }, { "epoch": 0.5398515650209745, "grad_norm": 4.239854335784912, "learning_rate": 4.621510992684783e-05, "loss": 2.1011, "step": 3346 }, { "epoch": 0.5400129073894805, "grad_norm": 3.7908120155334473, "learning_rate": 4.618905772678696e-05, "loss": 1.8025, "step": 3347 }, { "epoch": 0.5401742497579864, "grad_norm": 4.560600757598877, "learning_rate": 4.616300656735085e-05, "loss": 2.0401, "step": 3348 }, { "epoch": 0.5403355921264924, "grad_norm": 3.953225612640381, "learning_rate": 4.613695645565312e-05, "loss": 1.8735, "step": 3349 }, { "epoch": 0.5404969344949984, "grad_norm": 4.011920928955078, "learning_rate": 4.611090739880707e-05, "loss": 1.9614, "step": 3350 }, { "epoch": 0.5406582768635043, "grad_norm": 4.15378475189209, "learning_rate": 4.6084859403925704e-05, "loss": 1.9246, "step": 3351 }, { "epoch": 0.5408196192320103, "grad_norm": 3.286672830581665, "learning_rate": 4.6058812478121726e-05, "loss": 1.9275, "step": 3352 }, { "epoch": 0.5409809616005163, "grad_norm": 4.174089431762695, "learning_rate": 4.6032766628507626e-05, "loss": 1.9762, "step": 3353 }, { "epoch": 0.5411423039690223, "grad_norm": 4.887900352478027, "learning_rate": 4.600672186219551e-05, "loss": 2.0994, "step": 3354 }, { "epoch": 0.5413036463375283, "grad_norm": 3.994049310684204, "learning_rate": 4.598067818629721e-05, "loss": 1.5674, "step": 3355 }, { "epoch": 0.5414649887060342, "grad_norm": 3.9422547817230225, "learning_rate": 4.5954635607924306e-05, "loss": 2.0313, "step": 3356 }, { "epoch": 0.5416263310745402, "grad_norm": 3.7704057693481445, "learning_rate": 4.5928594134188006e-05, "loss": 1.5559, "step": 3357 }, { "epoch": 0.5417876734430461, "grad_norm": 4.104972839355469, "learning_rate": 4.590255377219931e-05, "loss": 1.7469, "step": 3358 }, { "epoch": 0.5419490158115521, "grad_norm": 4.480334281921387, "learning_rate": 4.5876514529068805e-05, "loss": 1.8588, "step": 3359 }, { "epoch": 0.542110358180058, "grad_norm": 3.6700620651245117, "learning_rate": 4.5850476411906856e-05, "loss": 1.9828, "step": 3360 }, { "epoch": 0.5422717005485641, "grad_norm": 4.9810357093811035, "learning_rate": 4.582443942782348e-05, "loss": 2.0797, "step": 3361 }, { "epoch": 0.5424330429170701, "grad_norm": 3.96756911277771, "learning_rate": 4.579840358392842e-05, "loss": 1.8334, "step": 3362 }, { "epoch": 0.542594385285576, "grad_norm": 5.304398536682129, "learning_rate": 4.577236888733105e-05, "loss": 1.9863, "step": 3363 }, { "epoch": 0.542755727654082, "grad_norm": 3.4439711570739746, "learning_rate": 4.5746335345140497e-05, "loss": 1.7999, "step": 3364 }, { "epoch": 0.5429170700225879, "grad_norm": 5.821254730224609, "learning_rate": 4.57203029644655e-05, "loss": 1.9819, "step": 3365 }, { "epoch": 0.5430784123910939, "grad_norm": 3.3282694816589355, "learning_rate": 4.569427175241458e-05, "loss": 1.7094, "step": 3366 }, { "epoch": 0.5432397547595998, "grad_norm": 5.027764320373535, "learning_rate": 4.566824171609584e-05, "loss": 2.05, "step": 3367 }, { "epoch": 0.5434010971281058, "grad_norm": 3.789264678955078, "learning_rate": 4.564221286261709e-05, "loss": 1.8987, "step": 3368 }, { "epoch": 0.5435624394966118, "grad_norm": 4.646523952484131, "learning_rate": 4.561618519908587e-05, "loss": 2.1439, "step": 3369 }, { "epoch": 0.5437237818651178, "grad_norm": 5.464221954345703, "learning_rate": 4.559015873260933e-05, "loss": 1.8975, "step": 3370 }, { "epoch": 0.5438851242336238, "grad_norm": 3.8311192989349365, "learning_rate": 4.5564133470294325e-05, "loss": 1.9645, "step": 3371 }, { "epoch": 0.5440464666021297, "grad_norm": 3.876864194869995, "learning_rate": 4.553810941924735e-05, "loss": 1.8745, "step": 3372 }, { "epoch": 0.5442078089706357, "grad_norm": 4.448172569274902, "learning_rate": 4.551208658657463e-05, "loss": 1.8245, "step": 3373 }, { "epoch": 0.5443691513391417, "grad_norm": 3.681504249572754, "learning_rate": 4.548606497938199e-05, "loss": 1.7546, "step": 3374 }, { "epoch": 0.5445304937076476, "grad_norm": 5.545989990234375, "learning_rate": 4.546004460477498e-05, "loss": 1.7352, "step": 3375 }, { "epoch": 0.5446918360761536, "grad_norm": 3.8270070552825928, "learning_rate": 4.543402546985875e-05, "loss": 1.8872, "step": 3376 }, { "epoch": 0.5448531784446595, "grad_norm": 3.8583481311798096, "learning_rate": 4.5408007581738185e-05, "loss": 2.1329, "step": 3377 }, { "epoch": 0.5450145208131656, "grad_norm": 3.3820762634277344, "learning_rate": 4.5381990947517766e-05, "loss": 1.9026, "step": 3378 }, { "epoch": 0.5451758631816715, "grad_norm": 5.060296058654785, "learning_rate": 4.535597557430164e-05, "loss": 2.004, "step": 3379 }, { "epoch": 0.5453372055501775, "grad_norm": 4.134612083435059, "learning_rate": 4.532996146919367e-05, "loss": 1.7844, "step": 3380 }, { "epoch": 0.5454985479186835, "grad_norm": 3.6750853061676025, "learning_rate": 4.5303948639297287e-05, "loss": 2.001, "step": 3381 }, { "epoch": 0.5456598902871894, "grad_norm": 4.092704772949219, "learning_rate": 4.527793709171564e-05, "loss": 1.9174, "step": 3382 }, { "epoch": 0.5458212326556954, "grad_norm": 4.162294864654541, "learning_rate": 4.525192683355147e-05, "loss": 1.9129, "step": 3383 }, { "epoch": 0.5459825750242013, "grad_norm": 4.528878211975098, "learning_rate": 4.5225917871907245e-05, "loss": 2.0338, "step": 3384 }, { "epoch": 0.5461439173927073, "grad_norm": 3.582669973373413, "learning_rate": 4.5199910213884996e-05, "loss": 1.9855, "step": 3385 }, { "epoch": 0.5463052597612132, "grad_norm": 4.393901348114014, "learning_rate": 4.517390386658646e-05, "loss": 1.8243, "step": 3386 }, { "epoch": 0.5464666021297193, "grad_norm": 6.063263893127441, "learning_rate": 4.514789883711296e-05, "loss": 2.0507, "step": 3387 }, { "epoch": 0.5466279444982253, "grad_norm": 4.6668782234191895, "learning_rate": 4.5121895132565534e-05, "loss": 1.8884, "step": 3388 }, { "epoch": 0.5467892868667312, "grad_norm": 5.516592025756836, "learning_rate": 4.509589276004477e-05, "loss": 1.9339, "step": 3389 }, { "epoch": 0.5469506292352372, "grad_norm": 3.8053150177001953, "learning_rate": 4.5069891726650974e-05, "loss": 2.1245, "step": 3390 }, { "epoch": 0.5471119716037431, "grad_norm": 3.28340220451355, "learning_rate": 4.504389203948403e-05, "loss": 1.8647, "step": 3391 }, { "epoch": 0.5472733139722491, "grad_norm": 4.491049289703369, "learning_rate": 4.501789370564345e-05, "loss": 1.7355, "step": 3392 }, { "epoch": 0.547434656340755, "grad_norm": 3.9153873920440674, "learning_rate": 4.499189673222845e-05, "loss": 1.7228, "step": 3393 }, { "epoch": 0.547595998709261, "grad_norm": 4.5766987800598145, "learning_rate": 4.496590112633776e-05, "loss": 2.1064, "step": 3394 }, { "epoch": 0.547757341077767, "grad_norm": 4.931046962738037, "learning_rate": 4.493990689506987e-05, "loss": 1.8381, "step": 3395 }, { "epoch": 0.547918683446273, "grad_norm": 4.126611232757568, "learning_rate": 4.491391404552278e-05, "loss": 2.0963, "step": 3396 }, { "epoch": 0.548080025814779, "grad_norm": 3.649404287338257, "learning_rate": 4.488792258479418e-05, "loss": 1.7649, "step": 3397 }, { "epoch": 0.548241368183285, "grad_norm": 5.112252712249756, "learning_rate": 4.486193251998134e-05, "loss": 1.6735, "step": 3398 }, { "epoch": 0.5484027105517909, "grad_norm": 3.7966115474700928, "learning_rate": 4.483594385818118e-05, "loss": 1.7547, "step": 3399 }, { "epoch": 0.5485640529202969, "grad_norm": 4.304330348968506, "learning_rate": 4.4809956606490226e-05, "loss": 2.1255, "step": 3400 }, { "epoch": 0.5487253952888028, "grad_norm": 4.657885551452637, "learning_rate": 4.478397077200463e-05, "loss": 2.0066, "step": 3401 }, { "epoch": 0.5488867376573088, "grad_norm": 4.4623260498046875, "learning_rate": 4.4757986361820094e-05, "loss": 2.3324, "step": 3402 }, { "epoch": 0.5490480800258147, "grad_norm": 4.30092191696167, "learning_rate": 4.473200338303204e-05, "loss": 1.8566, "step": 3403 }, { "epoch": 0.5492094223943208, "grad_norm": 4.318838119506836, "learning_rate": 4.470602184273543e-05, "loss": 1.93, "step": 3404 }, { "epoch": 0.5493707647628268, "grad_norm": 4.305547714233398, "learning_rate": 4.468004174802479e-05, "loss": 1.9673, "step": 3405 }, { "epoch": 0.5495321071313327, "grad_norm": 5.350564479827881, "learning_rate": 4.465406310599438e-05, "loss": 2.031, "step": 3406 }, { "epoch": 0.5496934494998387, "grad_norm": 5.960731506347656, "learning_rate": 4.462808592373792e-05, "loss": 1.7523, "step": 3407 }, { "epoch": 0.5498547918683446, "grad_norm": 3.5292716026306152, "learning_rate": 4.460211020834887e-05, "loss": 1.8345, "step": 3408 }, { "epoch": 0.5500161342368506, "grad_norm": 4.717067241668701, "learning_rate": 4.4576135966920165e-05, "loss": 2.1662, "step": 3409 }, { "epoch": 0.5501774766053565, "grad_norm": 4.258657455444336, "learning_rate": 4.455016320654442e-05, "loss": 1.9488, "step": 3410 }, { "epoch": 0.5503388189738625, "grad_norm": 3.420463800430298, "learning_rate": 4.452419193431379e-05, "loss": 1.8774, "step": 3411 }, { "epoch": 0.5505001613423685, "grad_norm": 5.453540325164795, "learning_rate": 4.4498222157320094e-05, "loss": 2.1955, "step": 3412 }, { "epoch": 0.5506615037108745, "grad_norm": 4.927591800689697, "learning_rate": 4.447225388265465e-05, "loss": 1.9469, "step": 3413 }, { "epoch": 0.5508228460793805, "grad_norm": 6.436439037322998, "learning_rate": 4.4446287117408456e-05, "loss": 2.0445, "step": 3414 }, { "epoch": 0.5509841884478864, "grad_norm": 4.598954200744629, "learning_rate": 4.4420321868672026e-05, "loss": 1.9913, "step": 3415 }, { "epoch": 0.5511455308163924, "grad_norm": 3.52225399017334, "learning_rate": 4.439435814353553e-05, "loss": 2.1382, "step": 3416 }, { "epoch": 0.5513068731848983, "grad_norm": 3.898519992828369, "learning_rate": 4.436839594908866e-05, "loss": 1.9294, "step": 3417 }, { "epoch": 0.5514682155534043, "grad_norm": 3.636260747909546, "learning_rate": 4.43424352924207e-05, "loss": 1.9964, "step": 3418 }, { "epoch": 0.5516295579219103, "grad_norm": 4.266148090362549, "learning_rate": 4.431647618062055e-05, "loss": 2.0357, "step": 3419 }, { "epoch": 0.5517909002904162, "grad_norm": 4.997242450714111, "learning_rate": 4.4290518620776645e-05, "loss": 2.213, "step": 3420 }, { "epoch": 0.5519522426589223, "grad_norm": 4.129906177520752, "learning_rate": 4.4264562619977044e-05, "loss": 1.9439, "step": 3421 }, { "epoch": 0.5521135850274282, "grad_norm": 5.237497806549072, "learning_rate": 4.423860818530932e-05, "loss": 1.9271, "step": 3422 }, { "epoch": 0.5522749273959342, "grad_norm": 3.959937810897827, "learning_rate": 4.4212655323860684e-05, "loss": 1.8816, "step": 3423 }, { "epoch": 0.5524362697644402, "grad_norm": 3.959937810897827, "learning_rate": 4.4212655323860684e-05, "loss": 2.0413, "step": 3424 }, { "epoch": 0.5525976121329461, "grad_norm": 5.576602458953857, "learning_rate": 4.418670404271785e-05, "loss": 1.9931, "step": 3425 }, { "epoch": 0.5527589545014521, "grad_norm": 6.255558967590332, "learning_rate": 4.416075434896717e-05, "loss": 2.0849, "step": 3426 }, { "epoch": 0.552920296869958, "grad_norm": 6.220365524291992, "learning_rate": 4.413480624969452e-05, "loss": 1.7052, "step": 3427 }, { "epoch": 0.553081639238464, "grad_norm": 4.45718240737915, "learning_rate": 4.410885975198533e-05, "loss": 1.9092, "step": 3428 }, { "epoch": 0.5532429816069699, "grad_norm": 4.006811618804932, "learning_rate": 4.408291486292462e-05, "loss": 1.734, "step": 3429 }, { "epoch": 0.553404323975476, "grad_norm": 3.4543488025665283, "learning_rate": 4.405697158959698e-05, "loss": 1.7235, "step": 3430 }, { "epoch": 0.553565666343982, "grad_norm": 5.664391040802002, "learning_rate": 4.403102993908653e-05, "loss": 1.8804, "step": 3431 }, { "epoch": 0.5537270087124879, "grad_norm": 5.415212154388428, "learning_rate": 4.400508991847692e-05, "loss": 1.9773, "step": 3432 }, { "epoch": 0.5538883510809939, "grad_norm": 8.737555503845215, "learning_rate": 4.3979151534851446e-05, "loss": 1.9354, "step": 3433 }, { "epoch": 0.5540496934494998, "grad_norm": 4.1277289390563965, "learning_rate": 4.395321479529287e-05, "loss": 1.9464, "step": 3434 }, { "epoch": 0.5542110358180058, "grad_norm": 3.8060250282287598, "learning_rate": 4.3927279706883565e-05, "loss": 1.6957, "step": 3435 }, { "epoch": 0.5543723781865117, "grad_norm": 5.43300199508667, "learning_rate": 4.39013462767054e-05, "loss": 2.0556, "step": 3436 }, { "epoch": 0.5545337205550177, "grad_norm": 5.492201328277588, "learning_rate": 4.3875414511839847e-05, "loss": 1.8884, "step": 3437 }, { "epoch": 0.5546950629235237, "grad_norm": 5.079095840454102, "learning_rate": 4.3849484419367866e-05, "loss": 1.6756, "step": 3438 }, { "epoch": 0.5548564052920297, "grad_norm": 4.2949604988098145, "learning_rate": 4.382355600637002e-05, "loss": 1.5468, "step": 3439 }, { "epoch": 0.5550177476605357, "grad_norm": 5.174224853515625, "learning_rate": 4.3797629279926325e-05, "loss": 1.9101, "step": 3440 }, { "epoch": 0.5551790900290416, "grad_norm": 4.244043350219727, "learning_rate": 4.377170424711646e-05, "loss": 1.9175, "step": 3441 }, { "epoch": 0.5553404323975476, "grad_norm": 4.246883869171143, "learning_rate": 4.374578091501954e-05, "loss": 1.6105, "step": 3442 }, { "epoch": 0.5555017747660536, "grad_norm": 3.899681806564331, "learning_rate": 4.371985929071424e-05, "loss": 2.0417, "step": 3443 }, { "epoch": 0.5556631171345595, "grad_norm": 4.2130632400512695, "learning_rate": 4.3693939381278815e-05, "loss": 2.2013, "step": 3444 }, { "epoch": 0.5558244595030655, "grad_norm": 4.7803754806518555, "learning_rate": 4.3668021193790974e-05, "loss": 2.0384, "step": 3445 }, { "epoch": 0.5559858018715714, "grad_norm": 4.410373210906982, "learning_rate": 4.364210473532804e-05, "loss": 1.9194, "step": 3446 }, { "epoch": 0.5561471442400775, "grad_norm": 4.31472110748291, "learning_rate": 4.36161900129668e-05, "loss": 1.9001, "step": 3447 }, { "epoch": 0.5563084866085835, "grad_norm": 6.867047309875488, "learning_rate": 4.359027703378357e-05, "loss": 1.9496, "step": 3448 }, { "epoch": 0.5564698289770894, "grad_norm": 4.220922946929932, "learning_rate": 4.356436580485424e-05, "loss": 1.6947, "step": 3449 }, { "epoch": 0.5566311713455954, "grad_norm": 4.422349452972412, "learning_rate": 4.3538456333254186e-05, "loss": 1.8319, "step": 3450 }, { "epoch": 0.5567925137141013, "grad_norm": 3.9616305828094482, "learning_rate": 4.351254862605828e-05, "loss": 1.9927, "step": 3451 }, { "epoch": 0.5569538560826073, "grad_norm": 4.980384826660156, "learning_rate": 4.3486642690340986e-05, "loss": 1.774, "step": 3452 }, { "epoch": 0.5571151984511132, "grad_norm": 4.41873025894165, "learning_rate": 4.346073853317619e-05, "loss": 1.8737, "step": 3453 }, { "epoch": 0.5572765408196192, "grad_norm": 3.8804445266723633, "learning_rate": 4.343483616163739e-05, "loss": 1.7567, "step": 3454 }, { "epoch": 0.5574378831881251, "grad_norm": 3.948648691177368, "learning_rate": 4.340893558279753e-05, "loss": 1.7944, "step": 3455 }, { "epoch": 0.5575992255566312, "grad_norm": 3.5727169513702393, "learning_rate": 4.338303680372905e-05, "loss": 1.6433, "step": 3456 }, { "epoch": 0.5577605679251372, "grad_norm": 3.9678032398223877, "learning_rate": 4.335713983150398e-05, "loss": 1.9791, "step": 3457 }, { "epoch": 0.5579219102936431, "grad_norm": 4.215176105499268, "learning_rate": 4.333124467319377e-05, "loss": 1.9158, "step": 3458 }, { "epoch": 0.5580832526621491, "grad_norm": 4.337172985076904, "learning_rate": 4.330535133586944e-05, "loss": 2.0593, "step": 3459 }, { "epoch": 0.558244595030655, "grad_norm": 4.706808090209961, "learning_rate": 4.3279459826601455e-05, "loss": 2.1515, "step": 3460 }, { "epoch": 0.558405937399161, "grad_norm": 4.440176963806152, "learning_rate": 4.325357015245985e-05, "loss": 1.8315, "step": 3461 }, { "epoch": 0.558567279767667, "grad_norm": 4.729047775268555, "learning_rate": 4.322768232051407e-05, "loss": 2.0323, "step": 3462 }, { "epoch": 0.5587286221361729, "grad_norm": 5.698724746704102, "learning_rate": 4.320179633783317e-05, "loss": 2.3371, "step": 3463 }, { "epoch": 0.558889964504679, "grad_norm": 3.6514177322387695, "learning_rate": 4.317591221148557e-05, "loss": 1.6228, "step": 3464 }, { "epoch": 0.5590513068731849, "grad_norm": 5.009037971496582, "learning_rate": 4.315002994853931e-05, "loss": 1.9361, "step": 3465 }, { "epoch": 0.5592126492416909, "grad_norm": 4.258859634399414, "learning_rate": 4.312414955606181e-05, "loss": 2.0629, "step": 3466 }, { "epoch": 0.5593739916101969, "grad_norm": 4.319770812988281, "learning_rate": 4.3098271041120076e-05, "loss": 1.9236, "step": 3467 }, { "epoch": 0.5595353339787028, "grad_norm": 4.81157112121582, "learning_rate": 4.3072394410780515e-05, "loss": 1.85, "step": 3468 }, { "epoch": 0.5596966763472088, "grad_norm": 4.622674942016602, "learning_rate": 4.3046519672109084e-05, "loss": 1.7307, "step": 3469 }, { "epoch": 0.5598580187157147, "grad_norm": 4.097692489624023, "learning_rate": 4.30206468321712e-05, "loss": 1.8736, "step": 3470 }, { "epoch": 0.5600193610842207, "grad_norm": 4.507349967956543, "learning_rate": 4.2994775898031726e-05, "loss": 1.8505, "step": 3471 }, { "epoch": 0.5601807034527266, "grad_norm": 3.4053497314453125, "learning_rate": 4.29689068767551e-05, "loss": 1.9851, "step": 3472 }, { "epoch": 0.5603420458212327, "grad_norm": 5.3332624435424805, "learning_rate": 4.2943039775405116e-05, "loss": 1.9102, "step": 3473 }, { "epoch": 0.5605033881897387, "grad_norm": 4.859854698181152, "learning_rate": 4.291717460104516e-05, "loss": 1.8431, "step": 3474 }, { "epoch": 0.5606647305582446, "grad_norm": 5.216459274291992, "learning_rate": 4.289131136073799e-05, "loss": 2.0909, "step": 3475 }, { "epoch": 0.5608260729267506, "grad_norm": 5.3039727210998535, "learning_rate": 4.286545006154591e-05, "loss": 1.9839, "step": 3476 }, { "epoch": 0.5609874152952565, "grad_norm": 3.725084066390991, "learning_rate": 4.283959071053066e-05, "loss": 1.6874, "step": 3477 }, { "epoch": 0.5611487576637625, "grad_norm": 4.183096885681152, "learning_rate": 4.281373331475347e-05, "loss": 1.7348, "step": 3478 }, { "epoch": 0.5613101000322684, "grad_norm": 5.283970832824707, "learning_rate": 4.2787877881274974e-05, "loss": 1.8059, "step": 3479 }, { "epoch": 0.5614714424007744, "grad_norm": 7.493622303009033, "learning_rate": 4.276202441715538e-05, "loss": 2.048, "step": 3480 }, { "epoch": 0.5616327847692804, "grad_norm": 3.493828773498535, "learning_rate": 4.273617292945425e-05, "loss": 2.0378, "step": 3481 }, { "epoch": 0.5617941271377864, "grad_norm": 4.150033473968506, "learning_rate": 4.2710323425230644e-05, "loss": 1.8063, "step": 3482 }, { "epoch": 0.5619554695062924, "grad_norm": 3.346498966217041, "learning_rate": 4.2684475911543145e-05, "loss": 1.9965, "step": 3483 }, { "epoch": 0.5621168118747983, "grad_norm": 7.276752471923828, "learning_rate": 4.2658630395449665e-05, "loss": 2.0183, "step": 3484 }, { "epoch": 0.5622781542433043, "grad_norm": 4.562815189361572, "learning_rate": 4.26327868840077e-05, "loss": 1.982, "step": 3485 }, { "epoch": 0.5624394966118103, "grad_norm": 4.3584394454956055, "learning_rate": 4.26069453842741e-05, "loss": 1.5823, "step": 3486 }, { "epoch": 0.5626008389803162, "grad_norm": 3.395272970199585, "learning_rate": 4.258110590330523e-05, "loss": 2.1431, "step": 3487 }, { "epoch": 0.5627621813488222, "grad_norm": 3.7238121032714844, "learning_rate": 4.255526844815685e-05, "loss": 2.133, "step": 3488 }, { "epoch": 0.5629235237173281, "grad_norm": 3.6202895641326904, "learning_rate": 4.252943302588423e-05, "loss": 1.9198, "step": 3489 }, { "epoch": 0.5630848660858342, "grad_norm": 5.4320902824401855, "learning_rate": 4.2503599643542024e-05, "loss": 2.0379, "step": 3490 }, { "epoch": 0.5632462084543401, "grad_norm": 5.918498516082764, "learning_rate": 4.247776830818439e-05, "loss": 1.9996, "step": 3491 }, { "epoch": 0.5634075508228461, "grad_norm": 4.040607929229736, "learning_rate": 4.245193902686483e-05, "loss": 1.7064, "step": 3492 }, { "epoch": 0.5635688931913521, "grad_norm": 4.910560131072998, "learning_rate": 4.2426111806636415e-05, "loss": 1.8541, "step": 3493 }, { "epoch": 0.563730235559858, "grad_norm": 4.844120025634766, "learning_rate": 4.240028665455156e-05, "loss": 2.1311, "step": 3494 }, { "epoch": 0.563891577928364, "grad_norm": 4.727542877197266, "learning_rate": 4.2374463577662116e-05, "loss": 2.1812, "step": 3495 }, { "epoch": 0.5640529202968699, "grad_norm": 4.263223648071289, "learning_rate": 4.234864258301943e-05, "loss": 1.9294, "step": 3496 }, { "epoch": 0.5642142626653759, "grad_norm": 4.178357124328613, "learning_rate": 4.232282367767422e-05, "loss": 1.8706, "step": 3497 }, { "epoch": 0.5643756050338818, "grad_norm": 4.068645000457764, "learning_rate": 4.229700686867668e-05, "loss": 2.0063, "step": 3498 }, { "epoch": 0.5645369474023879, "grad_norm": 3.6699161529541016, "learning_rate": 4.227119216307637e-05, "loss": 2.024, "step": 3499 }, { "epoch": 0.5646982897708939, "grad_norm": 4.167466163635254, "learning_rate": 4.224537956792235e-05, "loss": 2.0087, "step": 3500 }, { "epoch": 0.5648596321393998, "grad_norm": 4.946616172790527, "learning_rate": 4.221956909026304e-05, "loss": 2.1271, "step": 3501 }, { "epoch": 0.5650209745079058, "grad_norm": 5.21272087097168, "learning_rate": 4.2193760737146346e-05, "loss": 2.0256, "step": 3502 }, { "epoch": 0.5651823168764117, "grad_norm": 5.247081279754639, "learning_rate": 4.21679545156195e-05, "loss": 1.7813, "step": 3503 }, { "epoch": 0.5653436592449177, "grad_norm": 3.512169599533081, "learning_rate": 4.214215043272928e-05, "loss": 1.9385, "step": 3504 }, { "epoch": 0.5655050016134237, "grad_norm": 5.469997406005859, "learning_rate": 4.211634849552175e-05, "loss": 1.9733, "step": 3505 }, { "epoch": 0.5656663439819296, "grad_norm": 4.067634582519531, "learning_rate": 4.209054871104249e-05, "loss": 1.8343, "step": 3506 }, { "epoch": 0.5658276863504357, "grad_norm": 5.573502540588379, "learning_rate": 4.2064751086336405e-05, "loss": 1.5979, "step": 3507 }, { "epoch": 0.5659890287189416, "grad_norm": 3.959174156188965, "learning_rate": 4.203895562844789e-05, "loss": 1.8609, "step": 3508 }, { "epoch": 0.5661503710874476, "grad_norm": 3.4906005859375, "learning_rate": 4.2013162344420695e-05, "loss": 1.7882, "step": 3509 }, { "epoch": 0.5663117134559535, "grad_norm": 4.062140941619873, "learning_rate": 4.198737124129799e-05, "loss": 2.0697, "step": 3510 }, { "epoch": 0.5664730558244595, "grad_norm": 5.04123592376709, "learning_rate": 4.196158232612238e-05, "loss": 1.9035, "step": 3511 }, { "epoch": 0.5666343981929655, "grad_norm": 3.7623865604400635, "learning_rate": 4.193579560593581e-05, "loss": 2.055, "step": 3512 }, { "epoch": 0.5667957405614714, "grad_norm": 5.195204257965088, "learning_rate": 4.19100110877797e-05, "loss": 1.8476, "step": 3513 }, { "epoch": 0.5669570829299774, "grad_norm": 4.111303329467773, "learning_rate": 4.188422877869481e-05, "loss": 1.6723, "step": 3514 }, { "epoch": 0.5671184252984833, "grad_norm": 4.042357921600342, "learning_rate": 4.1858448685721306e-05, "loss": 1.8876, "step": 3515 }, { "epoch": 0.5672797676669894, "grad_norm": 3.8034486770629883, "learning_rate": 4.183267081589878e-05, "loss": 1.9028, "step": 3516 }, { "epoch": 0.5674411100354954, "grad_norm": 7.3269853591918945, "learning_rate": 4.18068951762662e-05, "loss": 1.7978, "step": 3517 }, { "epoch": 0.5676024524040013, "grad_norm": 4.7104692459106445, "learning_rate": 4.178112177386192e-05, "loss": 1.8569, "step": 3518 }, { "epoch": 0.5677637947725073, "grad_norm": 4.491191387176514, "learning_rate": 4.175535061572365e-05, "loss": 2.2193, "step": 3519 }, { "epoch": 0.5679251371410132, "grad_norm": 4.737609386444092, "learning_rate": 4.172958170888858e-05, "loss": 1.9274, "step": 3520 }, { "epoch": 0.5680864795095192, "grad_norm": 4.3185343742370605, "learning_rate": 4.170381506039317e-05, "loss": 1.8375, "step": 3521 }, { "epoch": 0.5682478218780251, "grad_norm": 5.701799392700195, "learning_rate": 4.1678050677273375e-05, "loss": 1.8371, "step": 3522 }, { "epoch": 0.5684091642465311, "grad_norm": 4.154476642608643, "learning_rate": 4.165228856656443e-05, "loss": 1.9134, "step": 3523 }, { "epoch": 0.5685705066150372, "grad_norm": 5.257763862609863, "learning_rate": 4.162652873530104e-05, "loss": 1.8649, "step": 3524 }, { "epoch": 0.5687318489835431, "grad_norm": 4.1844048500061035, "learning_rate": 4.1600771190517216e-05, "loss": 1.8578, "step": 3525 }, { "epoch": 0.5688931913520491, "grad_norm": 4.613462924957275, "learning_rate": 4.1575015939246384e-05, "loss": 2.0878, "step": 3526 }, { "epoch": 0.569054533720555, "grad_norm": 4.987559795379639, "learning_rate": 4.154926298852131e-05, "loss": 1.9917, "step": 3527 }, { "epoch": 0.569215876089061, "grad_norm": 5.124747276306152, "learning_rate": 4.15235123453742e-05, "loss": 1.9757, "step": 3528 }, { "epoch": 0.569377218457567, "grad_norm": 3.884608745574951, "learning_rate": 4.149776401683654e-05, "loss": 1.8557, "step": 3529 }, { "epoch": 0.5695385608260729, "grad_norm": 4.390828609466553, "learning_rate": 4.147201800993926e-05, "loss": 1.8436, "step": 3530 }, { "epoch": 0.5696999031945789, "grad_norm": 4.279433727264404, "learning_rate": 4.144627433171262e-05, "loss": 1.9243, "step": 3531 }, { "epoch": 0.5698612455630848, "grad_norm": 3.7821900844573975, "learning_rate": 4.142053298918622e-05, "loss": 2.0182, "step": 3532 }, { "epoch": 0.5700225879315909, "grad_norm": 3.6471571922302246, "learning_rate": 4.139479398938909e-05, "loss": 1.8736, "step": 3533 }, { "epoch": 0.5701839303000968, "grad_norm": 4.540952205657959, "learning_rate": 4.136905733934955e-05, "loss": 2.1067, "step": 3534 }, { "epoch": 0.5703452726686028, "grad_norm": 4.5960469245910645, "learning_rate": 4.134332304609533e-05, "loss": 1.7422, "step": 3535 }, { "epoch": 0.5705066150371088, "grad_norm": 5.430549621582031, "learning_rate": 4.131759111665349e-05, "loss": 2.1988, "step": 3536 }, { "epoch": 0.5706679574056147, "grad_norm": 5.013370990753174, "learning_rate": 4.1291861558050456e-05, "loss": 2.0637, "step": 3537 }, { "epoch": 0.5708292997741207, "grad_norm": 3.6673946380615234, "learning_rate": 4.126613437731197e-05, "loss": 1.6338, "step": 3538 }, { "epoch": 0.5709906421426266, "grad_norm": 4.181926250457764, "learning_rate": 4.1240409581463206e-05, "loss": 1.7872, "step": 3539 }, { "epoch": 0.5711519845111326, "grad_norm": 3.8125319480895996, "learning_rate": 4.121468717752859e-05, "loss": 1.7137, "step": 3540 }, { "epoch": 0.5713133268796385, "grad_norm": 5.434143543243408, "learning_rate": 4.118896717253199e-05, "loss": 2.1284, "step": 3541 }, { "epoch": 0.5714746692481446, "grad_norm": 5.147502422332764, "learning_rate": 4.116324957349652e-05, "loss": 1.7338, "step": 3542 }, { "epoch": 0.5716360116166506, "grad_norm": 4.157900810241699, "learning_rate": 4.113753438744472e-05, "loss": 1.8178, "step": 3543 }, { "epoch": 0.5717973539851565, "grad_norm": 4.950132846832275, "learning_rate": 4.1111821621398446e-05, "loss": 1.7818, "step": 3544 }, { "epoch": 0.5719586963536625, "grad_norm": 5.8169732093811035, "learning_rate": 4.1086111282378846e-05, "loss": 1.8208, "step": 3545 }, { "epoch": 0.5721200387221684, "grad_norm": 3.5307111740112305, "learning_rate": 4.1060403377406486e-05, "loss": 1.8114, "step": 3546 }, { "epoch": 0.5722813810906744, "grad_norm": 4.841145038604736, "learning_rate": 4.103469791350119e-05, "loss": 1.7671, "step": 3547 }, { "epoch": 0.5724427234591803, "grad_norm": 3.4705560207366943, "learning_rate": 4.100899489768219e-05, "loss": 1.8366, "step": 3548 }, { "epoch": 0.5726040658276863, "grad_norm": 3.809705972671509, "learning_rate": 4.098329433696797e-05, "loss": 2.1217, "step": 3549 }, { "epoch": 0.5727654081961924, "grad_norm": 4.063918590545654, "learning_rate": 4.095759623837643e-05, "loss": 1.8421, "step": 3550 }, { "epoch": 0.5729267505646983, "grad_norm": 6.015925407409668, "learning_rate": 4.093190060892471e-05, "loss": 1.6947, "step": 3551 }, { "epoch": 0.5730880929332043, "grad_norm": 5.437800407409668, "learning_rate": 4.090620745562935e-05, "loss": 1.9608, "step": 3552 }, { "epoch": 0.5732494353017102, "grad_norm": 4.587212562561035, "learning_rate": 4.088051678550617e-05, "loss": 1.965, "step": 3553 }, { "epoch": 0.5734107776702162, "grad_norm": 5.168732166290283, "learning_rate": 4.085482860557033e-05, "loss": 1.936, "step": 3554 }, { "epoch": 0.5735721200387222, "grad_norm": 4.875430583953857, "learning_rate": 4.0829142922836284e-05, "loss": 1.8593, "step": 3555 }, { "epoch": 0.5737334624072281, "grad_norm": 4.574604511260986, "learning_rate": 4.0803459744317854e-05, "loss": 1.943, "step": 3556 }, { "epoch": 0.5738948047757341, "grad_norm": 3.4781699180603027, "learning_rate": 4.077777907702814e-05, "loss": 1.8509, "step": 3557 }, { "epoch": 0.57405614714424, "grad_norm": 6.551433563232422, "learning_rate": 4.0752100927979535e-05, "loss": 2.0652, "step": 3558 }, { "epoch": 0.5742174895127461, "grad_norm": 4.630098819732666, "learning_rate": 4.072642530418382e-05, "loss": 1.8877, "step": 3559 }, { "epoch": 0.574378831881252, "grad_norm": 5.285862445831299, "learning_rate": 4.0700752212651996e-05, "loss": 2.2135, "step": 3560 }, { "epoch": 0.574540174249758, "grad_norm": 3.9878244400024414, "learning_rate": 4.067508166039446e-05, "loss": 1.8804, "step": 3561 }, { "epoch": 0.574701516618264, "grad_norm": 5.5384979248046875, "learning_rate": 4.064941365442084e-05, "loss": 1.9407, "step": 3562 }, { "epoch": 0.5748628589867699, "grad_norm": 5.246916770935059, "learning_rate": 4.062374820174013e-05, "loss": 1.9085, "step": 3563 }, { "epoch": 0.5750242013552759, "grad_norm": 4.07591438293457, "learning_rate": 4.0598085309360575e-05, "loss": 1.792, "step": 3564 }, { "epoch": 0.5751855437237818, "grad_norm": 3.93082594871521, "learning_rate": 4.057242498428976e-05, "loss": 1.9746, "step": 3565 }, { "epoch": 0.5753468860922878, "grad_norm": 4.674701690673828, "learning_rate": 4.0546767233534525e-05, "loss": 1.938, "step": 3566 }, { "epoch": 0.5755082284607939, "grad_norm": 4.735285758972168, "learning_rate": 4.0521112064101075e-05, "loss": 1.6369, "step": 3567 }, { "epoch": 0.5756695708292998, "grad_norm": 3.8902482986450195, "learning_rate": 4.049545948299482e-05, "loss": 2.0786, "step": 3568 }, { "epoch": 0.5758309131978058, "grad_norm": 3.805572509765625, "learning_rate": 4.046980949722058e-05, "loss": 1.7551, "step": 3569 }, { "epoch": 0.5759922555663117, "grad_norm": 5.669454097747803, "learning_rate": 4.044416211378236e-05, "loss": 2.1775, "step": 3570 }, { "epoch": 0.5761535979348177, "grad_norm": 4.8873820304870605, "learning_rate": 4.0418517339683474e-05, "loss": 1.8586, "step": 3571 }, { "epoch": 0.5763149403033236, "grad_norm": 6.248045921325684, "learning_rate": 4.039287518192659e-05, "loss": 1.9961, "step": 3572 }, { "epoch": 0.5764762826718296, "grad_norm": 4.519307613372803, "learning_rate": 4.036723564751358e-05, "loss": 2.2647, "step": 3573 }, { "epoch": 0.5766376250403356, "grad_norm": 4.497463226318359, "learning_rate": 4.034159874344566e-05, "loss": 1.5972, "step": 3574 }, { "epoch": 0.5767989674088415, "grad_norm": 4.406150817871094, "learning_rate": 4.031596447672328e-05, "loss": 1.9623, "step": 3575 }, { "epoch": 0.5769603097773476, "grad_norm": 4.784985542297363, "learning_rate": 4.029033285434623e-05, "loss": 1.8943, "step": 3576 }, { "epoch": 0.5771216521458535, "grad_norm": 4.867000102996826, "learning_rate": 4.026470388331347e-05, "loss": 1.8319, "step": 3577 }, { "epoch": 0.5772829945143595, "grad_norm": 5.884014129638672, "learning_rate": 4.0239077570623385e-05, "loss": 1.8488, "step": 3578 }, { "epoch": 0.5774443368828654, "grad_norm": 4.11932897567749, "learning_rate": 4.02134539232735e-05, "loss": 1.8305, "step": 3579 }, { "epoch": 0.5776056792513714, "grad_norm": 6.096425533294678, "learning_rate": 4.0187832948260705e-05, "loss": 1.7353, "step": 3580 }, { "epoch": 0.5777670216198774, "grad_norm": 4.391887664794922, "learning_rate": 4.0162214652581116e-05, "loss": 1.8747, "step": 3581 }, { "epoch": 0.5779283639883833, "grad_norm": 5.163760185241699, "learning_rate": 4.013659904323009e-05, "loss": 1.913, "step": 3582 }, { "epoch": 0.5780897063568893, "grad_norm": 5.652235507965088, "learning_rate": 4.011098612720233e-05, "loss": 2.1561, "step": 3583 }, { "epoch": 0.5782510487253952, "grad_norm": 5.187814235687256, "learning_rate": 4.0085375911491733e-05, "loss": 2.0876, "step": 3584 }, { "epoch": 0.5784123910939013, "grad_norm": 4.556102752685547, "learning_rate": 4.0059768403091505e-05, "loss": 1.8114, "step": 3585 }, { "epoch": 0.5785737334624073, "grad_norm": 4.740772247314453, "learning_rate": 4.0034163608994045e-05, "loss": 1.6901, "step": 3586 }, { "epoch": 0.5787350758309132, "grad_norm": 3.779283046722412, "learning_rate": 4.000856153619112e-05, "loss": 1.9207, "step": 3587 }, { "epoch": 0.5788964181994192, "grad_norm": 6.24001407623291, "learning_rate": 3.998296219167364e-05, "loss": 1.7858, "step": 3588 }, { "epoch": 0.5790577605679251, "grad_norm": 5.309969902038574, "learning_rate": 3.995736558243186e-05, "loss": 1.8324, "step": 3589 }, { "epoch": 0.5792191029364311, "grad_norm": 4.181309223175049, "learning_rate": 3.993177171545522e-05, "loss": 2.1065, "step": 3590 }, { "epoch": 0.579380445304937, "grad_norm": 4.735886573791504, "learning_rate": 3.990618059773247e-05, "loss": 2.1249, "step": 3591 }, { "epoch": 0.579541787673443, "grad_norm": 5.766643047332764, "learning_rate": 3.9880592236251554e-05, "loss": 1.9642, "step": 3592 }, { "epoch": 0.5797031300419491, "grad_norm": 3.530975818634033, "learning_rate": 3.985500663799972e-05, "loss": 1.9122, "step": 3593 }, { "epoch": 0.579864472410455, "grad_norm": 5.174084663391113, "learning_rate": 3.982942380996338e-05, "loss": 1.9706, "step": 3594 }, { "epoch": 0.580025814778961, "grad_norm": 3.8036489486694336, "learning_rate": 3.980384375912829e-05, "loss": 1.6745, "step": 3595 }, { "epoch": 0.5801871571474669, "grad_norm": 3.690394639968872, "learning_rate": 3.977826649247938e-05, "loss": 2.0887, "step": 3596 }, { "epoch": 0.5803484995159729, "grad_norm": 5.070824146270752, "learning_rate": 3.9752692017000827e-05, "loss": 1.8814, "step": 3597 }, { "epoch": 0.5805098418844788, "grad_norm": 3.071666955947876, "learning_rate": 3.9727120339676084e-05, "loss": 1.8724, "step": 3598 }, { "epoch": 0.5806711842529848, "grad_norm": 7.061601638793945, "learning_rate": 3.9701551467487776e-05, "loss": 2.0402, "step": 3599 }, { "epoch": 0.5808325266214908, "grad_norm": 5.6859002113342285, "learning_rate": 3.9675985407417836e-05, "loss": 1.9293, "step": 3600 }, { "epoch": 0.5809938689899967, "grad_norm": 4.9450554847717285, "learning_rate": 3.965042216644738e-05, "loss": 1.7457, "step": 3601 }, { "epoch": 0.5811552113585028, "grad_norm": 4.177241325378418, "learning_rate": 3.962486175155675e-05, "loss": 1.8122, "step": 3602 }, { "epoch": 0.5813165537270087, "grad_norm": 5.32080078125, "learning_rate": 3.9599304169725545e-05, "loss": 1.9454, "step": 3603 }, { "epoch": 0.5814778960955147, "grad_norm": 5.042309284210205, "learning_rate": 3.957374942793258e-05, "loss": 1.8161, "step": 3604 }, { "epoch": 0.5816392384640207, "grad_norm": 5.364151477813721, "learning_rate": 3.954819753315588e-05, "loss": 1.9463, "step": 3605 }, { "epoch": 0.5818005808325266, "grad_norm": 4.314451694488525, "learning_rate": 3.9522648492372735e-05, "loss": 1.6316, "step": 3606 }, { "epoch": 0.5819619232010326, "grad_norm": 4.097662925720215, "learning_rate": 3.94971023125596e-05, "loss": 1.9523, "step": 3607 }, { "epoch": 0.5821232655695385, "grad_norm": 5.013316631317139, "learning_rate": 3.947155900069216e-05, "loss": 1.9738, "step": 3608 }, { "epoch": 0.5822846079380445, "grad_norm": 4.259307384490967, "learning_rate": 3.944601856374537e-05, "loss": 1.6727, "step": 3609 }, { "epoch": 0.5824459503065506, "grad_norm": 4.392331600189209, "learning_rate": 3.942048100869333e-05, "loss": 1.9385, "step": 3610 }, { "epoch": 0.5826072926750565, "grad_norm": 3.544884204864502, "learning_rate": 3.939494634250941e-05, "loss": 1.6515, "step": 3611 }, { "epoch": 0.5827686350435625, "grad_norm": 4.88466739654541, "learning_rate": 3.936941457216614e-05, "loss": 1.7893, "step": 3612 }, { "epoch": 0.5829299774120684, "grad_norm": 4.783680438995361, "learning_rate": 3.934388570463531e-05, "loss": 2.0079, "step": 3613 }, { "epoch": 0.5830913197805744, "grad_norm": 4.30138635635376, "learning_rate": 3.931835974688785e-05, "loss": 1.9392, "step": 3614 }, { "epoch": 0.5832526621490803, "grad_norm": 3.4884119033813477, "learning_rate": 3.929283670589399e-05, "loss": 1.8283, "step": 3615 }, { "epoch": 0.5834140045175863, "grad_norm": 5.524956703186035, "learning_rate": 3.926731658862307e-05, "loss": 1.9238, "step": 3616 }, { "epoch": 0.5835753468860922, "grad_norm": 5.218562126159668, "learning_rate": 3.9241799402043705e-05, "loss": 1.9534, "step": 3617 }, { "epoch": 0.5837366892545982, "grad_norm": 5.083594799041748, "learning_rate": 3.9216285153123646e-05, "loss": 2.1464, "step": 3618 }, { "epoch": 0.5838980316231043, "grad_norm": 4.488029956817627, "learning_rate": 3.919077384882991e-05, "loss": 1.8756, "step": 3619 }, { "epoch": 0.5840593739916102, "grad_norm": 6.185628414154053, "learning_rate": 3.9165265496128644e-05, "loss": 1.8962, "step": 3620 }, { "epoch": 0.5842207163601162, "grad_norm": 5.930531024932861, "learning_rate": 3.9139760101985225e-05, "loss": 2.1463, "step": 3621 }, { "epoch": 0.5843820587286221, "grad_norm": 7.0362629890441895, "learning_rate": 3.911425767336421e-05, "loss": 1.9975, "step": 3622 }, { "epoch": 0.5845434010971281, "grad_norm": 4.510569095611572, "learning_rate": 3.908875821722937e-05, "loss": 1.8865, "step": 3623 }, { "epoch": 0.5847047434656341, "grad_norm": 7.450949192047119, "learning_rate": 3.9063261740543636e-05, "loss": 1.6148, "step": 3624 }, { "epoch": 0.58486608583414, "grad_norm": 5.08569860458374, "learning_rate": 3.903776825026911e-05, "loss": 1.8569, "step": 3625 }, { "epoch": 0.585027428202646, "grad_norm": 3.9076004028320312, "learning_rate": 3.901227775336715e-05, "loss": 1.9772, "step": 3626 }, { "epoch": 0.585188770571152, "grad_norm": 4.3509955406188965, "learning_rate": 3.8986790256798214e-05, "loss": 1.8251, "step": 3627 }, { "epoch": 0.585350112939658, "grad_norm": 6.478168964385986, "learning_rate": 3.896130576752201e-05, "loss": 1.8171, "step": 3628 }, { "epoch": 0.585511455308164, "grad_norm": 4.264913082122803, "learning_rate": 3.893582429249735e-05, "loss": 1.9046, "step": 3629 }, { "epoch": 0.5856727976766699, "grad_norm": 5.290229320526123, "learning_rate": 3.891034583868231e-05, "loss": 1.9554, "step": 3630 }, { "epoch": 0.5858341400451759, "grad_norm": 4.591266632080078, "learning_rate": 3.8884870413034064e-05, "loss": 1.9888, "step": 3631 }, { "epoch": 0.5859954824136818, "grad_norm": 4.453161239624023, "learning_rate": 3.885939802250901e-05, "loss": 2.1186, "step": 3632 }, { "epoch": 0.5861568247821878, "grad_norm": 4.925012111663818, "learning_rate": 3.883392867406269e-05, "loss": 1.849, "step": 3633 }, { "epoch": 0.5863181671506937, "grad_norm": 5.076260089874268, "learning_rate": 3.8808462374649803e-05, "loss": 1.8674, "step": 3634 }, { "epoch": 0.5864795095191997, "grad_norm": 4.231884002685547, "learning_rate": 3.878299913122427e-05, "loss": 1.9519, "step": 3635 }, { "epoch": 0.5866408518877058, "grad_norm": 3.9470291137695312, "learning_rate": 3.875753895073913e-05, "loss": 2.0958, "step": 3636 }, { "epoch": 0.5868021942562117, "grad_norm": 3.966118335723877, "learning_rate": 3.87320818401466e-05, "loss": 1.689, "step": 3637 }, { "epoch": 0.5869635366247177, "grad_norm": 4.230781078338623, "learning_rate": 3.8706627806398046e-05, "loss": 1.8142, "step": 3638 }, { "epoch": 0.5871248789932236, "grad_norm": 3.732171058654785, "learning_rate": 3.868117685644403e-05, "loss": 1.8904, "step": 3639 }, { "epoch": 0.5872862213617296, "grad_norm": 4.025249004364014, "learning_rate": 3.865572899723422e-05, "loss": 2.0185, "step": 3640 }, { "epoch": 0.5874475637302355, "grad_norm": 4.11497163772583, "learning_rate": 3.8630284235717504e-05, "loss": 1.7455, "step": 3641 }, { "epoch": 0.5876089060987415, "grad_norm": 4.722634792327881, "learning_rate": 3.860484257884184e-05, "loss": 1.8329, "step": 3642 }, { "epoch": 0.5877702484672475, "grad_norm": 4.155136585235596, "learning_rate": 3.857940403355444e-05, "loss": 1.8166, "step": 3643 }, { "epoch": 0.5879315908357534, "grad_norm": 4.59343957901001, "learning_rate": 3.855396860680155e-05, "loss": 2.0221, "step": 3644 }, { "epoch": 0.5880929332042595, "grad_norm": 4.209593296051025, "learning_rate": 3.8528536305528695e-05, "loss": 1.9369, "step": 3645 }, { "epoch": 0.5882542755727654, "grad_norm": 4.7490363121032715, "learning_rate": 3.850310713668044e-05, "loss": 1.8632, "step": 3646 }, { "epoch": 0.5884156179412714, "grad_norm": 4.539514541625977, "learning_rate": 3.847768110720052e-05, "loss": 1.9854, "step": 3647 }, { "epoch": 0.5885769603097774, "grad_norm": 3.969621419906616, "learning_rate": 3.845225822403186e-05, "loss": 1.8744, "step": 3648 }, { "epoch": 0.5887383026782833, "grad_norm": 4.80803108215332, "learning_rate": 3.842683849411646e-05, "loss": 2.061, "step": 3649 }, { "epoch": 0.5888996450467893, "grad_norm": 5.44177770614624, "learning_rate": 3.840142192439552e-05, "loss": 2.037, "step": 3650 }, { "epoch": 0.5890609874152952, "grad_norm": 4.776845932006836, "learning_rate": 3.837600852180933e-05, "loss": 1.8394, "step": 3651 }, { "epoch": 0.5892223297838012, "grad_norm": 3.8401901721954346, "learning_rate": 3.835059829329735e-05, "loss": 1.9709, "step": 3652 }, { "epoch": 0.5893836721523072, "grad_norm": 4.661105155944824, "learning_rate": 3.832519124579811e-05, "loss": 1.9098, "step": 3653 }, { "epoch": 0.5895450145208132, "grad_norm": 4.342999458312988, "learning_rate": 3.829978738624938e-05, "loss": 1.9601, "step": 3654 }, { "epoch": 0.5897063568893192, "grad_norm": 5.721419334411621, "learning_rate": 3.827438672158795e-05, "loss": 1.7262, "step": 3655 }, { "epoch": 0.5898676992578251, "grad_norm": 3.692500352859497, "learning_rate": 3.824898925874982e-05, "loss": 2.0584, "step": 3656 }, { "epoch": 0.5900290416263311, "grad_norm": 5.073899269104004, "learning_rate": 3.822359500467006e-05, "loss": 2.1524, "step": 3657 }, { "epoch": 0.590190383994837, "grad_norm": 3.4609923362731934, "learning_rate": 3.819820396628287e-05, "loss": 1.9398, "step": 3658 }, { "epoch": 0.590351726363343, "grad_norm": 4.638036251068115, "learning_rate": 3.8172816150521616e-05, "loss": 1.7962, "step": 3659 }, { "epoch": 0.590513068731849, "grad_norm": 3.909994125366211, "learning_rate": 3.8147431564318745e-05, "loss": 2.0509, "step": 3660 }, { "epoch": 0.5906744111003549, "grad_norm": 5.345953464508057, "learning_rate": 3.812205021460582e-05, "loss": 1.9027, "step": 3661 }, { "epoch": 0.590835753468861, "grad_norm": 4.3474626541137695, "learning_rate": 3.809667210831353e-05, "loss": 1.7642, "step": 3662 }, { "epoch": 0.5909970958373669, "grad_norm": 4.156850814819336, "learning_rate": 3.807129725237171e-05, "loss": 2.0627, "step": 3663 }, { "epoch": 0.5911584382058729, "grad_norm": 4.340262413024902, "learning_rate": 3.8045925653709233e-05, "loss": 2.0942, "step": 3664 }, { "epoch": 0.5913197805743788, "grad_norm": 5.7656683921813965, "learning_rate": 3.8020557319254174e-05, "loss": 2.28, "step": 3665 }, { "epoch": 0.5914811229428848, "grad_norm": 5.124190807342529, "learning_rate": 3.799519225593362e-05, "loss": 2.0985, "step": 3666 }, { "epoch": 0.5916424653113908, "grad_norm": 3.294722557067871, "learning_rate": 3.7969830470673866e-05, "loss": 1.9777, "step": 3667 }, { "epoch": 0.5918038076798967, "grad_norm": 5.506333351135254, "learning_rate": 3.794447197040022e-05, "loss": 1.8562, "step": 3668 }, { "epoch": 0.5919651500484027, "grad_norm": 4.190850734710693, "learning_rate": 3.7919116762037146e-05, "loss": 1.8515, "step": 3669 }, { "epoch": 0.5921264924169087, "grad_norm": 4.269705295562744, "learning_rate": 3.789376485250821e-05, "loss": 1.9124, "step": 3670 }, { "epoch": 0.5922878347854147, "grad_norm": 5.131028175354004, "learning_rate": 3.786841624873604e-05, "loss": 1.8684, "step": 3671 }, { "epoch": 0.5924491771539206, "grad_norm": 3.8137192726135254, "learning_rate": 3.784307095764241e-05, "loss": 1.8912, "step": 3672 }, { "epoch": 0.5926105195224266, "grad_norm": 4.432575702667236, "learning_rate": 3.781772898614812e-05, "loss": 2.1149, "step": 3673 }, { "epoch": 0.5927718618909326, "grad_norm": 3.3182921409606934, "learning_rate": 3.779239034117316e-05, "loss": 1.8549, "step": 3674 }, { "epoch": 0.5929332042594385, "grad_norm": 4.513149738311768, "learning_rate": 3.776705502963652e-05, "loss": 2.0607, "step": 3675 }, { "epoch": 0.5930945466279445, "grad_norm": 4.182868480682373, "learning_rate": 3.774172305845636e-05, "loss": 2.0251, "step": 3676 }, { "epoch": 0.5932558889964504, "grad_norm": 3.6221394538879395, "learning_rate": 3.771639443454984e-05, "loss": 1.7068, "step": 3677 }, { "epoch": 0.5934172313649564, "grad_norm": 3.82957124710083, "learning_rate": 3.76910691648333e-05, "loss": 1.7472, "step": 3678 }, { "epoch": 0.5935785737334625, "grad_norm": 5.206719398498535, "learning_rate": 3.7665747256222075e-05, "loss": 1.9413, "step": 3679 }, { "epoch": 0.5937399161019684, "grad_norm": 3.959946393966675, "learning_rate": 3.764042871563066e-05, "loss": 1.875, "step": 3680 }, { "epoch": 0.5939012584704744, "grad_norm": 4.654921054840088, "learning_rate": 3.761511354997256e-05, "loss": 2.055, "step": 3681 }, { "epoch": 0.5940626008389803, "grad_norm": 5.93565559387207, "learning_rate": 3.758980176616042e-05, "loss": 1.8814, "step": 3682 }, { "epoch": 0.5942239432074863, "grad_norm": 5.584222793579102, "learning_rate": 3.7564493371105934e-05, "loss": 1.9983, "step": 3683 }, { "epoch": 0.5943852855759922, "grad_norm": 4.700340747833252, "learning_rate": 3.753918837171984e-05, "loss": 1.9201, "step": 3684 }, { "epoch": 0.5945466279444982, "grad_norm": 4.035298824310303, "learning_rate": 3.7513886774912024e-05, "loss": 2.1135, "step": 3685 }, { "epoch": 0.5947079703130042, "grad_norm": 3.782871723175049, "learning_rate": 3.7488588587591356e-05, "loss": 1.8368, "step": 3686 }, { "epoch": 0.5948693126815101, "grad_norm": 4.397482872009277, "learning_rate": 3.7463293816665866e-05, "loss": 1.9948, "step": 3687 }, { "epoch": 0.5950306550500162, "grad_norm": 4.766461372375488, "learning_rate": 3.7438002469042565e-05, "loss": 2.1219, "step": 3688 }, { "epoch": 0.5951919974185221, "grad_norm": 4.941188812255859, "learning_rate": 3.741271455162757e-05, "loss": 2.0298, "step": 3689 }, { "epoch": 0.5953533397870281, "grad_norm": 3.774362325668335, "learning_rate": 3.738743007132608e-05, "loss": 2.0194, "step": 3690 }, { "epoch": 0.595514682155534, "grad_norm": 3.833545207977295, "learning_rate": 3.736214903504233e-05, "loss": 1.9942, "step": 3691 }, { "epoch": 0.59567602452404, "grad_norm": 3.833545207977295, "learning_rate": 3.736214903504233e-05, "loss": 1.9729, "step": 3692 }, { "epoch": 0.595837366892546, "grad_norm": 4.3665313720703125, "learning_rate": 3.7336871449679586e-05, "loss": 1.8651, "step": 3693 }, { "epoch": 0.5959987092610519, "grad_norm": 4.348609924316406, "learning_rate": 3.7311597322140246e-05, "loss": 2.2343, "step": 3694 }, { "epoch": 0.5961600516295579, "grad_norm": 3.520056962966919, "learning_rate": 3.728632665932569e-05, "loss": 1.8885, "step": 3695 }, { "epoch": 0.5963213939980639, "grad_norm": 5.269471168518066, "learning_rate": 3.726105946813642e-05, "loss": 1.9451, "step": 3696 }, { "epoch": 0.5964827363665699, "grad_norm": 4.60741662979126, "learning_rate": 3.723579575547194e-05, "loss": 1.8556, "step": 3697 }, { "epoch": 0.5966440787350759, "grad_norm": 4.3959641456604, "learning_rate": 3.721053552823078e-05, "loss": 1.7342, "step": 3698 }, { "epoch": 0.5968054211035818, "grad_norm": 4.219175815582275, "learning_rate": 3.7185278793310605e-05, "loss": 1.8752, "step": 3699 }, { "epoch": 0.5969667634720878, "grad_norm": 3.3393092155456543, "learning_rate": 3.716002555760806e-05, "loss": 1.8868, "step": 3700 }, { "epoch": 0.5971281058405937, "grad_norm": 3.608086109161377, "learning_rate": 3.713477582801886e-05, "loss": 1.9005, "step": 3701 }, { "epoch": 0.5972894482090997, "grad_norm": 4.510677337646484, "learning_rate": 3.710952961143773e-05, "loss": 1.844, "step": 3702 }, { "epoch": 0.5974507905776056, "grad_norm": 3.330909013748169, "learning_rate": 3.7084286914758505e-05, "loss": 1.8193, "step": 3703 }, { "epoch": 0.5976121329461116, "grad_norm": 3.9209372997283936, "learning_rate": 3.705904774487396e-05, "loss": 1.9915, "step": 3704 }, { "epoch": 0.5977734753146177, "grad_norm": 4.028830051422119, "learning_rate": 3.703381210867601e-05, "loss": 1.9291, "step": 3705 }, { "epoch": 0.5979348176831236, "grad_norm": 3.884798288345337, "learning_rate": 3.7008580013055524e-05, "loss": 2.0992, "step": 3706 }, { "epoch": 0.5980961600516296, "grad_norm": 4.469210624694824, "learning_rate": 3.698335146490246e-05, "loss": 1.9829, "step": 3707 }, { "epoch": 0.5982575024201355, "grad_norm": 5.87947940826416, "learning_rate": 3.695812647110577e-05, "loss": 1.7308, "step": 3708 }, { "epoch": 0.5984188447886415, "grad_norm": 4.097621917724609, "learning_rate": 3.6932905038553464e-05, "loss": 1.919, "step": 3709 }, { "epoch": 0.5985801871571474, "grad_norm": 5.142523288726807, "learning_rate": 3.690768717413254e-05, "loss": 2.1386, "step": 3710 }, { "epoch": 0.5987415295256534, "grad_norm": 4.307010650634766, "learning_rate": 3.6882472884729066e-05, "loss": 1.7298, "step": 3711 }, { "epoch": 0.5989028718941594, "grad_norm": 4.007396697998047, "learning_rate": 3.6857262177228125e-05, "loss": 1.7682, "step": 3712 }, { "epoch": 0.5990642142626654, "grad_norm": 3.942392110824585, "learning_rate": 3.683205505851377e-05, "loss": 1.9615, "step": 3713 }, { "epoch": 0.5992255566311714, "grad_norm": 6.172910690307617, "learning_rate": 3.680685153546916e-05, "loss": 2.005, "step": 3714 }, { "epoch": 0.5993868989996773, "grad_norm": 4.684194087982178, "learning_rate": 3.6781651614976386e-05, "loss": 1.753, "step": 3715 }, { "epoch": 0.5995482413681833, "grad_norm": 5.116608142852783, "learning_rate": 3.675645530391665e-05, "loss": 2.0317, "step": 3716 }, { "epoch": 0.5997095837366893, "grad_norm": 3.8065035343170166, "learning_rate": 3.673126260917006e-05, "loss": 2.0698, "step": 3717 }, { "epoch": 0.5998709261051952, "grad_norm": 5.370408535003662, "learning_rate": 3.670607353761584e-05, "loss": 1.9875, "step": 3718 }, { "epoch": 0.6000322684737012, "grad_norm": 3.7742254734039307, "learning_rate": 3.668088809613215e-05, "loss": 1.7665, "step": 3719 }, { "epoch": 0.6001936108422071, "grad_norm": 4.05025053024292, "learning_rate": 3.665570629159619e-05, "loss": 1.7383, "step": 3720 }, { "epoch": 0.6003549532107131, "grad_norm": 3.3534834384918213, "learning_rate": 3.663052813088417e-05, "loss": 1.8674, "step": 3721 }, { "epoch": 0.6005162955792192, "grad_norm": 5.025367259979248, "learning_rate": 3.6605353620871266e-05, "loss": 1.9043, "step": 3722 }, { "epoch": 0.6006776379477251, "grad_norm": 4.877417087554932, "learning_rate": 3.6580182768431735e-05, "loss": 1.6918, "step": 3723 }, { "epoch": 0.6008389803162311, "grad_norm": 4.852436542510986, "learning_rate": 3.6555015580438745e-05, "loss": 2.0019, "step": 3724 }, { "epoch": 0.601000322684737, "grad_norm": 5.720123767852783, "learning_rate": 3.6529852063764545e-05, "loss": 1.8533, "step": 3725 }, { "epoch": 0.601161665053243, "grad_norm": 4.051018714904785, "learning_rate": 3.65046922252803e-05, "loss": 2.2468, "step": 3726 }, { "epoch": 0.6013230074217489, "grad_norm": 4.266456604003906, "learning_rate": 3.6479536071856265e-05, "loss": 1.7491, "step": 3727 }, { "epoch": 0.6014843497902549, "grad_norm": 3.0934205055236816, "learning_rate": 3.645438361036161e-05, "loss": 1.9993, "step": 3728 }, { "epoch": 0.6016456921587608, "grad_norm": 4.6057329177856445, "learning_rate": 3.6429234847664535e-05, "loss": 1.8901, "step": 3729 }, { "epoch": 0.6018070345272669, "grad_norm": 7.07024621963501, "learning_rate": 3.640408979063219e-05, "loss": 2.274, "step": 3730 }, { "epoch": 0.6019683768957729, "grad_norm": 3.6758999824523926, "learning_rate": 3.6378948446130786e-05, "loss": 1.6481, "step": 3731 }, { "epoch": 0.6021297192642788, "grad_norm": 3.463942050933838, "learning_rate": 3.6353810821025436e-05, "loss": 1.7646, "step": 3732 }, { "epoch": 0.6022910616327848, "grad_norm": 5.796482086181641, "learning_rate": 3.632867692218032e-05, "loss": 2.008, "step": 3733 }, { "epoch": 0.6024524040012907, "grad_norm": 4.755577087402344, "learning_rate": 3.630354675645853e-05, "loss": 1.8181, "step": 3734 }, { "epoch": 0.6026137463697967, "grad_norm": 4.626229763031006, "learning_rate": 3.627842033072216e-05, "loss": 1.7967, "step": 3735 }, { "epoch": 0.6027750887383027, "grad_norm": 5.281877517700195, "learning_rate": 3.625329765183233e-05, "loss": 1.9245, "step": 3736 }, { "epoch": 0.6029364311068086, "grad_norm": 4.206997871398926, "learning_rate": 3.6228178726649047e-05, "loss": 1.9532, "step": 3737 }, { "epoch": 0.6030977734753146, "grad_norm": 3.6428096294403076, "learning_rate": 3.620306356203139e-05, "loss": 1.7949, "step": 3738 }, { "epoch": 0.6032591158438206, "grad_norm": 6.974172115325928, "learning_rate": 3.6177952164837335e-05, "loss": 1.8432, "step": 3739 }, { "epoch": 0.6034204582123266, "grad_norm": 4.038150310516357, "learning_rate": 3.615284454192388e-05, "loss": 1.8784, "step": 3740 }, { "epoch": 0.6035818005808326, "grad_norm": 7.101322650909424, "learning_rate": 3.612774070014694e-05, "loss": 2.1201, "step": 3741 }, { "epoch": 0.6037431429493385, "grad_norm": 9.740883827209473, "learning_rate": 3.610264064636146e-05, "loss": 2.1495, "step": 3742 }, { "epoch": 0.6039044853178445, "grad_norm": 5.0577712059021, "learning_rate": 3.607754438742129e-05, "loss": 1.9048, "step": 3743 }, { "epoch": 0.6040658276863504, "grad_norm": 6.079409599304199, "learning_rate": 3.605245193017931e-05, "loss": 1.7851, "step": 3744 }, { "epoch": 0.6042271700548564, "grad_norm": 4.561661243438721, "learning_rate": 3.602736328148728e-05, "loss": 1.6548, "step": 3745 }, { "epoch": 0.6043885124233623, "grad_norm": 4.185081481933594, "learning_rate": 3.600227844819601e-05, "loss": 1.7029, "step": 3746 }, { "epoch": 0.6045498547918683, "grad_norm": 4.4725117683410645, "learning_rate": 3.5977197437155205e-05, "loss": 1.8245, "step": 3747 }, { "epoch": 0.6047111971603744, "grad_norm": 4.323187351226807, "learning_rate": 3.5952120255213526e-05, "loss": 1.9139, "step": 3748 }, { "epoch": 0.6048725395288803, "grad_norm": 4.430797100067139, "learning_rate": 3.592704690921863e-05, "loss": 1.9795, "step": 3749 }, { "epoch": 0.6050338818973863, "grad_norm": 5.248645782470703, "learning_rate": 3.5901977406017085e-05, "loss": 1.9281, "step": 3750 }, { "epoch": 0.6051952242658922, "grad_norm": 3.7061660289764404, "learning_rate": 3.5876911752454447e-05, "loss": 1.6232, "step": 3751 }, { "epoch": 0.6053565666343982, "grad_norm": 4.320940971374512, "learning_rate": 3.585184995537518e-05, "loss": 1.8486, "step": 3752 }, { "epoch": 0.6055179090029041, "grad_norm": 3.9274215698242188, "learning_rate": 3.5826792021622744e-05, "loss": 1.7725, "step": 3753 }, { "epoch": 0.6056792513714101, "grad_norm": 4.034487724304199, "learning_rate": 3.580173795803948e-05, "loss": 1.917, "step": 3754 }, { "epoch": 0.6058405937399161, "grad_norm": 3.7273917198181152, "learning_rate": 3.577668777146676e-05, "loss": 1.933, "step": 3755 }, { "epoch": 0.6060019361084221, "grad_norm": 5.619368553161621, "learning_rate": 3.57516414687448e-05, "loss": 2.0396, "step": 3756 }, { "epoch": 0.6061632784769281, "grad_norm": 4.922757625579834, "learning_rate": 3.572659905671283e-05, "loss": 1.8759, "step": 3757 }, { "epoch": 0.606324620845434, "grad_norm": 4.182972431182861, "learning_rate": 3.5701560542208965e-05, "loss": 1.9355, "step": 3758 }, { "epoch": 0.60648596321394, "grad_norm": 3.7297675609588623, "learning_rate": 3.5676525932070317e-05, "loss": 1.9765, "step": 3759 }, { "epoch": 0.606647305582446, "grad_norm": 4.987452030181885, "learning_rate": 3.565149523313286e-05, "loss": 1.7428, "step": 3760 }, { "epoch": 0.6068086479509519, "grad_norm": 4.672898292541504, "learning_rate": 3.562646845223153e-05, "loss": 1.8289, "step": 3761 }, { "epoch": 0.6069699903194579, "grad_norm": 5.305594444274902, "learning_rate": 3.560144559620023e-05, "loss": 1.8384, "step": 3762 }, { "epoch": 0.6071313326879638, "grad_norm": 3.834249973297119, "learning_rate": 3.5576426671871736e-05, "loss": 1.5013, "step": 3763 }, { "epoch": 0.6072926750564698, "grad_norm": 4.112975120544434, "learning_rate": 3.55514116860778e-05, "loss": 1.889, "step": 3764 }, { "epoch": 0.6074540174249758, "grad_norm": 4.56207799911499, "learning_rate": 3.552640064564903e-05, "loss": 1.7246, "step": 3765 }, { "epoch": 0.6076153597934818, "grad_norm": 4.238980770111084, "learning_rate": 3.550139355741504e-05, "loss": 2.1032, "step": 3766 }, { "epoch": 0.6077767021619878, "grad_norm": 4.23140287399292, "learning_rate": 3.54763904282043e-05, "loss": 1.91, "step": 3767 }, { "epoch": 0.6079380445304937, "grad_norm": 5.2451863288879395, "learning_rate": 3.5451391264844244e-05, "loss": 1.7813, "step": 3768 }, { "epoch": 0.6080993868989997, "grad_norm": 4.276443004608154, "learning_rate": 3.542639607416116e-05, "loss": 1.9734, "step": 3769 }, { "epoch": 0.6082607292675056, "grad_norm": 3.4733471870422363, "learning_rate": 3.540140486298035e-05, "loss": 1.8424, "step": 3770 }, { "epoch": 0.6084220716360116, "grad_norm": 4.838557720184326, "learning_rate": 3.5376417638125914e-05, "loss": 1.9435, "step": 3771 }, { "epoch": 0.6085834140045175, "grad_norm": 4.959713935852051, "learning_rate": 3.535143440642097e-05, "loss": 2.0424, "step": 3772 }, { "epoch": 0.6087447563730236, "grad_norm": 6.249136447906494, "learning_rate": 3.532645517468748e-05, "loss": 1.7399, "step": 3773 }, { "epoch": 0.6089060987415296, "grad_norm": 4.988114356994629, "learning_rate": 3.5301479949746314e-05, "loss": 2.0397, "step": 3774 }, { "epoch": 0.6090674411100355, "grad_norm": 3.851431369781494, "learning_rate": 3.52765087384173e-05, "loss": 1.8279, "step": 3775 }, { "epoch": 0.6092287834785415, "grad_norm": 3.687272787094116, "learning_rate": 3.5251541547519094e-05, "loss": 2.0134, "step": 3776 }, { "epoch": 0.6093901258470474, "grad_norm": 4.138549327850342, "learning_rate": 3.522657838386933e-05, "loss": 1.9043, "step": 3777 }, { "epoch": 0.6095514682155534, "grad_norm": 4.868224143981934, "learning_rate": 3.520161925428449e-05, "loss": 1.8726, "step": 3778 }, { "epoch": 0.6097128105840594, "grad_norm": 6.811543941497803, "learning_rate": 3.5176664165579986e-05, "loss": 2.0306, "step": 3779 }, { "epoch": 0.6098741529525653, "grad_norm": 3.8094887733459473, "learning_rate": 3.5151713124570086e-05, "loss": 1.7307, "step": 3780 }, { "epoch": 0.6100354953210713, "grad_norm": 5.4980950355529785, "learning_rate": 3.512676613806802e-05, "loss": 1.799, "step": 3781 }, { "epoch": 0.6101968376895773, "grad_norm": 4.2599711418151855, "learning_rate": 3.510182321288582e-05, "loss": 1.9913, "step": 3782 }, { "epoch": 0.6103581800580833, "grad_norm": 5.045507907867432, "learning_rate": 3.507688435583451e-05, "loss": 1.6917, "step": 3783 }, { "epoch": 0.6105195224265892, "grad_norm": 4.295395374298096, "learning_rate": 3.5051949573723926e-05, "loss": 1.7401, "step": 3784 }, { "epoch": 0.6106808647950952, "grad_norm": 4.576744556427002, "learning_rate": 3.50270188733628e-05, "loss": 2.2997, "step": 3785 }, { "epoch": 0.6108422071636012, "grad_norm": 4.532271385192871, "learning_rate": 3.5002092261558814e-05, "loss": 1.7825, "step": 3786 }, { "epoch": 0.6110035495321071, "grad_norm": 3.389321804046631, "learning_rate": 3.497716974511844e-05, "loss": 1.8228, "step": 3787 }, { "epoch": 0.6111648919006131, "grad_norm": 4.300199031829834, "learning_rate": 3.495225133084712e-05, "loss": 1.9312, "step": 3788 }, { "epoch": 0.611326234269119, "grad_norm": 5.128062725067139, "learning_rate": 3.4927337025549077e-05, "loss": 2.3558, "step": 3789 }, { "epoch": 0.611487576637625, "grad_norm": 5.287378311157227, "learning_rate": 3.4902426836027534e-05, "loss": 1.8695, "step": 3790 }, { "epoch": 0.611648919006131, "grad_norm": 4.9498748779296875, "learning_rate": 3.4877520769084484e-05, "loss": 2.448, "step": 3791 }, { "epoch": 0.611810261374637, "grad_norm": 3.59617280960083, "learning_rate": 3.4852618831520855e-05, "loss": 1.7811, "step": 3792 }, { "epoch": 0.611971603743143, "grad_norm": 3.691159725189209, "learning_rate": 3.482772103013641e-05, "loss": 1.926, "step": 3793 }, { "epoch": 0.6121329461116489, "grad_norm": 4.8668532371521, "learning_rate": 3.480282737172983e-05, "loss": 1.8611, "step": 3794 }, { "epoch": 0.6122942884801549, "grad_norm": 4.09564208984375, "learning_rate": 3.477793786309861e-05, "loss": 2.0656, "step": 3795 }, { "epoch": 0.6124556308486608, "grad_norm": 4.320098400115967, "learning_rate": 3.4753052511039155e-05, "loss": 1.7872, "step": 3796 }, { "epoch": 0.6126169732171668, "grad_norm": 3.934811592102051, "learning_rate": 3.4728171322346694e-05, "loss": 1.7117, "step": 3797 }, { "epoch": 0.6127783155856728, "grad_norm": 5.060281753540039, "learning_rate": 3.470329430381535e-05, "loss": 1.7985, "step": 3798 }, { "epoch": 0.6129396579541788, "grad_norm": 4.571928977966309, "learning_rate": 3.467842146223812e-05, "loss": 1.9876, "step": 3799 }, { "epoch": 0.6131010003226848, "grad_norm": 5.835862159729004, "learning_rate": 3.4653552804406805e-05, "loss": 1.7337, "step": 3800 }, { "epoch": 0.6132623426911907, "grad_norm": 4.562862396240234, "learning_rate": 3.462868833711214e-05, "loss": 1.8781, "step": 3801 }, { "epoch": 0.6134236850596967, "grad_norm": 4.681947231292725, "learning_rate": 3.460382806714362e-05, "loss": 1.9091, "step": 3802 }, { "epoch": 0.6135850274282026, "grad_norm": 4.201179504394531, "learning_rate": 3.457897200128971e-05, "loss": 1.7072, "step": 3803 }, { "epoch": 0.6137463697967086, "grad_norm": 4.7126383781433105, "learning_rate": 3.455412014633763e-05, "loss": 1.9345, "step": 3804 }, { "epoch": 0.6139077121652146, "grad_norm": 5.711738109588623, "learning_rate": 3.452927250907349e-05, "loss": 2.0958, "step": 3805 }, { "epoch": 0.6140690545337205, "grad_norm": 3.8236541748046875, "learning_rate": 3.450442909628224e-05, "loss": 1.8577, "step": 3806 }, { "epoch": 0.6142303969022265, "grad_norm": 3.954463243484497, "learning_rate": 3.4479589914747706e-05, "loss": 1.7468, "step": 3807 }, { "epoch": 0.6143917392707325, "grad_norm": 3.7589309215545654, "learning_rate": 3.445475497125249e-05, "loss": 1.8477, "step": 3808 }, { "epoch": 0.6145530816392385, "grad_norm": 3.4328806400299072, "learning_rate": 3.4429924272578116e-05, "loss": 1.6261, "step": 3809 }, { "epoch": 0.6147144240077445, "grad_norm": 3.705533027648926, "learning_rate": 3.4405097825504906e-05, "loss": 1.6058, "step": 3810 }, { "epoch": 0.6148757663762504, "grad_norm": 4.693033695220947, "learning_rate": 3.4380275636811986e-05, "loss": 2.0203, "step": 3811 }, { "epoch": 0.6150371087447564, "grad_norm": 3.9044811725616455, "learning_rate": 3.435545771327743e-05, "loss": 1.9886, "step": 3812 }, { "epoch": 0.6151984511132623, "grad_norm": 4.10264778137207, "learning_rate": 3.433064406167801e-05, "loss": 1.8557, "step": 3813 }, { "epoch": 0.6153597934817683, "grad_norm": 4.515639781951904, "learning_rate": 3.4305834688789465e-05, "loss": 1.9711, "step": 3814 }, { "epoch": 0.6155211358502742, "grad_norm": 5.027278900146484, "learning_rate": 3.428102960138625e-05, "loss": 1.9102, "step": 3815 }, { "epoch": 0.6156824782187803, "grad_norm": 3.451860189437866, "learning_rate": 3.4256228806241733e-05, "loss": 1.9471, "step": 3816 }, { "epoch": 0.6158438205872863, "grad_norm": 4.987911701202393, "learning_rate": 3.423143231012803e-05, "loss": 1.6859, "step": 3817 }, { "epoch": 0.6160051629557922, "grad_norm": 4.156739234924316, "learning_rate": 3.4206640119816187e-05, "loss": 1.6867, "step": 3818 }, { "epoch": 0.6161665053242982, "grad_norm": 4.318131446838379, "learning_rate": 3.418185224207597e-05, "loss": 1.9572, "step": 3819 }, { "epoch": 0.6163278476928041, "grad_norm": 4.359016418457031, "learning_rate": 3.4157068683676055e-05, "loss": 1.8555, "step": 3820 }, { "epoch": 0.6164891900613101, "grad_norm": 5.2175140380859375, "learning_rate": 3.4132289451383866e-05, "loss": 1.9462, "step": 3821 }, { "epoch": 0.616650532429816, "grad_norm": 4.045276165008545, "learning_rate": 3.410751455196571e-05, "loss": 1.8704, "step": 3822 }, { "epoch": 0.616811874798322, "grad_norm": 4.262764930725098, "learning_rate": 3.4082743992186655e-05, "loss": 1.9725, "step": 3823 }, { "epoch": 0.616973217166828, "grad_norm": 4.094369888305664, "learning_rate": 3.405797777881059e-05, "loss": 1.8869, "step": 3824 }, { "epoch": 0.617134559535334, "grad_norm": 3.974959135055542, "learning_rate": 3.4033215918600285e-05, "loss": 1.9153, "step": 3825 }, { "epoch": 0.61729590190384, "grad_norm": 3.951637029647827, "learning_rate": 3.400845841831723e-05, "loss": 2.0925, "step": 3826 }, { "epoch": 0.6174572442723459, "grad_norm": 4.118864059448242, "learning_rate": 3.3983705284721795e-05, "loss": 1.6755, "step": 3827 }, { "epoch": 0.6176185866408519, "grad_norm": 5.3612775802612305, "learning_rate": 3.3958956524573085e-05, "loss": 1.8371, "step": 3828 }, { "epoch": 0.6177799290093579, "grad_norm": 5.615139484405518, "learning_rate": 3.3934212144629104e-05, "loss": 2.0056, "step": 3829 }, { "epoch": 0.6179412713778638, "grad_norm": 3.605287551879883, "learning_rate": 3.390947215164657e-05, "loss": 1.8722, "step": 3830 }, { "epoch": 0.6181026137463698, "grad_norm": 3.9551656246185303, "learning_rate": 3.388473655238109e-05, "loss": 1.7807, "step": 3831 }, { "epoch": 0.6182639561148757, "grad_norm": 4.237039566040039, "learning_rate": 3.386000535358696e-05, "loss": 1.7805, "step": 3832 }, { "epoch": 0.6184252984833818, "grad_norm": 4.651735305786133, "learning_rate": 3.38352785620174e-05, "loss": 1.7804, "step": 3833 }, { "epoch": 0.6185866408518877, "grad_norm": 4.280829906463623, "learning_rate": 3.381055618442434e-05, "loss": 1.9884, "step": 3834 }, { "epoch": 0.6187479832203937, "grad_norm": 4.486728191375732, "learning_rate": 3.378583822755853e-05, "loss": 1.7981, "step": 3835 }, { "epoch": 0.6189093255888997, "grad_norm": 4.050109386444092, "learning_rate": 3.376112469816951e-05, "loss": 1.9284, "step": 3836 }, { "epoch": 0.6190706679574056, "grad_norm": 4.030494689941406, "learning_rate": 3.37364156030056e-05, "loss": 1.926, "step": 3837 }, { "epoch": 0.6192320103259116, "grad_norm": 4.372687816619873, "learning_rate": 3.3711710948813956e-05, "loss": 1.7221, "step": 3838 }, { "epoch": 0.6193933526944175, "grad_norm": 5.265523433685303, "learning_rate": 3.368701074234045e-05, "loss": 2.1795, "step": 3839 }, { "epoch": 0.6195546950629235, "grad_norm": 4.2426629066467285, "learning_rate": 3.366231499032983e-05, "loss": 2.1742, "step": 3840 }, { "epoch": 0.6197160374314294, "grad_norm": 6.021782398223877, "learning_rate": 3.363762369952552e-05, "loss": 2.0044, "step": 3841 }, { "epoch": 0.6198773797999355, "grad_norm": 4.393467426300049, "learning_rate": 3.3612936876669834e-05, "loss": 1.8005, "step": 3842 }, { "epoch": 0.6200387221684415, "grad_norm": 4.665099620819092, "learning_rate": 3.358825452850376e-05, "loss": 2.0681, "step": 3843 }, { "epoch": 0.6202000645369474, "grad_norm": 4.036859035491943, "learning_rate": 3.356357666176716e-05, "loss": 1.7972, "step": 3844 }, { "epoch": 0.6203614069054534, "grad_norm": 5.150184154510498, "learning_rate": 3.353890328319861e-05, "loss": 2.0301, "step": 3845 }, { "epoch": 0.6205227492739593, "grad_norm": 4.837057113647461, "learning_rate": 3.3514234399535485e-05, "loss": 2.0091, "step": 3846 }, { "epoch": 0.6206840916424653, "grad_norm": 5.241817951202393, "learning_rate": 3.3489570017513914e-05, "loss": 2.0307, "step": 3847 }, { "epoch": 0.6208454340109713, "grad_norm": 5.212489128112793, "learning_rate": 3.3464910143868844e-05, "loss": 1.9736, "step": 3848 }, { "epoch": 0.6210067763794772, "grad_norm": 4.452176094055176, "learning_rate": 3.3440254785333936e-05, "loss": 1.8269, "step": 3849 }, { "epoch": 0.6211681187479832, "grad_norm": 4.795321941375732, "learning_rate": 3.341560394864162e-05, "loss": 2.0215, "step": 3850 }, { "epoch": 0.6213294611164892, "grad_norm": 4.669750213623047, "learning_rate": 3.3390957640523145e-05, "loss": 1.9653, "step": 3851 }, { "epoch": 0.6214908034849952, "grad_norm": 4.048704147338867, "learning_rate": 3.3366315867708466e-05, "loss": 1.6575, "step": 3852 }, { "epoch": 0.6216521458535011, "grad_norm": 3.779632091522217, "learning_rate": 3.334167863692634e-05, "loss": 1.9369, "step": 3853 }, { "epoch": 0.6218134882220071, "grad_norm": 4.874792575836182, "learning_rate": 3.331704595490426e-05, "loss": 1.8517, "step": 3854 }, { "epoch": 0.6219748305905131, "grad_norm": 6.897017478942871, "learning_rate": 3.3292417828368495e-05, "loss": 2.0271, "step": 3855 }, { "epoch": 0.622136172959019, "grad_norm": 4.712216377258301, "learning_rate": 3.326779426404402e-05, "loss": 1.8385, "step": 3856 }, { "epoch": 0.622297515327525, "grad_norm": 6.182539463043213, "learning_rate": 3.324317526865465e-05, "loss": 2.05, "step": 3857 }, { "epoch": 0.6224588576960309, "grad_norm": 4.800398826599121, "learning_rate": 3.321856084892287e-05, "loss": 1.9153, "step": 3858 }, { "epoch": 0.622620200064537, "grad_norm": 4.936999797821045, "learning_rate": 3.319395101156998e-05, "loss": 1.6719, "step": 3859 }, { "epoch": 0.622781542433043, "grad_norm": 3.894695997238159, "learning_rate": 3.316934576331598e-05, "loss": 1.8424, "step": 3860 }, { "epoch": 0.6229428848015489, "grad_norm": 5.069485664367676, "learning_rate": 3.314474511087964e-05, "loss": 1.6743, "step": 3861 }, { "epoch": 0.6231042271700549, "grad_norm": 4.204349517822266, "learning_rate": 3.312014906097848e-05, "loss": 1.8606, "step": 3862 }, { "epoch": 0.6232655695385608, "grad_norm": 3.7627949714660645, "learning_rate": 3.309555762032873e-05, "loss": 1.7485, "step": 3863 }, { "epoch": 0.6234269119070668, "grad_norm": 4.631115436553955, "learning_rate": 3.307097079564542e-05, "loss": 1.7235, "step": 3864 }, { "epoch": 0.6235882542755727, "grad_norm": 8.076692581176758, "learning_rate": 3.304638859364225e-05, "loss": 2.1334, "step": 3865 }, { "epoch": 0.6237495966440787, "grad_norm": 5.298736095428467, "learning_rate": 3.302181102103173e-05, "loss": 1.6522, "step": 3866 }, { "epoch": 0.6239109390125847, "grad_norm": 5.513214111328125, "learning_rate": 3.299723808452504e-05, "loss": 1.7262, "step": 3867 }, { "epoch": 0.6240722813810907, "grad_norm": 4.249666690826416, "learning_rate": 3.297266979083215e-05, "loss": 1.94, "step": 3868 }, { "epoch": 0.6242336237495967, "grad_norm": 3.5764763355255127, "learning_rate": 3.29481061466617e-05, "loss": 1.6686, "step": 3869 }, { "epoch": 0.6243949661181026, "grad_norm": 4.225545406341553, "learning_rate": 3.292354715872113e-05, "loss": 1.8488, "step": 3870 }, { "epoch": 0.6245563084866086, "grad_norm": 3.946922540664673, "learning_rate": 3.289899283371657e-05, "loss": 1.8323, "step": 3871 }, { "epoch": 0.6247176508551145, "grad_norm": 3.8466291427612305, "learning_rate": 3.287444317835285e-05, "loss": 2.327, "step": 3872 }, { "epoch": 0.6248789932236205, "grad_norm": 3.4624412059783936, "learning_rate": 3.2849898199333605e-05, "loss": 1.8043, "step": 3873 }, { "epoch": 0.6250403355921265, "grad_norm": 4.060080051422119, "learning_rate": 3.28253579033611e-05, "loss": 1.8283, "step": 3874 }, { "epoch": 0.6252016779606324, "grad_norm": 3.958866834640503, "learning_rate": 3.280082229713639e-05, "loss": 1.7741, "step": 3875 }, { "epoch": 0.6253630203291385, "grad_norm": 4.636531829833984, "learning_rate": 3.27762913873592e-05, "loss": 1.7591, "step": 3876 }, { "epoch": 0.6255243626976444, "grad_norm": 4.473749160766602, "learning_rate": 3.275176518072804e-05, "loss": 1.9947, "step": 3877 }, { "epoch": 0.6256857050661504, "grad_norm": 4.48061466217041, "learning_rate": 3.2727243683940045e-05, "loss": 1.7359, "step": 3878 }, { "epoch": 0.6258470474346564, "grad_norm": 3.749462604522705, "learning_rate": 3.2702726903691156e-05, "loss": 1.7407, "step": 3879 }, { "epoch": 0.6260083898031623, "grad_norm": 3.427985429763794, "learning_rate": 3.267821484667594e-05, "loss": 1.7586, "step": 3880 }, { "epoch": 0.6261697321716683, "grad_norm": 3.203972339630127, "learning_rate": 3.265370751958776e-05, "loss": 1.9288, "step": 3881 }, { "epoch": 0.6263310745401742, "grad_norm": 6.207522392272949, "learning_rate": 3.2629204929118605e-05, "loss": 1.9522, "step": 3882 }, { "epoch": 0.6264924169086802, "grad_norm": 5.126103401184082, "learning_rate": 3.260470708195924e-05, "loss": 1.9996, "step": 3883 }, { "epoch": 0.6266537592771861, "grad_norm": 4.193669319152832, "learning_rate": 3.258021398479907e-05, "loss": 1.6116, "step": 3884 }, { "epoch": 0.6268151016456922, "grad_norm": 5.127233028411865, "learning_rate": 3.255572564432628e-05, "loss": 2.2015, "step": 3885 }, { "epoch": 0.6269764440141982, "grad_norm": 3.4576432704925537, "learning_rate": 3.253124206722768e-05, "loss": 1.8934, "step": 3886 }, { "epoch": 0.6271377863827041, "grad_norm": 3.747927665710449, "learning_rate": 3.250676326018882e-05, "loss": 2.0763, "step": 3887 }, { "epoch": 0.6272991287512101, "grad_norm": 6.565838813781738, "learning_rate": 3.248228922989396e-05, "loss": 2.1956, "step": 3888 }, { "epoch": 0.627460471119716, "grad_norm": 5.520340919494629, "learning_rate": 3.245781998302599e-05, "loss": 1.9575, "step": 3889 }, { "epoch": 0.627621813488222, "grad_norm": 4.918091297149658, "learning_rate": 3.2433355526266595e-05, "loss": 1.9128, "step": 3890 }, { "epoch": 0.627783155856728, "grad_norm": 4.920687675476074, "learning_rate": 3.2408895866296056e-05, "loss": 1.9027, "step": 3891 }, { "epoch": 0.6279444982252339, "grad_norm": 4.013607978820801, "learning_rate": 3.2384441009793395e-05, "loss": 1.8993, "step": 3892 }, { "epoch": 0.6281058405937399, "grad_norm": 4.493187427520752, "learning_rate": 3.235999096343633e-05, "loss": 1.8503, "step": 3893 }, { "epoch": 0.6282671829622459, "grad_norm": 3.9341964721679688, "learning_rate": 3.233554573390123e-05, "loss": 1.9401, "step": 3894 }, { "epoch": 0.6284285253307519, "grad_norm": 5.569158554077148, "learning_rate": 3.231110532786316e-05, "loss": 1.9528, "step": 3895 }, { "epoch": 0.6285898676992578, "grad_norm": 3.7844202518463135, "learning_rate": 3.2286669751995904e-05, "loss": 1.9074, "step": 3896 }, { "epoch": 0.6287512100677638, "grad_norm": 5.982578277587891, "learning_rate": 3.226223901297185e-05, "loss": 1.8828, "step": 3897 }, { "epoch": 0.6289125524362698, "grad_norm": 4.350557804107666, "learning_rate": 3.2237813117462166e-05, "loss": 1.8033, "step": 3898 }, { "epoch": 0.6290738948047757, "grad_norm": 3.943612813949585, "learning_rate": 3.2213392072136616e-05, "loss": 2.1312, "step": 3899 }, { "epoch": 0.6292352371732817, "grad_norm": 4.545783519744873, "learning_rate": 3.218897588366365e-05, "loss": 1.8734, "step": 3900 }, { "epoch": 0.6293965795417876, "grad_norm": 3.6101181507110596, "learning_rate": 3.2164564558710456e-05, "loss": 2.0215, "step": 3901 }, { "epoch": 0.6295579219102937, "grad_norm": 4.474806308746338, "learning_rate": 3.2140158103942794e-05, "loss": 1.8417, "step": 3902 }, { "epoch": 0.6297192642787997, "grad_norm": 3.520000696182251, "learning_rate": 3.2115756526025195e-05, "loss": 1.9087, "step": 3903 }, { "epoch": 0.6298806066473056, "grad_norm": 5.004185199737549, "learning_rate": 3.209135983162077e-05, "loss": 1.9131, "step": 3904 }, { "epoch": 0.6300419490158116, "grad_norm": 4.817507266998291, "learning_rate": 3.2066968027391374e-05, "loss": 1.7474, "step": 3905 }, { "epoch": 0.6302032913843175, "grad_norm": 3.5767366886138916, "learning_rate": 3.204258111999745e-05, "loss": 1.7991, "step": 3906 }, { "epoch": 0.6303646337528235, "grad_norm": 4.406589984893799, "learning_rate": 3.201819911609819e-05, "loss": 1.7198, "step": 3907 }, { "epoch": 0.6305259761213294, "grad_norm": 4.156294822692871, "learning_rate": 3.199382202235135e-05, "loss": 1.7961, "step": 3908 }, { "epoch": 0.6306873184898354, "grad_norm": 6.422266483306885, "learning_rate": 3.1969449845413454e-05, "loss": 1.7938, "step": 3909 }, { "epoch": 0.6308486608583413, "grad_norm": 4.197117805480957, "learning_rate": 3.194508259193958e-05, "loss": 1.7533, "step": 3910 }, { "epoch": 0.6310100032268474, "grad_norm": 5.0357890129089355, "learning_rate": 3.192072026858352e-05, "loss": 1.9352, "step": 3911 }, { "epoch": 0.6311713455953534, "grad_norm": 3.7781126499176025, "learning_rate": 3.189636288199771e-05, "loss": 1.8253, "step": 3912 }, { "epoch": 0.6313326879638593, "grad_norm": 4.784677505493164, "learning_rate": 3.187201043883323e-05, "loss": 1.9062, "step": 3913 }, { "epoch": 0.6314940303323653, "grad_norm": 4.113647937774658, "learning_rate": 3.184766294573983e-05, "loss": 1.644, "step": 3914 }, { "epoch": 0.6316553727008712, "grad_norm": 4.529688358306885, "learning_rate": 3.1823320409365865e-05, "loss": 2.0294, "step": 3915 }, { "epoch": 0.6318167150693772, "grad_norm": 3.490339994430542, "learning_rate": 3.179898283635839e-05, "loss": 1.8773, "step": 3916 }, { "epoch": 0.6319780574378832, "grad_norm": 5.333843231201172, "learning_rate": 3.1774650233363055e-05, "loss": 1.9404, "step": 3917 }, { "epoch": 0.6321393998063891, "grad_norm": 3.6392922401428223, "learning_rate": 3.175032260702422e-05, "loss": 2.0004, "step": 3918 }, { "epoch": 0.6323007421748952, "grad_norm": 6.618895053863525, "learning_rate": 3.172599996398479e-05, "loss": 1.8705, "step": 3919 }, { "epoch": 0.6324620845434011, "grad_norm": 4.968805313110352, "learning_rate": 3.170168231088641e-05, "loss": 1.7857, "step": 3920 }, { "epoch": 0.6326234269119071, "grad_norm": 5.2753705978393555, "learning_rate": 3.167736965436927e-05, "loss": 1.7994, "step": 3921 }, { "epoch": 0.632784769280413, "grad_norm": 4.1668195724487305, "learning_rate": 3.165306200107229e-05, "loss": 1.9737, "step": 3922 }, { "epoch": 0.632946111648919, "grad_norm": 6.095056056976318, "learning_rate": 3.162875935763294e-05, "loss": 1.6118, "step": 3923 }, { "epoch": 0.633107454017425, "grad_norm": 3.8742151260375977, "learning_rate": 3.1604461730687346e-05, "loss": 1.7393, "step": 3924 }, { "epoch": 0.6332687963859309, "grad_norm": 5.468986511230469, "learning_rate": 3.158016912687032e-05, "loss": 1.8525, "step": 3925 }, { "epoch": 0.6334301387544369, "grad_norm": 4.554811954498291, "learning_rate": 3.15558815528152e-05, "loss": 2.0176, "step": 3926 }, { "epoch": 0.6335914811229428, "grad_norm": 4.299023628234863, "learning_rate": 3.153159901515406e-05, "loss": 2.0495, "step": 3927 }, { "epoch": 0.6337528234914489, "grad_norm": 5.944100856781006, "learning_rate": 3.150732152051751e-05, "loss": 1.8224, "step": 3928 }, { "epoch": 0.6339141658599549, "grad_norm": 4.19835901260376, "learning_rate": 3.148304907553485e-05, "loss": 2.0148, "step": 3929 }, { "epoch": 0.6340755082284608, "grad_norm": 3.8810858726501465, "learning_rate": 3.145878168683395e-05, "loss": 1.9752, "step": 3930 }, { "epoch": 0.6342368505969668, "grad_norm": 5.0286736488342285, "learning_rate": 3.143451936104131e-05, "loss": 1.9513, "step": 3931 }, { "epoch": 0.6343981929654727, "grad_norm": 4.726410388946533, "learning_rate": 3.1410262104782085e-05, "loss": 1.7122, "step": 3932 }, { "epoch": 0.6345595353339787, "grad_norm": 3.8950557708740234, "learning_rate": 3.138600992468e-05, "loss": 1.8204, "step": 3933 }, { "epoch": 0.6347208777024846, "grad_norm": 3.7046666145324707, "learning_rate": 3.136176282735741e-05, "loss": 1.9165, "step": 3934 }, { "epoch": 0.6348822200709906, "grad_norm": 3.938035249710083, "learning_rate": 3.1337520819435303e-05, "loss": 1.7226, "step": 3935 }, { "epoch": 0.6350435624394967, "grad_norm": 7.021340370178223, "learning_rate": 3.131328390753324e-05, "loss": 1.8995, "step": 3936 }, { "epoch": 0.6352049048080026, "grad_norm": 4.165858745574951, "learning_rate": 3.12890520982694e-05, "loss": 1.9557, "step": 3937 }, { "epoch": 0.6353662471765086, "grad_norm": 3.9283430576324463, "learning_rate": 3.126482539826061e-05, "loss": 1.8387, "step": 3938 }, { "epoch": 0.6355275895450145, "grad_norm": 3.8701705932617188, "learning_rate": 3.124060381412223e-05, "loss": 1.8546, "step": 3939 }, { "epoch": 0.6356889319135205, "grad_norm": 3.4759013652801514, "learning_rate": 3.1216387352468305e-05, "loss": 1.8128, "step": 3940 }, { "epoch": 0.6358502742820265, "grad_norm": 4.323712348937988, "learning_rate": 3.119217601991139e-05, "loss": 1.797, "step": 3941 }, { "epoch": 0.6360116166505324, "grad_norm": 3.5588533878326416, "learning_rate": 3.1167969823062734e-05, "loss": 1.9885, "step": 3942 }, { "epoch": 0.6361729590190384, "grad_norm": 3.617077589035034, "learning_rate": 3.11437687685321e-05, "loss": 2.1947, "step": 3943 }, { "epoch": 0.6363343013875443, "grad_norm": 4.54804801940918, "learning_rate": 3.1119572862927916e-05, "loss": 1.6814, "step": 3944 }, { "epoch": 0.6364956437560504, "grad_norm": 3.9997711181640625, "learning_rate": 3.109538211285714e-05, "loss": 1.8827, "step": 3945 }, { "epoch": 0.6366569861245563, "grad_norm": 3.9156370162963867, "learning_rate": 3.10711965249254e-05, "loss": 1.8592, "step": 3946 }, { "epoch": 0.6368183284930623, "grad_norm": 4.000783443450928, "learning_rate": 3.10470161057368e-05, "loss": 1.7229, "step": 3947 }, { "epoch": 0.6369796708615683, "grad_norm": 4.192603588104248, "learning_rate": 3.1022840861894174e-05, "loss": 1.7745, "step": 3948 }, { "epoch": 0.6371410132300742, "grad_norm": 5.290771484375, "learning_rate": 3.0998670799998844e-05, "loss": 2.181, "step": 3949 }, { "epoch": 0.6373023555985802, "grad_norm": 4.79047155380249, "learning_rate": 3.0974505926650724e-05, "loss": 1.8853, "step": 3950 }, { "epoch": 0.6374636979670861, "grad_norm": 4.0281171798706055, "learning_rate": 3.095034624844835e-05, "loss": 1.9216, "step": 3951 }, { "epoch": 0.6376250403355921, "grad_norm": 3.3467166423797607, "learning_rate": 3.092619177198881e-05, "loss": 1.8407, "step": 3952 }, { "epoch": 0.637786382704098, "grad_norm": 4.751272678375244, "learning_rate": 3.090204250386779e-05, "loss": 1.7854, "step": 3953 }, { "epoch": 0.6379477250726041, "grad_norm": 6.508646488189697, "learning_rate": 3.087789845067953e-05, "loss": 1.8706, "step": 3954 }, { "epoch": 0.6381090674411101, "grad_norm": 3.634064197540283, "learning_rate": 3.0853759619016896e-05, "loss": 1.733, "step": 3955 }, { "epoch": 0.638270409809616, "grad_norm": 7.2373151779174805, "learning_rate": 3.0829626015471245e-05, "loss": 1.9196, "step": 3956 }, { "epoch": 0.638431752178122, "grad_norm": 4.87274694442749, "learning_rate": 3.080549764663261e-05, "loss": 1.8556, "step": 3957 }, { "epoch": 0.6385930945466279, "grad_norm": 3.414288282394409, "learning_rate": 3.078137451908949e-05, "loss": 1.9769, "step": 3958 }, { "epoch": 0.6387544369151339, "grad_norm": 4.94952392578125, "learning_rate": 3.0757256639429025e-05, "loss": 1.8745, "step": 3959 }, { "epoch": 0.6389157792836399, "grad_norm": 4.189605236053467, "learning_rate": 3.07331440142369e-05, "loss": 1.6985, "step": 3960 }, { "epoch": 0.6390771216521458, "grad_norm": 3.7005834579467773, "learning_rate": 3.070903665009738e-05, "loss": 1.8928, "step": 3961 }, { "epoch": 0.6392384640206519, "grad_norm": 5.443114757537842, "learning_rate": 3.0684934553593244e-05, "loss": 1.8405, "step": 3962 }, { "epoch": 0.6393998063891578, "grad_norm": 4.24349308013916, "learning_rate": 3.066083773130588e-05, "loss": 1.8022, "step": 3963 }, { "epoch": 0.6395611487576638, "grad_norm": 3.877264976501465, "learning_rate": 3.0636746189815235e-05, "loss": 2.1855, "step": 3964 }, { "epoch": 0.6397224911261697, "grad_norm": 5.466297149658203, "learning_rate": 3.0612659935699774e-05, "loss": 2.0294, "step": 3965 }, { "epoch": 0.6398838334946757, "grad_norm": 4.056169509887695, "learning_rate": 3.058857897553659e-05, "loss": 1.9896, "step": 3966 }, { "epoch": 0.6400451758631817, "grad_norm": 4.240475654602051, "learning_rate": 3.0564503315901226e-05, "loss": 1.8254, "step": 3967 }, { "epoch": 0.6402065182316876, "grad_norm": 3.5806643962860107, "learning_rate": 3.0540432963367905e-05, "loss": 1.7046, "step": 3968 }, { "epoch": 0.6403678606001936, "grad_norm": 4.3338117599487305, "learning_rate": 3.051636792450928e-05, "loss": 1.9001, "step": 3969 }, { "epoch": 0.6405292029686995, "grad_norm": 3.7095282077789307, "learning_rate": 3.0492308205896635e-05, "loss": 1.5849, "step": 3970 }, { "epoch": 0.6406905453372056, "grad_norm": 4.299651145935059, "learning_rate": 3.0468253814099756e-05, "loss": 1.8346, "step": 3971 }, { "epoch": 0.6408518877057116, "grad_norm": 5.332083225250244, "learning_rate": 3.044420475568701e-05, "loss": 1.8271, "step": 3972 }, { "epoch": 0.6410132300742175, "grad_norm": 4.321498394012451, "learning_rate": 3.042016103722526e-05, "loss": 1.8567, "step": 3973 }, { "epoch": 0.6411745724427235, "grad_norm": 4.005444526672363, "learning_rate": 3.039612266527998e-05, "loss": 1.6229, "step": 3974 }, { "epoch": 0.6413359148112294, "grad_norm": 3.9515342712402344, "learning_rate": 3.0372089646415125e-05, "loss": 1.938, "step": 3975 }, { "epoch": 0.6414972571797354, "grad_norm": 4.833498954772949, "learning_rate": 3.0348061987193178e-05, "loss": 1.8208, "step": 3976 }, { "epoch": 0.6416585995482413, "grad_norm": 4.180032253265381, "learning_rate": 3.0324039694175233e-05, "loss": 1.9654, "step": 3977 }, { "epoch": 0.6418199419167473, "grad_norm": 5.152658462524414, "learning_rate": 3.030002277392085e-05, "loss": 1.9309, "step": 3978 }, { "epoch": 0.6419812842852534, "grad_norm": 3.8878352642059326, "learning_rate": 3.0276011232988145e-05, "loss": 1.8735, "step": 3979 }, { "epoch": 0.6421426266537593, "grad_norm": 5.087482929229736, "learning_rate": 3.0252005077933775e-05, "loss": 1.86, "step": 3980 }, { "epoch": 0.6423039690222653, "grad_norm": 3.9759790897369385, "learning_rate": 3.0228004315312917e-05, "loss": 1.6493, "step": 3981 }, { "epoch": 0.6424653113907712, "grad_norm": 6.068326473236084, "learning_rate": 3.0204008951679246e-05, "loss": 1.9676, "step": 3982 }, { "epoch": 0.6426266537592772, "grad_norm": 4.612680912017822, "learning_rate": 3.018001899358504e-05, "loss": 1.8544, "step": 3983 }, { "epoch": 0.6427879961277831, "grad_norm": 4.5577826499938965, "learning_rate": 3.0156034447581005e-05, "loss": 1.9273, "step": 3984 }, { "epoch": 0.6429493384962891, "grad_norm": 3.682194948196411, "learning_rate": 3.0132055320216468e-05, "loss": 1.7893, "step": 3985 }, { "epoch": 0.6431106808647951, "grad_norm": 4.775866508483887, "learning_rate": 3.0108081618039167e-05, "loss": 1.8938, "step": 3986 }, { "epoch": 0.643272023233301, "grad_norm": 4.182424545288086, "learning_rate": 3.008411334759548e-05, "loss": 1.809, "step": 3987 }, { "epoch": 0.6434333656018071, "grad_norm": 3.62164568901062, "learning_rate": 3.0060150515430198e-05, "loss": 2.0824, "step": 3988 }, { "epoch": 0.643594707970313, "grad_norm": 3.801424503326416, "learning_rate": 3.0036193128086665e-05, "loss": 1.654, "step": 3989 }, { "epoch": 0.643756050338819, "grad_norm": 4.317459583282471, "learning_rate": 3.001224119210676e-05, "loss": 1.8066, "step": 3990 }, { "epoch": 0.643917392707325, "grad_norm": 4.665870666503906, "learning_rate": 2.9988294714030833e-05, "loss": 2.182, "step": 3991 }, { "epoch": 0.6440787350758309, "grad_norm": 4.350362777709961, "learning_rate": 2.9964353700397797e-05, "loss": 1.8528, "step": 3992 }, { "epoch": 0.6442400774443369, "grad_norm": 3.7095420360565186, "learning_rate": 2.9940418157745004e-05, "loss": 1.8254, "step": 3993 }, { "epoch": 0.6444014198128428, "grad_norm": 4.114126205444336, "learning_rate": 2.9916488092608387e-05, "loss": 2.025, "step": 3994 }, { "epoch": 0.6445627621813488, "grad_norm": 5.245405673980713, "learning_rate": 2.9892563511522304e-05, "loss": 1.8729, "step": 3995 }, { "epoch": 0.6447241045498547, "grad_norm": 3.8701298236846924, "learning_rate": 2.98686444210197e-05, "loss": 1.8739, "step": 3996 }, { "epoch": 0.6448854469183608, "grad_norm": 5.725188255310059, "learning_rate": 2.9844730827631943e-05, "loss": 1.9052, "step": 3997 }, { "epoch": 0.6450467892868668, "grad_norm": 4.8308000564575195, "learning_rate": 2.9820822737888965e-05, "loss": 1.934, "step": 3998 }, { "epoch": 0.6452081316553727, "grad_norm": 4.190275192260742, "learning_rate": 2.979692015831913e-05, "loss": 1.7112, "step": 3999 }, { "epoch": 0.6453694740238787, "grad_norm": 3.8999693393707275, "learning_rate": 2.9773023095449355e-05, "loss": 1.8895, "step": 4000 }, { "epoch": 0.6455308163923846, "grad_norm": 5.109170913696289, "learning_rate": 2.9749131555805033e-05, "loss": 1.7905, "step": 4001 }, { "epoch": 0.6456921587608906, "grad_norm": 5.036904335021973, "learning_rate": 2.9725245545910008e-05, "loss": 1.8134, "step": 4002 }, { "epoch": 0.6458535011293965, "grad_norm": 4.558663368225098, "learning_rate": 2.97013650722867e-05, "loss": 2.1622, "step": 4003 }, { "epoch": 0.6460148434979025, "grad_norm": 6.16807222366333, "learning_rate": 2.9677490141455916e-05, "loss": 1.7927, "step": 4004 }, { "epoch": 0.6461761858664086, "grad_norm": 5.861327171325684, "learning_rate": 2.965362075993705e-05, "loss": 1.9503, "step": 4005 }, { "epoch": 0.6463375282349145, "grad_norm": 3.5244321823120117, "learning_rate": 2.9629756934247883e-05, "loss": 2.043, "step": 4006 }, { "epoch": 0.6464988706034205, "grad_norm": 4.2635416984558105, "learning_rate": 2.9605898670904774e-05, "loss": 1.99, "step": 4007 }, { "epoch": 0.6466602129719264, "grad_norm": 5.670888423919678, "learning_rate": 2.958204597642248e-05, "loss": 1.8356, "step": 4008 }, { "epoch": 0.6468215553404324, "grad_norm": 5.056757926940918, "learning_rate": 2.955819885731429e-05, "loss": 1.6895, "step": 4009 }, { "epoch": 0.6469828977089384, "grad_norm": 4.226497650146484, "learning_rate": 2.9534357320091937e-05, "loss": 1.9286, "step": 4010 }, { "epoch": 0.6471442400774443, "grad_norm": 4.541499614715576, "learning_rate": 2.9510521371265676e-05, "loss": 1.7026, "step": 4011 }, { "epoch": 0.6473055824459503, "grad_norm": 5.744909763336182, "learning_rate": 2.948669101734419e-05, "loss": 2.0614, "step": 4012 }, { "epoch": 0.6474669248144562, "grad_norm": 5.761630535125732, "learning_rate": 2.946286626483463e-05, "loss": 1.7397, "step": 4013 }, { "epoch": 0.6476282671829623, "grad_norm": 3.778589963912964, "learning_rate": 2.943904712024268e-05, "loss": 1.7743, "step": 4014 }, { "epoch": 0.6477896095514682, "grad_norm": 5.145801067352295, "learning_rate": 2.941523359007241e-05, "loss": 2.186, "step": 4015 }, { "epoch": 0.6479509519199742, "grad_norm": 3.8931326866149902, "learning_rate": 2.9391425680826444e-05, "loss": 1.8601, "step": 4016 }, { "epoch": 0.6481122942884802, "grad_norm": 4.508549690246582, "learning_rate": 2.9367623399005782e-05, "loss": 1.8606, "step": 4017 }, { "epoch": 0.6482736366569861, "grad_norm": 5.005433559417725, "learning_rate": 2.9343826751109955e-05, "loss": 1.7911, "step": 4018 }, { "epoch": 0.6484349790254921, "grad_norm": 4.451712608337402, "learning_rate": 2.932003574363692e-05, "loss": 2.0797, "step": 4019 }, { "epoch": 0.648596321393998, "grad_norm": 4.714862823486328, "learning_rate": 2.9296250383083118e-05, "loss": 1.7912, "step": 4020 }, { "epoch": 0.648757663762504, "grad_norm": 6.01436710357666, "learning_rate": 2.9272470675943408e-05, "loss": 1.602, "step": 4021 }, { "epoch": 0.6489190061310101, "grad_norm": 3.64809250831604, "learning_rate": 2.924869662871117e-05, "loss": 1.8688, "step": 4022 }, { "epoch": 0.649080348499516, "grad_norm": 4.9753546714782715, "learning_rate": 2.922492824787816e-05, "loss": 1.9459, "step": 4023 }, { "epoch": 0.649241690868022, "grad_norm": 6.607090473175049, "learning_rate": 2.9201165539934673e-05, "loss": 1.8438, "step": 4024 }, { "epoch": 0.6494030332365279, "grad_norm": 6.046296119689941, "learning_rate": 2.917740851136939e-05, "loss": 2.0968, "step": 4025 }, { "epoch": 0.6495643756050339, "grad_norm": 4.133302688598633, "learning_rate": 2.9153657168669428e-05, "loss": 1.9329, "step": 4026 }, { "epoch": 0.6497257179735398, "grad_norm": 4.30959939956665, "learning_rate": 2.912991151832043e-05, "loss": 1.9147, "step": 4027 }, { "epoch": 0.6498870603420458, "grad_norm": 5.278558254241943, "learning_rate": 2.91061715668064e-05, "loss": 1.8902, "step": 4028 }, { "epoch": 0.6500484027105518, "grad_norm": 7.491674423217773, "learning_rate": 2.9082437320609867e-05, "loss": 2.0245, "step": 4029 }, { "epoch": 0.6502097450790577, "grad_norm": 3.6092584133148193, "learning_rate": 2.9058708786211718e-05, "loss": 1.7284, "step": 4030 }, { "epoch": 0.6503710874475638, "grad_norm": 4.31028413772583, "learning_rate": 2.9034985970091355e-05, "loss": 1.6524, "step": 4031 }, { "epoch": 0.6505324298160697, "grad_norm": 5.1833672523498535, "learning_rate": 2.9011268878726556e-05, "loss": 1.7917, "step": 4032 }, { "epoch": 0.6506937721845757, "grad_norm": 3.9345691204071045, "learning_rate": 2.89875575185936e-05, "loss": 2.0056, "step": 4033 }, { "epoch": 0.6508551145530816, "grad_norm": 4.322170257568359, "learning_rate": 2.8963851896167128e-05, "loss": 1.9763, "step": 4034 }, { "epoch": 0.6510164569215876, "grad_norm": 3.895662307739258, "learning_rate": 2.8940152017920286e-05, "loss": 1.9022, "step": 4035 }, { "epoch": 0.6511777992900936, "grad_norm": 5.397833824157715, "learning_rate": 2.891645789032459e-05, "loss": 1.9762, "step": 4036 }, { "epoch": 0.6513391416585995, "grad_norm": 5.5054826736450195, "learning_rate": 2.889276951985005e-05, "loss": 2.0662, "step": 4037 }, { "epoch": 0.6515004840271055, "grad_norm": 5.851878643035889, "learning_rate": 2.886908691296504e-05, "loss": 2.0854, "step": 4038 }, { "epoch": 0.6516618263956114, "grad_norm": 5.049559116363525, "learning_rate": 2.884541007613637e-05, "loss": 1.9758, "step": 4039 }, { "epoch": 0.6518231687641175, "grad_norm": 3.9392457008361816, "learning_rate": 2.8821739015829337e-05, "loss": 1.833, "step": 4040 }, { "epoch": 0.6519845111326235, "grad_norm": 5.671417236328125, "learning_rate": 2.879807373850759e-05, "loss": 1.9672, "step": 4041 }, { "epoch": 0.6521458535011294, "grad_norm": 5.260490417480469, "learning_rate": 2.8774414250633212e-05, "loss": 1.8732, "step": 4042 }, { "epoch": 0.6523071958696354, "grad_norm": 5.128913879394531, "learning_rate": 2.8750760558666757e-05, "loss": 1.7957, "step": 4043 }, { "epoch": 0.6524685382381413, "grad_norm": 4.581787586212158, "learning_rate": 2.872711266906713e-05, "loss": 1.7629, "step": 4044 }, { "epoch": 0.6526298806066473, "grad_norm": 4.499758243560791, "learning_rate": 2.870347058829167e-05, "loss": 2.0256, "step": 4045 }, { "epoch": 0.6527912229751532, "grad_norm": 3.7136127948760986, "learning_rate": 2.867983432279616e-05, "loss": 1.8258, "step": 4046 }, { "epoch": 0.6529525653436592, "grad_norm": 3.6342873573303223, "learning_rate": 2.865620387903476e-05, "loss": 1.8578, "step": 4047 }, { "epoch": 0.6531139077121653, "grad_norm": 3.641798973083496, "learning_rate": 2.8632579263460068e-05, "loss": 1.7957, "step": 4048 }, { "epoch": 0.6532752500806712, "grad_norm": 3.681467056274414, "learning_rate": 2.8608960482523056e-05, "loss": 1.9007, "step": 4049 }, { "epoch": 0.6534365924491772, "grad_norm": 4.51108455657959, "learning_rate": 2.8585347542673156e-05, "loss": 1.7851, "step": 4050 }, { "epoch": 0.6535979348176831, "grad_norm": 3.876131534576416, "learning_rate": 2.8561740450358142e-05, "loss": 1.668, "step": 4051 }, { "epoch": 0.6537592771861891, "grad_norm": 3.4206247329711914, "learning_rate": 2.853813921202423e-05, "loss": 1.99, "step": 4052 }, { "epoch": 0.653920619554695, "grad_norm": 4.4058122634887695, "learning_rate": 2.8514543834116037e-05, "loss": 1.8108, "step": 4053 }, { "epoch": 0.654081961923201, "grad_norm": 5.689077377319336, "learning_rate": 2.8490954323076546e-05, "loss": 2.0066, "step": 4054 }, { "epoch": 0.654243304291707, "grad_norm": 5.168551921844482, "learning_rate": 2.8467370685347205e-05, "loss": 1.8448, "step": 4055 }, { "epoch": 0.6544046466602129, "grad_norm": 4.483346939086914, "learning_rate": 2.844379292736778e-05, "loss": 1.6863, "step": 4056 }, { "epoch": 0.654565989028719, "grad_norm": 4.934708118438721, "learning_rate": 2.84202210555765e-05, "loss": 1.993, "step": 4057 }, { "epoch": 0.6547273313972249, "grad_norm": 3.775606155395508, "learning_rate": 2.8396655076409923e-05, "loss": 1.8005, "step": 4058 }, { "epoch": 0.6548886737657309, "grad_norm": 3.6856369972229004, "learning_rate": 2.837309499630306e-05, "loss": 1.964, "step": 4059 }, { "epoch": 0.6550500161342369, "grad_norm": 4.163701057434082, "learning_rate": 2.834954082168928e-05, "loss": 1.9154, "step": 4060 }, { "epoch": 0.6552113585027428, "grad_norm": 4.498649597167969, "learning_rate": 2.8325992559000313e-05, "loss": 1.6676, "step": 4061 }, { "epoch": 0.6553727008712488, "grad_norm": 4.522988319396973, "learning_rate": 2.830245021466631e-05, "loss": 1.7289, "step": 4062 }, { "epoch": 0.6555340432397547, "grad_norm": 3.821260690689087, "learning_rate": 2.8278913795115825e-05, "loss": 1.9413, "step": 4063 }, { "epoch": 0.6556953856082607, "grad_norm": 4.8176093101501465, "learning_rate": 2.825538330677575e-05, "loss": 1.5925, "step": 4064 }, { "epoch": 0.6558567279767668, "grad_norm": 4.966172695159912, "learning_rate": 2.823185875607135e-05, "loss": 1.8659, "step": 4065 }, { "epoch": 0.6560180703452727, "grad_norm": 4.94246768951416, "learning_rate": 2.8208340149426338e-05, "loss": 1.8868, "step": 4066 }, { "epoch": 0.6561794127137787, "grad_norm": 6.051825523376465, "learning_rate": 2.818482749326272e-05, "loss": 1.6712, "step": 4067 }, { "epoch": 0.6563407550822846, "grad_norm": 6.057215213775635, "learning_rate": 2.8161320794000955e-05, "loss": 1.6316, "step": 4068 }, { "epoch": 0.6565020974507906, "grad_norm": 3.5899481773376465, "learning_rate": 2.8137820058059804e-05, "loss": 1.8105, "step": 4069 }, { "epoch": 0.6566634398192965, "grad_norm": 4.588062763214111, "learning_rate": 2.8114325291856465e-05, "loss": 1.7202, "step": 4070 }, { "epoch": 0.6568247821878025, "grad_norm": 5.241124629974365, "learning_rate": 2.8090836501806432e-05, "loss": 1.7447, "step": 4071 }, { "epoch": 0.6569861245563084, "grad_norm": 5.199393272399902, "learning_rate": 2.806735369432365e-05, "loss": 1.8149, "step": 4072 }, { "epoch": 0.6571474669248144, "grad_norm": 3.5903971195220947, "learning_rate": 2.8043876875820363e-05, "loss": 1.6505, "step": 4073 }, { "epoch": 0.6573088092933205, "grad_norm": 4.2009406089782715, "learning_rate": 2.802040605270722e-05, "loss": 1.6893, "step": 4074 }, { "epoch": 0.6574701516618264, "grad_norm": 4.394023418426514, "learning_rate": 2.799694123139322e-05, "loss": 1.9336, "step": 4075 }, { "epoch": 0.6576314940303324, "grad_norm": 4.379487991333008, "learning_rate": 2.797348241828569e-05, "loss": 1.9616, "step": 4076 }, { "epoch": 0.6577928363988383, "grad_norm": 3.5809361934661865, "learning_rate": 2.7950029619790397e-05, "loss": 1.555, "step": 4077 }, { "epoch": 0.6579541787673443, "grad_norm": 4.558257102966309, "learning_rate": 2.7926582842311378e-05, "loss": 1.8658, "step": 4078 }, { "epoch": 0.6581155211358503, "grad_norm": 4.692923545837402, "learning_rate": 2.790314209225109e-05, "loss": 1.6488, "step": 4079 }, { "epoch": 0.6582768635043562, "grad_norm": 4.130902290344238, "learning_rate": 2.787970737601031e-05, "loss": 1.9426, "step": 4080 }, { "epoch": 0.6584382058728622, "grad_norm": 4.575941562652588, "learning_rate": 2.785627869998817e-05, "loss": 1.8641, "step": 4081 }, { "epoch": 0.6585995482413682, "grad_norm": 3.7854466438293457, "learning_rate": 2.7832856070582146e-05, "loss": 1.8893, "step": 4082 }, { "epoch": 0.6587608906098742, "grad_norm": 4.291102886199951, "learning_rate": 2.7809439494188117e-05, "loss": 1.8129, "step": 4083 }, { "epoch": 0.6589222329783802, "grad_norm": 5.6730194091796875, "learning_rate": 2.7786028977200225e-05, "loss": 1.7007, "step": 4084 }, { "epoch": 0.6590835753468861, "grad_norm": 4.0196404457092285, "learning_rate": 2.7762624526011038e-05, "loss": 1.7915, "step": 4085 }, { "epoch": 0.6592449177153921, "grad_norm": 4.962471008300781, "learning_rate": 2.773922614701139e-05, "loss": 2.0735, "step": 4086 }, { "epoch": 0.659406260083898, "grad_norm": 5.386277675628662, "learning_rate": 2.7715833846590532e-05, "loss": 1.752, "step": 4087 }, { "epoch": 0.659567602452404, "grad_norm": 5.766901016235352, "learning_rate": 2.769244763113601e-05, "loss": 1.8034, "step": 4088 }, { "epoch": 0.6597289448209099, "grad_norm": 4.224059581756592, "learning_rate": 2.7669067507033697e-05, "loss": 1.8776, "step": 4089 }, { "epoch": 0.6598902871894159, "grad_norm": 5.296021938323975, "learning_rate": 2.7645693480667856e-05, "loss": 1.7963, "step": 4090 }, { "epoch": 0.660051629557922, "grad_norm": 4.75903844833374, "learning_rate": 2.7622325558421026e-05, "loss": 2.0486, "step": 4091 }, { "epoch": 0.6602129719264279, "grad_norm": 3.8697707653045654, "learning_rate": 2.7598963746674132e-05, "loss": 1.9202, "step": 4092 }, { "epoch": 0.6603743142949339, "grad_norm": 5.087195873260498, "learning_rate": 2.7575608051806374e-05, "loss": 2.0034, "step": 4093 }, { "epoch": 0.6605356566634398, "grad_norm": 4.344791889190674, "learning_rate": 2.7552258480195347e-05, "loss": 1.822, "step": 4094 }, { "epoch": 0.6606969990319458, "grad_norm": 3.8124961853027344, "learning_rate": 2.7528915038216908e-05, "loss": 1.8831, "step": 4095 }, { "epoch": 0.6608583414004517, "grad_norm": 4.018934726715088, "learning_rate": 2.750557773224531e-05, "loss": 1.7947, "step": 4096 }, { "epoch": 0.6610196837689577, "grad_norm": 3.811086654663086, "learning_rate": 2.7482246568653043e-05, "loss": 1.9091, "step": 4097 }, { "epoch": 0.6611810261374637, "grad_norm": 5.172258377075195, "learning_rate": 2.745892155381101e-05, "loss": 1.8699, "step": 4098 }, { "epoch": 0.6613423685059696, "grad_norm": 4.130087375640869, "learning_rate": 2.7435602694088386e-05, "loss": 1.8169, "step": 4099 }, { "epoch": 0.6615037108744757, "grad_norm": 5.704213619232178, "learning_rate": 2.7412289995852657e-05, "loss": 1.8186, "step": 4100 }, { "epoch": 0.6616650532429816, "grad_norm": 3.660961389541626, "learning_rate": 2.7388983465469665e-05, "loss": 1.9181, "step": 4101 }, { "epoch": 0.6618263956114876, "grad_norm": 4.935924530029297, "learning_rate": 2.7365683109303498e-05, "loss": 1.9634, "step": 4102 }, { "epoch": 0.6619877379799936, "grad_norm": 4.407436370849609, "learning_rate": 2.7342388933716668e-05, "loss": 1.8782, "step": 4103 }, { "epoch": 0.6621490803484995, "grad_norm": 4.4017205238342285, "learning_rate": 2.731910094506988e-05, "loss": 1.75, "step": 4104 }, { "epoch": 0.6623104227170055, "grad_norm": 3.8433196544647217, "learning_rate": 2.7295819149722258e-05, "loss": 1.9688, "step": 4105 }, { "epoch": 0.6624717650855114, "grad_norm": 3.9323267936706543, "learning_rate": 2.7272543554031137e-05, "loss": 1.6916, "step": 4106 }, { "epoch": 0.6626331074540174, "grad_norm": 4.648200988769531, "learning_rate": 2.7249274164352255e-05, "loss": 1.8686, "step": 4107 }, { "epoch": 0.6627944498225234, "grad_norm": 4.506359577178955, "learning_rate": 2.7226010987039552e-05, "loss": 2.0056, "step": 4108 }, { "epoch": 0.6629557921910294, "grad_norm": 4.010270595550537, "learning_rate": 2.7202754028445376e-05, "loss": 1.9273, "step": 4109 }, { "epoch": 0.6631171345595354, "grad_norm": 5.485168933868408, "learning_rate": 2.717950329492028e-05, "loss": 1.9363, "step": 4110 }, { "epoch": 0.6632784769280413, "grad_norm": 4.302667140960693, "learning_rate": 2.7156258792813218e-05, "loss": 2.0121, "step": 4111 }, { "epoch": 0.6634398192965473, "grad_norm": 5.655758857727051, "learning_rate": 2.713302052847132e-05, "loss": 2.0036, "step": 4112 }, { "epoch": 0.6636011616650532, "grad_norm": 3.56538724899292, "learning_rate": 2.710978850824014e-05, "loss": 2.0621, "step": 4113 }, { "epoch": 0.6637625040335592, "grad_norm": 4.988595008850098, "learning_rate": 2.708656273846345e-05, "loss": 1.692, "step": 4114 }, { "epoch": 0.6639238464020651, "grad_norm": 4.9897613525390625, "learning_rate": 2.7063343225483308e-05, "loss": 1.8756, "step": 4115 }, { "epoch": 0.6640851887705711, "grad_norm": 3.808546781539917, "learning_rate": 2.7040129975640123e-05, "loss": 2.2095, "step": 4116 }, { "epoch": 0.6642465311390772, "grad_norm": 4.039368629455566, "learning_rate": 2.701692299527252e-05, "loss": 1.7944, "step": 4117 }, { "epoch": 0.6644078735075831, "grad_norm": 5.334078311920166, "learning_rate": 2.69937222907175e-05, "loss": 1.7066, "step": 4118 }, { "epoch": 0.6645692158760891, "grad_norm": 3.4958536624908447, "learning_rate": 2.697052786831027e-05, "loss": 1.8442, "step": 4119 }, { "epoch": 0.664730558244595, "grad_norm": 4.124898433685303, "learning_rate": 2.6947339734384364e-05, "loss": 1.7974, "step": 4120 }, { "epoch": 0.664891900613101, "grad_norm": 4.844030857086182, "learning_rate": 2.6924157895271563e-05, "loss": 1.7956, "step": 4121 }, { "epoch": 0.665053242981607, "grad_norm": 4.001951694488525, "learning_rate": 2.6900982357301997e-05, "loss": 1.9523, "step": 4122 }, { "epoch": 0.6652145853501129, "grad_norm": 4.164713382720947, "learning_rate": 2.687781312680398e-05, "loss": 1.5221, "step": 4123 }, { "epoch": 0.6653759277186189, "grad_norm": 4.255041122436523, "learning_rate": 2.685465021010421e-05, "loss": 2.0482, "step": 4124 }, { "epoch": 0.6655372700871249, "grad_norm": 3.6462719440460205, "learning_rate": 2.683149361352756e-05, "loss": 1.8136, "step": 4125 }, { "epoch": 0.6656986124556309, "grad_norm": 4.150946617126465, "learning_rate": 2.680834334339727e-05, "loss": 2.3349, "step": 4126 }, { "epoch": 0.6658599548241368, "grad_norm": 5.911656379699707, "learning_rate": 2.6785199406034784e-05, "loss": 1.6946, "step": 4127 }, { "epoch": 0.6660212971926428, "grad_norm": 4.190311431884766, "learning_rate": 2.676206180775982e-05, "loss": 1.7547, "step": 4128 }, { "epoch": 0.6661826395611488, "grad_norm": 4.114874839782715, "learning_rate": 2.6738930554890418e-05, "loss": 1.8269, "step": 4129 }, { "epoch": 0.6663439819296547, "grad_norm": 3.957735061645508, "learning_rate": 2.671580565374282e-05, "loss": 1.701, "step": 4130 }, { "epoch": 0.6665053242981607, "grad_norm": 6.421628475189209, "learning_rate": 2.6692687110631597e-05, "loss": 1.6187, "step": 4131 }, { "epoch": 0.6666666666666666, "grad_norm": 6.102411270141602, "learning_rate": 2.6669574931869523e-05, "loss": 1.9229, "step": 4132 }, { "epoch": 0.6668280090351726, "grad_norm": 3.5106594562530518, "learning_rate": 2.6646469123767694e-05, "loss": 1.8732, "step": 4133 }, { "epoch": 0.6669893514036787, "grad_norm": 4.111449241638184, "learning_rate": 2.6623369692635404e-05, "loss": 1.6204, "step": 4134 }, { "epoch": 0.6671506937721846, "grad_norm": 4.131481647491455, "learning_rate": 2.6600276644780275e-05, "loss": 2.0642, "step": 4135 }, { "epoch": 0.6673120361406906, "grad_norm": 5.979979991912842, "learning_rate": 2.6577189986508123e-05, "loss": 2.0188, "step": 4136 }, { "epoch": 0.6674733785091965, "grad_norm": 4.239624500274658, "learning_rate": 2.6554109724123027e-05, "loss": 1.9138, "step": 4137 }, { "epoch": 0.6676347208777025, "grad_norm": 5.137545108795166, "learning_rate": 2.6531035863927378e-05, "loss": 1.9357, "step": 4138 }, { "epoch": 0.6677960632462084, "grad_norm": 4.375874042510986, "learning_rate": 2.650796841222176e-05, "loss": 1.7126, "step": 4139 }, { "epoch": 0.6679574056147144, "grad_norm": 4.2277021408081055, "learning_rate": 2.648490737530503e-05, "loss": 1.8765, "step": 4140 }, { "epoch": 0.6681187479832204, "grad_norm": 5.449302673339844, "learning_rate": 2.646185275947426e-05, "loss": 1.6811, "step": 4141 }, { "epoch": 0.6682800903517263, "grad_norm": 4.741269111633301, "learning_rate": 2.6438804571024835e-05, "loss": 1.8743, "step": 4142 }, { "epoch": 0.6684414327202324, "grad_norm": 5.969488143920898, "learning_rate": 2.641576281625031e-05, "loss": 2.1562, "step": 4143 }, { "epoch": 0.6686027750887383, "grad_norm": 4.281977653503418, "learning_rate": 2.6392727501442572e-05, "loss": 1.8982, "step": 4144 }, { "epoch": 0.6687641174572443, "grad_norm": 3.7031009197235107, "learning_rate": 2.6369698632891638e-05, "loss": 1.972, "step": 4145 }, { "epoch": 0.6689254598257502, "grad_norm": 4.476454257965088, "learning_rate": 2.6346676216885873e-05, "loss": 1.9243, "step": 4146 }, { "epoch": 0.6690868021942562, "grad_norm": 4.0415730476379395, "learning_rate": 2.6323660259711795e-05, "loss": 1.7471, "step": 4147 }, { "epoch": 0.6692481445627622, "grad_norm": 4.301098346710205, "learning_rate": 2.6300650767654234e-05, "loss": 1.8926, "step": 4148 }, { "epoch": 0.6694094869312681, "grad_norm": 5.061169147491455, "learning_rate": 2.627764774699617e-05, "loss": 1.8014, "step": 4149 }, { "epoch": 0.6695708292997741, "grad_norm": 4.341492652893066, "learning_rate": 2.625465120401891e-05, "loss": 1.9786, "step": 4150 }, { "epoch": 0.6697321716682801, "grad_norm": 5.11741304397583, "learning_rate": 2.623166114500192e-05, "loss": 2.1888, "step": 4151 }, { "epoch": 0.6698935140367861, "grad_norm": 4.154516220092773, "learning_rate": 2.6208677576222896e-05, "loss": 1.7038, "step": 4152 }, { "epoch": 0.670054856405292, "grad_norm": 3.9316623210906982, "learning_rate": 2.6185700503957823e-05, "loss": 1.8391, "step": 4153 }, { "epoch": 0.670216198773798, "grad_norm": 7.304197788238525, "learning_rate": 2.6162729934480844e-05, "loss": 2.0465, "step": 4154 }, { "epoch": 0.670377541142304, "grad_norm": 3.8144609928131104, "learning_rate": 2.6139765874064382e-05, "loss": 1.7615, "step": 4155 }, { "epoch": 0.6705388835108099, "grad_norm": 3.904538869857788, "learning_rate": 2.6116808328979054e-05, "loss": 2.3631, "step": 4156 }, { "epoch": 0.6707002258793159, "grad_norm": 5.528686046600342, "learning_rate": 2.6093857305493664e-05, "loss": 1.728, "step": 4157 }, { "epoch": 0.6708615682478218, "grad_norm": 4.866077899932861, "learning_rate": 2.6070912809875324e-05, "loss": 1.9217, "step": 4158 }, { "epoch": 0.6710229106163278, "grad_norm": 3.7308342456817627, "learning_rate": 2.6047974848389285e-05, "loss": 1.8429, "step": 4159 }, { "epoch": 0.6711842529848339, "grad_norm": 4.611958026885986, "learning_rate": 2.602504342729902e-05, "loss": 1.5838, "step": 4160 }, { "epoch": 0.6713455953533398, "grad_norm": 6.402384281158447, "learning_rate": 2.6002118552866284e-05, "loss": 2.021, "step": 4161 }, { "epoch": 0.6715069377218458, "grad_norm": 4.659445762634277, "learning_rate": 2.5979200231350946e-05, "loss": 1.8546, "step": 4162 }, { "epoch": 0.6716682800903517, "grad_norm": 3.7538390159606934, "learning_rate": 2.595628846901118e-05, "loss": 1.7822, "step": 4163 }, { "epoch": 0.6718296224588577, "grad_norm": 4.030730724334717, "learning_rate": 2.593338327210332e-05, "loss": 1.5749, "step": 4164 }, { "epoch": 0.6719909648273636, "grad_norm": 3.891728401184082, "learning_rate": 2.5910484646881862e-05, "loss": 1.7048, "step": 4165 }, { "epoch": 0.6721523071958696, "grad_norm": 3.5389225482940674, "learning_rate": 2.5887592599599618e-05, "loss": 1.6243, "step": 4166 }, { "epoch": 0.6723136495643756, "grad_norm": 4.831793785095215, "learning_rate": 2.586470713650751e-05, "loss": 2.1921, "step": 4167 }, { "epoch": 0.6724749919328816, "grad_norm": 4.646327495574951, "learning_rate": 2.5841828263854717e-05, "loss": 1.8703, "step": 4168 }, { "epoch": 0.6726363343013876, "grad_norm": 3.7683122158050537, "learning_rate": 2.581895598788857e-05, "loss": 1.7485, "step": 4169 }, { "epoch": 0.6727976766698935, "grad_norm": 4.0349626541137695, "learning_rate": 2.5796090314854663e-05, "loss": 1.7957, "step": 4170 }, { "epoch": 0.6729590190383995, "grad_norm": 3.9039664268493652, "learning_rate": 2.577323125099671e-05, "loss": 2.0349, "step": 4171 }, { "epoch": 0.6731203614069055, "grad_norm": 3.512286901473999, "learning_rate": 2.5750378802556707e-05, "loss": 1.8589, "step": 4172 }, { "epoch": 0.6732817037754114, "grad_norm": 5.282522201538086, "learning_rate": 2.5727532975774737e-05, "loss": 1.8648, "step": 4173 }, { "epoch": 0.6734430461439174, "grad_norm": 5.973352909088135, "learning_rate": 2.57046937768892e-05, "loss": 1.8859, "step": 4174 }, { "epoch": 0.6736043885124233, "grad_norm": 4.13266658782959, "learning_rate": 2.5681861212136578e-05, "loss": 1.8761, "step": 4175 }, { "epoch": 0.6737657308809293, "grad_norm": 5.284298896789551, "learning_rate": 2.5659035287751575e-05, "loss": 2.4626, "step": 4176 }, { "epoch": 0.6739270732494353, "grad_norm": 3.9527158737182617, "learning_rate": 2.563621600996714e-05, "loss": 2.0104, "step": 4177 }, { "epoch": 0.6740884156179413, "grad_norm": 5.228986740112305, "learning_rate": 2.5613403385014323e-05, "loss": 2.0879, "step": 4178 }, { "epoch": 0.6742497579864473, "grad_norm": 3.9151477813720703, "learning_rate": 2.5590597419122396e-05, "loss": 1.8697, "step": 4179 }, { "epoch": 0.6744111003549532, "grad_norm": 4.744083404541016, "learning_rate": 2.5567798118518792e-05, "loss": 1.6128, "step": 4180 }, { "epoch": 0.6745724427234592, "grad_norm": 4.110485076904297, "learning_rate": 2.5545005489429187e-05, "loss": 1.8802, "step": 4181 }, { "epoch": 0.6747337850919651, "grad_norm": 3.936558723449707, "learning_rate": 2.552221953807734e-05, "loss": 2.0994, "step": 4182 }, { "epoch": 0.6748951274604711, "grad_norm": 4.597769737243652, "learning_rate": 2.5499440270685277e-05, "loss": 1.8867, "step": 4183 }, { "epoch": 0.675056469828977, "grad_norm": 3.885920524597168, "learning_rate": 2.547666769347312e-05, "loss": 2.1563, "step": 4184 }, { "epoch": 0.6752178121974831, "grad_norm": 4.100528717041016, "learning_rate": 2.5453901812659242e-05, "loss": 1.8185, "step": 4185 }, { "epoch": 0.6753791545659891, "grad_norm": 5.49969482421875, "learning_rate": 2.5431142634460115e-05, "loss": 1.9517, "step": 4186 }, { "epoch": 0.675540496934495, "grad_norm": 5.925865173339844, "learning_rate": 2.5408390165090433e-05, "loss": 1.8116, "step": 4187 }, { "epoch": 0.675701839303001, "grad_norm": 4.8131256103515625, "learning_rate": 2.538564441076302e-05, "loss": 1.7581, "step": 4188 }, { "epoch": 0.6758631816715069, "grad_norm": 4.608528137207031, "learning_rate": 2.5362905377688912e-05, "loss": 1.8789, "step": 4189 }, { "epoch": 0.6760245240400129, "grad_norm": 4.1504340171813965, "learning_rate": 2.5340173072077267e-05, "loss": 1.8511, "step": 4190 }, { "epoch": 0.6761858664085189, "grad_norm": 3.9584457874298096, "learning_rate": 2.5317447500135406e-05, "loss": 1.8161, "step": 4191 }, { "epoch": 0.6763472087770248, "grad_norm": 5.7329254150390625, "learning_rate": 2.529472866806885e-05, "loss": 1.8829, "step": 4192 }, { "epoch": 0.6765085511455308, "grad_norm": 4.299564361572266, "learning_rate": 2.5272016582081236e-05, "loss": 1.8923, "step": 4193 }, { "epoch": 0.6766698935140368, "grad_norm": 4.187342166900635, "learning_rate": 2.5249311248374406e-05, "loss": 1.9306, "step": 4194 }, { "epoch": 0.6768312358825428, "grad_norm": 4.859185695648193, "learning_rate": 2.5226612673148314e-05, "loss": 1.9833, "step": 4195 }, { "epoch": 0.6769925782510487, "grad_norm": 4.390213966369629, "learning_rate": 2.5203920862601073e-05, "loss": 1.8682, "step": 4196 }, { "epoch": 0.6771539206195547, "grad_norm": 3.902789354324341, "learning_rate": 2.5181235822928996e-05, "loss": 1.6186, "step": 4197 }, { "epoch": 0.6773152629880607, "grad_norm": 4.4394450187683105, "learning_rate": 2.5158557560326483e-05, "loss": 1.8283, "step": 4198 }, { "epoch": 0.6774766053565666, "grad_norm": 5.325255870819092, "learning_rate": 2.5135886080986114e-05, "loss": 1.812, "step": 4199 }, { "epoch": 0.6776379477250726, "grad_norm": 5.705172538757324, "learning_rate": 2.5113221391098642e-05, "loss": 2.1034, "step": 4200 }, { "epoch": 0.6777992900935785, "grad_norm": 4.679775238037109, "learning_rate": 2.509056349685292e-05, "loss": 1.8822, "step": 4201 }, { "epoch": 0.6779606324620845, "grad_norm": 4.4186201095581055, "learning_rate": 2.506791240443595e-05, "loss": 2.0473, "step": 4202 }, { "epoch": 0.6781219748305906, "grad_norm": 5.696232318878174, "learning_rate": 2.5045268120032932e-05, "loss": 1.8613, "step": 4203 }, { "epoch": 0.6782833171990965, "grad_norm": 4.359367370605469, "learning_rate": 2.5022630649827128e-05, "loss": 2.0278, "step": 4204 }, { "epoch": 0.6784446595676025, "grad_norm": 4.115344524383545, "learning_rate": 2.500000000000001e-05, "loss": 1.8859, "step": 4205 }, { "epoch": 0.6786060019361084, "grad_norm": 3.563119649887085, "learning_rate": 2.4977376176731127e-05, "loss": 2.0457, "step": 4206 }, { "epoch": 0.6787673443046144, "grad_norm": 5.598097324371338, "learning_rate": 2.4954759186198223e-05, "loss": 1.9112, "step": 4207 }, { "epoch": 0.6789286866731203, "grad_norm": 3.9232401847839355, "learning_rate": 2.4932149034577117e-05, "loss": 1.7803, "step": 4208 }, { "epoch": 0.6790900290416263, "grad_norm": 4.832250118255615, "learning_rate": 2.4909545728041822e-05, "loss": 1.8358, "step": 4209 }, { "epoch": 0.6792513714101323, "grad_norm": 4.787143230438232, "learning_rate": 2.488694927276441e-05, "loss": 1.9455, "step": 4210 }, { "epoch": 0.6794127137786383, "grad_norm": 5.10698127746582, "learning_rate": 2.486435967491516e-05, "loss": 2.0101, "step": 4211 }, { "epoch": 0.6795740561471443, "grad_norm": 4.221131801605225, "learning_rate": 2.4841776940662408e-05, "loss": 2.0602, "step": 4212 }, { "epoch": 0.6797353985156502, "grad_norm": 4.46644926071167, "learning_rate": 2.481920107617268e-05, "loss": 1.7403, "step": 4213 }, { "epoch": 0.6798967408841562, "grad_norm": 4.517517566680908, "learning_rate": 2.4796632087610583e-05, "loss": 2.1973, "step": 4214 }, { "epoch": 0.6800580832526621, "grad_norm": 4.196491241455078, "learning_rate": 2.4774069981138848e-05, "loss": 2.1414, "step": 4215 }, { "epoch": 0.6802194256211681, "grad_norm": 5.910837650299072, "learning_rate": 2.475151476291832e-05, "loss": 2.0071, "step": 4216 }, { "epoch": 0.6803807679896741, "grad_norm": 3.324673652648926, "learning_rate": 2.472896643910802e-05, "loss": 1.8987, "step": 4217 }, { "epoch": 0.68054211035818, "grad_norm": 4.347282886505127, "learning_rate": 2.4706425015865025e-05, "loss": 1.7767, "step": 4218 }, { "epoch": 0.680703452726686, "grad_norm": 4.210080146789551, "learning_rate": 2.4683890499344532e-05, "loss": 1.8631, "step": 4219 }, { "epoch": 0.680864795095192, "grad_norm": 3.8832204341888428, "learning_rate": 2.4661362895699903e-05, "loss": 1.687, "step": 4220 }, { "epoch": 0.681026137463698, "grad_norm": 4.468530654907227, "learning_rate": 2.4638842211082542e-05, "loss": 1.8627, "step": 4221 }, { "epoch": 0.681187479832204, "grad_norm": 3.586127996444702, "learning_rate": 2.461632845164204e-05, "loss": 1.774, "step": 4222 }, { "epoch": 0.6813488222007099, "grad_norm": 4.217072486877441, "learning_rate": 2.4593821623526013e-05, "loss": 1.7192, "step": 4223 }, { "epoch": 0.6815101645692159, "grad_norm": 5.281252861022949, "learning_rate": 2.457132173288027e-05, "loss": 1.8081, "step": 4224 }, { "epoch": 0.6816715069377218, "grad_norm": 4.885616302490234, "learning_rate": 2.4548828785848645e-05, "loss": 2.1488, "step": 4225 }, { "epoch": 0.6818328493062278, "grad_norm": 5.50325345993042, "learning_rate": 2.4526342788573146e-05, "loss": 2.0298, "step": 4226 }, { "epoch": 0.6819941916747337, "grad_norm": 5.605451583862305, "learning_rate": 2.4503863747193844e-05, "loss": 1.7473, "step": 4227 }, { "epoch": 0.6821555340432398, "grad_norm": 6.766907691955566, "learning_rate": 2.4481391667848895e-05, "loss": 2.145, "step": 4228 }, { "epoch": 0.6823168764117458, "grad_norm": 5.514566421508789, "learning_rate": 2.4458926556674615e-05, "loss": 1.8165, "step": 4229 }, { "epoch": 0.6824782187802517, "grad_norm": 3.5636658668518066, "learning_rate": 2.4436468419805336e-05, "loss": 1.7146, "step": 4230 }, { "epoch": 0.6826395611487577, "grad_norm": 3.876129150390625, "learning_rate": 2.441401726337358e-05, "loss": 1.8953, "step": 4231 }, { "epoch": 0.6828009035172636, "grad_norm": 5.891449928283691, "learning_rate": 2.439157309350986e-05, "loss": 1.8175, "step": 4232 }, { "epoch": 0.6829622458857696, "grad_norm": 5.565365791320801, "learning_rate": 2.4369135916342884e-05, "loss": 1.7192, "step": 4233 }, { "epoch": 0.6831235882542755, "grad_norm": 4.477042198181152, "learning_rate": 2.434670573799937e-05, "loss": 1.8873, "step": 4234 }, { "epoch": 0.6832849306227815, "grad_norm": 4.289474010467529, "learning_rate": 2.4324282564604157e-05, "loss": 1.7617, "step": 4235 }, { "epoch": 0.6834462729912875, "grad_norm": 3.986325263977051, "learning_rate": 2.4301866402280154e-05, "loss": 1.6774, "step": 4236 }, { "epoch": 0.6836076153597935, "grad_norm": 6.523401737213135, "learning_rate": 2.4279457257148407e-05, "loss": 1.9909, "step": 4237 }, { "epoch": 0.6837689577282995, "grad_norm": 4.952888011932373, "learning_rate": 2.4257055135327976e-05, "loss": 1.8376, "step": 4238 }, { "epoch": 0.6839303000968054, "grad_norm": 5.137017726898193, "learning_rate": 2.4234660042936064e-05, "loss": 2.0815, "step": 4239 }, { "epoch": 0.6840916424653114, "grad_norm": 5.571824550628662, "learning_rate": 2.421227198608792e-05, "loss": 1.8191, "step": 4240 }, { "epoch": 0.6842529848338174, "grad_norm": 4.201406955718994, "learning_rate": 2.418989097089685e-05, "loss": 1.8579, "step": 4241 }, { "epoch": 0.6844143272023233, "grad_norm": 5.3459696769714355, "learning_rate": 2.4167517003474304e-05, "loss": 1.898, "step": 4242 }, { "epoch": 0.6845756695708293, "grad_norm": 5.0349650382995605, "learning_rate": 2.4145150089929743e-05, "loss": 2.2807, "step": 4243 }, { "epoch": 0.6847370119393352, "grad_norm": 4.771966934204102, "learning_rate": 2.4122790236370756e-05, "loss": 2.0599, "step": 4244 }, { "epoch": 0.6848983543078412, "grad_norm": 4.0456624031066895, "learning_rate": 2.410043744890294e-05, "loss": 1.9737, "step": 4245 }, { "epoch": 0.6850596966763473, "grad_norm": 5.042922496795654, "learning_rate": 2.4078091733630043e-05, "loss": 1.6734, "step": 4246 }, { "epoch": 0.6852210390448532, "grad_norm": 7.21657133102417, "learning_rate": 2.4055753096653794e-05, "loss": 1.7746, "step": 4247 }, { "epoch": 0.6853823814133592, "grad_norm": 3.572934627532959, "learning_rate": 2.4033421544074073e-05, "loss": 1.8811, "step": 4248 }, { "epoch": 0.6855437237818651, "grad_norm": 3.880200147628784, "learning_rate": 2.4011097081988747e-05, "loss": 1.9709, "step": 4249 }, { "epoch": 0.6857050661503711, "grad_norm": 5.2548723220825195, "learning_rate": 2.3988779716493832e-05, "loss": 2.0609, "step": 4250 }, { "epoch": 0.685866408518877, "grad_norm": 5.024470329284668, "learning_rate": 2.396646945368331e-05, "loss": 1.7785, "step": 4251 }, { "epoch": 0.686027750887383, "grad_norm": 3.6195952892303467, "learning_rate": 2.3944166299649317e-05, "loss": 1.7746, "step": 4252 }, { "epoch": 0.686189093255889, "grad_norm": 3.8889803886413574, "learning_rate": 2.392187026048198e-05, "loss": 1.9318, "step": 4253 }, { "epoch": 0.686350435624395, "grad_norm": 3.503079652786255, "learning_rate": 2.3899581342269516e-05, "loss": 1.8082, "step": 4254 }, { "epoch": 0.686511777992901, "grad_norm": 4.067113876342773, "learning_rate": 2.3877299551098185e-05, "loss": 1.7902, "step": 4255 }, { "epoch": 0.6866731203614069, "grad_norm": 3.907656669616699, "learning_rate": 2.3855024893052285e-05, "loss": 2.1521, "step": 4256 }, { "epoch": 0.6868344627299129, "grad_norm": 4.6518425941467285, "learning_rate": 2.3832757374214222e-05, "loss": 1.8983, "step": 4257 }, { "epoch": 0.6869958050984188, "grad_norm": 4.6036295890808105, "learning_rate": 2.3810497000664382e-05, "loss": 1.8195, "step": 4258 }, { "epoch": 0.6871571474669248, "grad_norm": 4.095510959625244, "learning_rate": 2.3788243778481275e-05, "loss": 1.7026, "step": 4259 }, { "epoch": 0.6873184898354308, "grad_norm": 5.651637554168701, "learning_rate": 2.3765997713741374e-05, "loss": 1.9468, "step": 4260 }, { "epoch": 0.6874798322039367, "grad_norm": 4.282829284667969, "learning_rate": 2.3743758812519278e-05, "loss": 1.9029, "step": 4261 }, { "epoch": 0.6876411745724427, "grad_norm": 4.165994644165039, "learning_rate": 2.372152708088756e-05, "loss": 1.8754, "step": 4262 }, { "epoch": 0.6878025169409487, "grad_norm": 3.732685089111328, "learning_rate": 2.369930252491691e-05, "loss": 1.9601, "step": 4263 }, { "epoch": 0.6879638593094547, "grad_norm": 5.241577625274658, "learning_rate": 2.3677085150675994e-05, "loss": 1.765, "step": 4264 }, { "epoch": 0.6881252016779607, "grad_norm": 4.694557189941406, "learning_rate": 2.3654874964231518e-05, "loss": 1.7963, "step": 4265 }, { "epoch": 0.6882865440464666, "grad_norm": 3.5138211250305176, "learning_rate": 2.3632671971648277e-05, "loss": 1.725, "step": 4266 }, { "epoch": 0.6884478864149726, "grad_norm": 4.039546966552734, "learning_rate": 2.3610476178989054e-05, "loss": 1.633, "step": 4267 }, { "epoch": 0.6886092287834785, "grad_norm": 4.485110759735107, "learning_rate": 2.3588287592314717e-05, "loss": 1.7488, "step": 4268 }, { "epoch": 0.6887705711519845, "grad_norm": 3.9109413623809814, "learning_rate": 2.356610621768408e-05, "loss": 1.8953, "step": 4269 }, { "epoch": 0.6889319135204904, "grad_norm": 4.28920841217041, "learning_rate": 2.3543932061154096e-05, "loss": 2.0305, "step": 4270 }, { "epoch": 0.6890932558889965, "grad_norm": 6.500674724578857, "learning_rate": 2.3521765128779643e-05, "loss": 2.0727, "step": 4271 }, { "epoch": 0.6892545982575025, "grad_norm": 4.590287685394287, "learning_rate": 2.349960542661372e-05, "loss": 1.904, "step": 4272 }, { "epoch": 0.6894159406260084, "grad_norm": 3.8235952854156494, "learning_rate": 2.3477452960707285e-05, "loss": 1.6787, "step": 4273 }, { "epoch": 0.6895772829945144, "grad_norm": 3.4124348163604736, "learning_rate": 2.345530773710934e-05, "loss": 1.8031, "step": 4274 }, { "epoch": 0.6897386253630203, "grad_norm": 4.249532222747803, "learning_rate": 2.3433169761866898e-05, "loss": 1.6595, "step": 4275 }, { "epoch": 0.6898999677315263, "grad_norm": 4.482729434967041, "learning_rate": 2.341103904102504e-05, "loss": 1.9365, "step": 4276 }, { "epoch": 0.6900613101000322, "grad_norm": 4.421379566192627, "learning_rate": 2.3388915580626808e-05, "loss": 1.758, "step": 4277 }, { "epoch": 0.6902226524685382, "grad_norm": 6.173230171203613, "learning_rate": 2.3366799386713277e-05, "loss": 1.932, "step": 4278 }, { "epoch": 0.6903839948370442, "grad_norm": 5.280002117156982, "learning_rate": 2.3344690465323583e-05, "loss": 1.9702, "step": 4279 }, { "epoch": 0.6905453372055502, "grad_norm": 4.441904067993164, "learning_rate": 2.332258882249479e-05, "loss": 2.1519, "step": 4280 }, { "epoch": 0.6907066795740562, "grad_norm": 5.471134185791016, "learning_rate": 2.330049446426208e-05, "loss": 2.1167, "step": 4281 }, { "epoch": 0.6908680219425621, "grad_norm": 3.910367250442505, "learning_rate": 2.3278407396658536e-05, "loss": 1.9564, "step": 4282 }, { "epoch": 0.6910293643110681, "grad_norm": 4.43289852142334, "learning_rate": 2.3256327625715347e-05, "loss": 1.8651, "step": 4283 }, { "epoch": 0.691190706679574, "grad_norm": 4.122693061828613, "learning_rate": 2.323425515746164e-05, "loss": 1.791, "step": 4284 }, { "epoch": 0.69135204904808, "grad_norm": 4.674631118774414, "learning_rate": 2.3212189997924594e-05, "loss": 1.9087, "step": 4285 }, { "epoch": 0.691513391416586, "grad_norm": 4.20643424987793, "learning_rate": 2.3190132153129345e-05, "loss": 1.7867, "step": 4286 }, { "epoch": 0.6916747337850919, "grad_norm": 4.392679691314697, "learning_rate": 2.31680816290991e-05, "loss": 1.8619, "step": 4287 }, { "epoch": 0.691836076153598, "grad_norm": 4.6754045486450195, "learning_rate": 2.3146038431854977e-05, "loss": 1.9409, "step": 4288 }, { "epoch": 0.691997418522104, "grad_norm": 4.793563365936279, "learning_rate": 2.3124002567416197e-05, "loss": 1.7599, "step": 4289 }, { "epoch": 0.6921587608906099, "grad_norm": 5.450015544891357, "learning_rate": 2.310197404179989e-05, "loss": 1.7792, "step": 4290 }, { "epoch": 0.6923201032591159, "grad_norm": 4.160528659820557, "learning_rate": 2.307995286102121e-05, "loss": 2.0111, "step": 4291 }, { "epoch": 0.6924814456276218, "grad_norm": 4.624261379241943, "learning_rate": 2.3057939031093344e-05, "loss": 2.035, "step": 4292 }, { "epoch": 0.6926427879961278, "grad_norm": 4.306497097015381, "learning_rate": 2.3035932558027418e-05, "loss": 1.7682, "step": 4293 }, { "epoch": 0.6928041303646337, "grad_norm": 4.892125129699707, "learning_rate": 2.3013933447832574e-05, "loss": 1.7313, "step": 4294 }, { "epoch": 0.6929654727331397, "grad_norm": 4.463599681854248, "learning_rate": 2.2991941706515922e-05, "loss": 1.9946, "step": 4295 }, { "epoch": 0.6931268151016456, "grad_norm": 3.9114396572113037, "learning_rate": 2.296995734008262e-05, "loss": 1.8424, "step": 4296 }, { "epoch": 0.6932881574701517, "grad_norm": 3.474891424179077, "learning_rate": 2.2947980354535726e-05, "loss": 2.013, "step": 4297 }, { "epoch": 0.6934494998386577, "grad_norm": 3.928380250930786, "learning_rate": 2.2926010755876364e-05, "loss": 1.6954, "step": 4298 }, { "epoch": 0.6936108422071636, "grad_norm": 4.158672332763672, "learning_rate": 2.290404855010357e-05, "loss": 2.2009, "step": 4299 }, { "epoch": 0.6937721845756696, "grad_norm": 4.239987850189209, "learning_rate": 2.2882093743214426e-05, "loss": 1.8984, "step": 4300 }, { "epoch": 0.6939335269441755, "grad_norm": 4.503664016723633, "learning_rate": 2.2860146341203937e-05, "loss": 1.7688, "step": 4301 }, { "epoch": 0.6940948693126815, "grad_norm": 4.7965087890625, "learning_rate": 2.2838206350065145e-05, "loss": 1.7498, "step": 4302 }, { "epoch": 0.6942562116811875, "grad_norm": 5.290661811828613, "learning_rate": 2.281627377578901e-05, "loss": 2.0565, "step": 4303 }, { "epoch": 0.6944175540496934, "grad_norm": 3.9973301887512207, "learning_rate": 2.2794348624364476e-05, "loss": 1.9461, "step": 4304 }, { "epoch": 0.6945788964181994, "grad_norm": 5.286022663116455, "learning_rate": 2.2772430901778514e-05, "loss": 1.7823, "step": 4305 }, { "epoch": 0.6947402387867054, "grad_norm": 4.26722526550293, "learning_rate": 2.2750520614015993e-05, "loss": 2.0004, "step": 4306 }, { "epoch": 0.6949015811552114, "grad_norm": 5.046075820922852, "learning_rate": 2.2728617767059824e-05, "loss": 1.8403, "step": 4307 }, { "epoch": 0.6950629235237173, "grad_norm": 4.135906219482422, "learning_rate": 2.2706722366890807e-05, "loss": 1.8142, "step": 4308 }, { "epoch": 0.6952242658922233, "grad_norm": 9.872830390930176, "learning_rate": 2.2684834419487798e-05, "loss": 1.9971, "step": 4309 }, { "epoch": 0.6953856082607293, "grad_norm": 4.156027317047119, "learning_rate": 2.2662953930827546e-05, "loss": 1.7378, "step": 4310 }, { "epoch": 0.6955469506292352, "grad_norm": 4.600287914276123, "learning_rate": 2.2641080906884764e-05, "loss": 1.9674, "step": 4311 }, { "epoch": 0.6957082929977412, "grad_norm": 4.224839210510254, "learning_rate": 2.26192153536322e-05, "loss": 1.6652, "step": 4312 }, { "epoch": 0.6958696353662471, "grad_norm": 4.38108491897583, "learning_rate": 2.2597357277040493e-05, "loss": 1.733, "step": 4313 }, { "epoch": 0.6960309777347532, "grad_norm": 3.771247625350952, "learning_rate": 2.257550668307823e-05, "loss": 1.9172, "step": 4314 }, { "epoch": 0.6961923201032592, "grad_norm": 4.842396259307861, "learning_rate": 2.255366357771203e-05, "loss": 1.8974, "step": 4315 }, { "epoch": 0.6963536624717651, "grad_norm": 4.061734199523926, "learning_rate": 2.253182796690641e-05, "loss": 2.0048, "step": 4316 }, { "epoch": 0.6965150048402711, "grad_norm": 4.278356075286865, "learning_rate": 2.250999985662382e-05, "loss": 1.7021, "step": 4317 }, { "epoch": 0.696676347208777, "grad_norm": 4.616026878356934, "learning_rate": 2.2488179252824747e-05, "loss": 1.7921, "step": 4318 }, { "epoch": 0.696837689577283, "grad_norm": 4.216506481170654, "learning_rate": 2.246636616146753e-05, "loss": 1.9744, "step": 4319 }, { "epoch": 0.6969990319457889, "grad_norm": 3.369396209716797, "learning_rate": 2.2444560588508533e-05, "loss": 1.65, "step": 4320 }, { "epoch": 0.6971603743142949, "grad_norm": 4.403998374938965, "learning_rate": 2.2422762539902013e-05, "loss": 1.9481, "step": 4321 }, { "epoch": 0.6973217166828009, "grad_norm": 4.213418483734131, "learning_rate": 2.2400972021600226e-05, "loss": 2.0417, "step": 4322 }, { "epoch": 0.6974830590513069, "grad_norm": 3.916518211364746, "learning_rate": 2.2379189039553305e-05, "loss": 1.8633, "step": 4323 }, { "epoch": 0.6976444014198129, "grad_norm": 3.631333827972412, "learning_rate": 2.2357413599709402e-05, "loss": 1.9357, "step": 4324 }, { "epoch": 0.6978057437883188, "grad_norm": 4.107703685760498, "learning_rate": 2.233564570801453e-05, "loss": 2.1404, "step": 4325 }, { "epoch": 0.6979670861568248, "grad_norm": 3.8593263626098633, "learning_rate": 2.2313885370412718e-05, "loss": 1.8382, "step": 4326 }, { "epoch": 0.6981284285253307, "grad_norm": 4.466026306152344, "learning_rate": 2.229213259284586e-05, "loss": 2.0149, "step": 4327 }, { "epoch": 0.6982897708938367, "grad_norm": 3.4434030055999756, "learning_rate": 2.227038738125385e-05, "loss": 2.0153, "step": 4328 }, { "epoch": 0.6984511132623427, "grad_norm": 3.6990842819213867, "learning_rate": 2.224864974157447e-05, "loss": 2.0005, "step": 4329 }, { "epoch": 0.6986124556308486, "grad_norm": 3.6912689208984375, "learning_rate": 2.2226919679743453e-05, "loss": 1.7826, "step": 4330 }, { "epoch": 0.6987737979993547, "grad_norm": 4.0772199630737305, "learning_rate": 2.2205197201694446e-05, "loss": 1.9131, "step": 4331 }, { "epoch": 0.6989351403678606, "grad_norm": 3.928609609603882, "learning_rate": 2.2183482313359066e-05, "loss": 2.0697, "step": 4332 }, { "epoch": 0.6990964827363666, "grad_norm": 4.35366153717041, "learning_rate": 2.2161775020666818e-05, "loss": 1.8996, "step": 4333 }, { "epoch": 0.6992578251048726, "grad_norm": 4.93165397644043, "learning_rate": 2.214007532954513e-05, "loss": 1.9748, "step": 4334 }, { "epoch": 0.6994191674733785, "grad_norm": 3.9155633449554443, "learning_rate": 2.2118383245919406e-05, "loss": 1.7677, "step": 4335 }, { "epoch": 0.6995805098418845, "grad_norm": 4.280554294586182, "learning_rate": 2.2096698775712894e-05, "loss": 1.6805, "step": 4336 }, { "epoch": 0.6997418522103904, "grad_norm": 3.150489091873169, "learning_rate": 2.207502192484685e-05, "loss": 1.7984, "step": 4337 }, { "epoch": 0.6999031945788964, "grad_norm": 4.621681213378906, "learning_rate": 2.2053352699240365e-05, "loss": 1.9041, "step": 4338 }, { "epoch": 0.7000645369474023, "grad_norm": 4.467617034912109, "learning_rate": 2.2031691104810525e-05, "loss": 1.9356, "step": 4339 }, { "epoch": 0.7002258793159084, "grad_norm": 4.062320709228516, "learning_rate": 2.201003714747228e-05, "loss": 1.6686, "step": 4340 }, { "epoch": 0.7003872216844144, "grad_norm": 4.962801933288574, "learning_rate": 2.198839083313849e-05, "loss": 2.0948, "step": 4341 }, { "epoch": 0.7005485640529203, "grad_norm": 5.018860340118408, "learning_rate": 2.1966752167719984e-05, "loss": 1.6878, "step": 4342 }, { "epoch": 0.7007099064214263, "grad_norm": 4.773665904998779, "learning_rate": 2.194512115712543e-05, "loss": 1.6959, "step": 4343 }, { "epoch": 0.7008712487899322, "grad_norm": 4.268318176269531, "learning_rate": 2.1923497807261477e-05, "loss": 1.9142, "step": 4344 }, { "epoch": 0.7010325911584382, "grad_norm": 4.606492519378662, "learning_rate": 2.190188212403262e-05, "loss": 2.0696, "step": 4345 }, { "epoch": 0.7011939335269441, "grad_norm": 4.331446647644043, "learning_rate": 2.188027411334131e-05, "loss": 1.8616, "step": 4346 }, { "epoch": 0.7013552758954501, "grad_norm": 5.121551990509033, "learning_rate": 2.1858673781087852e-05, "loss": 1.8384, "step": 4347 }, { "epoch": 0.7015166182639561, "grad_norm": 3.758315324783325, "learning_rate": 2.1837081133170523e-05, "loss": 1.7561, "step": 4348 }, { "epoch": 0.7016779606324621, "grad_norm": 3.490201473236084, "learning_rate": 2.1815496175485434e-05, "loss": 1.7623, "step": 4349 }, { "epoch": 0.7018393030009681, "grad_norm": 5.102977275848389, "learning_rate": 2.1793918913926636e-05, "loss": 1.9809, "step": 4350 }, { "epoch": 0.702000645369474, "grad_norm": 3.9732844829559326, "learning_rate": 2.1772349354386034e-05, "loss": 1.6756, "step": 4351 }, { "epoch": 0.70216198773798, "grad_norm": 6.685610771179199, "learning_rate": 2.1750787502753512e-05, "loss": 2.042, "step": 4352 }, { "epoch": 0.702323330106486, "grad_norm": 4.101576328277588, "learning_rate": 2.1729233364916775e-05, "loss": 1.7987, "step": 4353 }, { "epoch": 0.7024846724749919, "grad_norm": 4.826432228088379, "learning_rate": 2.1707686946761418e-05, "loss": 1.9823, "step": 4354 }, { "epoch": 0.7026460148434979, "grad_norm": 4.119592666625977, "learning_rate": 2.1686148254171013e-05, "loss": 2.2493, "step": 4355 }, { "epoch": 0.7028073572120038, "grad_norm": 4.75625467300415, "learning_rate": 2.1664617293026917e-05, "loss": 2.1533, "step": 4356 }, { "epoch": 0.7029686995805099, "grad_norm": 5.245871543884277, "learning_rate": 2.164309406920846e-05, "loss": 1.8152, "step": 4357 }, { "epoch": 0.7031300419490158, "grad_norm": 4.917736053466797, "learning_rate": 2.1621578588592793e-05, "loss": 1.7487, "step": 4358 }, { "epoch": 0.7032913843175218, "grad_norm": 6.010506629943848, "learning_rate": 2.1600070857055015e-05, "loss": 1.8144, "step": 4359 }, { "epoch": 0.7034527266860278, "grad_norm": 4.053597450256348, "learning_rate": 2.157857088046804e-05, "loss": 1.5957, "step": 4360 }, { "epoch": 0.7036140690545337, "grad_norm": 5.857894420623779, "learning_rate": 2.1557078664702746e-05, "loss": 1.9174, "step": 4361 }, { "epoch": 0.7037754114230397, "grad_norm": 5.7891035079956055, "learning_rate": 2.1535594215627803e-05, "loss": 1.8816, "step": 4362 }, { "epoch": 0.7039367537915456, "grad_norm": 4.0410614013671875, "learning_rate": 2.151411753910984e-05, "loss": 1.7359, "step": 4363 }, { "epoch": 0.7040980961600516, "grad_norm": 4.876594066619873, "learning_rate": 2.14926486410133e-05, "loss": 1.8103, "step": 4364 }, { "epoch": 0.7042594385285575, "grad_norm": 4.136269569396973, "learning_rate": 2.147118752720056e-05, "loss": 1.7087, "step": 4365 }, { "epoch": 0.7044207808970636, "grad_norm": 4.762213230133057, "learning_rate": 2.1449734203531828e-05, "loss": 1.9995, "step": 4366 }, { "epoch": 0.7045821232655696, "grad_norm": 3.8271641731262207, "learning_rate": 2.1428288675865176e-05, "loss": 1.5648, "step": 4367 }, { "epoch": 0.7047434656340755, "grad_norm": 4.143524169921875, "learning_rate": 2.1406850950056612e-05, "loss": 1.822, "step": 4368 }, { "epoch": 0.7049048080025815, "grad_norm": 3.7212743759155273, "learning_rate": 2.1385421031959947e-05, "loss": 1.918, "step": 4369 }, { "epoch": 0.7050661503710874, "grad_norm": 3.9864838123321533, "learning_rate": 2.136399892742687e-05, "loss": 1.7706, "step": 4370 }, { "epoch": 0.7052274927395934, "grad_norm": 5.322200775146484, "learning_rate": 2.1342584642306985e-05, "loss": 1.8825, "step": 4371 }, { "epoch": 0.7053888351080994, "grad_norm": 4.411141395568848, "learning_rate": 2.132117818244771e-05, "loss": 1.6506, "step": 4372 }, { "epoch": 0.7055501774766053, "grad_norm": 3.4371304512023926, "learning_rate": 2.1299779553694323e-05, "loss": 1.9689, "step": 4373 }, { "epoch": 0.7057115198451114, "grad_norm": 5.998043060302734, "learning_rate": 2.1278388761890022e-05, "loss": 1.9898, "step": 4374 }, { "epoch": 0.7058728622136173, "grad_norm": 3.8267035484313965, "learning_rate": 2.125700581287579e-05, "loss": 2.1537, "step": 4375 }, { "epoch": 0.7060342045821233, "grad_norm": 4.702099800109863, "learning_rate": 2.1235630712490538e-05, "loss": 1.7038, "step": 4376 }, { "epoch": 0.7061955469506292, "grad_norm": 5.004813194274902, "learning_rate": 2.1214263466570965e-05, "loss": 1.8292, "step": 4377 }, { "epoch": 0.7063568893191352, "grad_norm": 5.187638282775879, "learning_rate": 2.1192904080951704e-05, "loss": 1.9856, "step": 4378 }, { "epoch": 0.7065182316876412, "grad_norm": 4.296599388122559, "learning_rate": 2.117155256146517e-05, "loss": 1.8487, "step": 4379 }, { "epoch": 0.7066795740561471, "grad_norm": 3.953773260116577, "learning_rate": 2.115020891394165e-05, "loss": 1.8964, "step": 4380 }, { "epoch": 0.7068409164246531, "grad_norm": 4.438882827758789, "learning_rate": 2.1128873144209317e-05, "loss": 1.8821, "step": 4381 }, { "epoch": 0.707002258793159, "grad_norm": 5.1794023513793945, "learning_rate": 2.1107545258094135e-05, "loss": 2.0808, "step": 4382 }, { "epoch": 0.7071636011616651, "grad_norm": 3.880484104156494, "learning_rate": 2.108622526141999e-05, "loss": 1.8642, "step": 4383 }, { "epoch": 0.7073249435301711, "grad_norm": 4.025290489196777, "learning_rate": 2.106491316000852e-05, "loss": 2.0761, "step": 4384 }, { "epoch": 0.707486285898677, "grad_norm": 3.9385993480682373, "learning_rate": 2.10436089596793e-05, "loss": 1.8177, "step": 4385 }, { "epoch": 0.707647628267183, "grad_norm": 3.9777097702026367, "learning_rate": 2.1022312666249665e-05, "loss": 1.8149, "step": 4386 }, { "epoch": 0.7078089706356889, "grad_norm": 3.2477285861968994, "learning_rate": 2.1001024285534878e-05, "loss": 1.9699, "step": 4387 }, { "epoch": 0.7079703130041949, "grad_norm": 4.853583335876465, "learning_rate": 2.0979743823347957e-05, "loss": 1.7118, "step": 4388 }, { "epoch": 0.7081316553727008, "grad_norm": 4.169306755065918, "learning_rate": 2.095847128549981e-05, "loss": 1.7005, "step": 4389 }, { "epoch": 0.7082929977412068, "grad_norm": 4.811184883117676, "learning_rate": 2.0937206677799142e-05, "loss": 1.9682, "step": 4390 }, { "epoch": 0.7084543401097129, "grad_norm": 4.938937664031982, "learning_rate": 2.0915950006052553e-05, "loss": 1.6893, "step": 4391 }, { "epoch": 0.7086156824782188, "grad_norm": 5.426696300506592, "learning_rate": 2.089470127606442e-05, "loss": 2.0481, "step": 4392 }, { "epoch": 0.7087770248467248, "grad_norm": 4.347787857055664, "learning_rate": 2.087346049363696e-05, "loss": 1.8751, "step": 4393 }, { "epoch": 0.7089383672152307, "grad_norm": 5.035215377807617, "learning_rate": 2.085222766457025e-05, "loss": 1.7715, "step": 4394 }, { "epoch": 0.7090997095837367, "grad_norm": 4.536472320556641, "learning_rate": 2.0831002794662157e-05, "loss": 2.015, "step": 4395 }, { "epoch": 0.7092610519522426, "grad_norm": 5.405670642852783, "learning_rate": 2.0809785889708423e-05, "loss": 1.9514, "step": 4396 }, { "epoch": 0.7094223943207486, "grad_norm": 4.638657569885254, "learning_rate": 2.0788576955502547e-05, "loss": 1.8223, "step": 4397 }, { "epoch": 0.7095837366892546, "grad_norm": 5.7009453773498535, "learning_rate": 2.076737599783593e-05, "loss": 1.8343, "step": 4398 }, { "epoch": 0.7097450790577605, "grad_norm": 4.524998664855957, "learning_rate": 2.074618302249772e-05, "loss": 2.0693, "step": 4399 }, { "epoch": 0.7099064214262666, "grad_norm": 5.167919158935547, "learning_rate": 2.0724998035274945e-05, "loss": 1.9377, "step": 4400 }, { "epoch": 0.7100677637947725, "grad_norm": 3.5350277423858643, "learning_rate": 2.0703821041952404e-05, "loss": 1.8358, "step": 4401 }, { "epoch": 0.7102291061632785, "grad_norm": 4.2776055335998535, "learning_rate": 2.0682652048312767e-05, "loss": 2.0817, "step": 4402 }, { "epoch": 0.7103904485317845, "grad_norm": 4.363068103790283, "learning_rate": 2.0661491060136467e-05, "loss": 1.6639, "step": 4403 }, { "epoch": 0.7105517909002904, "grad_norm": 3.997136116027832, "learning_rate": 2.0640338083201766e-05, "loss": 1.7986, "step": 4404 }, { "epoch": 0.7107131332687964, "grad_norm": 4.03304386138916, "learning_rate": 2.061919312328477e-05, "loss": 1.8539, "step": 4405 }, { "epoch": 0.7108744756373023, "grad_norm": 4.127360820770264, "learning_rate": 2.059805618615934e-05, "loss": 1.8774, "step": 4406 }, { "epoch": 0.7110358180058083, "grad_norm": 3.846283197402954, "learning_rate": 2.0576927277597213e-05, "loss": 1.8691, "step": 4407 }, { "epoch": 0.7111971603743142, "grad_norm": 4.539346694946289, "learning_rate": 2.0555806403367878e-05, "loss": 2.0739, "step": 4408 }, { "epoch": 0.7113585027428203, "grad_norm": 5.109747409820557, "learning_rate": 2.053469356923865e-05, "loss": 1.6983, "step": 4409 }, { "epoch": 0.7115198451113263, "grad_norm": 4.709097385406494, "learning_rate": 2.0513588780974637e-05, "loss": 1.749, "step": 4410 }, { "epoch": 0.7116811874798322, "grad_norm": 4.342752456665039, "learning_rate": 2.049249204433879e-05, "loss": 1.9668, "step": 4411 }, { "epoch": 0.7118425298483382, "grad_norm": 4.634512424468994, "learning_rate": 2.04714033650918e-05, "loss": 1.6424, "step": 4412 }, { "epoch": 0.7120038722168441, "grad_norm": 4.253017425537109, "learning_rate": 2.0450322748992224e-05, "loss": 1.7382, "step": 4413 }, { "epoch": 0.7121652145853501, "grad_norm": 3.9589767456054688, "learning_rate": 2.0429250201796358e-05, "loss": 1.7899, "step": 4414 }, { "epoch": 0.712326556953856, "grad_norm": 4.6450629234313965, "learning_rate": 2.0408185729258343e-05, "loss": 1.6952, "step": 4415 }, { "epoch": 0.712487899322362, "grad_norm": 3.912813901901245, "learning_rate": 2.0387129337130083e-05, "loss": 1.7723, "step": 4416 }, { "epoch": 0.7126492416908681, "grad_norm": 3.898334264755249, "learning_rate": 2.0366081031161267e-05, "loss": 1.674, "step": 4417 }, { "epoch": 0.712810584059374, "grad_norm": 5.272930145263672, "learning_rate": 2.034504081709943e-05, "loss": 2.0234, "step": 4418 }, { "epoch": 0.71297192642788, "grad_norm": 4.26560640335083, "learning_rate": 2.0324008700689827e-05, "loss": 1.8778, "step": 4419 }, { "epoch": 0.7131332687963859, "grad_norm": 4.033371925354004, "learning_rate": 2.030298468767557e-05, "loss": 1.8857, "step": 4420 }, { "epoch": 0.7132946111648919, "grad_norm": 5.359758377075195, "learning_rate": 2.0281968783797488e-05, "loss": 1.8295, "step": 4421 }, { "epoch": 0.7134559535333979, "grad_norm": 4.577740669250488, "learning_rate": 2.0260960994794276e-05, "loss": 1.9281, "step": 4422 }, { "epoch": 0.7136172959019038, "grad_norm": 4.2320685386657715, "learning_rate": 2.0239961326402323e-05, "loss": 1.9201, "step": 4423 }, { "epoch": 0.7137786382704098, "grad_norm": 4.954527854919434, "learning_rate": 2.021896978435589e-05, "loss": 2.0447, "step": 4424 }, { "epoch": 0.7139399806389157, "grad_norm": 4.132653713226318, "learning_rate": 2.019798637438694e-05, "loss": 1.8366, "step": 4425 }, { "epoch": 0.7141013230074218, "grad_norm": 3.632702589035034, "learning_rate": 2.017701110222529e-05, "loss": 1.576, "step": 4426 }, { "epoch": 0.7142626653759278, "grad_norm": 4.073683261871338, "learning_rate": 2.0156043973598476e-05, "loss": 1.7663, "step": 4427 }, { "epoch": 0.7144240077444337, "grad_norm": 3.756171226501465, "learning_rate": 2.013508499423183e-05, "loss": 1.9047, "step": 4428 }, { "epoch": 0.7145853501129397, "grad_norm": 5.1460442543029785, "learning_rate": 2.011413416984846e-05, "loss": 2.045, "step": 4429 }, { "epoch": 0.7147466924814456, "grad_norm": 4.379173755645752, "learning_rate": 2.009319150616923e-05, "loss": 1.5792, "step": 4430 }, { "epoch": 0.7149080348499516, "grad_norm": 3.989184856414795, "learning_rate": 2.0072257008912826e-05, "loss": 1.9315, "step": 4431 }, { "epoch": 0.7150693772184575, "grad_norm": 4.4064507484436035, "learning_rate": 2.005133068379564e-05, "loss": 1.7277, "step": 4432 }, { "epoch": 0.7152307195869635, "grad_norm": 3.6643855571746826, "learning_rate": 2.0030412536531895e-05, "loss": 1.8192, "step": 4433 }, { "epoch": 0.7153920619554696, "grad_norm": 5.062724590301514, "learning_rate": 2.000950257283351e-05, "loss": 1.8298, "step": 4434 }, { "epoch": 0.7155534043239755, "grad_norm": 3.734053134918213, "learning_rate": 1.9988600798410258e-05, "loss": 1.949, "step": 4435 }, { "epoch": 0.7157147466924815, "grad_norm": 4.470686435699463, "learning_rate": 1.996770721896957e-05, "loss": 1.6318, "step": 4436 }, { "epoch": 0.7158760890609874, "grad_norm": 4.909954071044922, "learning_rate": 1.9946821840216752e-05, "loss": 1.5582, "step": 4437 }, { "epoch": 0.7160374314294934, "grad_norm": 5.325438499450684, "learning_rate": 1.9925944667854757e-05, "loss": 1.7757, "step": 4438 }, { "epoch": 0.7161987737979993, "grad_norm": 4.973210334777832, "learning_rate": 1.9905075707584407e-05, "loss": 2.0065, "step": 4439 }, { "epoch": 0.7163601161665053, "grad_norm": 4.6979594230651855, "learning_rate": 1.9884214965104194e-05, "loss": 1.7517, "step": 4440 }, { "epoch": 0.7165214585350113, "grad_norm": 4.94175386428833, "learning_rate": 1.9863362446110416e-05, "loss": 1.7648, "step": 4441 }, { "epoch": 0.7166828009035172, "grad_norm": 4.009958267211914, "learning_rate": 1.984251815629712e-05, "loss": 2.2666, "step": 4442 }, { "epoch": 0.7168441432720233, "grad_norm": 3.5523719787597656, "learning_rate": 1.982168210135606e-05, "loss": 1.8394, "step": 4443 }, { "epoch": 0.7170054856405292, "grad_norm": 3.77880859375, "learning_rate": 1.9800854286976815e-05, "loss": 2.2207, "step": 4444 }, { "epoch": 0.7171668280090352, "grad_norm": 5.544381618499756, "learning_rate": 1.978003471884665e-05, "loss": 1.87, "step": 4445 }, { "epoch": 0.7173281703775412, "grad_norm": 4.609133243560791, "learning_rate": 1.9759223402650635e-05, "loss": 1.7028, "step": 4446 }, { "epoch": 0.7174895127460471, "grad_norm": 4.625114440917969, "learning_rate": 1.973842034407154e-05, "loss": 1.9978, "step": 4447 }, { "epoch": 0.7176508551145531, "grad_norm": 5.1108269691467285, "learning_rate": 1.9717625548789893e-05, "loss": 2.0785, "step": 4448 }, { "epoch": 0.717812197483059, "grad_norm": 4.0849480628967285, "learning_rate": 1.969683902248395e-05, "loss": 1.8791, "step": 4449 }, { "epoch": 0.717973539851565, "grad_norm": 3.570565938949585, "learning_rate": 1.9676060770829774e-05, "loss": 1.7057, "step": 4450 }, { "epoch": 0.7181348822200709, "grad_norm": 4.450013160705566, "learning_rate": 1.9655290799501074e-05, "loss": 2.0126, "step": 4451 }, { "epoch": 0.718296224588577, "grad_norm": 5.133970260620117, "learning_rate": 1.9634529114169398e-05, "loss": 1.7061, "step": 4452 }, { "epoch": 0.718457566957083, "grad_norm": 5.0176167488098145, "learning_rate": 1.9613775720503928e-05, "loss": 1.8928, "step": 4453 }, { "epoch": 0.7186189093255889, "grad_norm": 3.944841146469116, "learning_rate": 1.9593030624171683e-05, "loss": 1.9382, "step": 4454 }, { "epoch": 0.7187802516940949, "grad_norm": 5.478379249572754, "learning_rate": 1.957229383083734e-05, "loss": 1.979, "step": 4455 }, { "epoch": 0.7189415940626008, "grad_norm": 4.344667434692383, "learning_rate": 1.9551565346163326e-05, "loss": 1.835, "step": 4456 }, { "epoch": 0.7191029364311068, "grad_norm": 5.0222697257995605, "learning_rate": 1.9530845175809836e-05, "loss": 2.0196, "step": 4457 }, { "epoch": 0.7192642787996127, "grad_norm": 3.5757687091827393, "learning_rate": 1.9510133325434742e-05, "loss": 1.7501, "step": 4458 }, { "epoch": 0.7194256211681187, "grad_norm": 3.818265438079834, "learning_rate": 1.94894298006937e-05, "loss": 1.9179, "step": 4459 }, { "epoch": 0.7195869635366248, "grad_norm": 4.43404483795166, "learning_rate": 1.946873460724003e-05, "loss": 1.8416, "step": 4460 }, { "epoch": 0.7197483059051307, "grad_norm": 4.358422756195068, "learning_rate": 1.944804775072484e-05, "loss": 2.1449, "step": 4461 }, { "epoch": 0.7199096482736367, "grad_norm": 4.856911659240723, "learning_rate": 1.9427369236796905e-05, "loss": 1.7152, "step": 4462 }, { "epoch": 0.7200709906421426, "grad_norm": 4.715914249420166, "learning_rate": 1.9406699071102774e-05, "loss": 1.8595, "step": 4463 }, { "epoch": 0.7202323330106486, "grad_norm": 4.349678993225098, "learning_rate": 1.9386037259286677e-05, "loss": 1.8718, "step": 4464 }, { "epoch": 0.7203936753791546, "grad_norm": 4.57448148727417, "learning_rate": 1.9365383806990562e-05, "loss": 1.9621, "step": 4465 }, { "epoch": 0.7205550177476605, "grad_norm": 4.83892297744751, "learning_rate": 1.9344738719854137e-05, "loss": 1.8222, "step": 4466 }, { "epoch": 0.7207163601161665, "grad_norm": 4.104072570800781, "learning_rate": 1.932410200351479e-05, "loss": 1.8106, "step": 4467 }, { "epoch": 0.7208777024846724, "grad_norm": 4.178228378295898, "learning_rate": 1.930347366360762e-05, "loss": 1.9169, "step": 4468 }, { "epoch": 0.7210390448531785, "grad_norm": 3.7050023078918457, "learning_rate": 1.9282853705765435e-05, "loss": 1.7201, "step": 4469 }, { "epoch": 0.7212003872216844, "grad_norm": 5.124715805053711, "learning_rate": 1.926224213561881e-05, "loss": 2.0132, "step": 4470 }, { "epoch": 0.7213617295901904, "grad_norm": 3.972278594970703, "learning_rate": 1.9241638958795942e-05, "loss": 2.0047, "step": 4471 }, { "epoch": 0.7215230719586964, "grad_norm": 5.051692485809326, "learning_rate": 1.922104418092283e-05, "loss": 1.9983, "step": 4472 }, { "epoch": 0.7216844143272023, "grad_norm": 4.551050186157227, "learning_rate": 1.920045780762309e-05, "loss": 1.8696, "step": 4473 }, { "epoch": 0.7218457566957083, "grad_norm": 4.42860746383667, "learning_rate": 1.917987984451812e-05, "loss": 1.8013, "step": 4474 }, { "epoch": 0.7220070990642142, "grad_norm": 4.822419166564941, "learning_rate": 1.9159310297226957e-05, "loss": 1.8173, "step": 4475 }, { "epoch": 0.7221684414327202, "grad_norm": 5.97450590133667, "learning_rate": 1.9138749171366398e-05, "loss": 1.7555, "step": 4476 }, { "epoch": 0.7223297838012263, "grad_norm": 3.7013676166534424, "learning_rate": 1.911819647255088e-05, "loss": 1.5018, "step": 4477 }, { "epoch": 0.7224911261697322, "grad_norm": 3.790663242340088, "learning_rate": 1.909765220639261e-05, "loss": 1.8122, "step": 4478 }, { "epoch": 0.7226524685382382, "grad_norm": 4.289879322052002, "learning_rate": 1.9077116378501424e-05, "loss": 1.7903, "step": 4479 }, { "epoch": 0.7228138109067441, "grad_norm": 6.576168537139893, "learning_rate": 1.9056588994484877e-05, "loss": 1.8691, "step": 4480 }, { "epoch": 0.7229751532752501, "grad_norm": 3.9365086555480957, "learning_rate": 1.9036070059948252e-05, "loss": 2.0826, "step": 4481 }, { "epoch": 0.723136495643756, "grad_norm": 4.5562310218811035, "learning_rate": 1.901555958049447e-05, "loss": 2.068, "step": 4482 }, { "epoch": 0.723297838012262, "grad_norm": 4.068548202514648, "learning_rate": 1.8995057561724193e-05, "loss": 1.9986, "step": 4483 }, { "epoch": 0.723459180380768, "grad_norm": 5.269343376159668, "learning_rate": 1.897456400923574e-05, "loss": 2.0491, "step": 4484 }, { "epoch": 0.7236205227492739, "grad_norm": 4.068410396575928, "learning_rate": 1.895407892862512e-05, "loss": 1.762, "step": 4485 }, { "epoch": 0.72378186511778, "grad_norm": 3.507288694381714, "learning_rate": 1.893360232548605e-05, "loss": 1.6844, "step": 4486 }, { "epoch": 0.7239432074862859, "grad_norm": 4.270358562469482, "learning_rate": 1.891313420540992e-05, "loss": 1.6996, "step": 4487 }, { "epoch": 0.7241045498547919, "grad_norm": 6.514777660369873, "learning_rate": 1.889267457398578e-05, "loss": 1.744, "step": 4488 }, { "epoch": 0.7242658922232978, "grad_norm": 3.457181692123413, "learning_rate": 1.887222343680041e-05, "loss": 1.6918, "step": 4489 }, { "epoch": 0.7244272345918038, "grad_norm": 3.9298248291015625, "learning_rate": 1.885178079943823e-05, "loss": 1.8281, "step": 4490 }, { "epoch": 0.7245885769603098, "grad_norm": 3.867527723312378, "learning_rate": 1.883134666748137e-05, "loss": 1.8823, "step": 4491 }, { "epoch": 0.7247499193288157, "grad_norm": 4.336234092712402, "learning_rate": 1.8810921046509617e-05, "loss": 1.8338, "step": 4492 }, { "epoch": 0.7249112616973217, "grad_norm": 4.065129280090332, "learning_rate": 1.8790503942100412e-05, "loss": 1.9443, "step": 4493 }, { "epoch": 0.7250726040658277, "grad_norm": 3.868798017501831, "learning_rate": 1.877009535982894e-05, "loss": 1.7561, "step": 4494 }, { "epoch": 0.7252339464343337, "grad_norm": 4.448511123657227, "learning_rate": 1.874969530526797e-05, "loss": 1.7878, "step": 4495 }, { "epoch": 0.7253952888028397, "grad_norm": 3.7267308235168457, "learning_rate": 1.872930378398804e-05, "loss": 1.8618, "step": 4496 }, { "epoch": 0.7255566311713456, "grad_norm": 3.7738239765167236, "learning_rate": 1.8708920801557257e-05, "loss": 1.7849, "step": 4497 }, { "epoch": 0.7257179735398516, "grad_norm": 5.435400485992432, "learning_rate": 1.8688546363541487e-05, "loss": 1.9727, "step": 4498 }, { "epoch": 0.7258793159083575, "grad_norm": 5.157015323638916, "learning_rate": 1.866818047550419e-05, "loss": 1.8461, "step": 4499 }, { "epoch": 0.7260406582768635, "grad_norm": 5.213715553283691, "learning_rate": 1.8647823143006542e-05, "loss": 2.0061, "step": 4500 }, { "epoch": 0.7262020006453694, "grad_norm": 4.5512518882751465, "learning_rate": 1.8627474371607347e-05, "loss": 2.0263, "step": 4501 }, { "epoch": 0.7263633430138754, "grad_norm": 5.680060386657715, "learning_rate": 1.8607134166863112e-05, "loss": 2.3281, "step": 4502 }, { "epoch": 0.7265246853823815, "grad_norm": 4.809805870056152, "learning_rate": 1.858680253432797e-05, "loss": 1.9999, "step": 4503 }, { "epoch": 0.7266860277508874, "grad_norm": 4.11499547958374, "learning_rate": 1.8566479479553715e-05, "loss": 1.7536, "step": 4504 }, { "epoch": 0.7268473701193934, "grad_norm": 4.847894668579102, "learning_rate": 1.8546165008089805e-05, "loss": 2.0412, "step": 4505 }, { "epoch": 0.7270087124878993, "grad_norm": 4.410421848297119, "learning_rate": 1.852585912548338e-05, "loss": 1.6655, "step": 4506 }, { "epoch": 0.7271700548564053, "grad_norm": 4.558352470397949, "learning_rate": 1.8505561837279195e-05, "loss": 1.7886, "step": 4507 }, { "epoch": 0.7273313972249112, "grad_norm": 3.6568939685821533, "learning_rate": 1.8485273149019655e-05, "loss": 1.6887, "step": 4508 }, { "epoch": 0.7274927395934172, "grad_norm": 4.80131196975708, "learning_rate": 1.8464993066244886e-05, "loss": 1.7628, "step": 4509 }, { "epoch": 0.7276540819619232, "grad_norm": 4.128368377685547, "learning_rate": 1.8444721594492558e-05, "loss": 1.886, "step": 4510 }, { "epoch": 0.7278154243304291, "grad_norm": 5.377955436706543, "learning_rate": 1.84244587392981e-05, "loss": 1.9814, "step": 4511 }, { "epoch": 0.7279767666989352, "grad_norm": 4.4759697914123535, "learning_rate": 1.840420450619449e-05, "loss": 1.7729, "step": 4512 }, { "epoch": 0.7281381090674411, "grad_norm": 4.838116645812988, "learning_rate": 1.8383958900712434e-05, "loss": 1.9534, "step": 4513 }, { "epoch": 0.7282994514359471, "grad_norm": 3.3044047355651855, "learning_rate": 1.8363721928380205e-05, "loss": 2.0678, "step": 4514 }, { "epoch": 0.728460793804453, "grad_norm": 4.035040378570557, "learning_rate": 1.8343493594723803e-05, "loss": 1.8531, "step": 4515 }, { "epoch": 0.728622136172959, "grad_norm": 4.368617534637451, "learning_rate": 1.832327390526678e-05, "loss": 1.5639, "step": 4516 }, { "epoch": 0.728783478541465, "grad_norm": 3.922774076461792, "learning_rate": 1.8303062865530406e-05, "loss": 1.9464, "step": 4517 }, { "epoch": 0.7289448209099709, "grad_norm": 3.48451828956604, "learning_rate": 1.8282860481033543e-05, "loss": 1.7091, "step": 4518 }, { "epoch": 0.7291061632784769, "grad_norm": 3.918025016784668, "learning_rate": 1.8262666757292674e-05, "loss": 1.9751, "step": 4519 }, { "epoch": 0.729267505646983, "grad_norm": 4.220199108123779, "learning_rate": 1.824248169982199e-05, "loss": 1.8982, "step": 4520 }, { "epoch": 0.7294288480154889, "grad_norm": 5.191842555999756, "learning_rate": 1.822230531413323e-05, "loss": 2.1131, "step": 4521 }, { "epoch": 0.7295901903839949, "grad_norm": 4.386340618133545, "learning_rate": 1.820213760573584e-05, "loss": 2.1085, "step": 4522 }, { "epoch": 0.7297515327525008, "grad_norm": 4.766170501708984, "learning_rate": 1.818197858013685e-05, "loss": 1.882, "step": 4523 }, { "epoch": 0.7299128751210068, "grad_norm": 4.039060115814209, "learning_rate": 1.8161828242840923e-05, "loss": 1.8015, "step": 4524 }, { "epoch": 0.7300742174895127, "grad_norm": 4.161253929138184, "learning_rate": 1.8141686599350337e-05, "loss": 1.6455, "step": 4525 }, { "epoch": 0.7302355598580187, "grad_norm": 5.185811996459961, "learning_rate": 1.8121553655165057e-05, "loss": 1.6829, "step": 4526 }, { "epoch": 0.7303969022265246, "grad_norm": 4.229522228240967, "learning_rate": 1.8101429415782594e-05, "loss": 2.0075, "step": 4527 }, { "epoch": 0.7305582445950306, "grad_norm": 3.822842597961426, "learning_rate": 1.808131388669816e-05, "loss": 1.7609, "step": 4528 }, { "epoch": 0.7307195869635367, "grad_norm": 4.920767784118652, "learning_rate": 1.8061207073404507e-05, "loss": 1.9016, "step": 4529 }, { "epoch": 0.7308809293320426, "grad_norm": 4.125351428985596, "learning_rate": 1.8041108981392086e-05, "loss": 1.9906, "step": 4530 }, { "epoch": 0.7310422717005486, "grad_norm": 4.810054302215576, "learning_rate": 1.802101961614891e-05, "loss": 1.627, "step": 4531 }, { "epoch": 0.7312036140690545, "grad_norm": 5.05076789855957, "learning_rate": 1.8000938983160608e-05, "loss": 2.0142, "step": 4532 }, { "epoch": 0.7313649564375605, "grad_norm": 4.283308029174805, "learning_rate": 1.7980867087910486e-05, "loss": 1.7554, "step": 4533 }, { "epoch": 0.7315262988060665, "grad_norm": 5.207695484161377, "learning_rate": 1.796080393587939e-05, "loss": 1.584, "step": 4534 }, { "epoch": 0.7316876411745724, "grad_norm": 5.01399564743042, "learning_rate": 1.794074953254583e-05, "loss": 2.0779, "step": 4535 }, { "epoch": 0.7318489835430784, "grad_norm": 3.69757080078125, "learning_rate": 1.7920703883385888e-05, "loss": 1.7488, "step": 4536 }, { "epoch": 0.7320103259115844, "grad_norm": 6.600658416748047, "learning_rate": 1.7900666993873305e-05, "loss": 1.8841, "step": 4537 }, { "epoch": 0.7321716682800904, "grad_norm": 4.896603107452393, "learning_rate": 1.7880638869479365e-05, "loss": 1.8702, "step": 4538 }, { "epoch": 0.7323330106485963, "grad_norm": 3.632988214492798, "learning_rate": 1.7860619515673033e-05, "loss": 1.617, "step": 4539 }, { "epoch": 0.7324943530171023, "grad_norm": 4.568796157836914, "learning_rate": 1.7840608937920804e-05, "loss": 1.8584, "step": 4540 }, { "epoch": 0.7326556953856083, "grad_norm": 3.9885737895965576, "learning_rate": 1.7820607141686846e-05, "loss": 1.8715, "step": 4541 }, { "epoch": 0.7328170377541142, "grad_norm": 5.049192428588867, "learning_rate": 1.780061413243288e-05, "loss": 2.1547, "step": 4542 }, { "epoch": 0.7329783801226202, "grad_norm": 6.493274211883545, "learning_rate": 1.778062991561824e-05, "loss": 2.0009, "step": 4543 }, { "epoch": 0.7331397224911261, "grad_norm": 5.213850975036621, "learning_rate": 1.7760654496699875e-05, "loss": 1.9638, "step": 4544 }, { "epoch": 0.7333010648596321, "grad_norm": 4.086864948272705, "learning_rate": 1.774068788113229e-05, "loss": 1.8733, "step": 4545 }, { "epoch": 0.7334624072281382, "grad_norm": 4.98647928237915, "learning_rate": 1.7720730074367646e-05, "loss": 1.7348, "step": 4546 }, { "epoch": 0.7336237495966441, "grad_norm": 4.155590057373047, "learning_rate": 1.770078108185565e-05, "loss": 1.7009, "step": 4547 }, { "epoch": 0.7337850919651501, "grad_norm": 5.692659378051758, "learning_rate": 1.7680840909043644e-05, "loss": 2.0436, "step": 4548 }, { "epoch": 0.733946434333656, "grad_norm": 3.6536619663238525, "learning_rate": 1.7660909561376504e-05, "loss": 1.9367, "step": 4549 }, { "epoch": 0.734107776702162, "grad_norm": 5.5632171630859375, "learning_rate": 1.764098704429677e-05, "loss": 1.6076, "step": 4550 }, { "epoch": 0.7342691190706679, "grad_norm": 5.216808795928955, "learning_rate": 1.7621073363244488e-05, "loss": 1.8999, "step": 4551 }, { "epoch": 0.7344304614391739, "grad_norm": 4.736422061920166, "learning_rate": 1.760116852365738e-05, "loss": 1.8347, "step": 4552 }, { "epoch": 0.7345918038076799, "grad_norm": 4.6815409660339355, "learning_rate": 1.7581272530970667e-05, "loss": 1.7078, "step": 4553 }, { "epoch": 0.7347531461761858, "grad_norm": 3.6447505950927734, "learning_rate": 1.7561385390617226e-05, "loss": 1.976, "step": 4554 }, { "epoch": 0.7349144885446919, "grad_norm": 3.5308313369750977, "learning_rate": 1.7541507108027466e-05, "loss": 1.7106, "step": 4555 }, { "epoch": 0.7350758309131978, "grad_norm": 9.241874694824219, "learning_rate": 1.7521637688629393e-05, "loss": 1.7049, "step": 4556 }, { "epoch": 0.7352371732817038, "grad_norm": 4.630099296569824, "learning_rate": 1.7501777137848625e-05, "loss": 1.8567, "step": 4557 }, { "epoch": 0.7353985156502097, "grad_norm": 4.028338432312012, "learning_rate": 1.7481925461108295e-05, "loss": 1.7505, "step": 4558 }, { "epoch": 0.7355598580187157, "grad_norm": 7.373607158660889, "learning_rate": 1.746208266382918e-05, "loss": 2.1431, "step": 4559 }, { "epoch": 0.7357212003872217, "grad_norm": 4.728418350219727, "learning_rate": 1.7442248751429574e-05, "loss": 1.8313, "step": 4560 }, { "epoch": 0.7358825427557276, "grad_norm": 4.013697147369385, "learning_rate": 1.7422423729325397e-05, "loss": 1.7527, "step": 4561 }, { "epoch": 0.7360438851242336, "grad_norm": 4.053626537322998, "learning_rate": 1.7402607602930104e-05, "loss": 1.8807, "step": 4562 }, { "epoch": 0.7362052274927396, "grad_norm": 4.765527248382568, "learning_rate": 1.7382800377654727e-05, "loss": 1.9369, "step": 4563 }, { "epoch": 0.7363665698612456, "grad_norm": 4.913549900054932, "learning_rate": 1.7363002058907867e-05, "loss": 1.7338, "step": 4564 }, { "epoch": 0.7365279122297516, "grad_norm": 4.462325572967529, "learning_rate": 1.734321265209572e-05, "loss": 2.0695, "step": 4565 }, { "epoch": 0.7366892545982575, "grad_norm": 4.088284969329834, "learning_rate": 1.7323432162622006e-05, "loss": 1.6925, "step": 4566 }, { "epoch": 0.7368505969667635, "grad_norm": 3.6851367950439453, "learning_rate": 1.730366059588805e-05, "loss": 1.7553, "step": 4567 }, { "epoch": 0.7370119393352694, "grad_norm": 4.4936418533325195, "learning_rate": 1.728389795729272e-05, "loss": 1.9055, "step": 4568 }, { "epoch": 0.7371732817037754, "grad_norm": 3.878669261932373, "learning_rate": 1.7264144252232422e-05, "loss": 2.0015, "step": 4569 }, { "epoch": 0.7373346240722813, "grad_norm": 4.058463096618652, "learning_rate": 1.724439948610119e-05, "loss": 1.846, "step": 4570 }, { "epoch": 0.7374959664407873, "grad_norm": 4.332618713378906, "learning_rate": 1.7224663664290536e-05, "loss": 2.031, "step": 4571 }, { "epoch": 0.7376573088092934, "grad_norm": 3.5738584995269775, "learning_rate": 1.7204936792189607e-05, "loss": 1.8451, "step": 4572 }, { "epoch": 0.7378186511777993, "grad_norm": 5.251870632171631, "learning_rate": 1.7185218875185035e-05, "loss": 1.9784, "step": 4573 }, { "epoch": 0.7379799935463053, "grad_norm": 4.949958324432373, "learning_rate": 1.7165509918661067e-05, "loss": 1.998, "step": 4574 }, { "epoch": 0.7381413359148112, "grad_norm": 4.119620323181152, "learning_rate": 1.7145809927999447e-05, "loss": 1.8491, "step": 4575 }, { "epoch": 0.7383026782833172, "grad_norm": 5.099935054779053, "learning_rate": 1.7126118908579535e-05, "loss": 1.8617, "step": 4576 }, { "epoch": 0.7384640206518231, "grad_norm": 4.226795196533203, "learning_rate": 1.710643686577818e-05, "loss": 1.9372, "step": 4577 }, { "epoch": 0.7386253630203291, "grad_norm": 4.3429107666015625, "learning_rate": 1.708676380496983e-05, "loss": 1.9013, "step": 4578 }, { "epoch": 0.7387867053888351, "grad_norm": 4.148284435272217, "learning_rate": 1.7067099731526444e-05, "loss": 1.939, "step": 4579 }, { "epoch": 0.7389480477573411, "grad_norm": 5.373454570770264, "learning_rate": 1.7047444650817517e-05, "loss": 1.8943, "step": 4580 }, { "epoch": 0.7391093901258471, "grad_norm": 4.9623260498046875, "learning_rate": 1.7027798568210156e-05, "loss": 1.8009, "step": 4581 }, { "epoch": 0.739270732494353, "grad_norm": 3.955017328262329, "learning_rate": 1.700816148906894e-05, "loss": 1.8899, "step": 4582 }, { "epoch": 0.739432074862859, "grad_norm": 4.221274375915527, "learning_rate": 1.698853341875602e-05, "loss": 1.7815, "step": 4583 }, { "epoch": 0.739593417231365, "grad_norm": 3.688870668411255, "learning_rate": 1.6968914362631065e-05, "loss": 1.7886, "step": 4584 }, { "epoch": 0.7397547595998709, "grad_norm": 4.286644458770752, "learning_rate": 1.6949304326051335e-05, "loss": 1.8917, "step": 4585 }, { "epoch": 0.7399161019683769, "grad_norm": 4.146304130554199, "learning_rate": 1.692970331437155e-05, "loss": 1.8755, "step": 4586 }, { "epoch": 0.7400774443368828, "grad_norm": 3.87103533744812, "learning_rate": 1.6910111332944056e-05, "loss": 1.9856, "step": 4587 }, { "epoch": 0.7402387867053888, "grad_norm": 4.180723190307617, "learning_rate": 1.689052838711864e-05, "loss": 1.9044, "step": 4588 }, { "epoch": 0.7404001290738949, "grad_norm": 3.6202878952026367, "learning_rate": 1.6870954482242707e-05, "loss": 1.6294, "step": 4589 }, { "epoch": 0.7405614714424008, "grad_norm": 4.394001483917236, "learning_rate": 1.685138962366112e-05, "loss": 2.2767, "step": 4590 }, { "epoch": 0.7407228138109068, "grad_norm": 4.077112197875977, "learning_rate": 1.683183381671633e-05, "loss": 1.8177, "step": 4591 }, { "epoch": 0.7408841561794127, "grad_norm": 4.718794345855713, "learning_rate": 1.6812287066748262e-05, "loss": 1.889, "step": 4592 }, { "epoch": 0.7410454985479187, "grad_norm": 4.432754993438721, "learning_rate": 1.6792749379094437e-05, "loss": 1.9168, "step": 4593 }, { "epoch": 0.7412068409164246, "grad_norm": 3.764059066772461, "learning_rate": 1.6773220759089835e-05, "loss": 1.9586, "step": 4594 }, { "epoch": 0.7413681832849306, "grad_norm": 4.602880001068115, "learning_rate": 1.6753701212066975e-05, "loss": 1.8579, "step": 4595 }, { "epoch": 0.7415295256534365, "grad_norm": 5.309647083282471, "learning_rate": 1.6734190743355944e-05, "loss": 1.8077, "step": 4596 }, { "epoch": 0.7416908680219426, "grad_norm": 4.698686599731445, "learning_rate": 1.671468935828428e-05, "loss": 1.7992, "step": 4597 }, { "epoch": 0.7418522103904486, "grad_norm": 3.6300902366638184, "learning_rate": 1.6695197062177108e-05, "loss": 1.8038, "step": 4598 }, { "epoch": 0.7420135527589545, "grad_norm": 4.105565547943115, "learning_rate": 1.6675713860357036e-05, "loss": 1.7547, "step": 4599 }, { "epoch": 0.7421748951274605, "grad_norm": 3.8115761280059814, "learning_rate": 1.665623975814416e-05, "loss": 1.9306, "step": 4600 }, { "epoch": 0.7423362374959664, "grad_norm": 3.8606653213500977, "learning_rate": 1.663677476085616e-05, "loss": 1.61, "step": 4601 }, { "epoch": 0.7424975798644724, "grad_norm": 4.54530143737793, "learning_rate": 1.6617318873808184e-05, "loss": 1.9141, "step": 4602 }, { "epoch": 0.7426589222329784, "grad_norm": 4.4852681159973145, "learning_rate": 1.6597872102312885e-05, "loss": 1.7797, "step": 4603 }, { "epoch": 0.7428202646014843, "grad_norm": 5.656156063079834, "learning_rate": 1.6578434451680468e-05, "loss": 1.9747, "step": 4604 }, { "epoch": 0.7429816069699903, "grad_norm": 3.343182325363159, "learning_rate": 1.6559005927218614e-05, "loss": 1.6285, "step": 4605 }, { "epoch": 0.7431429493384963, "grad_norm": 4.80542516708374, "learning_rate": 1.6539586534232504e-05, "loss": 2.0188, "step": 4606 }, { "epoch": 0.7433042917070023, "grad_norm": 3.5260865688323975, "learning_rate": 1.652017627802487e-05, "loss": 1.6938, "step": 4607 }, { "epoch": 0.7434656340755083, "grad_norm": 4.347525596618652, "learning_rate": 1.6500775163895893e-05, "loss": 1.7483, "step": 4608 }, { "epoch": 0.7436269764440142, "grad_norm": 3.326824426651001, "learning_rate": 1.6481383197143325e-05, "loss": 1.8742, "step": 4609 }, { "epoch": 0.7437883188125202, "grad_norm": 4.1977033615112305, "learning_rate": 1.646200038306234e-05, "loss": 1.7538, "step": 4610 }, { "epoch": 0.7439496611810261, "grad_norm": 3.4120020866394043, "learning_rate": 1.6442626726945687e-05, "loss": 1.8996, "step": 4611 }, { "epoch": 0.7441110035495321, "grad_norm": 4.192962646484375, "learning_rate": 1.6423262234083557e-05, "loss": 1.7158, "step": 4612 }, { "epoch": 0.744272345918038, "grad_norm": 4.311139106750488, "learning_rate": 1.640390690976369e-05, "loss": 2.0214, "step": 4613 }, { "epoch": 0.744433688286544, "grad_norm": 4.723755359649658, "learning_rate": 1.6384560759271267e-05, "loss": 1.837, "step": 4614 }, { "epoch": 0.7445950306550501, "grad_norm": 3.802628755569458, "learning_rate": 1.6365223787889017e-05, "loss": 1.8493, "step": 4615 }, { "epoch": 0.744756373023556, "grad_norm": 4.033158779144287, "learning_rate": 1.634589600089712e-05, "loss": 1.8956, "step": 4616 }, { "epoch": 0.744917715392062, "grad_norm": 5.609979629516602, "learning_rate": 1.6326577403573284e-05, "loss": 1.9424, "step": 4617 }, { "epoch": 0.7450790577605679, "grad_norm": 3.737351894378662, "learning_rate": 1.6307268001192688e-05, "loss": 1.8098, "step": 4618 }, { "epoch": 0.7452404001290739, "grad_norm": 4.970017433166504, "learning_rate": 1.6287967799027975e-05, "loss": 1.7239, "step": 4619 }, { "epoch": 0.7454017424975798, "grad_norm": 4.549147605895996, "learning_rate": 1.626867680234934e-05, "loss": 2.1699, "step": 4620 }, { "epoch": 0.7455630848660858, "grad_norm": 5.983554363250732, "learning_rate": 1.6249395016424416e-05, "loss": 1.7461, "step": 4621 }, { "epoch": 0.7457244272345918, "grad_norm": 4.125341892242432, "learning_rate": 1.6230122446518327e-05, "loss": 1.9705, "step": 4622 }, { "epoch": 0.7458857696030978, "grad_norm": 4.720033645629883, "learning_rate": 1.6210859097893667e-05, "loss": 1.8833, "step": 4623 }, { "epoch": 0.7460471119716038, "grad_norm": 3.916238784790039, "learning_rate": 1.6191604975810565e-05, "loss": 1.9698, "step": 4624 }, { "epoch": 0.7462084543401097, "grad_norm": 6.627416133880615, "learning_rate": 1.6172360085526565e-05, "loss": 1.772, "step": 4625 }, { "epoch": 0.7463697967086157, "grad_norm": 4.832714080810547, "learning_rate": 1.615312443229676e-05, "loss": 1.8113, "step": 4626 }, { "epoch": 0.7465311390771217, "grad_norm": 4.001484394073486, "learning_rate": 1.6133898021373646e-05, "loss": 2.0775, "step": 4627 }, { "epoch": 0.7466924814456276, "grad_norm": 3.636345148086548, "learning_rate": 1.6114680858007257e-05, "loss": 1.9637, "step": 4628 }, { "epoch": 0.7468538238141336, "grad_norm": 3.999509572982788, "learning_rate": 1.609547294744505e-05, "loss": 1.8473, "step": 4629 }, { "epoch": 0.7470151661826395, "grad_norm": 5.523262977600098, "learning_rate": 1.6076274294932013e-05, "loss": 1.828, "step": 4630 }, { "epoch": 0.7471765085511455, "grad_norm": 3.6508684158325195, "learning_rate": 1.605708490571056e-05, "loss": 1.7955, "step": 4631 }, { "epoch": 0.7473378509196515, "grad_norm": 4.2339701652526855, "learning_rate": 1.603790478502058e-05, "loss": 1.7679, "step": 4632 }, { "epoch": 0.7474991932881575, "grad_norm": 4.072476387023926, "learning_rate": 1.6018733938099462e-05, "loss": 1.8788, "step": 4633 }, { "epoch": 0.7476605356566635, "grad_norm": 3.755317449569702, "learning_rate": 1.5999572370182016e-05, "loss": 1.6835, "step": 4634 }, { "epoch": 0.7478218780251694, "grad_norm": 4.9306535720825195, "learning_rate": 1.5980420086500575e-05, "loss": 1.844, "step": 4635 }, { "epoch": 0.7479832203936754, "grad_norm": 3.9784042835235596, "learning_rate": 1.5961277092284876e-05, "loss": 1.9675, "step": 4636 }, { "epoch": 0.7481445627621813, "grad_norm": 4.35151481628418, "learning_rate": 1.5942143392762176e-05, "loss": 1.8588, "step": 4637 }, { "epoch": 0.7483059051306873, "grad_norm": 3.947786331176758, "learning_rate": 1.592301899315716e-05, "loss": 2.0314, "step": 4638 }, { "epoch": 0.7484672474991932, "grad_norm": 3.782907724380493, "learning_rate": 1.5903903898691962e-05, "loss": 1.7101, "step": 4639 }, { "epoch": 0.7486285898676993, "grad_norm": 3.448230266571045, "learning_rate": 1.5884798114586226e-05, "loss": 1.7833, "step": 4640 }, { "epoch": 0.7487899322362053, "grad_norm": 4.434009552001953, "learning_rate": 1.5865701646057002e-05, "loss": 1.9052, "step": 4641 }, { "epoch": 0.7489512746047112, "grad_norm": 3.897461414337158, "learning_rate": 1.58466144983188e-05, "loss": 1.7225, "step": 4642 }, { "epoch": 0.7491126169732172, "grad_norm": 3.770840644836426, "learning_rate": 1.5827536676583642e-05, "loss": 1.7474, "step": 4643 }, { "epoch": 0.7492739593417231, "grad_norm": 3.8453481197357178, "learning_rate": 1.5808468186060936e-05, "loss": 1.7936, "step": 4644 }, { "epoch": 0.7494353017102291, "grad_norm": 4.0624680519104, "learning_rate": 1.5789409031957563e-05, "loss": 1.8487, "step": 4645 }, { "epoch": 0.749596644078735, "grad_norm": 3.979088068008423, "learning_rate": 1.5770359219477887e-05, "loss": 1.6695, "step": 4646 }, { "epoch": 0.749757986447241, "grad_norm": 4.707864284515381, "learning_rate": 1.5751318753823658e-05, "loss": 1.9013, "step": 4647 }, { "epoch": 0.749919328815747, "grad_norm": 4.127801895141602, "learning_rate": 1.573228764019415e-05, "loss": 1.6308, "step": 4648 }, { "epoch": 0.750080671184253, "grad_norm": 4.5204572677612305, "learning_rate": 1.5713265883786e-05, "loss": 1.6404, "step": 4649 }, { "epoch": 0.750242013552759, "grad_norm": 5.12779426574707, "learning_rate": 1.5694253489793374e-05, "loss": 2.094, "step": 4650 }, { "epoch": 0.750403355921265, "grad_norm": 3.957977056503296, "learning_rate": 1.5675250463407807e-05, "loss": 1.878, "step": 4651 }, { "epoch": 0.7505646982897709, "grad_norm": 4.045324802398682, "learning_rate": 1.5656256809818342e-05, "loss": 1.8236, "step": 4652 }, { "epoch": 0.7507260406582769, "grad_norm": 4.513662338256836, "learning_rate": 1.56372725342114e-05, "loss": 1.6848, "step": 4653 }, { "epoch": 0.7508873830267828, "grad_norm": 4.850427627563477, "learning_rate": 1.5618297641770895e-05, "loss": 1.5516, "step": 4654 }, { "epoch": 0.7510487253952888, "grad_norm": 3.8891425132751465, "learning_rate": 1.5599332137678137e-05, "loss": 1.8541, "step": 4655 }, { "epoch": 0.7512100677637947, "grad_norm": 4.624578952789307, "learning_rate": 1.558037602711191e-05, "loss": 1.9117, "step": 4656 }, { "epoch": 0.7513714101323007, "grad_norm": 4.53134822845459, "learning_rate": 1.5561429315248406e-05, "loss": 1.9477, "step": 4657 }, { "epoch": 0.7515327525008068, "grad_norm": 5.0841264724731445, "learning_rate": 1.554249200726125e-05, "loss": 1.7465, "step": 4658 }, { "epoch": 0.7516940948693127, "grad_norm": 4.755808353424072, "learning_rate": 1.5523564108321497e-05, "loss": 1.8322, "step": 4659 }, { "epoch": 0.7518554372378187, "grad_norm": 5.515625, "learning_rate": 1.550464562359768e-05, "loss": 1.7347, "step": 4660 }, { "epoch": 0.7520167796063246, "grad_norm": 3.8248002529144287, "learning_rate": 1.5485736558255697e-05, "loss": 1.8377, "step": 4661 }, { "epoch": 0.7521781219748306, "grad_norm": 5.635315418243408, "learning_rate": 1.5466836917458893e-05, "loss": 1.8682, "step": 4662 }, { "epoch": 0.7523394643433365, "grad_norm": 5.133936882019043, "learning_rate": 1.5447946706368084e-05, "loss": 1.8941, "step": 4663 }, { "epoch": 0.7525008067118425, "grad_norm": 3.916292667388916, "learning_rate": 1.5429065930141433e-05, "loss": 1.8327, "step": 4664 }, { "epoch": 0.7526621490803485, "grad_norm": 4.06002950668335, "learning_rate": 1.5410194593934607e-05, "loss": 1.9623, "step": 4665 }, { "epoch": 0.7528234914488545, "grad_norm": 4.704767227172852, "learning_rate": 1.5391332702900625e-05, "loss": 1.8785, "step": 4666 }, { "epoch": 0.7529848338173605, "grad_norm": 3.932387113571167, "learning_rate": 1.5372480262189986e-05, "loss": 1.924, "step": 4667 }, { "epoch": 0.7531461761858664, "grad_norm": 5.115250587463379, "learning_rate": 1.535363727695055e-05, "loss": 1.9074, "step": 4668 }, { "epoch": 0.7533075185543724, "grad_norm": 4.762061595916748, "learning_rate": 1.5334803752327663e-05, "loss": 1.7341, "step": 4669 }, { "epoch": 0.7534688609228783, "grad_norm": 3.636070489883423, "learning_rate": 1.5315979693464037e-05, "loss": 1.466, "step": 4670 }, { "epoch": 0.7536302032913843, "grad_norm": 6.28103494644165, "learning_rate": 1.5297165105499794e-05, "loss": 1.6176, "step": 4671 }, { "epoch": 0.7537915456598903, "grad_norm": 5.284980773925781, "learning_rate": 1.5278359993572517e-05, "loss": 2.0502, "step": 4672 }, { "epoch": 0.7539528880283962, "grad_norm": 4.022558689117432, "learning_rate": 1.5259564362817148e-05, "loss": 1.9754, "step": 4673 }, { "epoch": 0.7541142303969022, "grad_norm": 4.9722185134887695, "learning_rate": 1.5240778218366098e-05, "loss": 1.813, "step": 4674 }, { "epoch": 0.7542755727654082, "grad_norm": 4.032649517059326, "learning_rate": 1.5222001565349114e-05, "loss": 1.7048, "step": 4675 }, { "epoch": 0.7544369151339142, "grad_norm": 4.553777694702148, "learning_rate": 1.5203234408893436e-05, "loss": 2.0578, "step": 4676 }, { "epoch": 0.7545982575024202, "grad_norm": 3.5396218299865723, "learning_rate": 1.5184476754123644e-05, "loss": 1.9123, "step": 4677 }, { "epoch": 0.7547595998709261, "grad_norm": 4.456936359405518, "learning_rate": 1.516572860616175e-05, "loss": 1.8527, "step": 4678 }, { "epoch": 0.7549209422394321, "grad_norm": 4.161593437194824, "learning_rate": 1.5146989970127157e-05, "loss": 1.6447, "step": 4679 }, { "epoch": 0.755082284607938, "grad_norm": 4.463810443878174, "learning_rate": 1.51282608511367e-05, "loss": 1.6882, "step": 4680 }, { "epoch": 0.755243626976444, "grad_norm": 5.106133460998535, "learning_rate": 1.5109541254304587e-05, "loss": 1.8498, "step": 4681 }, { "epoch": 0.7554049693449499, "grad_norm": 5.957087993621826, "learning_rate": 1.5090831184742415e-05, "loss": 1.9037, "step": 4682 }, { "epoch": 0.755566311713456, "grad_norm": 3.988837480545044, "learning_rate": 1.507213064755924e-05, "loss": 1.8145, "step": 4683 }, { "epoch": 0.755727654081962, "grad_norm": 3.7715859413146973, "learning_rate": 1.5053439647861434e-05, "loss": 1.606, "step": 4684 }, { "epoch": 0.7558889964504679, "grad_norm": 5.431835651397705, "learning_rate": 1.5034758190752835e-05, "loss": 2.0316, "step": 4685 }, { "epoch": 0.7560503388189739, "grad_norm": 4.677783012390137, "learning_rate": 1.5016086281334624e-05, "loss": 1.8509, "step": 4686 }, { "epoch": 0.7562116811874798, "grad_norm": 4.996568202972412, "learning_rate": 1.4997423924705417e-05, "loss": 1.9228, "step": 4687 }, { "epoch": 0.7563730235559858, "grad_norm": 5.09377908706665, "learning_rate": 1.4978771125961177e-05, "loss": 1.8016, "step": 4688 }, { "epoch": 0.7565343659244917, "grad_norm": 5.763912677764893, "learning_rate": 1.4960127890195308e-05, "loss": 1.7141, "step": 4689 }, { "epoch": 0.7566957082929977, "grad_norm": 4.399535179138184, "learning_rate": 1.4941494222498543e-05, "loss": 1.7796, "step": 4690 }, { "epoch": 0.7568570506615037, "grad_norm": 3.825272798538208, "learning_rate": 1.4922870127959065e-05, "loss": 1.8581, "step": 4691 }, { "epoch": 0.7570183930300097, "grad_norm": 4.161649227142334, "learning_rate": 1.4904255611662387e-05, "loss": 1.9555, "step": 4692 }, { "epoch": 0.7571797353985157, "grad_norm": 4.9506707191467285, "learning_rate": 1.488565067869146e-05, "loss": 1.7525, "step": 4693 }, { "epoch": 0.7573410777670216, "grad_norm": 3.6513702869415283, "learning_rate": 1.4867055334126578e-05, "loss": 1.7729, "step": 4694 }, { "epoch": 0.7575024201355276, "grad_norm": 3.864866256713867, "learning_rate": 1.4848469583045404e-05, "loss": 1.4987, "step": 4695 }, { "epoch": 0.7576637625040336, "grad_norm": 4.677921772003174, "learning_rate": 1.4829893430523052e-05, "loss": 1.9189, "step": 4696 }, { "epoch": 0.7578251048725395, "grad_norm": 3.988673210144043, "learning_rate": 1.4811326881631937e-05, "loss": 2.0152, "step": 4697 }, { "epoch": 0.7579864472410455, "grad_norm": 5.229036331176758, "learning_rate": 1.4792769941441903e-05, "loss": 1.8561, "step": 4698 }, { "epoch": 0.7581477896095514, "grad_norm": 3.684943675994873, "learning_rate": 1.4774222615020122e-05, "loss": 1.7979, "step": 4699 }, { "epoch": 0.7583091319780574, "grad_norm": 3.7034828662872314, "learning_rate": 1.4755684907431205e-05, "loss": 1.8502, "step": 4700 }, { "epoch": 0.7584704743465635, "grad_norm": 4.172142028808594, "learning_rate": 1.473715682373707e-05, "loss": 1.8207, "step": 4701 }, { "epoch": 0.7586318167150694, "grad_norm": 5.415553092956543, "learning_rate": 1.4718638368997073e-05, "loss": 1.7674, "step": 4702 }, { "epoch": 0.7587931590835754, "grad_norm": 4.393552780151367, "learning_rate": 1.4700129548267872e-05, "loss": 1.844, "step": 4703 }, { "epoch": 0.7589545014520813, "grad_norm": 11.484271049499512, "learning_rate": 1.468163036660356e-05, "loss": 1.9213, "step": 4704 }, { "epoch": 0.7591158438205873, "grad_norm": 3.798029661178589, "learning_rate": 1.4663140829055533e-05, "loss": 1.8191, "step": 4705 }, { "epoch": 0.7592771861890932, "grad_norm": 4.819342136383057, "learning_rate": 1.4644660940672627e-05, "loss": 1.8311, "step": 4706 }, { "epoch": 0.7594385285575992, "grad_norm": 3.9851481914520264, "learning_rate": 1.462619070650098e-05, "loss": 1.7942, "step": 4707 }, { "epoch": 0.7595998709261051, "grad_norm": 5.322359561920166, "learning_rate": 1.4607730131584108e-05, "loss": 1.6912, "step": 4708 }, { "epoch": 0.7597612132946112, "grad_norm": 3.942171812057495, "learning_rate": 1.458927922096292e-05, "loss": 1.8307, "step": 4709 }, { "epoch": 0.7599225556631172, "grad_norm": 5.3993730545043945, "learning_rate": 1.4570837979675644e-05, "loss": 1.9579, "step": 4710 }, { "epoch": 0.7600838980316231, "grad_norm": 4.565049171447754, "learning_rate": 1.4552406412757913e-05, "loss": 1.7947, "step": 4711 }, { "epoch": 0.7602452404001291, "grad_norm": 3.4901652336120605, "learning_rate": 1.4533984525242667e-05, "loss": 1.8477, "step": 4712 }, { "epoch": 0.760406582768635, "grad_norm": 6.0585222244262695, "learning_rate": 1.4515572322160254e-05, "loss": 1.8443, "step": 4713 }, { "epoch": 0.760567925137141, "grad_norm": 3.3346543312072754, "learning_rate": 1.4497169808538325e-05, "loss": 1.8413, "step": 4714 }, { "epoch": 0.760729267505647, "grad_norm": 5.118963718414307, "learning_rate": 1.4478776989401949e-05, "loss": 1.8299, "step": 4715 }, { "epoch": 0.7608906098741529, "grad_norm": 3.9776089191436768, "learning_rate": 1.4460393869773492e-05, "loss": 1.8031, "step": 4716 }, { "epoch": 0.7610519522426589, "grad_norm": 4.204662799835205, "learning_rate": 1.444202045467269e-05, "loss": 1.8442, "step": 4717 }, { "epoch": 0.7612132946111649, "grad_norm": 3.9457521438598633, "learning_rate": 1.4423656749116621e-05, "loss": 1.8611, "step": 4718 }, { "epoch": 0.7613746369796709, "grad_norm": 3.5638539791107178, "learning_rate": 1.4405302758119743e-05, "loss": 1.8136, "step": 4719 }, { "epoch": 0.7615359793481769, "grad_norm": 4.715579986572266, "learning_rate": 1.4386958486693835e-05, "loss": 1.907, "step": 4720 }, { "epoch": 0.7616973217166828, "grad_norm": 4.191335678100586, "learning_rate": 1.4368623939848003e-05, "loss": 1.9415, "step": 4721 }, { "epoch": 0.7618586640851888, "grad_norm": 4.509104251861572, "learning_rate": 1.435029912258875e-05, "loss": 1.9443, "step": 4722 }, { "epoch": 0.7620200064536947, "grad_norm": 4.599697113037109, "learning_rate": 1.4331984039919877e-05, "loss": 1.9331, "step": 4723 }, { "epoch": 0.7621813488222007, "grad_norm": 4.296796798706055, "learning_rate": 1.4313678696842559e-05, "loss": 1.9413, "step": 4724 }, { "epoch": 0.7623426911907066, "grad_norm": 5.785429000854492, "learning_rate": 1.4295383098355264e-05, "loss": 2.0427, "step": 4725 }, { "epoch": 0.7625040335592127, "grad_norm": 5.768642425537109, "learning_rate": 1.4277097249453874e-05, "loss": 1.974, "step": 4726 }, { "epoch": 0.7626653759277187, "grad_norm": 4.165225982666016, "learning_rate": 1.425882115513153e-05, "loss": 1.6863, "step": 4727 }, { "epoch": 0.7628267182962246, "grad_norm": 4.48101282119751, "learning_rate": 1.4240554820378772e-05, "loss": 1.8847, "step": 4728 }, { "epoch": 0.7629880606647306, "grad_norm": 5.002747058868408, "learning_rate": 1.4222298250183413e-05, "loss": 1.7281, "step": 4729 }, { "epoch": 0.7631494030332365, "grad_norm": 4.607141017913818, "learning_rate": 1.4204051449530676e-05, "loss": 1.7632, "step": 4730 }, { "epoch": 0.7633107454017425, "grad_norm": 6.016432285308838, "learning_rate": 1.4185814423403038e-05, "loss": 2.0498, "step": 4731 }, { "epoch": 0.7634720877702484, "grad_norm": 4.647368907928467, "learning_rate": 1.4167587176780378e-05, "loss": 1.7946, "step": 4732 }, { "epoch": 0.7636334301387544, "grad_norm": 4.305753231048584, "learning_rate": 1.4149369714639853e-05, "loss": 1.8973, "step": 4733 }, { "epoch": 0.7637947725072604, "grad_norm": 4.50528621673584, "learning_rate": 1.4131162041955948e-05, "loss": 1.9, "step": 4734 }, { "epoch": 0.7639561148757664, "grad_norm": 5.1306562423706055, "learning_rate": 1.4112964163700527e-05, "loss": 2.0559, "step": 4735 }, { "epoch": 0.7641174572442724, "grad_norm": 4.5340576171875, "learning_rate": 1.4094776084842725e-05, "loss": 1.6647, "step": 4736 }, { "epoch": 0.7642787996127783, "grad_norm": 4.456101417541504, "learning_rate": 1.407659781034903e-05, "loss": 1.6651, "step": 4737 }, { "epoch": 0.7644401419812843, "grad_norm": 3.8839051723480225, "learning_rate": 1.405842934518322e-05, "loss": 1.9903, "step": 4738 }, { "epoch": 0.7646014843497903, "grad_norm": 4.24583101272583, "learning_rate": 1.4040270694306457e-05, "loss": 1.789, "step": 4739 }, { "epoch": 0.7647628267182962, "grad_norm": 4.413949012756348, "learning_rate": 1.402212186267714e-05, "loss": 1.8537, "step": 4740 }, { "epoch": 0.7649241690868022, "grad_norm": 5.955898761749268, "learning_rate": 1.400398285525108e-05, "loss": 1.8682, "step": 4741 }, { "epoch": 0.7650855114553081, "grad_norm": 4.545259475708008, "learning_rate": 1.3985853676981314e-05, "loss": 1.8026, "step": 4742 }, { "epoch": 0.7652468538238142, "grad_norm": 4.107945919036865, "learning_rate": 1.3967734332818266e-05, "loss": 1.6435, "step": 4743 }, { "epoch": 0.7654081961923201, "grad_norm": 4.146108150482178, "learning_rate": 1.394962482770964e-05, "loss": 2.089, "step": 4744 }, { "epoch": 0.7655695385608261, "grad_norm": 4.125722885131836, "learning_rate": 1.3931525166600446e-05, "loss": 1.9152, "step": 4745 }, { "epoch": 0.7657308809293321, "grad_norm": 3.9361684322357178, "learning_rate": 1.3913435354433036e-05, "loss": 1.7265, "step": 4746 }, { "epoch": 0.765892223297838, "grad_norm": 4.01522159576416, "learning_rate": 1.3895355396147041e-05, "loss": 2.022, "step": 4747 }, { "epoch": 0.766053565666344, "grad_norm": 4.31265115737915, "learning_rate": 1.3877285296679438e-05, "loss": 1.9056, "step": 4748 }, { "epoch": 0.7662149080348499, "grad_norm": 3.7925353050231934, "learning_rate": 1.3859225060964459e-05, "loss": 1.7249, "step": 4749 }, { "epoch": 0.7663762504033559, "grad_norm": 3.631206512451172, "learning_rate": 1.3841174693933712e-05, "loss": 1.9949, "step": 4750 }, { "epoch": 0.7665375927718618, "grad_norm": 4.69987154006958, "learning_rate": 1.382313420051604e-05, "loss": 1.7979, "step": 4751 }, { "epoch": 0.7666989351403679, "grad_norm": 4.0977091789245605, "learning_rate": 1.3805103585637647e-05, "loss": 1.701, "step": 4752 }, { "epoch": 0.7668602775088739, "grad_norm": 4.90214204788208, "learning_rate": 1.3787082854222005e-05, "loss": 1.676, "step": 4753 }, { "epoch": 0.7670216198773798, "grad_norm": 4.234584331512451, "learning_rate": 1.3769072011189876e-05, "loss": 1.7801, "step": 4754 }, { "epoch": 0.7671829622458858, "grad_norm": 4.4239912033081055, "learning_rate": 1.3751071061459381e-05, "loss": 2.016, "step": 4755 }, { "epoch": 0.7673443046143917, "grad_norm": 4.1023454666137695, "learning_rate": 1.373308000994588e-05, "loss": 1.9956, "step": 4756 }, { "epoch": 0.7675056469828977, "grad_norm": 4.720330238342285, "learning_rate": 1.3715098861562059e-05, "loss": 1.6989, "step": 4757 }, { "epoch": 0.7676669893514037, "grad_norm": 4.460195064544678, "learning_rate": 1.3697127621217865e-05, "loss": 2.011, "step": 4758 }, { "epoch": 0.7678283317199096, "grad_norm": 3.4943008422851562, "learning_rate": 1.3679166293820606e-05, "loss": 1.9777, "step": 4759 }, { "epoch": 0.7679896740884156, "grad_norm": 4.435558319091797, "learning_rate": 1.366121488427481e-05, "loss": 1.7261, "step": 4760 }, { "epoch": 0.7681510164569216, "grad_norm": 4.208644866943359, "learning_rate": 1.3643273397482365e-05, "loss": 1.7009, "step": 4761 }, { "epoch": 0.7683123588254276, "grad_norm": 4.223933696746826, "learning_rate": 1.3625341838342376e-05, "loss": 1.7992, "step": 4762 }, { "epoch": 0.7684737011939335, "grad_norm": 4.545698165893555, "learning_rate": 1.3607420211751321e-05, "loss": 1.8289, "step": 4763 }, { "epoch": 0.7686350435624395, "grad_norm": 3.726844549179077, "learning_rate": 1.3589508522602873e-05, "loss": 1.8346, "step": 4764 }, { "epoch": 0.7687963859309455, "grad_norm": 4.714771270751953, "learning_rate": 1.3571606775788087e-05, "loss": 1.5316, "step": 4765 }, { "epoch": 0.7689577282994514, "grad_norm": 4.0585103034973145, "learning_rate": 1.3553714976195214e-05, "loss": 1.7228, "step": 4766 }, { "epoch": 0.7691190706679574, "grad_norm": 4.517932415008545, "learning_rate": 1.3535833128709869e-05, "loss": 1.8709, "step": 4767 }, { "epoch": 0.7692804130364633, "grad_norm": 5.770853519439697, "learning_rate": 1.351796123821487e-05, "loss": 1.9515, "step": 4768 }, { "epoch": 0.7694417554049694, "grad_norm": 4.701601982116699, "learning_rate": 1.3500099309590397e-05, "loss": 1.92, "step": 4769 }, { "epoch": 0.7696030977734754, "grad_norm": 4.671560287475586, "learning_rate": 1.348224734771385e-05, "loss": 1.8667, "step": 4770 }, { "epoch": 0.7697644401419813, "grad_norm": 4.16002893447876, "learning_rate": 1.346440535745992e-05, "loss": 1.7221, "step": 4771 }, { "epoch": 0.7699257825104873, "grad_norm": 4.713809013366699, "learning_rate": 1.3446573343700597e-05, "loss": 1.7392, "step": 4772 }, { "epoch": 0.7700871248789932, "grad_norm": 4.215362071990967, "learning_rate": 1.3428751311305132e-05, "loss": 1.9662, "step": 4773 }, { "epoch": 0.7702484672474992, "grad_norm": 4.4590253829956055, "learning_rate": 1.3410939265140027e-05, "loss": 1.7877, "step": 4774 }, { "epoch": 0.7704098096160051, "grad_norm": 3.882617950439453, "learning_rate": 1.3393137210069118e-05, "loss": 1.6606, "step": 4775 }, { "epoch": 0.7705711519845111, "grad_norm": 3.123530626296997, "learning_rate": 1.337534515095345e-05, "loss": 1.9039, "step": 4776 }, { "epoch": 0.770732494353017, "grad_norm": 5.224033832550049, "learning_rate": 1.335756309265136e-05, "loss": 1.7497, "step": 4777 }, { "epoch": 0.7708938367215231, "grad_norm": 4.352362155914307, "learning_rate": 1.3339791040018479e-05, "loss": 1.8906, "step": 4778 }, { "epoch": 0.7710551790900291, "grad_norm": 4.679987907409668, "learning_rate": 1.3322028997907666e-05, "loss": 1.9451, "step": 4779 }, { "epoch": 0.771216521458535, "grad_norm": 4.4323883056640625, "learning_rate": 1.3304276971169088e-05, "loss": 1.9463, "step": 4780 }, { "epoch": 0.771377863827041, "grad_norm": 4.972047805786133, "learning_rate": 1.3286534964650121e-05, "loss": 1.8902, "step": 4781 }, { "epoch": 0.771539206195547, "grad_norm": 4.959163665771484, "learning_rate": 1.3268802983195484e-05, "loss": 1.9566, "step": 4782 }, { "epoch": 0.7717005485640529, "grad_norm": 4.445774078369141, "learning_rate": 1.3251081031647078e-05, "loss": 1.8361, "step": 4783 }, { "epoch": 0.7718618909325589, "grad_norm": 5.686622142791748, "learning_rate": 1.3233369114844101e-05, "loss": 1.9, "step": 4784 }, { "epoch": 0.7720232333010648, "grad_norm": 4.390459060668945, "learning_rate": 1.3215667237623036e-05, "loss": 1.6565, "step": 4785 }, { "epoch": 0.7721845756695709, "grad_norm": 4.644404888153076, "learning_rate": 1.3197975404817564e-05, "loss": 1.7143, "step": 4786 }, { "epoch": 0.7723459180380768, "grad_norm": 4.473944664001465, "learning_rate": 1.3180293621258694e-05, "loss": 1.9453, "step": 4787 }, { "epoch": 0.7725072604065828, "grad_norm": 5.86933708190918, "learning_rate": 1.3162621891774617e-05, "loss": 1.7406, "step": 4788 }, { "epoch": 0.7726686027750888, "grad_norm": 3.9441027641296387, "learning_rate": 1.3144960221190861e-05, "loss": 1.6576, "step": 4789 }, { "epoch": 0.7728299451435947, "grad_norm": 4.592226028442383, "learning_rate": 1.3127308614330119e-05, "loss": 1.7783, "step": 4790 }, { "epoch": 0.7729912875121007, "grad_norm": 3.8888795375823975, "learning_rate": 1.3109667076012417e-05, "loss": 1.8405, "step": 4791 }, { "epoch": 0.7731526298806066, "grad_norm": 8.0985689163208, "learning_rate": 1.3092035611054976e-05, "loss": 2.0734, "step": 4792 }, { "epoch": 0.7733139722491126, "grad_norm": 4.424942970275879, "learning_rate": 1.3074414224272286e-05, "loss": 1.8247, "step": 4793 }, { "epoch": 0.7734753146176185, "grad_norm": 4.277080535888672, "learning_rate": 1.3056802920476075e-05, "loss": 1.8728, "step": 4794 }, { "epoch": 0.7736366569861246, "grad_norm": 6.436392784118652, "learning_rate": 1.3039201704475345e-05, "loss": 1.9738, "step": 4795 }, { "epoch": 0.7737979993546306, "grad_norm": 4.508710861206055, "learning_rate": 1.3021610581076316e-05, "loss": 2.0282, "step": 4796 }, { "epoch": 0.7739593417231365, "grad_norm": 5.0215582847595215, "learning_rate": 1.3004029555082453e-05, "loss": 1.5631, "step": 4797 }, { "epoch": 0.7741206840916425, "grad_norm": 4.453874588012695, "learning_rate": 1.2986458631294491e-05, "loss": 1.8881, "step": 4798 }, { "epoch": 0.7742820264601484, "grad_norm": 4.145251750946045, "learning_rate": 1.296889781451036e-05, "loss": 1.8948, "step": 4799 }, { "epoch": 0.7744433688286544, "grad_norm": 3.951270818710327, "learning_rate": 1.2951347109525291e-05, "loss": 1.8586, "step": 4800 }, { "epoch": 0.7746047111971603, "grad_norm": 5.916871070861816, "learning_rate": 1.2933806521131692e-05, "loss": 2.2301, "step": 4801 }, { "epoch": 0.7747660535656663, "grad_norm": 4.65308952331543, "learning_rate": 1.2916276054119259e-05, "loss": 1.8263, "step": 4802 }, { "epoch": 0.7749273959341723, "grad_norm": 4.195945739746094, "learning_rate": 1.2898755713274879e-05, "loss": 1.8826, "step": 4803 }, { "epoch": 0.7750887383026783, "grad_norm": 5.570291996002197, "learning_rate": 1.2881245503382722e-05, "loss": 1.7518, "step": 4804 }, { "epoch": 0.7752500806711843, "grad_norm": 4.299945831298828, "learning_rate": 1.2863745429224144e-05, "loss": 1.8924, "step": 4805 }, { "epoch": 0.7754114230396902, "grad_norm": 3.5652012825012207, "learning_rate": 1.2846255495577774e-05, "loss": 1.6966, "step": 4806 }, { "epoch": 0.7755727654081962, "grad_norm": 4.118142127990723, "learning_rate": 1.2828775707219442e-05, "loss": 1.6356, "step": 4807 }, { "epoch": 0.7757341077767022, "grad_norm": 3.89166522026062, "learning_rate": 1.281130606892223e-05, "loss": 1.9345, "step": 4808 }, { "epoch": 0.7758954501452081, "grad_norm": 5.561777591705322, "learning_rate": 1.2793846585456437e-05, "loss": 2.08, "step": 4809 }, { "epoch": 0.7760567925137141, "grad_norm": 3.5589771270751953, "learning_rate": 1.2776397261589573e-05, "loss": 1.6954, "step": 4810 }, { "epoch": 0.77621813488222, "grad_norm": 4.224887847900391, "learning_rate": 1.2758958102086416e-05, "loss": 1.7887, "step": 4811 }, { "epoch": 0.7763794772507261, "grad_norm": 4.228940010070801, "learning_rate": 1.2741529111708934e-05, "loss": 1.904, "step": 4812 }, { "epoch": 0.776540819619232, "grad_norm": 5.163037300109863, "learning_rate": 1.2724110295216301e-05, "loss": 1.9206, "step": 4813 }, { "epoch": 0.776702161987738, "grad_norm": 5.459940433502197, "learning_rate": 1.2706701657364988e-05, "loss": 1.79, "step": 4814 }, { "epoch": 0.776863504356244, "grad_norm": 5.294158935546875, "learning_rate": 1.2689303202908608e-05, "loss": 1.907, "step": 4815 }, { "epoch": 0.7770248467247499, "grad_norm": 4.8452348709106445, "learning_rate": 1.2671914936598018e-05, "loss": 1.9486, "step": 4816 }, { "epoch": 0.7771861890932559, "grad_norm": 4.328494548797607, "learning_rate": 1.2654536863181326e-05, "loss": 1.8546, "step": 4817 }, { "epoch": 0.7773475314617618, "grad_norm": 4.225079536437988, "learning_rate": 1.2637168987403797e-05, "loss": 1.8669, "step": 4818 }, { "epoch": 0.7775088738302678, "grad_norm": 6.0057244300842285, "learning_rate": 1.2619811314007974e-05, "loss": 1.8974, "step": 4819 }, { "epoch": 0.7776702161987737, "grad_norm": 6.11793851852417, "learning_rate": 1.260246384773357e-05, "loss": 1.9355, "step": 4820 }, { "epoch": 0.7778315585672798, "grad_norm": 4.3321075439453125, "learning_rate": 1.258512659331751e-05, "loss": 1.9473, "step": 4821 }, { "epoch": 0.7779929009357858, "grad_norm": 4.267263412475586, "learning_rate": 1.256779955549397e-05, "loss": 1.7181, "step": 4822 }, { "epoch": 0.7781542433042917, "grad_norm": 5.465700149536133, "learning_rate": 1.2550482738994285e-05, "loss": 1.9786, "step": 4823 }, { "epoch": 0.7783155856727977, "grad_norm": 4.487921714782715, "learning_rate": 1.253317614854706e-05, "loss": 1.9194, "step": 4824 }, { "epoch": 0.7784769280413036, "grad_norm": 5.506918907165527, "learning_rate": 1.2515879788878038e-05, "loss": 2.0618, "step": 4825 }, { "epoch": 0.7786382704098096, "grad_norm": 4.063748836517334, "learning_rate": 1.2498593664710234e-05, "loss": 1.7781, "step": 4826 }, { "epoch": 0.7787996127783156, "grad_norm": 4.7127909660339355, "learning_rate": 1.2481317780763802e-05, "loss": 2.0239, "step": 4827 }, { "epoch": 0.7789609551468215, "grad_norm": 4.559621334075928, "learning_rate": 1.2464052141756177e-05, "loss": 2.127, "step": 4828 }, { "epoch": 0.7791222975153276, "grad_norm": 4.137547016143799, "learning_rate": 1.2446796752401913e-05, "loss": 1.9814, "step": 4829 }, { "epoch": 0.7792836398838335, "grad_norm": 5.029731273651123, "learning_rate": 1.2429551617412844e-05, "loss": 1.7936, "step": 4830 }, { "epoch": 0.7794449822523395, "grad_norm": 5.414242267608643, "learning_rate": 1.2412316741497953e-05, "loss": 2.129, "step": 4831 }, { "epoch": 0.7796063246208454, "grad_norm": 3.9555225372314453, "learning_rate": 1.2395092129363428e-05, "loss": 2.1969, "step": 4832 }, { "epoch": 0.7797676669893514, "grad_norm": 3.4828078746795654, "learning_rate": 1.2377877785712649e-05, "loss": 1.8161, "step": 4833 }, { "epoch": 0.7799290093578574, "grad_norm": 3.443568706512451, "learning_rate": 1.236067371524624e-05, "loss": 1.5697, "step": 4834 }, { "epoch": 0.7800903517263633, "grad_norm": 4.887693405151367, "learning_rate": 1.2343479922661965e-05, "loss": 1.835, "step": 4835 }, { "epoch": 0.7802516940948693, "grad_norm": 4.772017478942871, "learning_rate": 1.2326296412654787e-05, "loss": 1.9179, "step": 4836 }, { "epoch": 0.7804130364633752, "grad_norm": 5.858239650726318, "learning_rate": 1.2309123189916904e-05, "loss": 1.7846, "step": 4837 }, { "epoch": 0.7805743788318813, "grad_norm": 4.29452657699585, "learning_rate": 1.2291960259137647e-05, "loss": 1.9617, "step": 4838 }, { "epoch": 0.7807357212003873, "grad_norm": 4.060075283050537, "learning_rate": 1.2274807625003598e-05, "loss": 1.8724, "step": 4839 }, { "epoch": 0.7808970635688932, "grad_norm": 4.429147720336914, "learning_rate": 1.2257665292198461e-05, "loss": 2.0031, "step": 4840 }, { "epoch": 0.7810584059373992, "grad_norm": 3.9962515830993652, "learning_rate": 1.2240533265403198e-05, "loss": 1.7474, "step": 4841 }, { "epoch": 0.7812197483059051, "grad_norm": 3.854008197784424, "learning_rate": 1.2223411549295888e-05, "loss": 1.8632, "step": 4842 }, { "epoch": 0.7813810906744111, "grad_norm": 3.9905378818511963, "learning_rate": 1.2206300148551848e-05, "loss": 1.8312, "step": 4843 }, { "epoch": 0.781542433042917, "grad_norm": 4.155596733093262, "learning_rate": 1.2189199067843538e-05, "loss": 2.0119, "step": 4844 }, { "epoch": 0.781703775411423, "grad_norm": 3.753019332885742, "learning_rate": 1.2172108311840641e-05, "loss": 1.9925, "step": 4845 }, { "epoch": 0.7818651177799291, "grad_norm": 4.133620262145996, "learning_rate": 1.2155027885209991e-05, "loss": 1.7772, "step": 4846 }, { "epoch": 0.782026460148435, "grad_norm": 4.011654853820801, "learning_rate": 1.213795779261559e-05, "loss": 1.6683, "step": 4847 }, { "epoch": 0.782187802516941, "grad_norm": 4.1891679763793945, "learning_rate": 1.212089803871867e-05, "loss": 1.8676, "step": 4848 }, { "epoch": 0.7823491448854469, "grad_norm": 4.765373706817627, "learning_rate": 1.2103848628177573e-05, "loss": 1.952, "step": 4849 }, { "epoch": 0.7825104872539529, "grad_norm": 4.962911128997803, "learning_rate": 1.2086809565647878e-05, "loss": 1.8573, "step": 4850 }, { "epoch": 0.7826718296224588, "grad_norm": 3.755892276763916, "learning_rate": 1.2069780855782304e-05, "loss": 1.7528, "step": 4851 }, { "epoch": 0.7828331719909648, "grad_norm": 5.934349060058594, "learning_rate": 1.2052762503230746e-05, "loss": 1.6815, "step": 4852 }, { "epoch": 0.7829945143594708, "grad_norm": 4.053897857666016, "learning_rate": 1.2035754512640262e-05, "loss": 1.7697, "step": 4853 }, { "epoch": 0.7831558567279767, "grad_norm": 4.091436386108398, "learning_rate": 1.2018756888655125e-05, "loss": 1.9912, "step": 4854 }, { "epoch": 0.7833171990964828, "grad_norm": 4.620340347290039, "learning_rate": 1.200176963591671e-05, "loss": 1.8034, "step": 4855 }, { "epoch": 0.7834785414649887, "grad_norm": 4.518535137176514, "learning_rate": 1.198479275906363e-05, "loss": 1.6941, "step": 4856 }, { "epoch": 0.7836398838334947, "grad_norm": 6.105302810668945, "learning_rate": 1.1967826262731602e-05, "loss": 1.884, "step": 4857 }, { "epoch": 0.7838012262020007, "grad_norm": 5.046043395996094, "learning_rate": 1.1950870151553561e-05, "loss": 2.0027, "step": 4858 }, { "epoch": 0.7839625685705066, "grad_norm": 4.713179588317871, "learning_rate": 1.1933924430159572e-05, "loss": 1.767, "step": 4859 }, { "epoch": 0.7841239109390126, "grad_norm": 5.794550895690918, "learning_rate": 1.1916989103176856e-05, "loss": 1.9028, "step": 4860 }, { "epoch": 0.7842852533075185, "grad_norm": 4.4063825607299805, "learning_rate": 1.1900064175229847e-05, "loss": 1.7676, "step": 4861 }, { "epoch": 0.7844465956760245, "grad_norm": 4.139499664306641, "learning_rate": 1.1883149650940074e-05, "loss": 1.9148, "step": 4862 }, { "epoch": 0.7846079380445304, "grad_norm": 5.17821741104126, "learning_rate": 1.186624553492628e-05, "loss": 1.7091, "step": 4863 }, { "epoch": 0.7847692804130365, "grad_norm": 3.906259059906006, "learning_rate": 1.1849351831804318e-05, "loss": 1.6162, "step": 4864 }, { "epoch": 0.7849306227815425, "grad_norm": 4.134382724761963, "learning_rate": 1.1832468546187247e-05, "loss": 1.6765, "step": 4865 }, { "epoch": 0.7850919651500484, "grad_norm": 4.796937465667725, "learning_rate": 1.1815595682685237e-05, "loss": 1.8752, "step": 4866 }, { "epoch": 0.7852533075185544, "grad_norm": 5.201277732849121, "learning_rate": 1.1798733245905651e-05, "loss": 1.7716, "step": 4867 }, { "epoch": 0.7854146498870603, "grad_norm": 4.218733310699463, "learning_rate": 1.1781881240452958e-05, "loss": 1.9781, "step": 4868 }, { "epoch": 0.7855759922555663, "grad_norm": 4.711889266967773, "learning_rate": 1.176503967092884e-05, "loss": 1.8243, "step": 4869 }, { "epoch": 0.7857373346240722, "grad_norm": 3.912487030029297, "learning_rate": 1.1748208541932077e-05, "loss": 1.7795, "step": 4870 }, { "epoch": 0.7858986769925782, "grad_norm": 5.722700595855713, "learning_rate": 1.1731387858058613e-05, "loss": 1.7087, "step": 4871 }, { "epoch": 0.7860600193610843, "grad_norm": 4.365345478057861, "learning_rate": 1.1714577623901547e-05, "loss": 1.8558, "step": 4872 }, { "epoch": 0.7862213617295902, "grad_norm": 4.097417831420898, "learning_rate": 1.1697777844051105e-05, "loss": 1.823, "step": 4873 }, { "epoch": 0.7863827040980962, "grad_norm": 5.112419605255127, "learning_rate": 1.1680988523094705e-05, "loss": 1.6119, "step": 4874 }, { "epoch": 0.7865440464666021, "grad_norm": 5.568664073944092, "learning_rate": 1.1664209665616849e-05, "loss": 1.9551, "step": 4875 }, { "epoch": 0.7867053888351081, "grad_norm": 3.6490912437438965, "learning_rate": 1.1647441276199233e-05, "loss": 1.8411, "step": 4876 }, { "epoch": 0.7868667312036141, "grad_norm": 4.620687484741211, "learning_rate": 1.1630683359420652e-05, "loss": 1.8486, "step": 4877 }, { "epoch": 0.78702807357212, "grad_norm": 3.4812748432159424, "learning_rate": 1.1613935919857094e-05, "loss": 1.9814, "step": 4878 }, { "epoch": 0.787189415940626, "grad_norm": 4.480321407318115, "learning_rate": 1.1597198962081612e-05, "loss": 1.9138, "step": 4879 }, { "epoch": 0.7873507583091319, "grad_norm": 4.20350456237793, "learning_rate": 1.1580472490664474e-05, "loss": 2.0126, "step": 4880 }, { "epoch": 0.787512100677638, "grad_norm": 4.852173805236816, "learning_rate": 1.1563756510173024e-05, "loss": 1.748, "step": 4881 }, { "epoch": 0.787673443046144, "grad_norm": 4.446038722991943, "learning_rate": 1.154705102517179e-05, "loss": 1.9694, "step": 4882 }, { "epoch": 0.7878347854146499, "grad_norm": 7.211885452270508, "learning_rate": 1.1530356040222402e-05, "loss": 2.2675, "step": 4883 }, { "epoch": 0.7879961277831559, "grad_norm": 4.495987415313721, "learning_rate": 1.151367155988361e-05, "loss": 1.9904, "step": 4884 }, { "epoch": 0.7881574701516618, "grad_norm": 4.462154388427734, "learning_rate": 1.149699758871135e-05, "loss": 1.7895, "step": 4885 }, { "epoch": 0.7883188125201678, "grad_norm": 4.5518622398376465, "learning_rate": 1.1480334131258625e-05, "loss": 1.7954, "step": 4886 }, { "epoch": 0.7884801548886737, "grad_norm": 4.522104740142822, "learning_rate": 1.1463681192075632e-05, "loss": 1.914, "step": 4887 }, { "epoch": 0.7886414972571797, "grad_norm": 6.0603251457214355, "learning_rate": 1.1447038775709623e-05, "loss": 1.6616, "step": 4888 }, { "epoch": 0.7888028396256858, "grad_norm": 4.378868103027344, "learning_rate": 1.1430406886705053e-05, "loss": 2.0277, "step": 4889 }, { "epoch": 0.7889641819941917, "grad_norm": 4.344919204711914, "learning_rate": 1.1413785529603438e-05, "loss": 1.9149, "step": 4890 }, { "epoch": 0.7891255243626977, "grad_norm": 4.162896156311035, "learning_rate": 1.1397174708943458e-05, "loss": 1.6958, "step": 4891 }, { "epoch": 0.7892868667312036, "grad_norm": 3.8431289196014404, "learning_rate": 1.1380574429260881e-05, "loss": 1.8051, "step": 4892 }, { "epoch": 0.7894482090997096, "grad_norm": 4.48696231842041, "learning_rate": 1.1363984695088653e-05, "loss": 1.7665, "step": 4893 }, { "epoch": 0.7896095514682155, "grad_norm": 5.2174811363220215, "learning_rate": 1.1347405510956765e-05, "loss": 1.773, "step": 4894 }, { "epoch": 0.7897708938367215, "grad_norm": 4.319388389587402, "learning_rate": 1.1330836881392404e-05, "loss": 1.9333, "step": 4895 }, { "epoch": 0.7899322362052275, "grad_norm": 4.041973114013672, "learning_rate": 1.1314278810919826e-05, "loss": 1.8259, "step": 4896 }, { "epoch": 0.7900935785737334, "grad_norm": 4.26777982711792, "learning_rate": 1.12977313040604e-05, "loss": 1.8426, "step": 4897 }, { "epoch": 0.7902549209422395, "grad_norm": 3.559138536453247, "learning_rate": 1.1281194365332649e-05, "loss": 1.8108, "step": 4898 }, { "epoch": 0.7904162633107454, "grad_norm": 4.491950988769531, "learning_rate": 1.1264667999252171e-05, "loss": 2.0104, "step": 4899 }, { "epoch": 0.7905776056792514, "grad_norm": 4.133874893188477, "learning_rate": 1.1248152210331714e-05, "loss": 1.7412, "step": 4900 }, { "epoch": 0.7907389480477574, "grad_norm": 3.8438262939453125, "learning_rate": 1.1231647003081092e-05, "loss": 2.0872, "step": 4901 }, { "epoch": 0.7909002904162633, "grad_norm": 4.686543941497803, "learning_rate": 1.1215152382007283e-05, "loss": 1.9076, "step": 4902 }, { "epoch": 0.7910616327847693, "grad_norm": 4.718398094177246, "learning_rate": 1.1198668351614323e-05, "loss": 1.7459, "step": 4903 }, { "epoch": 0.7912229751532752, "grad_norm": 4.179304599761963, "learning_rate": 1.1182194916403399e-05, "loss": 2.0392, "step": 4904 }, { "epoch": 0.7913843175217812, "grad_norm": 4.1944899559021, "learning_rate": 1.1165732080872766e-05, "loss": 1.8769, "step": 4905 }, { "epoch": 0.7915456598902871, "grad_norm": 4.269710063934326, "learning_rate": 1.114927984951783e-05, "loss": 1.5916, "step": 4906 }, { "epoch": 0.7917070022587932, "grad_norm": 3.848909854888916, "learning_rate": 1.1132838226831054e-05, "loss": 1.916, "step": 4907 }, { "epoch": 0.7918683446272992, "grad_norm": 4.461495399475098, "learning_rate": 1.1116407217302027e-05, "loss": 1.8363, "step": 4908 }, { "epoch": 0.7920296869958051, "grad_norm": 3.3846654891967773, "learning_rate": 1.1099986825417453e-05, "loss": 1.9127, "step": 4909 }, { "epoch": 0.7921910293643111, "grad_norm": 3.9858438968658447, "learning_rate": 1.1083577055661116e-05, "loss": 1.8888, "step": 4910 }, { "epoch": 0.792352371732817, "grad_norm": 3.759572982788086, "learning_rate": 1.1067177912513898e-05, "loss": 2.0451, "step": 4911 }, { "epoch": 0.792513714101323, "grad_norm": 4.379895210266113, "learning_rate": 1.1050789400453782e-05, "loss": 1.7487, "step": 4912 }, { "epoch": 0.7926750564698289, "grad_norm": 9.255057334899902, "learning_rate": 1.103441152395588e-05, "loss": 1.7979, "step": 4913 }, { "epoch": 0.7928363988383349, "grad_norm": 4.533437252044678, "learning_rate": 1.1018044287492341e-05, "loss": 1.9864, "step": 4914 }, { "epoch": 0.792997741206841, "grad_norm": 3.9128003120422363, "learning_rate": 1.100168769553247e-05, "loss": 1.9298, "step": 4915 }, { "epoch": 0.7931590835753469, "grad_norm": 4.127525806427002, "learning_rate": 1.098534175254261e-05, "loss": 1.7195, "step": 4916 }, { "epoch": 0.7933204259438529, "grad_norm": 5.006974697113037, "learning_rate": 1.0969006462986253e-05, "loss": 1.998, "step": 4917 }, { "epoch": 0.7934817683123588, "grad_norm": 4.356435775756836, "learning_rate": 1.0952681831323914e-05, "loss": 1.8028, "step": 4918 }, { "epoch": 0.7936431106808648, "grad_norm": 4.146892070770264, "learning_rate": 1.093636786201327e-05, "loss": 1.8735, "step": 4919 }, { "epoch": 0.7938044530493708, "grad_norm": 3.9896233081817627, "learning_rate": 1.0920064559509025e-05, "loss": 1.7779, "step": 4920 }, { "epoch": 0.7939657954178767, "grad_norm": 5.546407222747803, "learning_rate": 1.0903771928263018e-05, "loss": 1.8785, "step": 4921 }, { "epoch": 0.7941271377863827, "grad_norm": 4.734492301940918, "learning_rate": 1.0887489972724141e-05, "loss": 1.9074, "step": 4922 }, { "epoch": 0.7942884801548886, "grad_norm": 5.046846866607666, "learning_rate": 1.0871218697338376e-05, "loss": 1.9277, "step": 4923 }, { "epoch": 0.7944498225233947, "grad_norm": 5.657299518585205, "learning_rate": 1.0854958106548812e-05, "loss": 2.0755, "step": 4924 }, { "epoch": 0.7946111648919006, "grad_norm": 6.1893630027771, "learning_rate": 1.0838708204795584e-05, "loss": 1.9284, "step": 4925 }, { "epoch": 0.7947725072604066, "grad_norm": 5.536571025848389, "learning_rate": 1.082246899651595e-05, "loss": 2.0, "step": 4926 }, { "epoch": 0.7949338496289126, "grad_norm": 4.260076522827148, "learning_rate": 1.080624048614422e-05, "loss": 1.7571, "step": 4927 }, { "epoch": 0.7950951919974185, "grad_norm": 4.106614112854004, "learning_rate": 1.0790022678111772e-05, "loss": 1.7714, "step": 4928 }, { "epoch": 0.7952565343659245, "grad_norm": 5.757980823516846, "learning_rate": 1.0773815576847095e-05, "loss": 1.6824, "step": 4929 }, { "epoch": 0.7954178767344304, "grad_norm": 4.206118106842041, "learning_rate": 1.075761918677574e-05, "loss": 1.7748, "step": 4930 }, { "epoch": 0.7955792191029364, "grad_norm": 4.439728260040283, "learning_rate": 1.0741433512320315e-05, "loss": 1.8324, "step": 4931 }, { "epoch": 0.7957405614714425, "grad_norm": 3.572453022003174, "learning_rate": 1.0725258557900537e-05, "loss": 1.6651, "step": 4932 }, { "epoch": 0.7959019038399484, "grad_norm": 4.678240776062012, "learning_rate": 1.0709094327933155e-05, "loss": 1.7703, "step": 4933 }, { "epoch": 0.7960632462084544, "grad_norm": 5.155104637145996, "learning_rate": 1.0692940826832038e-05, "loss": 1.879, "step": 4934 }, { "epoch": 0.7962245885769603, "grad_norm": 4.731137752532959, "learning_rate": 1.0676798059008081e-05, "loss": 1.6497, "step": 4935 }, { "epoch": 0.7963859309454663, "grad_norm": 5.3584089279174805, "learning_rate": 1.0660666028869254e-05, "loss": 1.9967, "step": 4936 }, { "epoch": 0.7965472733139722, "grad_norm": 4.699703216552734, "learning_rate": 1.0644544740820638e-05, "loss": 2.2507, "step": 4937 }, { "epoch": 0.7967086156824782, "grad_norm": 4.3637776374816895, "learning_rate": 1.062843419926432e-05, "loss": 2.0056, "step": 4938 }, { "epoch": 0.7968699580509842, "grad_norm": 4.8707594871521, "learning_rate": 1.0612334408599512e-05, "loss": 1.9799, "step": 4939 }, { "epoch": 0.7970313004194901, "grad_norm": 5.19387674331665, "learning_rate": 1.0596245373222424e-05, "loss": 1.8095, "step": 4940 }, { "epoch": 0.7971926427879962, "grad_norm": 5.282740116119385, "learning_rate": 1.05801670975264e-05, "loss": 1.7478, "step": 4941 }, { "epoch": 0.7973539851565021, "grad_norm": 7.607383728027344, "learning_rate": 1.0564099585901788e-05, "loss": 1.9698, "step": 4942 }, { "epoch": 0.7975153275250081, "grad_norm": 5.037977695465088, "learning_rate": 1.0548042842736038e-05, "loss": 1.8196, "step": 4943 }, { "epoch": 0.797676669893514, "grad_norm": 4.223226547241211, "learning_rate": 1.0531996872413618e-05, "loss": 1.752, "step": 4944 }, { "epoch": 0.79783801226202, "grad_norm": 6.1297221183776855, "learning_rate": 1.0515961679316111e-05, "loss": 2.0072, "step": 4945 }, { "epoch": 0.797999354630526, "grad_norm": 4.56374454498291, "learning_rate": 1.0499937267822101e-05, "loss": 1.8715, "step": 4946 }, { "epoch": 0.7981606969990319, "grad_norm": 4.783345699310303, "learning_rate": 1.0483923642307258e-05, "loss": 1.8331, "step": 4947 }, { "epoch": 0.7983220393675379, "grad_norm": 4.3292765617370605, "learning_rate": 1.0467920807144282e-05, "loss": 1.7773, "step": 4948 }, { "epoch": 0.7984833817360439, "grad_norm": 5.089447498321533, "learning_rate": 1.0451928766702979e-05, "loss": 1.9119, "step": 4949 }, { "epoch": 0.7986447241045499, "grad_norm": 4.106072902679443, "learning_rate": 1.0435947525350149e-05, "loss": 1.7819, "step": 4950 }, { "epoch": 0.7988060664730559, "grad_norm": 4.993464469909668, "learning_rate": 1.0419977087449656e-05, "loss": 1.9956, "step": 4951 }, { "epoch": 0.7989674088415618, "grad_norm": 5.2668304443359375, "learning_rate": 1.0404017457362459e-05, "loss": 1.8632, "step": 4952 }, { "epoch": 0.7991287512100678, "grad_norm": 5.383654594421387, "learning_rate": 1.0388068639446502e-05, "loss": 1.967, "step": 4953 }, { "epoch": 0.7992900935785737, "grad_norm": 4.778904914855957, "learning_rate": 1.0372130638056826e-05, "loss": 1.6598, "step": 4954 }, { "epoch": 0.7994514359470797, "grad_norm": 3.6620941162109375, "learning_rate": 1.0356203457545483e-05, "loss": 1.8743, "step": 4955 }, { "epoch": 0.7996127783155856, "grad_norm": 4.586091041564941, "learning_rate": 1.0340287102261603e-05, "loss": 1.988, "step": 4956 }, { "epoch": 0.7997741206840916, "grad_norm": 4.33573055267334, "learning_rate": 1.0324381576551323e-05, "loss": 1.7612, "step": 4957 }, { "epoch": 0.7999354630525977, "grad_norm": 6.598884105682373, "learning_rate": 1.0308486884757868e-05, "loss": 1.9919, "step": 4958 }, { "epoch": 0.8000968054211036, "grad_norm": 3.6663243770599365, "learning_rate": 1.0292603031221465e-05, "loss": 1.7532, "step": 4959 }, { "epoch": 0.8002581477896096, "grad_norm": 4.019326210021973, "learning_rate": 1.027673002027938e-05, "loss": 1.7217, "step": 4960 }, { "epoch": 0.8004194901581155, "grad_norm": 4.4263505935668945, "learning_rate": 1.0260867856265966e-05, "loss": 1.7163, "step": 4961 }, { "epoch": 0.8005808325266215, "grad_norm": 3.918682813644409, "learning_rate": 1.0245016543512553e-05, "loss": 1.7715, "step": 4962 }, { "epoch": 0.8007421748951274, "grad_norm": 5.8670148849487305, "learning_rate": 1.022917608634757e-05, "loss": 2.0226, "step": 4963 }, { "epoch": 0.8009035172636334, "grad_norm": 4.7948455810546875, "learning_rate": 1.0213346489096414e-05, "loss": 1.841, "step": 4964 }, { "epoch": 0.8010648596321394, "grad_norm": 4.6134209632873535, "learning_rate": 1.0197527756081582e-05, "loss": 1.7509, "step": 4965 }, { "epoch": 0.8012262020006453, "grad_norm": 4.026442050933838, "learning_rate": 1.018171989162256e-05, "loss": 1.789, "step": 4966 }, { "epoch": 0.8013875443691514, "grad_norm": 4.19586181640625, "learning_rate": 1.0165922900035885e-05, "loss": 1.7807, "step": 4967 }, { "epoch": 0.8015488867376573, "grad_norm": 4.882951736450195, "learning_rate": 1.0150136785635095e-05, "loss": 1.9988, "step": 4968 }, { "epoch": 0.8017102291061633, "grad_norm": 7.722875595092773, "learning_rate": 1.0134361552730825e-05, "loss": 1.7145, "step": 4969 }, { "epoch": 0.8018715714746693, "grad_norm": 4.069356918334961, "learning_rate": 1.0118597205630658e-05, "loss": 1.761, "step": 4970 }, { "epoch": 0.8020329138431752, "grad_norm": 4.985439300537109, "learning_rate": 1.010284374863928e-05, "loss": 2.0331, "step": 4971 }, { "epoch": 0.8021942562116812, "grad_norm": 4.771203517913818, "learning_rate": 1.0087101186058346e-05, "loss": 2.0332, "step": 4972 }, { "epoch": 0.8023555985801871, "grad_norm": 4.417211532592773, "learning_rate": 1.0071369522186547e-05, "loss": 1.7046, "step": 4973 }, { "epoch": 0.8025169409486931, "grad_norm": 3.9871044158935547, "learning_rate": 1.005564876131963e-05, "loss": 1.9312, "step": 4974 }, { "epoch": 0.8026782833171991, "grad_norm": 5.729042053222656, "learning_rate": 1.0039938907750323e-05, "loss": 1.8495, "step": 4975 }, { "epoch": 0.8028396256857051, "grad_norm": 4.6541266441345215, "learning_rate": 1.0024239965768418e-05, "loss": 1.7877, "step": 4976 }, { "epoch": 0.8030009680542111, "grad_norm": 4.373040199279785, "learning_rate": 1.0008551939660676e-05, "loss": 2.0013, "step": 4977 }, { "epoch": 0.803162310422717, "grad_norm": 3.792407989501953, "learning_rate": 9.992874833710936e-06, "loss": 1.7861, "step": 4978 }, { "epoch": 0.803323652791223, "grad_norm": 3.719904661178589, "learning_rate": 9.9772086522e-06, "loss": 1.6944, "step": 4979 }, { "epoch": 0.8034849951597289, "grad_norm": 3.9930500984191895, "learning_rate": 9.961553399405733e-06, "loss": 1.4582, "step": 4980 }, { "epoch": 0.8036463375282349, "grad_norm": 4.299803733825684, "learning_rate": 9.945909079602966e-06, "loss": 1.7848, "step": 4981 }, { "epoch": 0.8038076798967408, "grad_norm": 4.049663066864014, "learning_rate": 9.930275697063613e-06, "loss": 1.7776, "step": 4982 }, { "epoch": 0.8039690222652468, "grad_norm": 4.3470001220703125, "learning_rate": 9.914653256056522e-06, "loss": 2.0063, "step": 4983 }, { "epoch": 0.8041303646337529, "grad_norm": 3.8651134967803955, "learning_rate": 9.899041760847628e-06, "loss": 1.8212, "step": 4984 }, { "epoch": 0.8042917070022588, "grad_norm": 4.063155651092529, "learning_rate": 9.883441215699823e-06, "loss": 1.6792, "step": 4985 }, { "epoch": 0.8044530493707648, "grad_norm": 5.242466449737549, "learning_rate": 9.867851624873038e-06, "loss": 1.7351, "step": 4986 }, { "epoch": 0.8046143917392707, "grad_norm": 3.8467047214508057, "learning_rate": 9.85227299262419e-06, "loss": 1.8724, "step": 4987 }, { "epoch": 0.8047757341077767, "grad_norm": 6.567809581756592, "learning_rate": 9.836705323207207e-06, "loss": 1.7028, "step": 4988 }, { "epoch": 0.8049370764762827, "grad_norm": 5.676382064819336, "learning_rate": 9.821148620873071e-06, "loss": 1.9252, "step": 4989 }, { "epoch": 0.8050984188447886, "grad_norm": 4.816730976104736, "learning_rate": 9.805602889869692e-06, "loss": 1.7533, "step": 4990 }, { "epoch": 0.8052597612132946, "grad_norm": 4.336112022399902, "learning_rate": 9.790068134442049e-06, "loss": 1.957, "step": 4991 }, { "epoch": 0.8054211035818006, "grad_norm": 3.976856231689453, "learning_rate": 9.774544358832082e-06, "loss": 1.8226, "step": 4992 }, { "epoch": 0.8055824459503066, "grad_norm": 4.793515682220459, "learning_rate": 9.75903156727877e-06, "loss": 1.9546, "step": 4993 }, { "epoch": 0.8057437883188125, "grad_norm": 6.302753448486328, "learning_rate": 9.74352976401805e-06, "loss": 2.0595, "step": 4994 }, { "epoch": 0.8059051306873185, "grad_norm": 4.390596866607666, "learning_rate": 9.728038953282903e-06, "loss": 1.9161, "step": 4995 }, { "epoch": 0.8060664730558245, "grad_norm": 4.317444324493408, "learning_rate": 9.712559139303257e-06, "loss": 1.7779, "step": 4996 }, { "epoch": 0.8062278154243304, "grad_norm": 6.430931091308594, "learning_rate": 9.697090326306097e-06, "loss": 1.9661, "step": 4997 }, { "epoch": 0.8063891577928364, "grad_norm": 4.240243911743164, "learning_rate": 9.681632518515354e-06, "loss": 1.7169, "step": 4998 }, { "epoch": 0.8065505001613423, "grad_norm": 4.05634069442749, "learning_rate": 9.666185720151965e-06, "loss": 1.718, "step": 4999 }, { "epoch": 0.8067118425298483, "grad_norm": 5.419493198394775, "learning_rate": 9.6507499354339e-06, "loss": 1.7795, "step": 5000 }, { "epoch": 0.8068731848983544, "grad_norm": 3.757697820663452, "learning_rate": 9.635325168576054e-06, "loss": 1.6912, "step": 5001 }, { "epoch": 0.8070345272668603, "grad_norm": 5.66853141784668, "learning_rate": 9.619911423790378e-06, "loss": 1.7352, "step": 5002 }, { "epoch": 0.8071958696353663, "grad_norm": 5.9120306968688965, "learning_rate": 9.604508705285764e-06, "loss": 1.9152, "step": 5003 }, { "epoch": 0.8073572120038722, "grad_norm": 4.561873912811279, "learning_rate": 9.58911701726814e-06, "loss": 2.0036, "step": 5004 }, { "epoch": 0.8075185543723782, "grad_norm": 4.15148401260376, "learning_rate": 9.573736363940377e-06, "loss": 1.6839, "step": 5005 }, { "epoch": 0.8076798967408841, "grad_norm": 3.8810439109802246, "learning_rate": 9.558366749502357e-06, "loss": 1.7462, "step": 5006 }, { "epoch": 0.8078412391093901, "grad_norm": 4.477046489715576, "learning_rate": 9.543008178150931e-06, "loss": 1.7553, "step": 5007 }, { "epoch": 0.808002581477896, "grad_norm": 5.181619167327881, "learning_rate": 9.527660654079968e-06, "loss": 1.8165, "step": 5008 }, { "epoch": 0.808163923846402, "grad_norm": 4.163424015045166, "learning_rate": 9.51232418148027e-06, "loss": 1.8912, "step": 5009 }, { "epoch": 0.8083252662149081, "grad_norm": 3.726431369781494, "learning_rate": 9.496998764539684e-06, "loss": 1.8877, "step": 5010 }, { "epoch": 0.808486608583414, "grad_norm": 4.380525588989258, "learning_rate": 9.481684407442987e-06, "loss": 1.9208, "step": 5011 }, { "epoch": 0.80864795095192, "grad_norm": 4.851559638977051, "learning_rate": 9.466381114371941e-06, "loss": 1.6984, "step": 5012 }, { "epoch": 0.808809293320426, "grad_norm": 6.974335670471191, "learning_rate": 9.451088889505321e-06, "loss": 1.8413, "step": 5013 }, { "epoch": 0.8089706356889319, "grad_norm": 3.3473620414733887, "learning_rate": 9.435807737018842e-06, "loss": 1.6117, "step": 5014 }, { "epoch": 0.8091319780574379, "grad_norm": 5.172091007232666, "learning_rate": 9.42053766108522e-06, "loss": 1.9959, "step": 5015 }, { "epoch": 0.8092933204259438, "grad_norm": 3.5614988803863525, "learning_rate": 9.405278665874129e-06, "loss": 1.7952, "step": 5016 }, { "epoch": 0.8094546627944498, "grad_norm": 5.11036491394043, "learning_rate": 9.390030755552242e-06, "loss": 1.9466, "step": 5017 }, { "epoch": 0.8096160051629558, "grad_norm": 4.175196170806885, "learning_rate": 9.374793934283166e-06, "loss": 1.9538, "step": 5018 }, { "epoch": 0.8097773475314618, "grad_norm": 4.084559440612793, "learning_rate": 9.359568206227525e-06, "loss": 1.9314, "step": 5019 }, { "epoch": 0.8099386898999678, "grad_norm": 4.597757816314697, "learning_rate": 9.344353575542875e-06, "loss": 1.8681, "step": 5020 }, { "epoch": 0.8101000322684737, "grad_norm": 3.842285394668579, "learning_rate": 9.329150046383772e-06, "loss": 1.9994, "step": 5021 }, { "epoch": 0.8102613746369797, "grad_norm": 4.087825298309326, "learning_rate": 9.313957622901726e-06, "loss": 1.9577, "step": 5022 }, { "epoch": 0.8104227170054856, "grad_norm": 5.270120143890381, "learning_rate": 9.298776309245194e-06, "loss": 1.9583, "step": 5023 }, { "epoch": 0.8105840593739916, "grad_norm": 4.861396789550781, "learning_rate": 9.283606109559644e-06, "loss": 2.0954, "step": 5024 }, { "epoch": 0.8107454017424975, "grad_norm": 5.2622199058532715, "learning_rate": 9.268447027987488e-06, "loss": 1.9247, "step": 5025 }, { "epoch": 0.8109067441110035, "grad_norm": 4.728799343109131, "learning_rate": 9.253299068668086e-06, "loss": 1.838, "step": 5026 }, { "epoch": 0.8110680864795096, "grad_norm": 3.924114227294922, "learning_rate": 9.238162235737768e-06, "loss": 1.671, "step": 5027 }, { "epoch": 0.8112294288480155, "grad_norm": 6.050514221191406, "learning_rate": 9.22303653332986e-06, "loss": 1.9685, "step": 5028 }, { "epoch": 0.8113907712165215, "grad_norm": 4.385323524475098, "learning_rate": 9.207921965574594e-06, "loss": 1.8523, "step": 5029 }, { "epoch": 0.8115521135850274, "grad_norm": 3.637460470199585, "learning_rate": 9.192818536599213e-06, "loss": 1.7259, "step": 5030 }, { "epoch": 0.8117134559535334, "grad_norm": 4.110119819641113, "learning_rate": 9.177726250527868e-06, "loss": 1.8351, "step": 5031 }, { "epoch": 0.8118747983220393, "grad_norm": 4.287790775299072, "learning_rate": 9.162645111481727e-06, "loss": 2.0917, "step": 5032 }, { "epoch": 0.8120361406905453, "grad_norm": 4.925917625427246, "learning_rate": 9.147575123578844e-06, "loss": 1.7523, "step": 5033 }, { "epoch": 0.8121974830590513, "grad_norm": 7.13510274887085, "learning_rate": 9.132516290934301e-06, "loss": 2.1304, "step": 5034 }, { "epoch": 0.8123588254275573, "grad_norm": 5.606991291046143, "learning_rate": 9.11746861766008e-06, "loss": 1.7848, "step": 5035 }, { "epoch": 0.8125201677960633, "grad_norm": 6.008810997009277, "learning_rate": 9.102432107865121e-06, "loss": 1.9205, "step": 5036 }, { "epoch": 0.8126815101645692, "grad_norm": 4.687321186065674, "learning_rate": 9.087406765655355e-06, "loss": 1.7783, "step": 5037 }, { "epoch": 0.8128428525330752, "grad_norm": 4.49308967590332, "learning_rate": 9.07239259513361e-06, "loss": 1.7734, "step": 5038 }, { "epoch": 0.8130041949015812, "grad_norm": 5.762004852294922, "learning_rate": 9.057389600399719e-06, "loss": 1.9634, "step": 5039 }, { "epoch": 0.8131655372700871, "grad_norm": 4.135660171508789, "learning_rate": 9.042397785550405e-06, "loss": 1.6049, "step": 5040 }, { "epoch": 0.8133268796385931, "grad_norm": 4.049421310424805, "learning_rate": 9.027417154679396e-06, "loss": 1.7663, "step": 5041 }, { "epoch": 0.813488222007099, "grad_norm": 3.8373448848724365, "learning_rate": 9.012447711877332e-06, "loss": 1.7997, "step": 5042 }, { "epoch": 0.813649564375605, "grad_norm": 4.999689102172852, "learning_rate": 8.997489461231772e-06, "loss": 2.0029, "step": 5043 }, { "epoch": 0.813810906744111, "grad_norm": 4.012880802154541, "learning_rate": 8.9825424068273e-06, "loss": 1.7545, "step": 5044 }, { "epoch": 0.813972249112617, "grad_norm": 4.000550270080566, "learning_rate": 8.967606552745361e-06, "loss": 1.8365, "step": 5045 }, { "epoch": 0.814133591481123, "grad_norm": 4.201355934143066, "learning_rate": 8.952681903064374e-06, "loss": 1.9151, "step": 5046 }, { "epoch": 0.8142949338496289, "grad_norm": 7.912868499755859, "learning_rate": 8.937768461859714e-06, "loss": 1.6631, "step": 5047 }, { "epoch": 0.8144562762181349, "grad_norm": 3.9873435497283936, "learning_rate": 8.92286623320368e-06, "loss": 1.8191, "step": 5048 }, { "epoch": 0.8146176185866408, "grad_norm": 4.569661617279053, "learning_rate": 8.907975221165481e-06, "loss": 2.1193, "step": 5049 }, { "epoch": 0.8147789609551468, "grad_norm": 4.338842868804932, "learning_rate": 8.893095429811332e-06, "loss": 1.9049, "step": 5050 }, { "epoch": 0.8149403033236527, "grad_norm": 5.899425029754639, "learning_rate": 8.878226863204309e-06, "loss": 2.068, "step": 5051 }, { "epoch": 0.8151016456921588, "grad_norm": 5.389956951141357, "learning_rate": 8.863369525404485e-06, "loss": 1.6494, "step": 5052 }, { "epoch": 0.8152629880606648, "grad_norm": 5.630323886871338, "learning_rate": 8.848523420468818e-06, "loss": 1.9578, "step": 5053 }, { "epoch": 0.8154243304291707, "grad_norm": 4.566960334777832, "learning_rate": 8.833688552451236e-06, "loss": 1.8953, "step": 5054 }, { "epoch": 0.8155856727976767, "grad_norm": 5.83742094039917, "learning_rate": 8.818864925402564e-06, "loss": 1.7418, "step": 5055 }, { "epoch": 0.8157470151661826, "grad_norm": 3.897963285446167, "learning_rate": 8.8040525433706e-06, "loss": 2.1279, "step": 5056 }, { "epoch": 0.8159083575346886, "grad_norm": 4.441431999206543, "learning_rate": 8.789251410400023e-06, "loss": 1.7854, "step": 5057 }, { "epoch": 0.8160696999031946, "grad_norm": 6.147578716278076, "learning_rate": 8.77446153053249e-06, "loss": 1.9488, "step": 5058 }, { "epoch": 0.8162310422717005, "grad_norm": 4.624359130859375, "learning_rate": 8.759682907806537e-06, "loss": 1.6247, "step": 5059 }, { "epoch": 0.8163923846402065, "grad_norm": 5.683760643005371, "learning_rate": 8.744915546257671e-06, "loss": 2.0023, "step": 5060 }, { "epoch": 0.8165537270087125, "grad_norm": 4.435068130493164, "learning_rate": 8.730159449918285e-06, "loss": 2.0267, "step": 5061 }, { "epoch": 0.8167150693772185, "grad_norm": 5.430092811584473, "learning_rate": 8.715414622817708e-06, "loss": 1.8278, "step": 5062 }, { "epoch": 0.8168764117457245, "grad_norm": 3.4103713035583496, "learning_rate": 8.700681068982225e-06, "loss": 1.6731, "step": 5063 }, { "epoch": 0.8170377541142304, "grad_norm": 4.3300042152404785, "learning_rate": 8.685958792434989e-06, "loss": 1.9492, "step": 5064 }, { "epoch": 0.8171990964827364, "grad_norm": 4.567752361297607, "learning_rate": 8.671247797196113e-06, "loss": 1.8913, "step": 5065 }, { "epoch": 0.8173604388512423, "grad_norm": 4.536086559295654, "learning_rate": 8.65654808728259e-06, "loss": 2.0211, "step": 5066 }, { "epoch": 0.8175217812197483, "grad_norm": 4.3093438148498535, "learning_rate": 8.641859666708397e-06, "loss": 1.8431, "step": 5067 }, { "epoch": 0.8176831235882542, "grad_norm": 4.09118127822876, "learning_rate": 8.627182539484353e-06, "loss": 1.7744, "step": 5068 }, { "epoch": 0.8178444659567602, "grad_norm": 5.769215106964111, "learning_rate": 8.612516709618252e-06, "loss": 1.9448, "step": 5069 }, { "epoch": 0.8180058083252663, "grad_norm": 4.4861297607421875, "learning_rate": 8.597862181114764e-06, "loss": 1.4927, "step": 5070 }, { "epoch": 0.8181671506937722, "grad_norm": 4.781145095825195, "learning_rate": 8.583218957975504e-06, "loss": 1.849, "step": 5071 }, { "epoch": 0.8183284930622782, "grad_norm": 5.037804126739502, "learning_rate": 8.568587044198968e-06, "loss": 1.7988, "step": 5072 }, { "epoch": 0.8184898354307841, "grad_norm": 4.66380500793457, "learning_rate": 8.553966443780599e-06, "loss": 1.819, "step": 5073 }, { "epoch": 0.8186511777992901, "grad_norm": 5.495998859405518, "learning_rate": 8.539357160712718e-06, "loss": 2.0507, "step": 5074 }, { "epoch": 0.818812520167796, "grad_norm": 4.616162300109863, "learning_rate": 8.524759198984566e-06, "loss": 1.9237, "step": 5075 }, { "epoch": 0.818973862536302, "grad_norm": 4.727950572967529, "learning_rate": 8.51017256258232e-06, "loss": 2.0016, "step": 5076 }, { "epoch": 0.819135204904808, "grad_norm": 3.9767277240753174, "learning_rate": 8.495597255489007e-06, "loss": 2.0227, "step": 5077 }, { "epoch": 0.819296547273314, "grad_norm": 3.8664677143096924, "learning_rate": 8.481033281684631e-06, "loss": 1.7761, "step": 5078 }, { "epoch": 0.81945788964182, "grad_norm": 4.328319549560547, "learning_rate": 8.46648064514603e-06, "loss": 2.0187, "step": 5079 }, { "epoch": 0.8196192320103259, "grad_norm": 7.903204917907715, "learning_rate": 8.45193934984701e-06, "loss": 2.0693, "step": 5080 }, { "epoch": 0.8197805743788319, "grad_norm": 4.029341220855713, "learning_rate": 8.437409399758234e-06, "loss": 1.9574, "step": 5081 }, { "epoch": 0.8199419167473379, "grad_norm": 5.163949966430664, "learning_rate": 8.422890798847282e-06, "loss": 1.5796, "step": 5082 }, { "epoch": 0.8201032591158438, "grad_norm": 4.17691707611084, "learning_rate": 8.408383551078652e-06, "loss": 1.965, "step": 5083 }, { "epoch": 0.8202646014843498, "grad_norm": 4.767606735229492, "learning_rate": 8.393887660413719e-06, "loss": 2.0412, "step": 5084 }, { "epoch": 0.8204259438528557, "grad_norm": 4.345592498779297, "learning_rate": 8.379403130810764e-06, "loss": 1.6822, "step": 5085 }, { "epoch": 0.8205872862213617, "grad_norm": 4.43311882019043, "learning_rate": 8.364929966224955e-06, "loss": 1.7265, "step": 5086 }, { "epoch": 0.8207486285898677, "grad_norm": 3.951153516769409, "learning_rate": 8.350468170608394e-06, "loss": 1.7785, "step": 5087 }, { "epoch": 0.8209099709583737, "grad_norm": 4.090831756591797, "learning_rate": 8.336017747910019e-06, "loss": 1.7555, "step": 5088 }, { "epoch": 0.8210713133268797, "grad_norm": 4.850238800048828, "learning_rate": 8.321578702075733e-06, "loss": 1.5936, "step": 5089 }, { "epoch": 0.8212326556953856, "grad_norm": 4.753424167633057, "learning_rate": 8.30715103704826e-06, "loss": 1.7142, "step": 5090 }, { "epoch": 0.8213939980638916, "grad_norm": 4.926393508911133, "learning_rate": 8.292734756767284e-06, "loss": 1.8298, "step": 5091 }, { "epoch": 0.8215553404323975, "grad_norm": 5.0380940437316895, "learning_rate": 8.278329865169321e-06, "loss": 2.0694, "step": 5092 }, { "epoch": 0.8217166828009035, "grad_norm": 4.265560626983643, "learning_rate": 8.263936366187824e-06, "loss": 1.8107, "step": 5093 }, { "epoch": 0.8218780251694094, "grad_norm": 4.015749454498291, "learning_rate": 8.2495542637531e-06, "loss": 1.8572, "step": 5094 }, { "epoch": 0.8220393675379155, "grad_norm": 4.692224502563477, "learning_rate": 8.235183561792382e-06, "loss": 1.679, "step": 5095 }, { "epoch": 0.8222007099064215, "grad_norm": 3.4330830574035645, "learning_rate": 8.220824264229737e-06, "loss": 1.7406, "step": 5096 }, { "epoch": 0.8223620522749274, "grad_norm": 4.453279972076416, "learning_rate": 8.206476374986178e-06, "loss": 1.9318, "step": 5097 }, { "epoch": 0.8225233946434334, "grad_norm": 4.732417106628418, "learning_rate": 8.192139897979556e-06, "loss": 1.7594, "step": 5098 }, { "epoch": 0.8226847370119393, "grad_norm": 5.123412132263184, "learning_rate": 8.17781483712462e-06, "loss": 2.0592, "step": 5099 }, { "epoch": 0.8228460793804453, "grad_norm": 4.737793445587158, "learning_rate": 8.163501196333018e-06, "loss": 1.9146, "step": 5100 }, { "epoch": 0.8230074217489513, "grad_norm": 4.241943359375, "learning_rate": 8.149198979513257e-06, "loss": 1.8821, "step": 5101 }, { "epoch": 0.8231687641174572, "grad_norm": 4.252610206604004, "learning_rate": 8.134908190570723e-06, "loss": 1.8205, "step": 5102 }, { "epoch": 0.8233301064859632, "grad_norm": 4.118143081665039, "learning_rate": 8.120628833407717e-06, "loss": 1.6768, "step": 5103 }, { "epoch": 0.8234914488544692, "grad_norm": 5.339510917663574, "learning_rate": 8.106360911923382e-06, "loss": 2.15, "step": 5104 }, { "epoch": 0.8236527912229752, "grad_norm": 4.20247745513916, "learning_rate": 8.092104430013736e-06, "loss": 1.7785, "step": 5105 }, { "epoch": 0.8238141335914811, "grad_norm": 3.4559247493743896, "learning_rate": 8.077859391571712e-06, "loss": 1.865, "step": 5106 }, { "epoch": 0.8239754759599871, "grad_norm": 4.361405372619629, "learning_rate": 8.063625800487067e-06, "loss": 1.7921, "step": 5107 }, { "epoch": 0.8241368183284931, "grad_norm": 3.9835152626037598, "learning_rate": 8.049403660646487e-06, "loss": 1.9069, "step": 5108 }, { "epoch": 0.824298160696999, "grad_norm": 3.749183416366577, "learning_rate": 8.035192975933476e-06, "loss": 1.6726, "step": 5109 }, { "epoch": 0.824459503065505, "grad_norm": 4.866585731506348, "learning_rate": 8.020993750228461e-06, "loss": 1.7821, "step": 5110 }, { "epoch": 0.8246208454340109, "grad_norm": 5.902980804443359, "learning_rate": 8.006805987408705e-06, "loss": 1.8989, "step": 5111 }, { "epoch": 0.8247821878025169, "grad_norm": 4.18698787689209, "learning_rate": 7.992629691348335e-06, "loss": 1.848, "step": 5112 }, { "epoch": 0.824943530171023, "grad_norm": 4.467362880706787, "learning_rate": 7.978464865918395e-06, "loss": 1.9784, "step": 5113 }, { "epoch": 0.8251048725395289, "grad_norm": 4.428437232971191, "learning_rate": 7.964311514986733e-06, "loss": 1.9523, "step": 5114 }, { "epoch": 0.8252662149080349, "grad_norm": 5.125341415405273, "learning_rate": 7.950169642418126e-06, "loss": 1.8608, "step": 5115 }, { "epoch": 0.8254275572765408, "grad_norm": 4.956939220428467, "learning_rate": 7.936039252074157e-06, "loss": 1.6212, "step": 5116 }, { "epoch": 0.8255888996450468, "grad_norm": 5.354269504547119, "learning_rate": 7.921920347813333e-06, "loss": 1.9752, "step": 5117 }, { "epoch": 0.8257502420135527, "grad_norm": 3.843909502029419, "learning_rate": 7.907812933490971e-06, "loss": 1.8323, "step": 5118 }, { "epoch": 0.8259115843820587, "grad_norm": 4.785239219665527, "learning_rate": 7.893717012959296e-06, "loss": 1.8471, "step": 5119 }, { "epoch": 0.8260729267505647, "grad_norm": 5.060910224914551, "learning_rate": 7.879632590067353e-06, "loss": 1.9037, "step": 5120 }, { "epoch": 0.8262342691190707, "grad_norm": 3.5868513584136963, "learning_rate": 7.865559668661088e-06, "loss": 1.5919, "step": 5121 }, { "epoch": 0.8263956114875767, "grad_norm": 3.619961738586426, "learning_rate": 7.85149825258325e-06, "loss": 1.7938, "step": 5122 }, { "epoch": 0.8265569538560826, "grad_norm": 5.197341442108154, "learning_rate": 7.837448345673526e-06, "loss": 1.7578, "step": 5123 }, { "epoch": 0.8267182962245886, "grad_norm": 4.56782341003418, "learning_rate": 7.82340995176839e-06, "loss": 1.9851, "step": 5124 }, { "epoch": 0.8268796385930945, "grad_norm": 4.951657295227051, "learning_rate": 7.809383074701193e-06, "loss": 1.7909, "step": 5125 }, { "epoch": 0.8270409809616005, "grad_norm": 4.474319934844971, "learning_rate": 7.79536771830217e-06, "loss": 1.8335, "step": 5126 }, { "epoch": 0.8272023233301065, "grad_norm": 4.652566909790039, "learning_rate": 7.781363886398363e-06, "loss": 1.8465, "step": 5127 }, { "epoch": 0.8273636656986124, "grad_norm": 3.9626662731170654, "learning_rate": 7.767371582813715e-06, "loss": 1.6353, "step": 5128 }, { "epoch": 0.8275250080671184, "grad_norm": 5.283198356628418, "learning_rate": 7.753390811368971e-06, "loss": 1.5737, "step": 5129 }, { "epoch": 0.8276863504356244, "grad_norm": 4.049074649810791, "learning_rate": 7.739421575881783e-06, "loss": 2.0153, "step": 5130 }, { "epoch": 0.8278476928041304, "grad_norm": 5.280628204345703, "learning_rate": 7.725463880166589e-06, "loss": 2.0416, "step": 5131 }, { "epoch": 0.8280090351726364, "grad_norm": 4.2782769203186035, "learning_rate": 7.711517728034746e-06, "loss": 1.7204, "step": 5132 }, { "epoch": 0.8281703775411423, "grad_norm": 4.3173136711120605, "learning_rate": 7.697583123294388e-06, "loss": 1.8216, "step": 5133 }, { "epoch": 0.8283317199096483, "grad_norm": 4.055956840515137, "learning_rate": 7.683660069750559e-06, "loss": 1.7376, "step": 5134 }, { "epoch": 0.8284930622781542, "grad_norm": 4.467798709869385, "learning_rate": 7.6697485712051e-06, "loss": 1.9001, "step": 5135 }, { "epoch": 0.8286544046466602, "grad_norm": 5.5613813400268555, "learning_rate": 7.65584863145673e-06, "loss": 1.9246, "step": 5136 }, { "epoch": 0.8288157470151661, "grad_norm": 4.992427825927734, "learning_rate": 7.641960254301e-06, "loss": 1.8027, "step": 5137 }, { "epoch": 0.8289770893836722, "grad_norm": 3.728839874267578, "learning_rate": 7.6280834435302876e-06, "loss": 1.8116, "step": 5138 }, { "epoch": 0.8291384317521782, "grad_norm": 3.922977924346924, "learning_rate": 7.6142182029338424e-06, "loss": 1.7641, "step": 5139 }, { "epoch": 0.8292997741206841, "grad_norm": 4.391849994659424, "learning_rate": 7.600364536297738e-06, "loss": 1.9565, "step": 5140 }, { "epoch": 0.8294611164891901, "grad_norm": 4.4909515380859375, "learning_rate": 7.586522447404882e-06, "loss": 1.8532, "step": 5141 }, { "epoch": 0.829622458857696, "grad_norm": 4.394071578979492, "learning_rate": 7.57269194003502e-06, "loss": 1.5828, "step": 5142 }, { "epoch": 0.829783801226202, "grad_norm": 4.511841297149658, "learning_rate": 7.5588730179647625e-06, "loss": 1.8064, "step": 5143 }, { "epoch": 0.829945143594708, "grad_norm": 8.176545143127441, "learning_rate": 7.545065684967517e-06, "loss": 2.1051, "step": 5144 }, { "epoch": 0.8301064859632139, "grad_norm": 3.9797441959381104, "learning_rate": 7.531269944813568e-06, "loss": 1.9633, "step": 5145 }, { "epoch": 0.8302678283317199, "grad_norm": 4.101449012756348, "learning_rate": 7.517485801269986e-06, "loss": 1.6769, "step": 5146 }, { "epoch": 0.8304291707002259, "grad_norm": 3.9880740642547607, "learning_rate": 7.503713258100725e-06, "loss": 1.8013, "step": 5147 }, { "epoch": 0.8305905130687319, "grad_norm": 4.547024250030518, "learning_rate": 7.489952319066529e-06, "loss": 1.9486, "step": 5148 }, { "epoch": 0.8307518554372378, "grad_norm": 4.98875617980957, "learning_rate": 7.476202987925013e-06, "loss": 1.8601, "step": 5149 }, { "epoch": 0.8309131978057438, "grad_norm": 4.726477146148682, "learning_rate": 7.462465268430591e-06, "loss": 1.6014, "step": 5150 }, { "epoch": 0.8310745401742498, "grad_norm": 3.761723756790161, "learning_rate": 7.448739164334501e-06, "loss": 1.8683, "step": 5151 }, { "epoch": 0.8312358825427557, "grad_norm": 4.3507080078125, "learning_rate": 7.43502467938485e-06, "loss": 1.938, "step": 5152 }, { "epoch": 0.8313972249112617, "grad_norm": 4.776767730712891, "learning_rate": 7.421321817326526e-06, "loss": 1.711, "step": 5153 }, { "epoch": 0.8315585672797676, "grad_norm": 4.126363277435303, "learning_rate": 7.407630581901293e-06, "loss": 1.6088, "step": 5154 }, { "epoch": 0.8317199096482737, "grad_norm": 3.9798381328582764, "learning_rate": 7.393950976847674e-06, "loss": 1.6647, "step": 5155 }, { "epoch": 0.8318812520167796, "grad_norm": 4.233016490936279, "learning_rate": 7.380283005901084e-06, "loss": 1.758, "step": 5156 }, { "epoch": 0.8320425943852856, "grad_norm": 4.131165504455566, "learning_rate": 7.366626672793714e-06, "loss": 1.6231, "step": 5157 }, { "epoch": 0.8322039367537916, "grad_norm": 4.439856052398682, "learning_rate": 7.352981981254608e-06, "loss": 1.8005, "step": 5158 }, { "epoch": 0.8323652791222975, "grad_norm": 6.2160186767578125, "learning_rate": 7.339348935009616e-06, "loss": 1.7089, "step": 5159 }, { "epoch": 0.8325266214908035, "grad_norm": 5.1408796310424805, "learning_rate": 7.325727537781396e-06, "loss": 1.622, "step": 5160 }, { "epoch": 0.8326879638593094, "grad_norm": 3.8749024868011475, "learning_rate": 7.312117793289447e-06, "loss": 1.7168, "step": 5161 }, { "epoch": 0.8328493062278154, "grad_norm": 5.038416862487793, "learning_rate": 7.298519705250067e-06, "loss": 2.0763, "step": 5162 }, { "epoch": 0.8330106485963213, "grad_norm": 4.5244526863098145, "learning_rate": 7.284933277376404e-06, "loss": 1.8442, "step": 5163 }, { "epoch": 0.8331719909648274, "grad_norm": 5.41386079788208, "learning_rate": 7.271358513378368e-06, "loss": 1.6265, "step": 5164 }, { "epoch": 0.8333333333333334, "grad_norm": 4.516528129577637, "learning_rate": 7.257795416962753e-06, "loss": 1.7632, "step": 5165 }, { "epoch": 0.8334946757018393, "grad_norm": 4.514156341552734, "learning_rate": 7.244243991833094e-06, "loss": 1.7597, "step": 5166 }, { "epoch": 0.8336560180703453, "grad_norm": 5.281730651855469, "learning_rate": 7.230704241689806e-06, "loss": 2.0508, "step": 5167 }, { "epoch": 0.8338173604388512, "grad_norm": 5.620670318603516, "learning_rate": 7.217176170230056e-06, "loss": 1.9524, "step": 5168 }, { "epoch": 0.8339787028073572, "grad_norm": 4.294712543487549, "learning_rate": 7.203659781147881e-06, "loss": 2.0827, "step": 5169 }, { "epoch": 0.8341400451758632, "grad_norm": 3.571934461593628, "learning_rate": 7.190155078134064e-06, "loss": 1.8611, "step": 5170 }, { "epoch": 0.8343013875443691, "grad_norm": 4.268311023712158, "learning_rate": 7.1766620648762665e-06, "loss": 1.8018, "step": 5171 }, { "epoch": 0.8344627299128751, "grad_norm": 5.438915252685547, "learning_rate": 7.163180745058889e-06, "loss": 1.8312, "step": 5172 }, { "epoch": 0.8346240722813811, "grad_norm": 5.227361679077148, "learning_rate": 7.149711122363201e-06, "loss": 1.9746, "step": 5173 }, { "epoch": 0.8347854146498871, "grad_norm": 4.696291446685791, "learning_rate": 7.13625320046723e-06, "loss": 1.9036, "step": 5174 }, { "epoch": 0.834946757018393, "grad_norm": 4.491235256195068, "learning_rate": 7.1228069830458264e-06, "loss": 1.9708, "step": 5175 }, { "epoch": 0.835108099386899, "grad_norm": 4.827406883239746, "learning_rate": 7.109372473770659e-06, "loss": 1.7651, "step": 5176 }, { "epoch": 0.835269441755405, "grad_norm": 4.635233402252197, "learning_rate": 7.095949676310171e-06, "loss": 1.8846, "step": 5177 }, { "epoch": 0.8354307841239109, "grad_norm": 5.021635055541992, "learning_rate": 7.082538594329641e-06, "loss": 2.1273, "step": 5178 }, { "epoch": 0.8355921264924169, "grad_norm": 5.73133659362793, "learning_rate": 7.069139231491118e-06, "loss": 1.8384, "step": 5179 }, { "epoch": 0.8357534688609228, "grad_norm": 6.3099188804626465, "learning_rate": 7.055751591453469e-06, "loss": 2.0464, "step": 5180 }, { "epoch": 0.8359148112294289, "grad_norm": 4.243574142456055, "learning_rate": 7.0423756778723375e-06, "loss": 2.0335, "step": 5181 }, { "epoch": 0.8360761535979349, "grad_norm": 3.7860779762268066, "learning_rate": 7.0290114944002065e-06, "loss": 1.9782, "step": 5182 }, { "epoch": 0.8362374959664408, "grad_norm": 5.813986301422119, "learning_rate": 7.015659044686307e-06, "loss": 1.9711, "step": 5183 }, { "epoch": 0.8363988383349468, "grad_norm": 4.148378849029541, "learning_rate": 7.002318332376712e-06, "loss": 1.925, "step": 5184 }, { "epoch": 0.8365601807034527, "grad_norm": 3.4092843532562256, "learning_rate": 6.988989361114251e-06, "loss": 1.7628, "step": 5185 }, { "epoch": 0.8367215230719587, "grad_norm": 4.043959617614746, "learning_rate": 6.97567213453858e-06, "loss": 1.7128, "step": 5186 }, { "epoch": 0.8368828654404646, "grad_norm": 4.311399936676025, "learning_rate": 6.962366656286118e-06, "loss": 1.8537, "step": 5187 }, { "epoch": 0.8370442078089706, "grad_norm": 4.286233901977539, "learning_rate": 6.949072929990091e-06, "loss": 1.8384, "step": 5188 }, { "epoch": 0.8372055501774766, "grad_norm": 3.0191707611083984, "learning_rate": 6.935790959280525e-06, "loss": 1.804, "step": 5189 }, { "epoch": 0.8373668925459826, "grad_norm": 4.2001729011535645, "learning_rate": 6.922520747784206e-06, "loss": 1.655, "step": 5190 }, { "epoch": 0.8375282349144886, "grad_norm": 4.738022327423096, "learning_rate": 6.9092622991247576e-06, "loss": 1.5961, "step": 5191 }, { "epoch": 0.8376895772829945, "grad_norm": 5.298644065856934, "learning_rate": 6.896015616922535e-06, "loss": 1.7559, "step": 5192 }, { "epoch": 0.8378509196515005, "grad_norm": 4.534740924835205, "learning_rate": 6.882780704794734e-06, "loss": 1.5783, "step": 5193 }, { "epoch": 0.8380122620200064, "grad_norm": 4.212889671325684, "learning_rate": 6.869557566355284e-06, "loss": 1.9214, "step": 5194 }, { "epoch": 0.8381736043885124, "grad_norm": 4.8258256912231445, "learning_rate": 6.856346205214947e-06, "loss": 1.6304, "step": 5195 }, { "epoch": 0.8383349467570184, "grad_norm": 4.971568584442139, "learning_rate": 6.843146624981239e-06, "loss": 2.0498, "step": 5196 }, { "epoch": 0.8384962891255243, "grad_norm": 5.606351375579834, "learning_rate": 6.829958829258465e-06, "loss": 1.8753, "step": 5197 }, { "epoch": 0.8386576314940304, "grad_norm": 4.643892288208008, "learning_rate": 6.816782821647727e-06, "loss": 2.143, "step": 5198 }, { "epoch": 0.8388189738625363, "grad_norm": 4.603559494018555, "learning_rate": 6.8036186057468866e-06, "loss": 1.8819, "step": 5199 }, { "epoch": 0.8389803162310423, "grad_norm": 3.6417465209960938, "learning_rate": 6.790466185150596e-06, "loss": 1.7776, "step": 5200 }, { "epoch": 0.8391416585995483, "grad_norm": 3.8199212551116943, "learning_rate": 6.777325563450282e-06, "loss": 1.8418, "step": 5201 }, { "epoch": 0.8393030009680542, "grad_norm": 4.013891696929932, "learning_rate": 6.7641967442341635e-06, "loss": 1.8983, "step": 5202 }, { "epoch": 0.8394643433365602, "grad_norm": 3.892119884490967, "learning_rate": 6.751079731087217e-06, "loss": 1.7774, "step": 5203 }, { "epoch": 0.8396256857050661, "grad_norm": 5.950384140014648, "learning_rate": 6.737974527591212e-06, "loss": 1.9238, "step": 5204 }, { "epoch": 0.8397870280735721, "grad_norm": 4.6189398765563965, "learning_rate": 6.724881137324679e-06, "loss": 1.7152, "step": 5205 }, { "epoch": 0.839948370442078, "grad_norm": 5.0324835777282715, "learning_rate": 6.711799563862942e-06, "loss": 1.9298, "step": 5206 }, { "epoch": 0.8401097128105841, "grad_norm": 4.298720359802246, "learning_rate": 6.698729810778065e-06, "loss": 1.667, "step": 5207 }, { "epoch": 0.8402710551790901, "grad_norm": 4.028954029083252, "learning_rate": 6.685671881638933e-06, "loss": 1.6205, "step": 5208 }, { "epoch": 0.840432397547596, "grad_norm": 4.030002593994141, "learning_rate": 6.672625780011144e-06, "loss": 1.7649, "step": 5209 }, { "epoch": 0.840593739916102, "grad_norm": 4.125081539154053, "learning_rate": 6.659591509457125e-06, "loss": 1.9293, "step": 5210 }, { "epoch": 0.8407550822846079, "grad_norm": 3.785576105117798, "learning_rate": 6.6465690735360244e-06, "loss": 2.1153, "step": 5211 }, { "epoch": 0.8409164246531139, "grad_norm": 4.0753493309021, "learning_rate": 6.633558475803792e-06, "loss": 1.6938, "step": 5212 }, { "epoch": 0.8410777670216198, "grad_norm": 4.192071437835693, "learning_rate": 6.6205597198131295e-06, "loss": 1.8384, "step": 5213 }, { "epoch": 0.8412391093901258, "grad_norm": 5.801394939422607, "learning_rate": 6.607572809113488e-06, "loss": 1.8592, "step": 5214 }, { "epoch": 0.8414004517586318, "grad_norm": 4.262351036071777, "learning_rate": 6.594597747251136e-06, "loss": 1.713, "step": 5215 }, { "epoch": 0.8415617941271378, "grad_norm": 4.121074199676514, "learning_rate": 6.581634537769054e-06, "loss": 1.7004, "step": 5216 }, { "epoch": 0.8417231364956438, "grad_norm": 3.9050257205963135, "learning_rate": 6.568683184206997e-06, "loss": 1.6932, "step": 5217 }, { "epoch": 0.8418844788641497, "grad_norm": 7.386196136474609, "learning_rate": 6.555743690101523e-06, "loss": 1.8373, "step": 5218 }, { "epoch": 0.8420458212326557, "grad_norm": 4.8773627281188965, "learning_rate": 6.542816058985895e-06, "loss": 2.0369, "step": 5219 }, { "epoch": 0.8422071636011617, "grad_norm": 4.547236919403076, "learning_rate": 6.529900294390162e-06, "loss": 1.6388, "step": 5220 }, { "epoch": 0.8423685059696676, "grad_norm": 4.2048869132995605, "learning_rate": 6.516996399841152e-06, "loss": 1.7337, "step": 5221 }, { "epoch": 0.8425298483381736, "grad_norm": 3.9539923667907715, "learning_rate": 6.504104378862408e-06, "loss": 2.0345, "step": 5222 }, { "epoch": 0.8426911907066795, "grad_norm": 3.4182205200195312, "learning_rate": 6.49122423497428e-06, "loss": 1.7882, "step": 5223 }, { "epoch": 0.8428525330751856, "grad_norm": 6.698798656463623, "learning_rate": 6.478355971693834e-06, "loss": 1.9628, "step": 5224 }, { "epoch": 0.8430138754436916, "grad_norm": 4.609959602355957, "learning_rate": 6.465499592534902e-06, "loss": 1.651, "step": 5225 }, { "epoch": 0.8431752178121975, "grad_norm": 4.455406665802002, "learning_rate": 6.452655101008098e-06, "loss": 1.8349, "step": 5226 }, { "epoch": 0.8433365601807035, "grad_norm": 4.491783142089844, "learning_rate": 6.439822500620751e-06, "loss": 1.9411, "step": 5227 }, { "epoch": 0.8434979025492094, "grad_norm": 4.449477672576904, "learning_rate": 6.427001794876975e-06, "loss": 1.7111, "step": 5228 }, { "epoch": 0.8436592449177154, "grad_norm": 3.9713246822357178, "learning_rate": 6.414192987277601e-06, "loss": 1.789, "step": 5229 }, { "epoch": 0.8438205872862213, "grad_norm": 4.128500938415527, "learning_rate": 6.401396081320255e-06, "loss": 1.8, "step": 5230 }, { "epoch": 0.8439819296547273, "grad_norm": 4.807419300079346, "learning_rate": 6.388611080499274e-06, "loss": 1.7202, "step": 5231 }, { "epoch": 0.8441432720232332, "grad_norm": 3.9238312244415283, "learning_rate": 6.3758379883057714e-06, "loss": 2.1235, "step": 5232 }, { "epoch": 0.8443046143917393, "grad_norm": 3.4898500442504883, "learning_rate": 6.363076808227586e-06, "loss": 2.0414, "step": 5233 }, { "epoch": 0.8444659567602453, "grad_norm": 3.7841458320617676, "learning_rate": 6.350327543749329e-06, "loss": 1.7985, "step": 5234 }, { "epoch": 0.8446272991287512, "grad_norm": 5.285628318786621, "learning_rate": 6.337590198352339e-06, "loss": 1.9694, "step": 5235 }, { "epoch": 0.8447886414972572, "grad_norm": 3.4902424812316895, "learning_rate": 6.3248647755147e-06, "loss": 1.9331, "step": 5236 }, { "epoch": 0.8449499838657631, "grad_norm": 4.591045379638672, "learning_rate": 6.312151278711237e-06, "loss": 1.8746, "step": 5237 }, { "epoch": 0.8451113262342691, "grad_norm": 9.962109565734863, "learning_rate": 6.299449711413552e-06, "loss": 2.3339, "step": 5238 }, { "epoch": 0.8452726686027751, "grad_norm": 4.510113716125488, "learning_rate": 6.286760077089954e-06, "loss": 1.7606, "step": 5239 }, { "epoch": 0.845434010971281, "grad_norm": 4.234711647033691, "learning_rate": 6.274082379205487e-06, "loss": 1.8549, "step": 5240 }, { "epoch": 0.8455953533397871, "grad_norm": 6.012852191925049, "learning_rate": 6.261416621221977e-06, "loss": 2.0749, "step": 5241 }, { "epoch": 0.845756695708293, "grad_norm": 4.803767681121826, "learning_rate": 6.248762806597946e-06, "loss": 1.774, "step": 5242 }, { "epoch": 0.845918038076799, "grad_norm": 5.22224235534668, "learning_rate": 6.236120938788692e-06, "loss": 1.8564, "step": 5243 }, { "epoch": 0.846079380445305, "grad_norm": 3.6898679733276367, "learning_rate": 6.223491021246214e-06, "loss": 1.5177, "step": 5244 }, { "epoch": 0.8462407228138109, "grad_norm": 4.5900959968566895, "learning_rate": 6.2108730574192865e-06, "loss": 1.6623, "step": 5245 }, { "epoch": 0.8464020651823169, "grad_norm": 3.6462502479553223, "learning_rate": 6.198267050753387e-06, "loss": 1.8379, "step": 5246 }, { "epoch": 0.8465634075508228, "grad_norm": 4.66995906829834, "learning_rate": 6.185673004690745e-06, "loss": 1.8918, "step": 5247 }, { "epoch": 0.8467247499193288, "grad_norm": 4.3259100914001465, "learning_rate": 6.173090922670316e-06, "loss": 1.978, "step": 5248 }, { "epoch": 0.8468860922878347, "grad_norm": 3.9500341415405273, "learning_rate": 6.160520808127807e-06, "loss": 1.7887, "step": 5249 }, { "epoch": 0.8470474346563408, "grad_norm": 5.067325592041016, "learning_rate": 6.147962664495632e-06, "loss": 1.7481, "step": 5250 }, { "epoch": 0.8472087770248468, "grad_norm": 5.685298442840576, "learning_rate": 6.135416495202934e-06, "loss": 1.7633, "step": 5251 }, { "epoch": 0.8473701193933527, "grad_norm": 4.695085525512695, "learning_rate": 6.122882303675626e-06, "loss": 1.7703, "step": 5252 }, { "epoch": 0.8475314617618587, "grad_norm": 4.995589256286621, "learning_rate": 6.110360093336292e-06, "loss": 1.858, "step": 5253 }, { "epoch": 0.8476928041303646, "grad_norm": 3.847963571548462, "learning_rate": 6.097849867604311e-06, "loss": 1.657, "step": 5254 }, { "epoch": 0.8478541464988706, "grad_norm": 4.688514232635498, "learning_rate": 6.085351629895736e-06, "loss": 2.0851, "step": 5255 }, { "epoch": 0.8480154888673765, "grad_norm": 4.997213363647461, "learning_rate": 6.0728653836233555e-06, "loss": 1.8903, "step": 5256 }, { "epoch": 0.8481768312358825, "grad_norm": 4.4050726890563965, "learning_rate": 6.060391132196713e-06, "loss": 1.8936, "step": 5257 }, { "epoch": 0.8483381736043886, "grad_norm": 6.244439601898193, "learning_rate": 6.047928879022052e-06, "loss": 1.9428, "step": 5258 }, { "epoch": 0.8484995159728945, "grad_norm": 4.172328948974609, "learning_rate": 6.0354786275023224e-06, "loss": 1.958, "step": 5259 }, { "epoch": 0.8486608583414005, "grad_norm": 5.372505187988281, "learning_rate": 6.023040381037254e-06, "loss": 1.7281, "step": 5260 }, { "epoch": 0.8488222007099064, "grad_norm": 4.413226127624512, "learning_rate": 6.01061414302323e-06, "loss": 1.8378, "step": 5261 }, { "epoch": 0.8489835430784124, "grad_norm": 5.235836029052734, "learning_rate": 5.998199916853414e-06, "loss": 2.0496, "step": 5262 }, { "epoch": 0.8491448854469184, "grad_norm": 4.358976364135742, "learning_rate": 5.985797705917651e-06, "loss": 1.4936, "step": 5263 }, { "epoch": 0.8493062278154243, "grad_norm": 4.531014919281006, "learning_rate": 5.973407513602514e-06, "loss": 1.7078, "step": 5264 }, { "epoch": 0.8494675701839303, "grad_norm": 5.202716827392578, "learning_rate": 5.961029343291308e-06, "loss": 1.8907, "step": 5265 }, { "epoch": 0.8496289125524362, "grad_norm": 4.391210556030273, "learning_rate": 5.948663198364035e-06, "loss": 1.917, "step": 5266 }, { "epoch": 0.8497902549209423, "grad_norm": 4.849218845367432, "learning_rate": 5.936309082197439e-06, "loss": 2.2373, "step": 5267 }, { "epoch": 0.8499515972894482, "grad_norm": 4.686940670013428, "learning_rate": 5.923966998164937e-06, "loss": 1.9442, "step": 5268 }, { "epoch": 0.8501129396579542, "grad_norm": 4.684127330780029, "learning_rate": 5.911636949636718e-06, "loss": 1.9074, "step": 5269 }, { "epoch": 0.8502742820264602, "grad_norm": 4.158158302307129, "learning_rate": 5.8993189399796315e-06, "loss": 1.888, "step": 5270 }, { "epoch": 0.8504356243949661, "grad_norm": 4.792473793029785, "learning_rate": 5.887012972557276e-06, "loss": 1.8296, "step": 5271 }, { "epoch": 0.8505969667634721, "grad_norm": 3.8275701999664307, "learning_rate": 5.8747190507299375e-06, "loss": 1.7749, "step": 5272 }, { "epoch": 0.850758309131978, "grad_norm": 3.9070305824279785, "learning_rate": 5.86243717785463e-06, "loss": 1.8192, "step": 5273 }, { "epoch": 0.850919651500484, "grad_norm": 5.525460720062256, "learning_rate": 5.850167357285069e-06, "loss": 1.8901, "step": 5274 }, { "epoch": 0.8510809938689899, "grad_norm": 4.825296878814697, "learning_rate": 5.837909592371682e-06, "loss": 1.848, "step": 5275 }, { "epoch": 0.851242336237496, "grad_norm": 3.576374053955078, "learning_rate": 5.825663886461585e-06, "loss": 1.6513, "step": 5276 }, { "epoch": 0.851403678606002, "grad_norm": 4.482271671295166, "learning_rate": 5.813430242898649e-06, "loss": 1.8692, "step": 5277 }, { "epoch": 0.8515650209745079, "grad_norm": 5.846675872802734, "learning_rate": 5.8012086650234e-06, "loss": 1.9382, "step": 5278 }, { "epoch": 0.8517263633430139, "grad_norm": 5.3428544998168945, "learning_rate": 5.788999156173086e-06, "loss": 1.8543, "step": 5279 }, { "epoch": 0.8518877057115198, "grad_norm": 4.3809895515441895, "learning_rate": 5.776801719681691e-06, "loss": 1.993, "step": 5280 }, { "epoch": 0.8520490480800258, "grad_norm": 4.829232215881348, "learning_rate": 5.764616358879838e-06, "loss": 2.0357, "step": 5281 }, { "epoch": 0.8522103904485318, "grad_norm": 4.304603099822998, "learning_rate": 5.752443077094927e-06, "loss": 1.7668, "step": 5282 }, { "epoch": 0.8523717328170377, "grad_norm": 4.619054794311523, "learning_rate": 5.740281877650994e-06, "loss": 1.7929, "step": 5283 }, { "epoch": 0.8525330751855438, "grad_norm": 5.700281143188477, "learning_rate": 5.728132763868832e-06, "loss": 1.8248, "step": 5284 }, { "epoch": 0.8526944175540497, "grad_norm": 4.372552871704102, "learning_rate": 5.715995739065877e-06, "loss": 2.0041, "step": 5285 }, { "epoch": 0.8528557599225557, "grad_norm": 4.835000038146973, "learning_rate": 5.703870806556316e-06, "loss": 1.9382, "step": 5286 }, { "epoch": 0.8530171022910616, "grad_norm": 4.303082466125488, "learning_rate": 5.691757969651001e-06, "loss": 1.6725, "step": 5287 }, { "epoch": 0.8531784446595676, "grad_norm": 5.479561805725098, "learning_rate": 5.679657231657487e-06, "loss": 1.8091, "step": 5288 }, { "epoch": 0.8533397870280736, "grad_norm": 4.941107749938965, "learning_rate": 5.667568595880046e-06, "loss": 1.7673, "step": 5289 }, { "epoch": 0.8535011293965795, "grad_norm": 4.362493515014648, "learning_rate": 5.655492065619605e-06, "loss": 1.8743, "step": 5290 }, { "epoch": 0.8536624717650855, "grad_norm": 4.439315319061279, "learning_rate": 5.643427644173837e-06, "loss": 1.8494, "step": 5291 }, { "epoch": 0.8538238141335914, "grad_norm": 4.728001117706299, "learning_rate": 5.631375334837058e-06, "loss": 2.1247, "step": 5292 }, { "epoch": 0.8539851565020975, "grad_norm": 3.551706552505493, "learning_rate": 5.619335140900317e-06, "loss": 1.8912, "step": 5293 }, { "epoch": 0.8541464988706035, "grad_norm": 4.582133769989014, "learning_rate": 5.607307065651324e-06, "loss": 1.9535, "step": 5294 }, { "epoch": 0.8543078412391094, "grad_norm": 4.534191131591797, "learning_rate": 5.595291112374507e-06, "loss": 1.8993, "step": 5295 }, { "epoch": 0.8544691836076154, "grad_norm": 3.634624481201172, "learning_rate": 5.5832872843509465e-06, "loss": 1.7037, "step": 5296 }, { "epoch": 0.8546305259761213, "grad_norm": 3.549039840698242, "learning_rate": 5.571295584858466e-06, "loss": 1.9124, "step": 5297 }, { "epoch": 0.8547918683446273, "grad_norm": 3.984860420227051, "learning_rate": 5.559316017171518e-06, "loss": 1.8104, "step": 5298 }, { "epoch": 0.8549532107131332, "grad_norm": 4.351929187774658, "learning_rate": 5.547348584561296e-06, "loss": 1.6888, "step": 5299 }, { "epoch": 0.8551145530816392, "grad_norm": 4.291843414306641, "learning_rate": 5.535393290295643e-06, "loss": 1.8186, "step": 5300 }, { "epoch": 0.8552758954501453, "grad_norm": 4.084909439086914, "learning_rate": 5.523450137639091e-06, "loss": 1.5097, "step": 5301 }, { "epoch": 0.8554372378186512, "grad_norm": 4.398079872131348, "learning_rate": 5.5115191298528876e-06, "loss": 1.9224, "step": 5302 }, { "epoch": 0.8555985801871572, "grad_norm": 4.4744768142700195, "learning_rate": 5.499600270194921e-06, "loss": 1.9395, "step": 5303 }, { "epoch": 0.8557599225556631, "grad_norm": 4.3137431144714355, "learning_rate": 5.487693561919794e-06, "loss": 1.8131, "step": 5304 }, { "epoch": 0.8559212649241691, "grad_norm": 4.522165298461914, "learning_rate": 5.47579900827877e-06, "loss": 1.6565, "step": 5305 }, { "epoch": 0.856082607292675, "grad_norm": 4.212401866912842, "learning_rate": 5.463916612519821e-06, "loss": 1.8043, "step": 5306 }, { "epoch": 0.856243949661181, "grad_norm": 4.5480780601501465, "learning_rate": 5.45204637788756e-06, "loss": 1.9229, "step": 5307 }, { "epoch": 0.856405292029687, "grad_norm": 4.158749103546143, "learning_rate": 5.440188307623317e-06, "loss": 1.8929, "step": 5308 }, { "epoch": 0.8565666343981929, "grad_norm": 4.080484867095947, "learning_rate": 5.428342404965076e-06, "loss": 1.6823, "step": 5309 }, { "epoch": 0.856727976766699, "grad_norm": 3.7655515670776367, "learning_rate": 5.4165086731475186e-06, "loss": 1.7646, "step": 5310 }, { "epoch": 0.8568893191352049, "grad_norm": 4.867955207824707, "learning_rate": 5.404687115401969e-06, "loss": 2.0057, "step": 5311 }, { "epoch": 0.8570506615037109, "grad_norm": 3.499534845352173, "learning_rate": 5.392877734956475e-06, "loss": 1.7056, "step": 5312 }, { "epoch": 0.8572120038722169, "grad_norm": 3.72841477394104, "learning_rate": 5.3810805350357205e-06, "loss": 1.6767, "step": 5313 }, { "epoch": 0.8573733462407228, "grad_norm": 4.146420001983643, "learning_rate": 5.369295518861078e-06, "loss": 2.0991, "step": 5314 }, { "epoch": 0.8575346886092288, "grad_norm": 5.447978973388672, "learning_rate": 5.35752268965059e-06, "loss": 1.9444, "step": 5315 }, { "epoch": 0.8576960309777347, "grad_norm": 4.583520889282227, "learning_rate": 5.345762050618963e-06, "loss": 1.7592, "step": 5316 }, { "epoch": 0.8578573733462407, "grad_norm": 4.727061748504639, "learning_rate": 5.3340136049776055e-06, "loss": 1.9346, "step": 5317 }, { "epoch": 0.8580187157147466, "grad_norm": 6.0795578956604, "learning_rate": 5.322277355934558e-06, "loss": 1.6686, "step": 5318 }, { "epoch": 0.8581800580832527, "grad_norm": 5.296669006347656, "learning_rate": 5.3105533066945605e-06, "loss": 1.7791, "step": 5319 }, { "epoch": 0.8583414004517587, "grad_norm": 4.125102996826172, "learning_rate": 5.298841460458998e-06, "loss": 2.0862, "step": 5320 }, { "epoch": 0.8585027428202646, "grad_norm": 5.359317779541016, "learning_rate": 5.287141820425945e-06, "loss": 1.9058, "step": 5321 }, { "epoch": 0.8586640851887706, "grad_norm": 3.726865291595459, "learning_rate": 5.2754543897901184e-06, "loss": 1.6196, "step": 5322 }, { "epoch": 0.8588254275572765, "grad_norm": 6.377970218658447, "learning_rate": 5.263779171742933e-06, "loss": 1.981, "step": 5323 }, { "epoch": 0.8589867699257825, "grad_norm": 4.332647323608398, "learning_rate": 5.2521161694724375e-06, "loss": 1.8543, "step": 5324 }, { "epoch": 0.8591481122942884, "grad_norm": 3.9370486736297607, "learning_rate": 5.240465386163368e-06, "loss": 1.72, "step": 5325 }, { "epoch": 0.8593094546627944, "grad_norm": 4.345332145690918, "learning_rate": 5.2288268249971125e-06, "loss": 1.8494, "step": 5326 }, { "epoch": 0.8594707970313005, "grad_norm": 3.694758653640747, "learning_rate": 5.217200489151713e-06, "loss": 1.7929, "step": 5327 }, { "epoch": 0.8596321393998064, "grad_norm": 4.486896514892578, "learning_rate": 5.2055863818018965e-06, "loss": 1.9654, "step": 5328 }, { "epoch": 0.8597934817683124, "grad_norm": 4.784701347351074, "learning_rate": 5.193984506119032e-06, "loss": 1.9568, "step": 5329 }, { "epoch": 0.8599548241368183, "grad_norm": 4.010159015655518, "learning_rate": 5.1823948652711565e-06, "loss": 1.668, "step": 5330 }, { "epoch": 0.8601161665053243, "grad_norm": 4.728301048278809, "learning_rate": 5.170817462422961e-06, "loss": 1.9033, "step": 5331 }, { "epoch": 0.8602775088738303, "grad_norm": 4.136148929595947, "learning_rate": 5.159252300735812e-06, "loss": 1.6008, "step": 5332 }, { "epoch": 0.8604388512423362, "grad_norm": 3.8405919075012207, "learning_rate": 5.1476993833677045e-06, "loss": 2.0028, "step": 5333 }, { "epoch": 0.8606001936108422, "grad_norm": 5.300532341003418, "learning_rate": 5.13615871347331e-06, "loss": 1.6452, "step": 5334 }, { "epoch": 0.8607615359793481, "grad_norm": 4.644827842712402, "learning_rate": 5.124630294203942e-06, "loss": 1.5739, "step": 5335 }, { "epoch": 0.8609228783478542, "grad_norm": 4.183739185333252, "learning_rate": 5.113114128707591e-06, "loss": 2.0056, "step": 5336 }, { "epoch": 0.8610842207163601, "grad_norm": 5.444863796234131, "learning_rate": 5.1016102201288776e-06, "loss": 1.645, "step": 5337 }, { "epoch": 0.8612455630848661, "grad_norm": 6.546682834625244, "learning_rate": 5.090118571609098e-06, "loss": 1.9507, "step": 5338 }, { "epoch": 0.8614069054533721, "grad_norm": 4.524355888366699, "learning_rate": 5.078639186286177e-06, "loss": 2.0171, "step": 5339 }, { "epoch": 0.861568247821878, "grad_norm": 4.184797286987305, "learning_rate": 5.0671720672947064e-06, "loss": 1.6688, "step": 5340 }, { "epoch": 0.861729590190384, "grad_norm": 5.031257629394531, "learning_rate": 5.055717217765926e-06, "loss": 2.1551, "step": 5341 }, { "epoch": 0.8618909325588899, "grad_norm": 3.807931423187256, "learning_rate": 5.044274640827718e-06, "loss": 1.82, "step": 5342 }, { "epoch": 0.8620522749273959, "grad_norm": 4.028903007507324, "learning_rate": 5.032844339604631e-06, "loss": 1.5555, "step": 5343 }, { "epoch": 0.862213617295902, "grad_norm": 7.599312782287598, "learning_rate": 5.021426317217831e-06, "loss": 1.9882, "step": 5344 }, { "epoch": 0.8623749596644079, "grad_norm": 5.002162456512451, "learning_rate": 5.010020576785174e-06, "loss": 1.9644, "step": 5345 }, { "epoch": 0.8625363020329139, "grad_norm": 5.8590545654296875, "learning_rate": 4.998627121421112e-06, "loss": 1.7531, "step": 5346 }, { "epoch": 0.8626976444014198, "grad_norm": 4.679166316986084, "learning_rate": 4.987245954236791e-06, "loss": 1.6703, "step": 5347 }, { "epoch": 0.8628589867699258, "grad_norm": 3.858346700668335, "learning_rate": 4.975877078339964e-06, "loss": 1.6805, "step": 5348 }, { "epoch": 0.8630203291384317, "grad_norm": 3.771313190460205, "learning_rate": 4.964520496835057e-06, "loss": 2.1385, "step": 5349 }, { "epoch": 0.8631816715069377, "grad_norm": 4.188822269439697, "learning_rate": 4.953176212823113e-06, "loss": 1.5778, "step": 5350 }, { "epoch": 0.8633430138754437, "grad_norm": 6.0285325050354, "learning_rate": 4.941844229401821e-06, "loss": 1.7102, "step": 5351 }, { "epoch": 0.8635043562439496, "grad_norm": 4.193412780761719, "learning_rate": 4.930524549665538e-06, "loss": 1.7374, "step": 5352 }, { "epoch": 0.8636656986124557, "grad_norm": 5.323854446411133, "learning_rate": 4.919217176705238e-06, "loss": 1.6838, "step": 5353 }, { "epoch": 0.8638270409809616, "grad_norm": 4.328566074371338, "learning_rate": 4.9079221136085315e-06, "loss": 1.7497, "step": 5354 }, { "epoch": 0.8639883833494676, "grad_norm": 4.033364772796631, "learning_rate": 4.896639363459671e-06, "loss": 1.5371, "step": 5355 }, { "epoch": 0.8641497257179735, "grad_norm": 4.1560444831848145, "learning_rate": 4.885368929339562e-06, "loss": 2.0093, "step": 5356 }, { "epoch": 0.8643110680864795, "grad_norm": 3.9417884349823, "learning_rate": 4.8741108143257215e-06, "loss": 2.1247, "step": 5357 }, { "epoch": 0.8644724104549855, "grad_norm": 4.554046630859375, "learning_rate": 4.862865021492335e-06, "loss": 2.0201, "step": 5358 }, { "epoch": 0.8646337528234914, "grad_norm": 4.345934867858887, "learning_rate": 4.851631553910185e-06, "loss": 1.9753, "step": 5359 }, { "epoch": 0.8647950951919974, "grad_norm": 4.904932498931885, "learning_rate": 4.8404104146467284e-06, "loss": 2.0068, "step": 5360 }, { "epoch": 0.8649564375605033, "grad_norm": 4.793888092041016, "learning_rate": 4.8292016067660206e-06, "loss": 1.7089, "step": 5361 }, { "epoch": 0.8651177799290094, "grad_norm": 4.269280433654785, "learning_rate": 4.8180051333287735e-06, "loss": 1.8964, "step": 5362 }, { "epoch": 0.8652791222975154, "grad_norm": 3.4615707397460938, "learning_rate": 4.8068209973923255e-06, "loss": 1.5962, "step": 5363 }, { "epoch": 0.8654404646660213, "grad_norm": 3.4652092456817627, "learning_rate": 4.795649202010622e-06, "loss": 1.7917, "step": 5364 }, { "epoch": 0.8656018070345273, "grad_norm": 4.4619574546813965, "learning_rate": 4.784489750234283e-06, "loss": 1.859, "step": 5365 }, { "epoch": 0.8657631494030332, "grad_norm": 5.999852657318115, "learning_rate": 4.773342645110518e-06, "loss": 1.8824, "step": 5366 }, { "epoch": 0.8659244917715392, "grad_norm": 4.514030933380127, "learning_rate": 4.762207889683196e-06, "loss": 1.5664, "step": 5367 }, { "epoch": 0.8660858341400451, "grad_norm": 4.133525371551514, "learning_rate": 4.751085486992779e-06, "loss": 1.9823, "step": 5368 }, { "epoch": 0.8662471765085511, "grad_norm": 4.643326759338379, "learning_rate": 4.739975440076405e-06, "loss": 1.7787, "step": 5369 }, { "epoch": 0.8664085188770572, "grad_norm": 3.7829456329345703, "learning_rate": 4.728877751967786e-06, "loss": 1.7204, "step": 5370 }, { "epoch": 0.8665698612455631, "grad_norm": 3.434251546859741, "learning_rate": 4.717792425697288e-06, "loss": 1.8811, "step": 5371 }, { "epoch": 0.8667312036140691, "grad_norm": 3.654670476913452, "learning_rate": 4.706719464291903e-06, "loss": 1.7796, "step": 5372 }, { "epoch": 0.866892545982575, "grad_norm": 4.253548622131348, "learning_rate": 4.695658870775232e-06, "loss": 1.7287, "step": 5373 }, { "epoch": 0.867053888351081, "grad_norm": 5.27730655670166, "learning_rate": 4.684610648167503e-06, "loss": 1.7725, "step": 5374 }, { "epoch": 0.867215230719587, "grad_norm": 4.814943790435791, "learning_rate": 4.673574799485586e-06, "loss": 1.8509, "step": 5375 }, { "epoch": 0.8673765730880929, "grad_norm": 4.000126838684082, "learning_rate": 4.662551327742942e-06, "loss": 1.7523, "step": 5376 }, { "epoch": 0.8675379154565989, "grad_norm": 4.85685396194458, "learning_rate": 4.651540235949658e-06, "loss": 1.6614, "step": 5377 }, { "epoch": 0.8676992578251048, "grad_norm": 5.462538719177246, "learning_rate": 4.640541527112474e-06, "loss": 1.6222, "step": 5378 }, { "epoch": 0.8678606001936109, "grad_norm": 4.697770118713379, "learning_rate": 4.629555204234693e-06, "loss": 2.0384, "step": 5379 }, { "epoch": 0.8680219425621168, "grad_norm": 4.688658714294434, "learning_rate": 4.618581270316292e-06, "loss": 1.6785, "step": 5380 }, { "epoch": 0.8681832849306228, "grad_norm": 4.218990802764893, "learning_rate": 4.607619728353818e-06, "loss": 1.7295, "step": 5381 }, { "epoch": 0.8683446272991288, "grad_norm": 4.786597728729248, "learning_rate": 4.596670581340479e-06, "loss": 1.7838, "step": 5382 }, { "epoch": 0.8685059696676347, "grad_norm": 4.64976692199707, "learning_rate": 4.585733832266048e-06, "loss": 2.016, "step": 5383 }, { "epoch": 0.8686673120361407, "grad_norm": 5.516035079956055, "learning_rate": 4.57480948411696e-06, "loss": 1.9975, "step": 5384 }, { "epoch": 0.8688286544046466, "grad_norm": 4.395437240600586, "learning_rate": 4.563897539876228e-06, "loss": 1.8529, "step": 5385 }, { "epoch": 0.8689899967731526, "grad_norm": 4.031848907470703, "learning_rate": 4.552998002523512e-06, "loss": 1.7504, "step": 5386 }, { "epoch": 0.8691513391416587, "grad_norm": 4.178362846374512, "learning_rate": 4.542110875035038e-06, "loss": 2.1688, "step": 5387 }, { "epoch": 0.8693126815101646, "grad_norm": 3.6858437061309814, "learning_rate": 4.531236160383701e-06, "loss": 1.3889, "step": 5388 }, { "epoch": 0.8694740238786706, "grad_norm": 6.3474321365356445, "learning_rate": 4.520373861538951e-06, "loss": 1.845, "step": 5389 }, { "epoch": 0.8696353662471765, "grad_norm": 4.003809452056885, "learning_rate": 4.50952398146689e-06, "loss": 2.0094, "step": 5390 }, { "epoch": 0.8697967086156825, "grad_norm": 3.9384477138519287, "learning_rate": 4.498686523130191e-06, "loss": 1.9549, "step": 5391 }, { "epoch": 0.8699580509841884, "grad_norm": 4.913197040557861, "learning_rate": 4.487861489488177e-06, "loss": 2.1316, "step": 5392 }, { "epoch": 0.8701193933526944, "grad_norm": 4.362610816955566, "learning_rate": 4.4770488834967485e-06, "loss": 1.8528, "step": 5393 }, { "epoch": 0.8702807357212003, "grad_norm": 4.632915019989014, "learning_rate": 4.4662487081084115e-06, "loss": 2.0092, "step": 5394 }, { "epoch": 0.8704420780897063, "grad_norm": 4.530938625335693, "learning_rate": 4.455460966272307e-06, "loss": 2.0791, "step": 5395 }, { "epoch": 0.8706034204582124, "grad_norm": 3.6736834049224854, "learning_rate": 4.444685660934139e-06, "loss": 2.2223, "step": 5396 }, { "epoch": 0.8707647628267183, "grad_norm": 4.891629695892334, "learning_rate": 4.433922795036255e-06, "loss": 1.6743, "step": 5397 }, { "epoch": 0.8709261051952243, "grad_norm": 4.351646900177002, "learning_rate": 4.423172371517575e-06, "loss": 1.8404, "step": 5398 }, { "epoch": 0.8710874475637302, "grad_norm": 4.50725793838501, "learning_rate": 4.412434393313652e-06, "loss": 1.9123, "step": 5399 }, { "epoch": 0.8712487899322362, "grad_norm": 4.609782695770264, "learning_rate": 4.401708863356602e-06, "loss": 1.558, "step": 5400 }, { "epoch": 0.8714101323007422, "grad_norm": 4.794640064239502, "learning_rate": 4.39099578457518e-06, "loss": 1.5952, "step": 5401 }, { "epoch": 0.8715714746692481, "grad_norm": 5.642186641693115, "learning_rate": 4.38029515989472e-06, "loss": 2.0321, "step": 5402 }, { "epoch": 0.8717328170377541, "grad_norm": 4.8262152671813965, "learning_rate": 4.369606992237146e-06, "loss": 1.7114, "step": 5403 }, { "epoch": 0.8718941594062601, "grad_norm": 4.665055751800537, "learning_rate": 4.358931284521023e-06, "loss": 1.6008, "step": 5404 }, { "epoch": 0.8720555017747661, "grad_norm": 4.443180561065674, "learning_rate": 4.3482680396614516e-06, "loss": 1.7421, "step": 5405 }, { "epoch": 0.872216844143272, "grad_norm": 5.374730110168457, "learning_rate": 4.337617260570187e-06, "loss": 1.8272, "step": 5406 }, { "epoch": 0.872378186511778, "grad_norm": 5.673450469970703, "learning_rate": 4.326978950155536e-06, "loss": 1.9179, "step": 5407 }, { "epoch": 0.872539528880284, "grad_norm": 4.725485801696777, "learning_rate": 4.3163531113224465e-06, "loss": 1.6315, "step": 5408 }, { "epoch": 0.8727008712487899, "grad_norm": 4.904433250427246, "learning_rate": 4.305739746972415e-06, "loss": 2.2354, "step": 5409 }, { "epoch": 0.8728622136172959, "grad_norm": 3.5843193531036377, "learning_rate": 4.2951388600035555e-06, "loss": 1.7607, "step": 5410 }, { "epoch": 0.8730235559858018, "grad_norm": 4.858310699462891, "learning_rate": 4.28455045331056e-06, "loss": 2.0684, "step": 5411 }, { "epoch": 0.8731848983543078, "grad_norm": 4.717955589294434, "learning_rate": 4.273974529784747e-06, "loss": 2.0185, "step": 5412 }, { "epoch": 0.8733462407228139, "grad_norm": 4.557201862335205, "learning_rate": 4.2634110923139796e-06, "loss": 1.6191, "step": 5413 }, { "epoch": 0.8735075830913198, "grad_norm": 3.5318820476531982, "learning_rate": 4.252860143782761e-06, "loss": 1.7523, "step": 5414 }, { "epoch": 0.8736689254598258, "grad_norm": 8.056474685668945, "learning_rate": 4.242321687072137e-06, "loss": 1.8573, "step": 5415 }, { "epoch": 0.8738302678283317, "grad_norm": 4.431918144226074, "learning_rate": 4.231795725059756e-06, "loss": 1.8516, "step": 5416 }, { "epoch": 0.8739916101968377, "grad_norm": 4.269765377044678, "learning_rate": 4.221282260619891e-06, "loss": 1.7545, "step": 5417 }, { "epoch": 0.8741529525653436, "grad_norm": 3.9823522567749023, "learning_rate": 4.2107812966233395e-06, "loss": 1.6845, "step": 5418 }, { "epoch": 0.8743142949338496, "grad_norm": 5.325613498687744, "learning_rate": 4.200292835937553e-06, "loss": 1.9474, "step": 5419 }, { "epoch": 0.8744756373023556, "grad_norm": 3.8651812076568604, "learning_rate": 4.189816881426506e-06, "loss": 1.6847, "step": 5420 }, { "epoch": 0.8746369796708615, "grad_norm": 4.520545959472656, "learning_rate": 4.179353435950805e-06, "loss": 1.9354, "step": 5421 }, { "epoch": 0.8747983220393676, "grad_norm": 5.559961318969727, "learning_rate": 4.168902502367611e-06, "loss": 1.8653, "step": 5422 }, { "epoch": 0.8749596644078735, "grad_norm": 3.3139851093292236, "learning_rate": 4.1584640835306944e-06, "loss": 1.7976, "step": 5423 }, { "epoch": 0.8751210067763795, "grad_norm": 3.545987606048584, "learning_rate": 4.148038182290376e-06, "loss": 1.713, "step": 5424 }, { "epoch": 0.8752823491448855, "grad_norm": 3.64139986038208, "learning_rate": 4.1376248014935945e-06, "loss": 1.8402, "step": 5425 }, { "epoch": 0.8754436915133914, "grad_norm": 4.494565010070801, "learning_rate": 4.127223943983849e-06, "loss": 1.9319, "step": 5426 }, { "epoch": 0.8756050338818974, "grad_norm": 4.627253532409668, "learning_rate": 4.1168356126012055e-06, "loss": 1.6034, "step": 5427 }, { "epoch": 0.8757663762504033, "grad_norm": 3.559372901916504, "learning_rate": 4.106459810182345e-06, "loss": 1.7515, "step": 5428 }, { "epoch": 0.8759277186189093, "grad_norm": 5.645289421081543, "learning_rate": 4.096096539560501e-06, "loss": 1.7272, "step": 5429 }, { "epoch": 0.8760890609874153, "grad_norm": 5.593602657318115, "learning_rate": 4.0857458035654935e-06, "loss": 1.8373, "step": 5430 }, { "epoch": 0.8762504033559213, "grad_norm": 4.428093433380127, "learning_rate": 4.075407605023706e-06, "loss": 1.648, "step": 5431 }, { "epoch": 0.8764117457244273, "grad_norm": 4.232276916503906, "learning_rate": 4.0650819467581315e-06, "loss": 1.7091, "step": 5432 }, { "epoch": 0.8765730880929332, "grad_norm": 4.081578731536865, "learning_rate": 4.0547688315883015e-06, "loss": 1.6103, "step": 5433 }, { "epoch": 0.8767344304614392, "grad_norm": 5.288313865661621, "learning_rate": 4.044468262330353e-06, "loss": 1.8681, "step": 5434 }, { "epoch": 0.8768957728299451, "grad_norm": 4.759239196777344, "learning_rate": 4.03418024179697e-06, "loss": 1.819, "step": 5435 }, { "epoch": 0.8770571151984511, "grad_norm": 4.259239196777344, "learning_rate": 4.023904772797443e-06, "loss": 1.736, "step": 5436 }, { "epoch": 0.877218457566957, "grad_norm": 4.10651969909668, "learning_rate": 4.01364185813759e-06, "loss": 1.6239, "step": 5437 }, { "epoch": 0.877379799935463, "grad_norm": 5.2628302574157715, "learning_rate": 4.003391500619852e-06, "loss": 1.7202, "step": 5438 }, { "epoch": 0.8775411423039691, "grad_norm": 4.976608753204346, "learning_rate": 3.993153703043196e-06, "loss": 1.7757, "step": 5439 }, { "epoch": 0.877702484672475, "grad_norm": 3.359093427658081, "learning_rate": 3.9829284682031845e-06, "loss": 1.9446, "step": 5440 }, { "epoch": 0.877863827040981, "grad_norm": 4.429149627685547, "learning_rate": 3.972715798891952e-06, "loss": 1.6241, "step": 5441 }, { "epoch": 0.8780251694094869, "grad_norm": 5.290810585021973, "learning_rate": 3.962515697898173e-06, "loss": 1.8892, "step": 5442 }, { "epoch": 0.8781865117779929, "grad_norm": 3.9987807273864746, "learning_rate": 3.952328168007141e-06, "loss": 1.9139, "step": 5443 }, { "epoch": 0.8783478541464989, "grad_norm": 4.468580722808838, "learning_rate": 3.942153212000654e-06, "loss": 1.7947, "step": 5444 }, { "epoch": 0.8785091965150048, "grad_norm": 4.247889041900635, "learning_rate": 3.93199083265714e-06, "loss": 2.0586, "step": 5445 }, { "epoch": 0.8786705388835108, "grad_norm": 4.7147746086120605, "learning_rate": 3.9218410327515385e-06, "loss": 1.9079, "step": 5446 }, { "epoch": 0.8788318812520168, "grad_norm": 5.2043304443359375, "learning_rate": 3.911703815055395e-06, "loss": 1.8722, "step": 5447 }, { "epoch": 0.8789932236205228, "grad_norm": 5.258590221405029, "learning_rate": 3.901579182336796e-06, "loss": 1.6139, "step": 5448 }, { "epoch": 0.8791545659890287, "grad_norm": 5.239922523498535, "learning_rate": 3.891467137360388e-06, "loss": 1.8144, "step": 5449 }, { "epoch": 0.8793159083575347, "grad_norm": 4.352474212646484, "learning_rate": 3.881367682887393e-06, "loss": 1.5627, "step": 5450 }, { "epoch": 0.8794772507260407, "grad_norm": 4.475174903869629, "learning_rate": 3.871280821675605e-06, "loss": 1.6751, "step": 5451 }, { "epoch": 0.8796385930945466, "grad_norm": 4.773714542388916, "learning_rate": 3.861206556479352e-06, "loss": 2.0421, "step": 5452 }, { "epoch": 0.8797999354630526, "grad_norm": 4.278417587280273, "learning_rate": 3.851144890049535e-06, "loss": 1.8468, "step": 5453 }, { "epoch": 0.8799612778315585, "grad_norm": 4.038722515106201, "learning_rate": 3.841095825133623e-06, "loss": 1.956, "step": 5454 }, { "epoch": 0.8801226202000645, "grad_norm": 3.979750394821167, "learning_rate": 3.831059364475631e-06, "loss": 1.7624, "step": 5455 }, { "epoch": 0.8802839625685706, "grad_norm": 6.502319812774658, "learning_rate": 3.821035510816151e-06, "loss": 2.0544, "step": 5456 }, { "epoch": 0.8804453049370765, "grad_norm": 5.762221336364746, "learning_rate": 3.8110242668923045e-06, "loss": 1.8721, "step": 5457 }, { "epoch": 0.8806066473055825, "grad_norm": 4.8464436531066895, "learning_rate": 3.801025635437799e-06, "loss": 1.7393, "step": 5458 }, { "epoch": 0.8807679896740884, "grad_norm": 4.780818939208984, "learning_rate": 3.7910396191828677e-06, "loss": 2.0555, "step": 5459 }, { "epoch": 0.8809293320425944, "grad_norm": 3.8620622158050537, "learning_rate": 3.7810662208543348e-06, "loss": 1.544, "step": 5460 }, { "epoch": 0.8810906744111003, "grad_norm": 3.8314895629882812, "learning_rate": 3.771105443175543e-06, "loss": 1.7244, "step": 5461 }, { "epoch": 0.8812520167796063, "grad_norm": 3.7106902599334717, "learning_rate": 3.7611572888664183e-06, "loss": 1.6506, "step": 5462 }, { "epoch": 0.8814133591481123, "grad_norm": 3.627849578857422, "learning_rate": 3.751221760643414e-06, "loss": 1.7715, "step": 5463 }, { "epoch": 0.8815747015166182, "grad_norm": 4.333226680755615, "learning_rate": 3.741298861219561e-06, "loss": 2.0436, "step": 5464 }, { "epoch": 0.8817360438851243, "grad_norm": 5.117793083190918, "learning_rate": 3.7313885933044245e-06, "loss": 1.8617, "step": 5465 }, { "epoch": 0.8818973862536302, "grad_norm": 4.468812465667725, "learning_rate": 3.721490959604118e-06, "loss": 1.795, "step": 5466 }, { "epoch": 0.8820587286221362, "grad_norm": 4.807857990264893, "learning_rate": 3.711605962821324e-06, "loss": 1.6412, "step": 5467 }, { "epoch": 0.8822200709906421, "grad_norm": 4.753337860107422, "learning_rate": 3.7017336056552608e-06, "loss": 1.7431, "step": 5468 }, { "epoch": 0.8823814133591481, "grad_norm": 4.103842735290527, "learning_rate": 3.6918738908016948e-06, "loss": 1.8165, "step": 5469 }, { "epoch": 0.8825427557276541, "grad_norm": 4.501655101776123, "learning_rate": 3.6820268209529328e-06, "loss": 1.9522, "step": 5470 }, { "epoch": 0.88270409809616, "grad_norm": 3.9741098880767822, "learning_rate": 3.672192398797858e-06, "loss": 1.8075, "step": 5471 }, { "epoch": 0.882865440464666, "grad_norm": 4.133610248565674, "learning_rate": 3.662370627021855e-06, "loss": 1.9183, "step": 5472 }, { "epoch": 0.883026782833172, "grad_norm": 4.6065802574157715, "learning_rate": 3.652561508306912e-06, "loss": 1.9396, "step": 5473 }, { "epoch": 0.883188125201678, "grad_norm": 4.168022155761719, "learning_rate": 3.642765045331503e-06, "loss": 1.5395, "step": 5474 }, { "epoch": 0.883349467570184, "grad_norm": 3.8084218502044678, "learning_rate": 3.6329812407706885e-06, "loss": 1.7228, "step": 5475 }, { "epoch": 0.8835108099386899, "grad_norm": 5.351351261138916, "learning_rate": 3.6232100972960427e-06, "loss": 1.6885, "step": 5476 }, { "epoch": 0.8836721523071959, "grad_norm": 3.873673915863037, "learning_rate": 3.6134516175757193e-06, "loss": 1.7994, "step": 5477 }, { "epoch": 0.8838334946757018, "grad_norm": 4.273514270782471, "learning_rate": 3.603705804274371e-06, "loss": 1.8824, "step": 5478 }, { "epoch": 0.8839948370442078, "grad_norm": 4.087368965148926, "learning_rate": 3.593972660053219e-06, "loss": 2.0012, "step": 5479 }, { "epoch": 0.8841561794127137, "grad_norm": 4.9953107833862305, "learning_rate": 3.5842521875700197e-06, "loss": 2.0025, "step": 5480 }, { "epoch": 0.8843175217812197, "grad_norm": 4.712258338928223, "learning_rate": 3.574544389479062e-06, "loss": 1.9964, "step": 5481 }, { "epoch": 0.8844788641497258, "grad_norm": 5.13985538482666, "learning_rate": 3.564849268431192e-06, "loss": 1.8126, "step": 5482 }, { "epoch": 0.8846402065182317, "grad_norm": 7.525959491729736, "learning_rate": 3.5551668270737638e-06, "loss": 1.9318, "step": 5483 }, { "epoch": 0.8848015488867377, "grad_norm": 4.160379886627197, "learning_rate": 3.545497068050713e-06, "loss": 1.952, "step": 5484 }, { "epoch": 0.8849628912552436, "grad_norm": 3.685530662536621, "learning_rate": 3.5358399940024544e-06, "loss": 1.5698, "step": 5485 }, { "epoch": 0.8851242336237496, "grad_norm": 4.459519863128662, "learning_rate": 3.526195607566002e-06, "loss": 1.9705, "step": 5486 }, { "epoch": 0.8852855759922555, "grad_norm": 3.8491642475128174, "learning_rate": 3.516563911374865e-06, "loss": 1.824, "step": 5487 }, { "epoch": 0.8854469183607615, "grad_norm": 4.472646236419678, "learning_rate": 3.50694490805909e-06, "loss": 1.7509, "step": 5488 }, { "epoch": 0.8856082607292675, "grad_norm": 4.286990165710449, "learning_rate": 3.4973386002452535e-06, "loss": 1.8416, "step": 5489 }, { "epoch": 0.8857696030977735, "grad_norm": 3.676454782485962, "learning_rate": 3.487744990556502e-06, "loss": 1.8689, "step": 5490 }, { "epoch": 0.8859309454662795, "grad_norm": 4.483414173126221, "learning_rate": 3.478164081612478e-06, "loss": 1.7064, "step": 5491 }, { "epoch": 0.8860922878347854, "grad_norm": 3.92512845993042, "learning_rate": 3.468595876029357e-06, "loss": 1.8683, "step": 5492 }, { "epoch": 0.8862536302032914, "grad_norm": 3.9366745948791504, "learning_rate": 3.4590403764198753e-06, "loss": 1.6761, "step": 5493 }, { "epoch": 0.8864149725717974, "grad_norm": 4.361815929412842, "learning_rate": 3.4494975853932577e-06, "loss": 2.0082, "step": 5494 }, { "epoch": 0.8865763149403033, "grad_norm": 4.422248363494873, "learning_rate": 3.4399675055552973e-06, "loss": 1.8268, "step": 5495 }, { "epoch": 0.8867376573088093, "grad_norm": 4.271640777587891, "learning_rate": 3.43045013950829e-06, "loss": 1.8297, "step": 5496 }, { "epoch": 0.8868989996773152, "grad_norm": 3.809992551803589, "learning_rate": 3.420945489851085e-06, "loss": 2.0953, "step": 5497 }, { "epoch": 0.8870603420458212, "grad_norm": 4.003145694732666, "learning_rate": 3.411453559179023e-06, "loss": 1.6517, "step": 5498 }, { "epoch": 0.8872216844143272, "grad_norm": 4.99307918548584, "learning_rate": 3.4019743500840084e-06, "loss": 1.7525, "step": 5499 }, { "epoch": 0.8873830267828332, "grad_norm": 4.213025093078613, "learning_rate": 3.3925078651544486e-06, "loss": 1.8423, "step": 5500 }, { "epoch": 0.8875443691513392, "grad_norm": 4.226743698120117, "learning_rate": 3.383054106975292e-06, "loss": 1.7812, "step": 5501 }, { "epoch": 0.8877057115198451, "grad_norm": 5.146629333496094, "learning_rate": 3.373613078128002e-06, "loss": 2.0277, "step": 5502 }, { "epoch": 0.8878670538883511, "grad_norm": 5.330268859863281, "learning_rate": 3.364184781190549e-06, "loss": 1.8118, "step": 5503 }, { "epoch": 0.888028396256857, "grad_norm": 4.649377822875977, "learning_rate": 3.3547692187374747e-06, "loss": 1.7216, "step": 5504 }, { "epoch": 0.888189738625363, "grad_norm": 5.103134632110596, "learning_rate": 3.3453663933397938e-06, "loss": 1.893, "step": 5505 }, { "epoch": 0.888351080993869, "grad_norm": 4.233748912811279, "learning_rate": 3.335976307565075e-06, "loss": 1.7237, "step": 5506 }, { "epoch": 0.888512423362375, "grad_norm": 4.579054832458496, "learning_rate": 3.3265989639773953e-06, "loss": 1.8852, "step": 5507 }, { "epoch": 0.888673765730881, "grad_norm": 3.687269926071167, "learning_rate": 3.3172343651373504e-06, "loss": 1.7031, "step": 5508 }, { "epoch": 0.8888351080993869, "grad_norm": 4.788128852844238, "learning_rate": 3.307882513602051e-06, "loss": 2.1332, "step": 5509 }, { "epoch": 0.8889964504678929, "grad_norm": 4.407412528991699, "learning_rate": 3.298543411925159e-06, "loss": 2.054, "step": 5510 }, { "epoch": 0.8891577928363988, "grad_norm": 5.501307487487793, "learning_rate": 3.289217062656802e-06, "loss": 1.9332, "step": 5511 }, { "epoch": 0.8893191352049048, "grad_norm": 4.539480686187744, "learning_rate": 3.2799034683436815e-06, "loss": 1.6599, "step": 5512 }, { "epoch": 0.8894804775734108, "grad_norm": 3.469583511352539, "learning_rate": 3.2706026315289682e-06, "loss": 1.8929, "step": 5513 }, { "epoch": 0.8896418199419167, "grad_norm": 4.4009528160095215, "learning_rate": 3.2613145547523928e-06, "loss": 1.8845, "step": 5514 }, { "epoch": 0.8898031623104227, "grad_norm": 4.314871311187744, "learning_rate": 3.2520392405501644e-06, "loss": 1.7426, "step": 5515 }, { "epoch": 0.8899645046789287, "grad_norm": 3.7380380630493164, "learning_rate": 3.242776691455013e-06, "loss": 1.9838, "step": 5516 }, { "epoch": 0.8901258470474347, "grad_norm": 6.570891380310059, "learning_rate": 3.2335269099962096e-06, "loss": 1.7812, "step": 5517 }, { "epoch": 0.8902871894159406, "grad_norm": 3.98508882522583, "learning_rate": 3.2242898986995063e-06, "loss": 1.9104, "step": 5518 }, { "epoch": 0.8904485317844466, "grad_norm": 3.6875693798065186, "learning_rate": 3.215065660087202e-06, "loss": 1.7309, "step": 5519 }, { "epoch": 0.8906098741529526, "grad_norm": 4.245000839233398, "learning_rate": 3.205854196678071e-06, "loss": 1.9901, "step": 5520 }, { "epoch": 0.8907712165214585, "grad_norm": 4.279839992523193, "learning_rate": 3.1966555109874287e-06, "loss": 1.6667, "step": 5521 }, { "epoch": 0.8909325588899645, "grad_norm": 3.916095495223999, "learning_rate": 3.1874696055270715e-06, "loss": 2.0707, "step": 5522 }, { "epoch": 0.8910939012584704, "grad_norm": 4.966620445251465, "learning_rate": 3.178296482805354e-06, "loss": 1.7666, "step": 5523 }, { "epoch": 0.8912552436269764, "grad_norm": 3.7445385456085205, "learning_rate": 3.169136145327084e-06, "loss": 2.0391, "step": 5524 }, { "epoch": 0.8914165859954825, "grad_norm": 4.792840957641602, "learning_rate": 3.159988595593616e-06, "loss": 1.8792, "step": 5525 }, { "epoch": 0.8915779283639884, "grad_norm": 5.726657867431641, "learning_rate": 3.150853836102802e-06, "loss": 1.8947, "step": 5526 }, { "epoch": 0.8917392707324944, "grad_norm": 4.585726261138916, "learning_rate": 3.141731869348996e-06, "loss": 2.1522, "step": 5527 }, { "epoch": 0.8919006131010003, "grad_norm": 5.255412578582764, "learning_rate": 3.1326226978230678e-06, "loss": 2.0892, "step": 5528 }, { "epoch": 0.8920619554695063, "grad_norm": 4.796871662139893, "learning_rate": 3.1235263240123824e-06, "loss": 1.9708, "step": 5529 }, { "epoch": 0.8922232978380122, "grad_norm": 4.137606143951416, "learning_rate": 3.1144427504008254e-06, "loss": 1.7527, "step": 5530 }, { "epoch": 0.8923846402065182, "grad_norm": 4.873930931091309, "learning_rate": 3.105371979468763e-06, "loss": 1.7856, "step": 5531 }, { "epoch": 0.8925459825750242, "grad_norm": 5.601640701293945, "learning_rate": 3.096314013693108e-06, "loss": 1.8228, "step": 5532 }, { "epoch": 0.8927073249435302, "grad_norm": 5.193307399749756, "learning_rate": 3.087268855547221e-06, "loss": 1.9018, "step": 5533 }, { "epoch": 0.8928686673120362, "grad_norm": 4.120820999145508, "learning_rate": 3.0782365075010145e-06, "loss": 1.7555, "step": 5534 }, { "epoch": 0.8930300096805421, "grad_norm": 4.9706902503967285, "learning_rate": 3.069216972020866e-06, "loss": 2.0308, "step": 5535 }, { "epoch": 0.8931913520490481, "grad_norm": 4.47065544128418, "learning_rate": 3.0602102515696953e-06, "loss": 1.6246, "step": 5536 }, { "epoch": 0.893352694417554, "grad_norm": 3.629650354385376, "learning_rate": 3.0512163486068666e-06, "loss": 1.8044, "step": 5537 }, { "epoch": 0.89351403678606, "grad_norm": 5.262669086456299, "learning_rate": 3.0422352655883057e-06, "loss": 2.0888, "step": 5538 }, { "epoch": 0.893675379154566, "grad_norm": 4.949001312255859, "learning_rate": 3.0332670049663837e-06, "loss": 1.7839, "step": 5539 }, { "epoch": 0.8938367215230719, "grad_norm": 4.028216361999512, "learning_rate": 3.0243115691900136e-06, "loss": 1.7119, "step": 5540 }, { "epoch": 0.8939980638915779, "grad_norm": 4.334787845611572, "learning_rate": 3.0153689607045845e-06, "loss": 1.7027, "step": 5541 }, { "epoch": 0.8941594062600839, "grad_norm": 4.003933906555176, "learning_rate": 3.00643918195197e-06, "loss": 1.7413, "step": 5542 }, { "epoch": 0.8943207486285899, "grad_norm": 4.258213996887207, "learning_rate": 2.9975222353705756e-06, "loss": 1.6041, "step": 5543 }, { "epoch": 0.8944820909970959, "grad_norm": 4.005317687988281, "learning_rate": 2.988618123395276e-06, "loss": 1.9746, "step": 5544 }, { "epoch": 0.8946434333656018, "grad_norm": 5.412639617919922, "learning_rate": 2.979726848457437e-06, "loss": 1.7949, "step": 5545 }, { "epoch": 0.8948047757341078, "grad_norm": 3.905019760131836, "learning_rate": 2.9708484129849556e-06, "loss": 1.715, "step": 5546 }, { "epoch": 0.8949661181026137, "grad_norm": 4.405820846557617, "learning_rate": 2.9619828194021816e-06, "loss": 1.7162, "step": 5547 }, { "epoch": 0.8951274604711197, "grad_norm": 3.744107484817505, "learning_rate": 2.953130070129967e-06, "loss": 1.6591, "step": 5548 }, { "epoch": 0.8952888028396256, "grad_norm": 4.333800315856934, "learning_rate": 2.944290167585684e-06, "loss": 1.6746, "step": 5549 }, { "epoch": 0.8954501452081317, "grad_norm": 4.936779975891113, "learning_rate": 2.9354631141831623e-06, "loss": 1.8484, "step": 5550 }, { "epoch": 0.8956114875766377, "grad_norm": 3.6195812225341797, "learning_rate": 2.9266489123327468e-06, "loss": 1.9801, "step": 5551 }, { "epoch": 0.8957728299451436, "grad_norm": 6.064610004425049, "learning_rate": 2.9178475644412563e-06, "loss": 2.0025, "step": 5552 }, { "epoch": 0.8959341723136496, "grad_norm": 4.333761215209961, "learning_rate": 2.909059072912018e-06, "loss": 1.8259, "step": 5553 }, { "epoch": 0.8960955146821555, "grad_norm": 6.1533894538879395, "learning_rate": 2.9002834401448296e-06, "loss": 1.6982, "step": 5554 }, { "epoch": 0.8962568570506615, "grad_norm": 6.941343784332275, "learning_rate": 2.8915206685359798e-06, "loss": 1.8972, "step": 5555 }, { "epoch": 0.8964181994191674, "grad_norm": 6.984273910522461, "learning_rate": 2.8827707604782704e-06, "loss": 1.7878, "step": 5556 }, { "epoch": 0.8965795417876734, "grad_norm": 4.510810852050781, "learning_rate": 2.8740337183609466e-06, "loss": 1.9487, "step": 5557 }, { "epoch": 0.8967408841561794, "grad_norm": 4.577576160430908, "learning_rate": 2.865309544569794e-06, "loss": 2.0792, "step": 5558 }, { "epoch": 0.8969022265246854, "grad_norm": 3.6346867084503174, "learning_rate": 2.8565982414870297e-06, "loss": 1.8338, "step": 5559 }, { "epoch": 0.8970635688931914, "grad_norm": 4.3393449783325195, "learning_rate": 2.8478998114914004e-06, "loss": 1.8608, "step": 5560 }, { "epoch": 0.8972249112616973, "grad_norm": 4.22618293762207, "learning_rate": 2.839214256958106e-06, "loss": 1.7635, "step": 5561 }, { "epoch": 0.8973862536302033, "grad_norm": 5.2830023765563965, "learning_rate": 2.8305415802588608e-06, "loss": 1.6503, "step": 5562 }, { "epoch": 0.8975475959987093, "grad_norm": 4.610961437225342, "learning_rate": 2.8218817837618317e-06, "loss": 1.9631, "step": 5563 }, { "epoch": 0.8977089383672152, "grad_norm": 4.6389570236206055, "learning_rate": 2.8132348698316934e-06, "loss": 1.8174, "step": 5564 }, { "epoch": 0.8978702807357212, "grad_norm": 4.900999546051025, "learning_rate": 2.804600840829574e-06, "loss": 1.8821, "step": 5565 }, { "epoch": 0.8980316231042271, "grad_norm": 4.001184463500977, "learning_rate": 2.795979699113127e-06, "loss": 1.5665, "step": 5566 }, { "epoch": 0.8981929654727331, "grad_norm": 4.928239822387695, "learning_rate": 2.7873714470364466e-06, "loss": 1.8012, "step": 5567 }, { "epoch": 0.8983543078412392, "grad_norm": 4.311793804168701, "learning_rate": 2.7787760869501133e-06, "loss": 1.7901, "step": 5568 }, { "epoch": 0.8985156502097451, "grad_norm": 5.386144638061523, "learning_rate": 2.770193621201217e-06, "loss": 1.6103, "step": 5569 }, { "epoch": 0.8986769925782511, "grad_norm": 8.556854248046875, "learning_rate": 2.7616240521332882e-06, "loss": 1.7432, "step": 5570 }, { "epoch": 0.898838334946757, "grad_norm": 4.126765727996826, "learning_rate": 2.7530673820863715e-06, "loss": 2.3162, "step": 5571 }, { "epoch": 0.898999677315263, "grad_norm": 4.71306037902832, "learning_rate": 2.744523613396954e-06, "loss": 1.8745, "step": 5572 }, { "epoch": 0.8991610196837689, "grad_norm": 4.547961711883545, "learning_rate": 2.735992748398025e-06, "loss": 1.7934, "step": 5573 }, { "epoch": 0.8993223620522749, "grad_norm": 4.691380023956299, "learning_rate": 2.727474789419038e-06, "loss": 1.7966, "step": 5574 }, { "epoch": 0.8994837044207808, "grad_norm": 6.193465709686279, "learning_rate": 2.718969738785937e-06, "loss": 1.9246, "step": 5575 }, { "epoch": 0.8996450467892869, "grad_norm": 4.564996242523193, "learning_rate": 2.7104775988211205e-06, "loss": 1.7162, "step": 5576 }, { "epoch": 0.8998063891577929, "grad_norm": 4.405948162078857, "learning_rate": 2.701998371843478e-06, "loss": 1.8958, "step": 5577 }, { "epoch": 0.8999677315262988, "grad_norm": 4.4796624183654785, "learning_rate": 2.6935320601683634e-06, "loss": 1.7776, "step": 5578 }, { "epoch": 0.9001290738948048, "grad_norm": 4.329618453979492, "learning_rate": 2.6850786661076044e-06, "loss": 1.7769, "step": 5579 }, { "epoch": 0.9002904162633107, "grad_norm": 3.8581998348236084, "learning_rate": 2.676638191969516e-06, "loss": 1.7699, "step": 5580 }, { "epoch": 0.9004517586318167, "grad_norm": 4.222093105316162, "learning_rate": 2.6682106400588546e-06, "loss": 1.9077, "step": 5581 }, { "epoch": 0.9006131010003227, "grad_norm": 5.26242733001709, "learning_rate": 2.6597960126768906e-06, "loss": 1.9529, "step": 5582 }, { "epoch": 0.9007744433688286, "grad_norm": 4.077167510986328, "learning_rate": 2.65139431212133e-06, "loss": 1.9152, "step": 5583 }, { "epoch": 0.9009357857373346, "grad_norm": 4.203107833862305, "learning_rate": 2.6430055406863607e-06, "loss": 1.5174, "step": 5584 }, { "epoch": 0.9010971281058406, "grad_norm": 4.102605819702148, "learning_rate": 2.6346297006626274e-06, "loss": 1.9124, "step": 5585 }, { "epoch": 0.9012584704743466, "grad_norm": 4.139077186584473, "learning_rate": 2.6262667943372845e-06, "loss": 1.8356, "step": 5586 }, { "epoch": 0.9014198128428526, "grad_norm": 4.112833023071289, "learning_rate": 2.617916823993899e-06, "loss": 1.7165, "step": 5587 }, { "epoch": 0.9015811552113585, "grad_norm": 4.986573696136475, "learning_rate": 2.609579791912553e-06, "loss": 2.0159, "step": 5588 }, { "epoch": 0.9017424975798645, "grad_norm": 5.297725677490234, "learning_rate": 2.601255700369765e-06, "loss": 1.7165, "step": 5589 }, { "epoch": 0.9019038399483704, "grad_norm": 5.899139404296875, "learning_rate": 2.592944551638543e-06, "loss": 2.0275, "step": 5590 }, { "epoch": 0.9020651823168764, "grad_norm": 3.807053804397583, "learning_rate": 2.5846463479883344e-06, "loss": 1.9027, "step": 5591 }, { "epoch": 0.9022265246853823, "grad_norm": 4.398458003997803, "learning_rate": 2.57636109168507e-06, "loss": 1.7302, "step": 5592 }, { "epoch": 0.9023878670538884, "grad_norm": 4.744655609130859, "learning_rate": 2.5680887849911463e-06, "loss": 1.84, "step": 5593 }, { "epoch": 0.9025492094223944, "grad_norm": 3.9637608528137207, "learning_rate": 2.5598294301654114e-06, "loss": 1.8567, "step": 5594 }, { "epoch": 0.9027105517909003, "grad_norm": 4.358543395996094, "learning_rate": 2.5515830294631894e-06, "loss": 1.7194, "step": 5595 }, { "epoch": 0.9028718941594063, "grad_norm": 5.891459941864014, "learning_rate": 2.5433495851362567e-06, "loss": 1.7255, "step": 5596 }, { "epoch": 0.9030332365279122, "grad_norm": 4.939186096191406, "learning_rate": 2.53512909943287e-06, "loss": 1.8839, "step": 5597 }, { "epoch": 0.9031945788964182, "grad_norm": 4.955162525177002, "learning_rate": 2.5269215745977126e-06, "loss": 1.8185, "step": 5598 }, { "epoch": 0.9033559212649241, "grad_norm": 4.159775733947754, "learning_rate": 2.518727012871974e-06, "loss": 1.9379, "step": 5599 }, { "epoch": 0.9035172636334301, "grad_norm": 5.024981498718262, "learning_rate": 2.5105454164932594e-06, "loss": 1.8584, "step": 5600 }, { "epoch": 0.9036786060019361, "grad_norm": 3.6600778102874756, "learning_rate": 2.5023767876956704e-06, "loss": 1.907, "step": 5601 }, { "epoch": 0.9038399483704421, "grad_norm": 5.974171161651611, "learning_rate": 2.494221128709745e-06, "loss": 1.736, "step": 5602 }, { "epoch": 0.9040012907389481, "grad_norm": 5.048582553863525, "learning_rate": 2.4860784417624904e-06, "loss": 2.0173, "step": 5603 }, { "epoch": 0.904162633107454, "grad_norm": 5.431088924407959, "learning_rate": 2.4779487290773617e-06, "loss": 1.8394, "step": 5604 }, { "epoch": 0.90432397547596, "grad_norm": 4.85250997543335, "learning_rate": 2.469831992874272e-06, "loss": 1.8768, "step": 5605 }, { "epoch": 0.904485317844466, "grad_norm": 3.9703378677368164, "learning_rate": 2.4617282353696093e-06, "loss": 1.8982, "step": 5606 }, { "epoch": 0.9046466602129719, "grad_norm": 4.356378078460693, "learning_rate": 2.4536374587761924e-06, "loss": 1.6529, "step": 5607 }, { "epoch": 0.9048080025814779, "grad_norm": 4.12722635269165, "learning_rate": 2.445559665303321e-06, "loss": 1.7909, "step": 5608 }, { "epoch": 0.9049693449499838, "grad_norm": 4.801215648651123, "learning_rate": 2.4374948571567246e-06, "loss": 1.8611, "step": 5609 }, { "epoch": 0.9051306873184899, "grad_norm": 3.9699883460998535, "learning_rate": 2.429443036538609e-06, "loss": 1.9078, "step": 5610 }, { "epoch": 0.9052920296869958, "grad_norm": 4.241724491119385, "learning_rate": 2.4214042056476093e-06, "loss": 2.0308, "step": 5611 }, { "epoch": 0.9054533720555018, "grad_norm": 3.783005714416504, "learning_rate": 2.4133783666788424e-06, "loss": 1.8767, "step": 5612 }, { "epoch": 0.9056147144240078, "grad_norm": 5.081392288208008, "learning_rate": 2.4053655218238493e-06, "loss": 2.0387, "step": 5613 }, { "epoch": 0.9057760567925137, "grad_norm": 5.225015640258789, "learning_rate": 2.397365673270646e-06, "loss": 1.7805, "step": 5614 }, { "epoch": 0.9059373991610197, "grad_norm": 4.1674981117248535, "learning_rate": 2.389378823203681e-06, "loss": 1.8708, "step": 5615 }, { "epoch": 0.9060987415295256, "grad_norm": 3.9809725284576416, "learning_rate": 2.3814049738038744e-06, "loss": 1.7738, "step": 5616 }, { "epoch": 0.9062600838980316, "grad_norm": 3.365668296813965, "learning_rate": 2.373444127248581e-06, "loss": 1.8897, "step": 5617 }, { "epoch": 0.9064214262665375, "grad_norm": 3.308582067489624, "learning_rate": 2.3654962857115937e-06, "loss": 1.8113, "step": 5618 }, { "epoch": 0.9065827686350436, "grad_norm": 3.76804780960083, "learning_rate": 2.3575614513631884e-06, "loss": 1.7842, "step": 5619 }, { "epoch": 0.9067441110035496, "grad_norm": 3.941070079803467, "learning_rate": 2.3496396263700482e-06, "loss": 1.7475, "step": 5620 }, { "epoch": 0.9069054533720555, "grad_norm": 4.967513084411621, "learning_rate": 2.3417308128953485e-06, "loss": 1.9079, "step": 5621 }, { "epoch": 0.9070667957405615, "grad_norm": 4.02778434753418, "learning_rate": 2.333835013098673e-06, "loss": 1.6699, "step": 5622 }, { "epoch": 0.9072281381090674, "grad_norm": 3.6762659549713135, "learning_rate": 2.3259522291360747e-06, "loss": 1.8003, "step": 5623 }, { "epoch": 0.9073894804775734, "grad_norm": 3.978619337081909, "learning_rate": 2.318082463160032e-06, "loss": 1.6767, "step": 5624 }, { "epoch": 0.9075508228460794, "grad_norm": 5.992420196533203, "learning_rate": 2.3102257173194974e-06, "loss": 1.8987, "step": 5625 }, { "epoch": 0.9077121652145853, "grad_norm": 4.104101657867432, "learning_rate": 2.302381993759839e-06, "loss": 1.7226, "step": 5626 }, { "epoch": 0.9078735075830913, "grad_norm": 5.735623359680176, "learning_rate": 2.2945512946228984e-06, "loss": 2.1269, "step": 5627 }, { "epoch": 0.9080348499515973, "grad_norm": 4.954807758331299, "learning_rate": 2.286733622046927e-06, "loss": 1.8843, "step": 5628 }, { "epoch": 0.9081961923201033, "grad_norm": 4.75136661529541, "learning_rate": 2.27892897816665e-06, "loss": 1.5943, "step": 5629 }, { "epoch": 0.9083575346886092, "grad_norm": 3.810737371444702, "learning_rate": 2.271137365113213e-06, "loss": 1.6648, "step": 5630 }, { "epoch": 0.9085188770571152, "grad_norm": 3.4491994380950928, "learning_rate": 2.2633587850142133e-06, "loss": 1.7016, "step": 5631 }, { "epoch": 0.9086802194256212, "grad_norm": 5.240142345428467, "learning_rate": 2.2555932399936973e-06, "loss": 1.8236, "step": 5632 }, { "epoch": 0.9088415617941271, "grad_norm": 4.1295061111450195, "learning_rate": 2.2478407321721296e-06, "loss": 1.6504, "step": 5633 }, { "epoch": 0.9090029041626331, "grad_norm": 8.141188621520996, "learning_rate": 2.2401012636664387e-06, "loss": 1.9291, "step": 5634 }, { "epoch": 0.909164246531139, "grad_norm": 3.8959414958953857, "learning_rate": 2.2323748365899675e-06, "loss": 1.7571, "step": 5635 }, { "epoch": 0.9093255888996451, "grad_norm": 4.677822589874268, "learning_rate": 2.2246614530525346e-06, "loss": 1.8032, "step": 5636 }, { "epoch": 0.909486931268151, "grad_norm": 5.629193305969238, "learning_rate": 2.216961115160354e-06, "loss": 1.7274, "step": 5637 }, { "epoch": 0.909648273636657, "grad_norm": 5.022723197937012, "learning_rate": 2.2092738250161114e-06, "loss": 1.7807, "step": 5638 }, { "epoch": 0.909809616005163, "grad_norm": 8.506092071533203, "learning_rate": 2.2015995847189107e-06, "loss": 1.8648, "step": 5639 }, { "epoch": 0.9099709583736689, "grad_norm": 4.457287788391113, "learning_rate": 2.1939383963642867e-06, "loss": 1.7594, "step": 5640 }, { "epoch": 0.9101323007421749, "grad_norm": 4.704784393310547, "learning_rate": 2.1862902620442437e-06, "loss": 2.1164, "step": 5641 }, { "epoch": 0.9102936431106808, "grad_norm": 5.191957950592041, "learning_rate": 2.178655183847189e-06, "loss": 1.8367, "step": 5642 }, { "epoch": 0.9104549854791868, "grad_norm": 4.328970432281494, "learning_rate": 2.1710331638579717e-06, "loss": 1.8905, "step": 5643 }, { "epoch": 0.9106163278476928, "grad_norm": 5.440768718719482, "learning_rate": 2.1634242041578713e-06, "loss": 1.5234, "step": 5644 }, { "epoch": 0.9107776702161988, "grad_norm": 4.354081153869629, "learning_rate": 2.1558283068246253e-06, "loss": 2.1624, "step": 5645 }, { "epoch": 0.9109390125847048, "grad_norm": 4.221670627593994, "learning_rate": 2.1482454739323755e-06, "loss": 1.7389, "step": 5646 }, { "epoch": 0.9111003549532107, "grad_norm": 8.989388465881348, "learning_rate": 2.1406757075517147e-06, "loss": 1.8681, "step": 5647 }, { "epoch": 0.9112616973217167, "grad_norm": 6.524087905883789, "learning_rate": 2.133119009749651e-06, "loss": 1.8335, "step": 5648 }, { "epoch": 0.9114230396902226, "grad_norm": 4.2385711669921875, "learning_rate": 2.1255753825896453e-06, "loss": 1.5785, "step": 5649 }, { "epoch": 0.9115843820587286, "grad_norm": 5.5846147537231445, "learning_rate": 2.1180448281315657e-06, "loss": 1.9489, "step": 5650 }, { "epoch": 0.9117457244272346, "grad_norm": 5.0407304763793945, "learning_rate": 2.11052734843174e-06, "loss": 1.937, "step": 5651 }, { "epoch": 0.9119070667957405, "grad_norm": 4.131408214569092, "learning_rate": 2.1030229455428928e-06, "loss": 1.8004, "step": 5652 }, { "epoch": 0.9120684091642466, "grad_norm": 4.67261266708374, "learning_rate": 2.0955316215142074e-06, "loss": 2.0531, "step": 5653 }, { "epoch": 0.9122297515327525, "grad_norm": 3.847583532333374, "learning_rate": 2.088053378391269e-06, "loss": 1.6934, "step": 5654 }, { "epoch": 0.9123910939012585, "grad_norm": 3.829157829284668, "learning_rate": 2.0805882182161063e-06, "loss": 1.7926, "step": 5655 }, { "epoch": 0.9125524362697645, "grad_norm": 4.377475261688232, "learning_rate": 2.0731361430271877e-06, "loss": 1.7305, "step": 5656 }, { "epoch": 0.9127137786382704, "grad_norm": 4.612732887268066, "learning_rate": 2.065697154859375e-06, "loss": 2.0563, "step": 5657 }, { "epoch": 0.9128751210067764, "grad_norm": 4.503220558166504, "learning_rate": 2.0582712557439874e-06, "loss": 1.7823, "step": 5658 }, { "epoch": 0.9130364633752823, "grad_norm": 4.670096397399902, "learning_rate": 2.050858447708759e-06, "loss": 2.0674, "step": 5659 }, { "epoch": 0.9131978057437883, "grad_norm": 6.374212265014648, "learning_rate": 2.043458732777831e-06, "loss": 1.7429, "step": 5660 }, { "epoch": 0.9133591481122942, "grad_norm": 3.732630491256714, "learning_rate": 2.0360721129718152e-06, "loss": 1.9587, "step": 5661 }, { "epoch": 0.9135204904808003, "grad_norm": 4.829896926879883, "learning_rate": 2.028698590307698e-06, "loss": 1.8403, "step": 5662 }, { "epoch": 0.9136818328493063, "grad_norm": 3.5910189151763916, "learning_rate": 2.021338166798914e-06, "loss": 1.7132, "step": 5663 }, { "epoch": 0.9138431752178122, "grad_norm": 5.4957733154296875, "learning_rate": 2.0139908444553267e-06, "loss": 1.9427, "step": 5664 }, { "epoch": 0.9140045175863182, "grad_norm": 5.186479091644287, "learning_rate": 2.0066566252831986e-06, "loss": 2.1578, "step": 5665 }, { "epoch": 0.9141658599548241, "grad_norm": 3.928703546524048, "learning_rate": 1.999335511285244e-06, "loss": 1.8647, "step": 5666 }, { "epoch": 0.9143272023233301, "grad_norm": 4.604649066925049, "learning_rate": 1.992027504460575e-06, "loss": 1.753, "step": 5667 }, { "epoch": 0.914488544691836, "grad_norm": 4.960216522216797, "learning_rate": 1.984732606804729e-06, "loss": 1.8151, "step": 5668 }, { "epoch": 0.914649887060342, "grad_norm": 5.813380241394043, "learning_rate": 1.977450820309684e-06, "loss": 1.854, "step": 5669 }, { "epoch": 0.914811229428848, "grad_norm": 4.861239433288574, "learning_rate": 1.9701821469637948e-06, "loss": 1.8343, "step": 5670 }, { "epoch": 0.914972571797354, "grad_norm": 4.349196910858154, "learning_rate": 1.96292658875189e-06, "loss": 1.7532, "step": 5671 }, { "epoch": 0.91513391416586, "grad_norm": 4.05011510848999, "learning_rate": 1.9556841476551736e-06, "loss": 2.1045, "step": 5672 }, { "epoch": 0.9152952565343659, "grad_norm": 5.115226745605469, "learning_rate": 1.9484548256512912e-06, "loss": 2.2221, "step": 5673 }, { "epoch": 0.9154565989028719, "grad_norm": 4.569157600402832, "learning_rate": 1.9412386247142864e-06, "loss": 2.0785, "step": 5674 }, { "epoch": 0.9156179412713779, "grad_norm": 6.013679027557373, "learning_rate": 1.934035546814644e-06, "loss": 1.829, "step": 5675 }, { "epoch": 0.9157792836398838, "grad_norm": 4.638603210449219, "learning_rate": 1.9268455939192463e-06, "loss": 1.7385, "step": 5676 }, { "epoch": 0.9159406260083898, "grad_norm": 4.1503520011901855, "learning_rate": 1.9196687679914062e-06, "loss": 1.7304, "step": 5677 }, { "epoch": 0.9161019683768957, "grad_norm": 5.443770885467529, "learning_rate": 1.9125050709908387e-06, "loss": 1.9824, "step": 5678 }, { "epoch": 0.9162633107454018, "grad_norm": 4.665947437286377, "learning_rate": 1.9053545048736744e-06, "loss": 1.737, "step": 5679 }, { "epoch": 0.9164246531139078, "grad_norm": 4.867141246795654, "learning_rate": 1.8982170715924785e-06, "loss": 1.9976, "step": 5680 }, { "epoch": 0.9165859954824137, "grad_norm": 4.335331916809082, "learning_rate": 1.8910927730962036e-06, "loss": 1.8882, "step": 5681 }, { "epoch": 0.9167473378509197, "grad_norm": 3.68391489982605, "learning_rate": 1.8839816113302266e-06, "loss": 1.574, "step": 5682 }, { "epoch": 0.9169086802194256, "grad_norm": 4.229113578796387, "learning_rate": 1.8768835882363389e-06, "loss": 1.7778, "step": 5683 }, { "epoch": 0.9170700225879316, "grad_norm": 4.200973987579346, "learning_rate": 1.8697987057527566e-06, "loss": 1.8908, "step": 5684 }, { "epoch": 0.9172313649564375, "grad_norm": 4.999737739562988, "learning_rate": 1.8627269658140711e-06, "loss": 1.761, "step": 5685 }, { "epoch": 0.9173927073249435, "grad_norm": 4.386737823486328, "learning_rate": 1.8556683703513267e-06, "loss": 1.8375, "step": 5686 }, { "epoch": 0.9175540496934494, "grad_norm": 5.096742153167725, "learning_rate": 1.8486229212919481e-06, "loss": 1.6345, "step": 5687 }, { "epoch": 0.9177153920619555, "grad_norm": 5.353132724761963, "learning_rate": 1.841590620559791e-06, "loss": 1.7384, "step": 5688 }, { "epoch": 0.9178767344304615, "grad_norm": 5.973819732666016, "learning_rate": 1.8345714700751026e-06, "loss": 1.8727, "step": 5689 }, { "epoch": 0.9180380767989674, "grad_norm": 3.748918056488037, "learning_rate": 1.827565471754561e-06, "loss": 1.8188, "step": 5690 }, { "epoch": 0.9181994191674734, "grad_norm": 4.631107807159424, "learning_rate": 1.82057262751123e-06, "loss": 1.8604, "step": 5691 }, { "epoch": 0.9183607615359793, "grad_norm": 5.493433475494385, "learning_rate": 1.8135929392545993e-06, "loss": 2.1599, "step": 5692 }, { "epoch": 0.9185221039044853, "grad_norm": 3.2908809185028076, "learning_rate": 1.8066264088905548e-06, "loss": 1.9088, "step": 5693 }, { "epoch": 0.9186834462729913, "grad_norm": 4.060011863708496, "learning_rate": 1.7996730383213867e-06, "loss": 1.814, "step": 5694 }, { "epoch": 0.9188447886414972, "grad_norm": 3.9401357173919678, "learning_rate": 1.7927328294458146e-06, "loss": 1.7475, "step": 5695 }, { "epoch": 0.9190061310100033, "grad_norm": 5.0063796043396, "learning_rate": 1.785805784158928e-06, "loss": 1.9751, "step": 5696 }, { "epoch": 0.9191674733785092, "grad_norm": 4.859718322753906, "learning_rate": 1.7788919043522646e-06, "loss": 1.7047, "step": 5697 }, { "epoch": 0.9193288157470152, "grad_norm": 3.6432406902313232, "learning_rate": 1.77199119191373e-06, "loss": 1.5968, "step": 5698 }, { "epoch": 0.9194901581155212, "grad_norm": 4.3131022453308105, "learning_rate": 1.765103648727645e-06, "loss": 1.8461, "step": 5699 }, { "epoch": 0.9196515004840271, "grad_norm": 4.97066593170166, "learning_rate": 1.75822927667475e-06, "loss": 2.1464, "step": 5700 }, { "epoch": 0.9198128428525331, "grad_norm": 5.496631622314453, "learning_rate": 1.751368077632176e-06, "loss": 2.04, "step": 5701 }, { "epoch": 0.919974185221039, "grad_norm": 4.846836566925049, "learning_rate": 1.7445200534734474e-06, "loss": 1.9715, "step": 5702 }, { "epoch": 0.920135527589545, "grad_norm": 4.323795318603516, "learning_rate": 1.7376852060685123e-06, "loss": 1.8746, "step": 5703 }, { "epoch": 0.9202968699580509, "grad_norm": 3.578923225402832, "learning_rate": 1.7308635372837056e-06, "loss": 1.7163, "step": 5704 }, { "epoch": 0.920458212326557, "grad_norm": 3.9475107192993164, "learning_rate": 1.7240550489817653e-06, "loss": 1.9772, "step": 5705 }, { "epoch": 0.920619554695063, "grad_norm": 4.6500325202941895, "learning_rate": 1.717259743021843e-06, "loss": 1.6877, "step": 5706 }, { "epoch": 0.9207808970635689, "grad_norm": 3.917170763015747, "learning_rate": 1.7104776212594653e-06, "loss": 1.8975, "step": 5707 }, { "epoch": 0.9209422394320749, "grad_norm": 4.595219612121582, "learning_rate": 1.70370868554659e-06, "loss": 1.7851, "step": 5708 }, { "epoch": 0.9211035818005808, "grad_norm": 5.927668571472168, "learning_rate": 1.6969529377315441e-06, "loss": 1.9125, "step": 5709 }, { "epoch": 0.9212649241690868, "grad_norm": 4.408805847167969, "learning_rate": 1.6902103796590795e-06, "loss": 1.7602, "step": 5710 }, { "epoch": 0.9214262665375927, "grad_norm": 4.741026401519775, "learning_rate": 1.6834810131703293e-06, "loss": 1.557, "step": 5711 }, { "epoch": 0.9215876089060987, "grad_norm": 3.8772976398468018, "learning_rate": 1.6767648401028346e-06, "loss": 2.0056, "step": 5712 }, { "epoch": 0.9217489512746048, "grad_norm": 4.7736897468566895, "learning_rate": 1.6700618622905228e-06, "loss": 1.6526, "step": 5713 }, { "epoch": 0.9219102936431107, "grad_norm": 3.9673078060150146, "learning_rate": 1.66337208156373e-06, "loss": 1.7841, "step": 5714 }, { "epoch": 0.9220716360116167, "grad_norm": 3.6450281143188477, "learning_rate": 1.6566954997491723e-06, "loss": 1.6754, "step": 5715 }, { "epoch": 0.9222329783801226, "grad_norm": 4.685565948486328, "learning_rate": 1.6500321186699918e-06, "loss": 1.7412, "step": 5716 }, { "epoch": 0.9223943207486286, "grad_norm": 5.0415449142456055, "learning_rate": 1.6433819401456996e-06, "loss": 1.7575, "step": 5717 }, { "epoch": 0.9225556631171346, "grad_norm": 4.050350189208984, "learning_rate": 1.6367449659921986e-06, "loss": 1.5767, "step": 5718 }, { "epoch": 0.9227170054856405, "grad_norm": 7.504308223724365, "learning_rate": 1.6301211980218e-06, "loss": 1.6768, "step": 5719 }, { "epoch": 0.9228783478541465, "grad_norm": 3.741177558898926, "learning_rate": 1.6235106380432186e-06, "loss": 1.944, "step": 5720 }, { "epoch": 0.9230396902226524, "grad_norm": 4.127954483032227, "learning_rate": 1.6169132878615322e-06, "loss": 2.0029, "step": 5721 }, { "epoch": 0.9232010325911585, "grad_norm": 5.195093154907227, "learning_rate": 1.6103291492782391e-06, "loss": 1.7876, "step": 5722 }, { "epoch": 0.9233623749596644, "grad_norm": 6.398956775665283, "learning_rate": 1.6037582240912175e-06, "loss": 1.5098, "step": 5723 }, { "epoch": 0.9235237173281704, "grad_norm": 6.398956775665283, "learning_rate": 1.6037582240912175e-06, "loss": 1.9152, "step": 5724 }, { "epoch": 0.9236850596966764, "grad_norm": 4.0053815841674805, "learning_rate": 1.597200514094732e-06, "loss": 1.8956, "step": 5725 }, { "epoch": 0.9238464020651823, "grad_norm": 4.242628574371338, "learning_rate": 1.5906560210794562e-06, "loss": 1.6887, "step": 5726 }, { "epoch": 0.9240077444336883, "grad_norm": 4.242440700531006, "learning_rate": 1.5841247468324383e-06, "loss": 1.9101, "step": 5727 }, { "epoch": 0.9241690868021942, "grad_norm": 5.061077117919922, "learning_rate": 1.5776066931371348e-06, "loss": 1.6569, "step": 5728 }, { "epoch": 0.9243304291707002, "grad_norm": 4.0543413162231445, "learning_rate": 1.5711018617733607e-06, "loss": 1.7956, "step": 5729 }, { "epoch": 0.9244917715392061, "grad_norm": 4.903491497039795, "learning_rate": 1.5646102545173624e-06, "loss": 1.7056, "step": 5730 }, { "epoch": 0.9246531139077122, "grad_norm": 5.305523872375488, "learning_rate": 1.5581318731417383e-06, "loss": 1.6872, "step": 5731 }, { "epoch": 0.9248144562762182, "grad_norm": 4.152594566345215, "learning_rate": 1.55166671941549e-06, "loss": 1.7782, "step": 5732 }, { "epoch": 0.9249757986447241, "grad_norm": 5.141536712646484, "learning_rate": 1.5452147951040163e-06, "loss": 1.8871, "step": 5733 }, { "epoch": 0.9251371410132301, "grad_norm": 3.96051287651062, "learning_rate": 1.538776101969086e-06, "loss": 1.8092, "step": 5734 }, { "epoch": 0.925298483381736, "grad_norm": 4.150367259979248, "learning_rate": 1.532350641768876e-06, "loss": 1.8495, "step": 5735 }, { "epoch": 0.925459825750242, "grad_norm": 5.4383416175842285, "learning_rate": 1.5259384162579216e-06, "loss": 2.0074, "step": 5736 }, { "epoch": 0.925621168118748, "grad_norm": 4.244972229003906, "learning_rate": 1.519539427187172e-06, "loss": 1.5395, "step": 5737 }, { "epoch": 0.9257825104872539, "grad_norm": 4.363376140594482, "learning_rate": 1.5131536763039521e-06, "loss": 1.6113, "step": 5738 }, { "epoch": 0.92594385285576, "grad_norm": 4.801346302032471, "learning_rate": 1.5067811653519558e-06, "loss": 1.7692, "step": 5739 }, { "epoch": 0.9261051952242659, "grad_norm": 6.123290061950684, "learning_rate": 1.5004218960712802e-06, "loss": 1.7549, "step": 5740 }, { "epoch": 0.9262665375927719, "grad_norm": 5.035151958465576, "learning_rate": 1.4940758701984136e-06, "loss": 1.9065, "step": 5741 }, { "epoch": 0.9264278799612778, "grad_norm": 4.790508270263672, "learning_rate": 1.4877430894662036e-06, "loss": 1.9566, "step": 5742 }, { "epoch": 0.9265892223297838, "grad_norm": 4.044414043426514, "learning_rate": 1.4814235556039003e-06, "loss": 1.5918, "step": 5743 }, { "epoch": 0.9267505646982898, "grad_norm": 4.563947677612305, "learning_rate": 1.4751172703371342e-06, "loss": 1.8414, "step": 5744 }, { "epoch": 0.9269119070667957, "grad_norm": 4.41463565826416, "learning_rate": 1.4688242353879e-06, "loss": 1.8903, "step": 5745 }, { "epoch": 0.9270732494353017, "grad_norm": 5.345877170562744, "learning_rate": 1.4625444524746068e-06, "loss": 1.9152, "step": 5746 }, { "epoch": 0.9272345918038076, "grad_norm": 4.196035385131836, "learning_rate": 1.4562779233120105e-06, "loss": 1.6025, "step": 5747 }, { "epoch": 0.9273959341723137, "grad_norm": 4.328422546386719, "learning_rate": 1.4500246496112758e-06, "loss": 1.7882, "step": 5748 }, { "epoch": 0.9275572765408197, "grad_norm": 3.8723225593566895, "learning_rate": 1.4437846330799255e-06, "loss": 1.7593, "step": 5749 }, { "epoch": 0.9277186189093256, "grad_norm": 4.046195030212402, "learning_rate": 1.4375578754218855e-06, "loss": 1.9627, "step": 5750 }, { "epoch": 0.9278799612778316, "grad_norm": 5.210872650146484, "learning_rate": 1.4313443783374404e-06, "loss": 1.8299, "step": 5751 }, { "epoch": 0.9280413036463375, "grad_norm": 4.25077486038208, "learning_rate": 1.4251441435232659e-06, "loss": 1.6938, "step": 5752 }, { "epoch": 0.9282026460148435, "grad_norm": 3.886307716369629, "learning_rate": 1.4189571726724082e-06, "loss": 1.7097, "step": 5753 }, { "epoch": 0.9283639883833494, "grad_norm": 4.030497074127197, "learning_rate": 1.412783467474299e-06, "loss": 1.8167, "step": 5754 }, { "epoch": 0.9285253307518554, "grad_norm": 3.4684436321258545, "learning_rate": 1.4066230296147454e-06, "loss": 1.8662, "step": 5755 }, { "epoch": 0.9286866731203615, "grad_norm": 4.377821922302246, "learning_rate": 1.400475860775935e-06, "loss": 1.9748, "step": 5756 }, { "epoch": 0.9288480154888674, "grad_norm": 6.5896711349487305, "learning_rate": 1.3943419626364196e-06, "loss": 1.9087, "step": 5757 }, { "epoch": 0.9290093578573734, "grad_norm": 3.9209225177764893, "learning_rate": 1.3882213368711372e-06, "loss": 1.9682, "step": 5758 }, { "epoch": 0.9291707002258793, "grad_norm": 4.070976257324219, "learning_rate": 1.3821139851514064e-06, "loss": 2.043, "step": 5759 }, { "epoch": 0.9293320425943853, "grad_norm": 4.065196514129639, "learning_rate": 1.3760199091449044e-06, "loss": 1.6531, "step": 5760 }, { "epoch": 0.9294933849628912, "grad_norm": 5.823979377746582, "learning_rate": 1.3699391105157056e-06, "loss": 2.0396, "step": 5761 }, { "epoch": 0.9296547273313972, "grad_norm": 4.389406681060791, "learning_rate": 1.3638715909242316e-06, "loss": 1.7936, "step": 5762 }, { "epoch": 0.9298160696999032, "grad_norm": 4.885544776916504, "learning_rate": 1.357817352027313e-06, "loss": 1.8755, "step": 5763 }, { "epoch": 0.9299774120684091, "grad_norm": 4.0908427238464355, "learning_rate": 1.351776395478116e-06, "loss": 1.8113, "step": 5764 }, { "epoch": 0.9301387544369152, "grad_norm": 3.9577252864837646, "learning_rate": 1.3457487229262155e-06, "loss": 1.8165, "step": 5765 }, { "epoch": 0.9303000968054211, "grad_norm": 5.4702067375183105, "learning_rate": 1.3397343360175286e-06, "loss": 1.6826, "step": 5766 }, { "epoch": 0.9304614391739271, "grad_norm": 4.719128608703613, "learning_rate": 1.3337332363943634e-06, "loss": 1.9677, "step": 5767 }, { "epoch": 0.930622781542433, "grad_norm": 5.333267688751221, "learning_rate": 1.327745425695398e-06, "loss": 2.0296, "step": 5768 }, { "epoch": 0.930784123910939, "grad_norm": 4.493922233581543, "learning_rate": 1.3217709055556638e-06, "loss": 1.7187, "step": 5769 }, { "epoch": 0.930945466279445, "grad_norm": 4.137994766235352, "learning_rate": 1.3158096776065942e-06, "loss": 1.7014, "step": 5770 }, { "epoch": 0.9311068086479509, "grad_norm": 5.764809608459473, "learning_rate": 1.3098617434759596e-06, "loss": 1.9381, "step": 5771 }, { "epoch": 0.9312681510164569, "grad_norm": 4.776947021484375, "learning_rate": 1.3039271047879331e-06, "loss": 1.92, "step": 5772 }, { "epoch": 0.9314294933849628, "grad_norm": 4.990124225616455, "learning_rate": 1.2980057631630294e-06, "loss": 1.9291, "step": 5773 }, { "epoch": 0.9315908357534689, "grad_norm": 5.358241081237793, "learning_rate": 1.2920977202181494e-06, "loss": 1.8355, "step": 5774 }, { "epoch": 0.9317521781219749, "grad_norm": 5.3820295333862305, "learning_rate": 1.286202977566553e-06, "loss": 1.7486, "step": 5775 }, { "epoch": 0.9319135204904808, "grad_norm": 4.732501029968262, "learning_rate": 1.2803215368178745e-06, "loss": 1.606, "step": 5776 }, { "epoch": 0.9320748628589868, "grad_norm": 4.951090335845947, "learning_rate": 1.2744533995781183e-06, "loss": 1.4972, "step": 5777 }, { "epoch": 0.9322362052274927, "grad_norm": 3.594369649887085, "learning_rate": 1.268598567449647e-06, "loss": 1.8737, "step": 5778 }, { "epoch": 0.9323975475959987, "grad_norm": 4.86506462097168, "learning_rate": 1.2627570420311929e-06, "loss": 1.8318, "step": 5779 }, { "epoch": 0.9325588899645046, "grad_norm": 5.794149398803711, "learning_rate": 1.256928824917858e-06, "loss": 1.8681, "step": 5780 }, { "epoch": 0.9327202323330106, "grad_norm": 4.306437969207764, "learning_rate": 1.2511139177011133e-06, "loss": 1.7558, "step": 5781 }, { "epoch": 0.9328815747015167, "grad_norm": 5.511157512664795, "learning_rate": 1.2453123219687834e-06, "loss": 1.728, "step": 5782 }, { "epoch": 0.9330429170700226, "grad_norm": 5.119511127471924, "learning_rate": 1.2395240393050733e-06, "loss": 1.9274, "step": 5783 }, { "epoch": 0.9332042594385286, "grad_norm": 4.189093112945557, "learning_rate": 1.2337490712905352e-06, "loss": 1.6383, "step": 5784 }, { "epoch": 0.9333656018070345, "grad_norm": 5.592221260070801, "learning_rate": 1.2279874195021024e-06, "loss": 1.8266, "step": 5785 }, { "epoch": 0.9335269441755405, "grad_norm": 4.038226127624512, "learning_rate": 1.222239085513066e-06, "loss": 1.7665, "step": 5786 }, { "epoch": 0.9336882865440465, "grad_norm": 4.962045669555664, "learning_rate": 1.2165040708930765e-06, "loss": 1.6626, "step": 5787 }, { "epoch": 0.9338496289125524, "grad_norm": 4.691133499145508, "learning_rate": 1.2107823772081472e-06, "loss": 1.848, "step": 5788 }, { "epoch": 0.9340109712810584, "grad_norm": 4.325597286224365, "learning_rate": 1.2050740060206679e-06, "loss": 1.7769, "step": 5789 }, { "epoch": 0.9341723136495643, "grad_norm": 3.8352766036987305, "learning_rate": 1.1993789588893634e-06, "loss": 1.9731, "step": 5790 }, { "epoch": 0.9343336560180704, "grad_norm": 3.7644858360290527, "learning_rate": 1.1936972373693567e-06, "loss": 1.7886, "step": 5791 }, { "epoch": 0.9344949983865763, "grad_norm": 3.7606143951416016, "learning_rate": 1.1880288430120901e-06, "loss": 1.8622, "step": 5792 }, { "epoch": 0.9346563407550823, "grad_norm": 4.325725078582764, "learning_rate": 1.1823737773654087e-06, "loss": 1.7914, "step": 5793 }, { "epoch": 0.9348176831235883, "grad_norm": 4.37510347366333, "learning_rate": 1.1767320419734884e-06, "loss": 1.8447, "step": 5794 }, { "epoch": 0.9349790254920942, "grad_norm": 3.663802146911621, "learning_rate": 1.1711036383768693e-06, "loss": 1.8028, "step": 5795 }, { "epoch": 0.9351403678606002, "grad_norm": 4.456315040588379, "learning_rate": 1.165488568112466e-06, "loss": 2.0647, "step": 5796 }, { "epoch": 0.9353017102291061, "grad_norm": 6.146907806396484, "learning_rate": 1.1598868327135359e-06, "loss": 2.0919, "step": 5797 }, { "epoch": 0.9354630525976121, "grad_norm": 3.3174848556518555, "learning_rate": 1.1542984337097107e-06, "loss": 1.8917, "step": 5798 }, { "epoch": 0.9356243949661182, "grad_norm": 5.587307929992676, "learning_rate": 1.1487233726269585e-06, "loss": 1.7521, "step": 5799 }, { "epoch": 0.9357857373346241, "grad_norm": 4.982137680053711, "learning_rate": 1.1431616509876287e-06, "loss": 1.8063, "step": 5800 }, { "epoch": 0.9359470797031301, "grad_norm": 4.172671318054199, "learning_rate": 1.1376132703104115e-06, "loss": 1.6838, "step": 5801 }, { "epoch": 0.936108422071636, "grad_norm": 4.366324424743652, "learning_rate": 1.1320782321103673e-06, "loss": 1.9022, "step": 5802 }, { "epoch": 0.936269764440142, "grad_norm": 3.5740795135498047, "learning_rate": 1.1265565378989041e-06, "loss": 1.8462, "step": 5803 }, { "epoch": 0.9364311068086479, "grad_norm": 3.788661003112793, "learning_rate": 1.1210481891837877e-06, "loss": 2.0685, "step": 5804 }, { "epoch": 0.9365924491771539, "grad_norm": 4.398108005523682, "learning_rate": 1.1155531874691371e-06, "loss": 2.0232, "step": 5805 }, { "epoch": 0.9367537915456599, "grad_norm": 3.890148401260376, "learning_rate": 1.1100715342554357e-06, "loss": 1.7261, "step": 5806 }, { "epoch": 0.9369151339141658, "grad_norm": 4.149150848388672, "learning_rate": 1.1046032310395193e-06, "loss": 1.6467, "step": 5807 }, { "epoch": 0.9370764762826719, "grad_norm": 4.093803405761719, "learning_rate": 1.0991482793145657e-06, "loss": 1.8167, "step": 5808 }, { "epoch": 0.9372378186511778, "grad_norm": 4.130459308624268, "learning_rate": 1.0937066805701223e-06, "loss": 1.6255, "step": 5809 }, { "epoch": 0.9373991610196838, "grad_norm": 3.8121321201324463, "learning_rate": 1.088278436292084e-06, "loss": 1.9435, "step": 5810 }, { "epoch": 0.9375605033881897, "grad_norm": 5.062289714813232, "learning_rate": 1.0828635479627036e-06, "loss": 2.1098, "step": 5811 }, { "epoch": 0.9377218457566957, "grad_norm": 4.468748092651367, "learning_rate": 1.0774620170605764e-06, "loss": 1.7855, "step": 5812 }, { "epoch": 0.9378831881252017, "grad_norm": 4.119090557098389, "learning_rate": 1.0720738450606615e-06, "loss": 1.7658, "step": 5813 }, { "epoch": 0.9380445304937076, "grad_norm": 5.087363243103027, "learning_rate": 1.0666990334342707e-06, "loss": 1.7402, "step": 5814 }, { "epoch": 0.9382058728622136, "grad_norm": 4.254721164703369, "learning_rate": 1.0613375836490468e-06, "loss": 1.9288, "step": 5815 }, { "epoch": 0.9383672152307196, "grad_norm": 4.193010330200195, "learning_rate": 1.0559894971690132e-06, "loss": 1.8224, "step": 5816 }, { "epoch": 0.9385285575992256, "grad_norm": 4.436793804168701, "learning_rate": 1.0506547754545292e-06, "loss": 1.749, "step": 5817 }, { "epoch": 0.9386898999677316, "grad_norm": 5.846003532409668, "learning_rate": 1.0453334199623022e-06, "loss": 1.7162, "step": 5818 }, { "epoch": 0.9388512423362375, "grad_norm": 5.199612617492676, "learning_rate": 1.0400254321453974e-06, "loss": 1.8913, "step": 5819 }, { "epoch": 0.9390125847047435, "grad_norm": 3.845651626586914, "learning_rate": 1.0347308134532218e-06, "loss": 1.8086, "step": 5820 }, { "epoch": 0.9391739270732494, "grad_norm": 4.292616844177246, "learning_rate": 1.0294495653315418e-06, "loss": 1.8092, "step": 5821 }, { "epoch": 0.9393352694417554, "grad_norm": 5.048312664031982, "learning_rate": 1.0241816892224644e-06, "loss": 1.8648, "step": 5822 }, { "epoch": 0.9394966118102613, "grad_norm": 5.327896595001221, "learning_rate": 1.0189271865644446e-06, "loss": 1.9031, "step": 5823 }, { "epoch": 0.9396579541787673, "grad_norm": 4.470217227935791, "learning_rate": 1.0136860587923015e-06, "loss": 1.6775, "step": 5824 }, { "epoch": 0.9398192965472734, "grad_norm": 5.024284362792969, "learning_rate": 1.0084583073371733e-06, "loss": 1.6311, "step": 5825 }, { "epoch": 0.9399806389157793, "grad_norm": 7.035519599914551, "learning_rate": 1.0032439336265742e-06, "loss": 1.5856, "step": 5826 }, { "epoch": 0.9401419812842853, "grad_norm": 4.376884460449219, "learning_rate": 9.980429390843427e-07, "loss": 1.9039, "step": 5827 }, { "epoch": 0.9403033236527912, "grad_norm": 5.319319248199463, "learning_rate": 9.928553251306871e-07, "loss": 1.7658, "step": 5828 }, { "epoch": 0.9404646660212972, "grad_norm": 5.395619869232178, "learning_rate": 9.87681093182141e-07, "loss": 2.0921, "step": 5829 }, { "epoch": 0.9406260083898031, "grad_norm": 4.178811073303223, "learning_rate": 9.82520244651597e-07, "loss": 1.9595, "step": 5830 }, { "epoch": 0.9407873507583091, "grad_norm": 5.348474502563477, "learning_rate": 9.773727809482825e-07, "loss": 2.0595, "step": 5831 }, { "epoch": 0.9409486931268151, "grad_norm": 4.0664448738098145, "learning_rate": 9.722387034777847e-07, "loss": 1.9365, "step": 5832 }, { "epoch": 0.941110035495321, "grad_norm": 3.9825870990753174, "learning_rate": 9.671180136420154e-07, "loss": 1.6047, "step": 5833 }, { "epoch": 0.9412713778638271, "grad_norm": 4.089111804962158, "learning_rate": 9.620107128392563e-07, "loss": 1.7241, "step": 5834 }, { "epoch": 0.941432720232333, "grad_norm": 3.979795455932617, "learning_rate": 9.569168024640973e-07, "loss": 1.6304, "step": 5835 }, { "epoch": 0.941594062600839, "grad_norm": 4.714263439178467, "learning_rate": 9.518362839075145e-07, "loss": 1.9056, "step": 5836 }, { "epoch": 0.941755404969345, "grad_norm": 5.615983963012695, "learning_rate": 9.467691585568039e-07, "loss": 1.7513, "step": 5837 }, { "epoch": 0.9419167473378509, "grad_norm": 4.188431262969971, "learning_rate": 9.417154277955864e-07, "loss": 1.9914, "step": 5838 }, { "epoch": 0.9420780897063569, "grad_norm": 3.9508488178253174, "learning_rate": 9.366750930038748e-07, "loss": 1.7786, "step": 5839 }, { "epoch": 0.9422394320748628, "grad_norm": 4.674958229064941, "learning_rate": 9.316481555579681e-07, "loss": 1.8749, "step": 5840 }, { "epoch": 0.9424007744433688, "grad_norm": 4.925543785095215, "learning_rate": 9.266346168305517e-07, "loss": 1.7405, "step": 5841 }, { "epoch": 0.9425621168118749, "grad_norm": 5.0826544761657715, "learning_rate": 9.21634478190625e-07, "loss": 1.9369, "step": 5842 }, { "epoch": 0.9427234591803808, "grad_norm": 6.129978179931641, "learning_rate": 9.166477410035401e-07, "loss": 1.7705, "step": 5843 }, { "epoch": 0.9428848015488868, "grad_norm": 3.634263277053833, "learning_rate": 9.116744066309913e-07, "loss": 1.9332, "step": 5844 }, { "epoch": 0.9430461439173927, "grad_norm": 4.566090106964111, "learning_rate": 9.067144764309976e-07, "loss": 2.0883, "step": 5845 }, { "epoch": 0.9432074862858987, "grad_norm": 3.382263660430908, "learning_rate": 9.017679517579425e-07, "loss": 1.9461, "step": 5846 }, { "epoch": 0.9433688286544046, "grad_norm": 4.951333045959473, "learning_rate": 8.968348339625287e-07, "loss": 1.8161, "step": 5847 }, { "epoch": 0.9435301710229106, "grad_norm": 5.0788960456848145, "learning_rate": 8.919151243918067e-07, "loss": 1.8715, "step": 5848 }, { "epoch": 0.9436915133914165, "grad_norm": 4.868221759796143, "learning_rate": 8.870088243891572e-07, "loss": 1.9675, "step": 5849 }, { "epoch": 0.9438528557599225, "grad_norm": 3.2534308433532715, "learning_rate": 8.821159352943143e-07, "loss": 1.639, "step": 5850 }, { "epoch": 0.9440141981284286, "grad_norm": 3.6828863620758057, "learning_rate": 8.772364584433368e-07, "loss": 1.9083, "step": 5851 }, { "epoch": 0.9441755404969345, "grad_norm": 4.399649620056152, "learning_rate": 8.723703951686313e-07, "loss": 1.9833, "step": 5852 }, { "epoch": 0.9443368828654405, "grad_norm": 4.549874782562256, "learning_rate": 8.675177467989349e-07, "loss": 1.79, "step": 5853 }, { "epoch": 0.9444982252339464, "grad_norm": 4.242159843444824, "learning_rate": 8.626785146593208e-07, "loss": 1.8302, "step": 5854 }, { "epoch": 0.9446595676024524, "grad_norm": 6.748963356018066, "learning_rate": 8.578527000711989e-07, "loss": 1.8678, "step": 5855 }, { "epoch": 0.9448209099709584, "grad_norm": 3.9145658016204834, "learning_rate": 8.530403043523205e-07, "loss": 1.7411, "step": 5856 }, { "epoch": 0.9449822523394643, "grad_norm": 4.138341426849365, "learning_rate": 8.482413288167734e-07, "loss": 1.5438, "step": 5857 }, { "epoch": 0.9451435947079703, "grad_norm": 4.497691631317139, "learning_rate": 8.43455774774965e-07, "loss": 1.751, "step": 5858 }, { "epoch": 0.9453049370764763, "grad_norm": 3.659984588623047, "learning_rate": 8.386836435336609e-07, "loss": 1.9816, "step": 5859 }, { "epoch": 0.9454662794449823, "grad_norm": 3.222717523574829, "learning_rate": 8.339249363959411e-07, "loss": 1.6494, "step": 5860 }, { "epoch": 0.9456276218134883, "grad_norm": 3.8509652614593506, "learning_rate": 8.29179654661244e-07, "loss": 1.8249, "step": 5861 }, { "epoch": 0.9457889641819942, "grad_norm": 5.055091381072998, "learning_rate": 8.244477996253108e-07, "loss": 2.0407, "step": 5862 }, { "epoch": 0.9459503065505002, "grad_norm": 3.966942310333252, "learning_rate": 8.197293725802469e-07, "loss": 1.6762, "step": 5863 }, { "epoch": 0.9461116489190061, "grad_norm": 3.865260601043701, "learning_rate": 8.150243748144659e-07, "loss": 1.9668, "step": 5864 }, { "epoch": 0.9462729912875121, "grad_norm": 4.931168556213379, "learning_rate": 8.103328076127347e-07, "loss": 2.018, "step": 5865 }, { "epoch": 0.946434333656018, "grad_norm": 4.939724922180176, "learning_rate": 8.056546722561343e-07, "loss": 1.7499, "step": 5866 }, { "epoch": 0.946595676024524, "grad_norm": 4.040256500244141, "learning_rate": 8.00989970022098e-07, "loss": 1.9447, "step": 5867 }, { "epoch": 0.9467570183930301, "grad_norm": 3.61901593208313, "learning_rate": 7.963387021843683e-07, "loss": 1.7853, "step": 5868 }, { "epoch": 0.946918360761536, "grad_norm": 4.144516468048096, "learning_rate": 7.917008700130401e-07, "loss": 1.7975, "step": 5869 }, { "epoch": 0.947079703130042, "grad_norm": 4.960970401763916, "learning_rate": 7.870764747745285e-07, "loss": 1.882, "step": 5870 }, { "epoch": 0.9472410454985479, "grad_norm": 5.900622844696045, "learning_rate": 7.824655177315787e-07, "loss": 1.8373, "step": 5871 }, { "epoch": 0.9474023878670539, "grad_norm": 6.7140212059021, "learning_rate": 7.778680001432725e-07, "loss": 1.7062, "step": 5872 }, { "epoch": 0.9475637302355598, "grad_norm": 4.1263427734375, "learning_rate": 7.732839232650224e-07, "loss": 1.6811, "step": 5873 }, { "epoch": 0.9477250726040658, "grad_norm": 4.739447116851807, "learning_rate": 7.687132883485549e-07, "loss": 1.7811, "step": 5874 }, { "epoch": 0.9478864149725718, "grad_norm": 4.560645580291748, "learning_rate": 7.641560966419492e-07, "loss": 1.8688, "step": 5875 }, { "epoch": 0.9480477573410777, "grad_norm": 4.571281433105469, "learning_rate": 7.596123493895991e-07, "loss": 1.9427, "step": 5876 }, { "epoch": 0.9482090997095838, "grad_norm": 4.332674980163574, "learning_rate": 7.550820478322285e-07, "loss": 1.6717, "step": 5877 }, { "epoch": 0.9483704420780897, "grad_norm": 5.199446678161621, "learning_rate": 7.50565193206898e-07, "loss": 1.8095, "step": 5878 }, { "epoch": 0.9485317844465957, "grad_norm": 4.981563568115234, "learning_rate": 7.460617867469822e-07, "loss": 1.7219, "step": 5879 }, { "epoch": 0.9486931268151017, "grad_norm": 4.1707353591918945, "learning_rate": 7.415718296822028e-07, "loss": 1.7694, "step": 5880 }, { "epoch": 0.9488544691836076, "grad_norm": 5.002323627471924, "learning_rate": 7.370953232385902e-07, "loss": 1.6788, "step": 5881 }, { "epoch": 0.9490158115521136, "grad_norm": 5.204863548278809, "learning_rate": 7.326322686385112e-07, "loss": 1.6622, "step": 5882 }, { "epoch": 0.9491771539206195, "grad_norm": 4.420432090759277, "learning_rate": 7.281826671006576e-07, "loss": 1.7891, "step": 5883 }, { "epoch": 0.9493384962891255, "grad_norm": 4.139347553253174, "learning_rate": 7.237465198400461e-07, "loss": 1.7936, "step": 5884 }, { "epoch": 0.9494998386576315, "grad_norm": 4.050226211547852, "learning_rate": 7.193238280680248e-07, "loss": 1.7334, "step": 5885 }, { "epoch": 0.9496611810261375, "grad_norm": 4.120753765106201, "learning_rate": 7.149145929922607e-07, "loss": 1.8455, "step": 5886 }, { "epoch": 0.9498225233946435, "grad_norm": 5.404914379119873, "learning_rate": 7.105188158167575e-07, "loss": 1.7015, "step": 5887 }, { "epoch": 0.9499838657631494, "grad_norm": 4.45375919342041, "learning_rate": 7.061364977418217e-07, "loss": 1.7473, "step": 5888 }, { "epoch": 0.9501452081316554, "grad_norm": 5.266660213470459, "learning_rate": 7.017676399641182e-07, "loss": 2.0971, "step": 5889 }, { "epoch": 0.9503065505001613, "grad_norm": 5.51798677444458, "learning_rate": 6.974122436766039e-07, "loss": 2.1716, "step": 5890 }, { "epoch": 0.9504678928686673, "grad_norm": 4.540561199188232, "learning_rate": 6.930703100685775e-07, "loss": 1.6084, "step": 5891 }, { "epoch": 0.9506292352371732, "grad_norm": 3.912099838256836, "learning_rate": 6.887418403256574e-07, "loss": 1.7189, "step": 5892 }, { "epoch": 0.9507905776056792, "grad_norm": 4.840001583099365, "learning_rate": 6.844268356297867e-07, "loss": 1.677, "step": 5893 }, { "epoch": 0.9509519199741853, "grad_norm": 3.7052505016326904, "learning_rate": 6.801252971592287e-07, "loss": 1.8608, "step": 5894 }, { "epoch": 0.9511132623426912, "grad_norm": 5.620258808135986, "learning_rate": 6.758372260885715e-07, "loss": 1.7261, "step": 5895 }, { "epoch": 0.9512746047111972, "grad_norm": 4.253757953643799, "learning_rate": 6.715626235887341e-07, "loss": 1.696, "step": 5896 }, { "epoch": 0.9514359470797031, "grad_norm": 5.048199653625488, "learning_rate": 6.673014908269326e-07, "loss": 1.863, "step": 5897 }, { "epoch": 0.9515972894482091, "grad_norm": 4.374826431274414, "learning_rate": 6.630538289667366e-07, "loss": 1.8576, "step": 5898 }, { "epoch": 0.951758631816715, "grad_norm": 4.63386869430542, "learning_rate": 6.588196391680124e-07, "loss": 1.7793, "step": 5899 }, { "epoch": 0.951919974185221, "grad_norm": 3.973889112472534, "learning_rate": 6.545989225869631e-07, "loss": 1.6409, "step": 5900 }, { "epoch": 0.952081316553727, "grad_norm": 3.8073489665985107, "learning_rate": 6.503916803761057e-07, "loss": 1.4881, "step": 5901 }, { "epoch": 0.952242658922233, "grad_norm": 4.754162311553955, "learning_rate": 6.461979136842877e-07, "loss": 1.6191, "step": 5902 }, { "epoch": 0.952404001290739, "grad_norm": 4.1775221824646, "learning_rate": 6.420176236566544e-07, "loss": 1.7152, "step": 5903 }, { "epoch": 0.952565343659245, "grad_norm": 4.4393510818481445, "learning_rate": 6.378508114346982e-07, "loss": 1.8372, "step": 5904 }, { "epoch": 0.9527266860277509, "grad_norm": 4.985992431640625, "learning_rate": 6.336974781562088e-07, "loss": 1.928, "step": 5905 }, { "epoch": 0.9528880283962569, "grad_norm": 5.360131740570068, "learning_rate": 6.295576249553125e-07, "loss": 2.0691, "step": 5906 }, { "epoch": 0.9530493707647628, "grad_norm": 4.566859245300293, "learning_rate": 6.25431252962444e-07, "loss": 1.568, "step": 5907 }, { "epoch": 0.9532107131332688, "grad_norm": 4.333277702331543, "learning_rate": 6.213183633043574e-07, "loss": 1.8062, "step": 5908 }, { "epoch": 0.9533720555017747, "grad_norm": 4.303825378417969, "learning_rate": 6.172189571041376e-07, "loss": 1.9251, "step": 5909 }, { "epoch": 0.9535333978702807, "grad_norm": 3.563807487487793, "learning_rate": 6.131330354811615e-07, "loss": 1.8189, "step": 5910 }, { "epoch": 0.9536947402387868, "grad_norm": 3.7442002296447754, "learning_rate": 6.090605995511589e-07, "loss": 1.9291, "step": 5911 }, { "epoch": 0.9538560826072927, "grad_norm": 5.451288223266602, "learning_rate": 6.050016504261458e-07, "loss": 2.0853, "step": 5912 }, { "epoch": 0.9540174249757987, "grad_norm": 4.133009433746338, "learning_rate": 6.009561892144744e-07, "loss": 1.9121, "step": 5913 }, { "epoch": 0.9541787673443046, "grad_norm": 5.506205081939697, "learning_rate": 5.969242170208056e-07, "loss": 1.8173, "step": 5914 }, { "epoch": 0.9543401097128106, "grad_norm": 4.302342414855957, "learning_rate": 5.929057349461198e-07, "loss": 1.7966, "step": 5915 }, { "epoch": 0.9545014520813165, "grad_norm": 4.712862491607666, "learning_rate": 5.889007440877059e-07, "loss": 1.7672, "step": 5916 }, { "epoch": 0.9546627944498225, "grad_norm": 4.7828450202941895, "learning_rate": 5.849092455391892e-07, "loss": 1.8546, "step": 5917 }, { "epoch": 0.9548241368183285, "grad_norm": 4.673771381378174, "learning_rate": 5.809312403904921e-07, "loss": 1.6937, "step": 5918 }, { "epoch": 0.9549854791868344, "grad_norm": 6.275789737701416, "learning_rate": 5.769667297278513e-07, "loss": 1.6837, "step": 5919 }, { "epoch": 0.9551468215553405, "grad_norm": 4.317024230957031, "learning_rate": 5.730157146338399e-07, "loss": 1.6549, "step": 5920 }, { "epoch": 0.9553081639238464, "grad_norm": 4.117687702178955, "learning_rate": 5.690781961873115e-07, "loss": 1.9171, "step": 5921 }, { "epoch": 0.9554695062923524, "grad_norm": 5.107153415679932, "learning_rate": 5.651541754634726e-07, "loss": 1.8191, "step": 5922 }, { "epoch": 0.9556308486608583, "grad_norm": 4.189974784851074, "learning_rate": 5.612436535338106e-07, "loss": 1.7533, "step": 5923 }, { "epoch": 0.9557921910293643, "grad_norm": 5.197915077209473, "learning_rate": 5.573466314661546e-07, "loss": 2.0262, "step": 5924 }, { "epoch": 0.9559535333978703, "grad_norm": 4.776738166809082, "learning_rate": 5.534631103246257e-07, "loss": 1.9174, "step": 5925 }, { "epoch": 0.9561148757663762, "grad_norm": 4.674618721008301, "learning_rate": 5.495930911696757e-07, "loss": 1.7167, "step": 5926 }, { "epoch": 0.9562762181348822, "grad_norm": 4.881612300872803, "learning_rate": 5.457365750580534e-07, "loss": 1.9184, "step": 5927 }, { "epoch": 0.9564375605033882, "grad_norm": 5.173159122467041, "learning_rate": 5.418935630428279e-07, "loss": 1.7982, "step": 5928 }, { "epoch": 0.9565989028718942, "grad_norm": 4.905303001403809, "learning_rate": 5.380640561733819e-07, "loss": 1.8097, "step": 5929 }, { "epoch": 0.9567602452404002, "grad_norm": 5.457943439483643, "learning_rate": 5.342480554954177e-07, "loss": 1.9987, "step": 5930 }, { "epoch": 0.9569215876089061, "grad_norm": 5.061657905578613, "learning_rate": 5.304455620509297e-07, "loss": 1.7285, "step": 5931 }, { "epoch": 0.9570829299774121, "grad_norm": 4.368500709533691, "learning_rate": 5.266565768782427e-07, "loss": 1.9036, "step": 5932 }, { "epoch": 0.957244272345918, "grad_norm": 4.104846000671387, "learning_rate": 5.228811010119849e-07, "loss": 1.8644, "step": 5933 }, { "epoch": 0.957405614714424, "grad_norm": 4.591108798980713, "learning_rate": 5.191191354830926e-07, "loss": 2.1388, "step": 5934 }, { "epoch": 0.9575669570829299, "grad_norm": 4.90683650970459, "learning_rate": 5.15370681318822e-07, "loss": 2.1739, "step": 5935 }, { "epoch": 0.9577282994514359, "grad_norm": 5.190983295440674, "learning_rate": 5.116357395427262e-07, "loss": 1.7365, "step": 5936 }, { "epoch": 0.957889641819942, "grad_norm": 4.570216178894043, "learning_rate": 5.079143111746898e-07, "loss": 1.9492, "step": 5937 }, { "epoch": 0.9580509841884479, "grad_norm": 5.459639549255371, "learning_rate": 5.042063972308831e-07, "loss": 1.7176, "step": 5938 }, { "epoch": 0.9582123265569539, "grad_norm": 6.138484954833984, "learning_rate": 5.005119987238071e-07, "loss": 1.8889, "step": 5939 }, { "epoch": 0.9583736689254598, "grad_norm": 5.507758617401123, "learning_rate": 4.968311166622552e-07, "loss": 1.7188, "step": 5940 }, { "epoch": 0.9585350112939658, "grad_norm": 5.310621738433838, "learning_rate": 4.931637520513455e-07, "loss": 1.8531, "step": 5941 }, { "epoch": 0.9586963536624717, "grad_norm": 4.495138168334961, "learning_rate": 4.895099058924879e-07, "loss": 1.768, "step": 5942 }, { "epoch": 0.9588576960309777, "grad_norm": 6.398849964141846, "learning_rate": 4.858695791834178e-07, "loss": 1.8201, "step": 5943 }, { "epoch": 0.9590190383994837, "grad_norm": 4.211677551269531, "learning_rate": 4.822427729181678e-07, "loss": 2.0938, "step": 5944 }, { "epoch": 0.9591803807679897, "grad_norm": 4.224961757659912, "learning_rate": 4.786294880870845e-07, "loss": 1.8269, "step": 5945 }, { "epoch": 0.9593417231364957, "grad_norm": 4.252756118774414, "learning_rate": 4.750297256768177e-07, "loss": 1.8621, "step": 5946 }, { "epoch": 0.9595030655050016, "grad_norm": 3.9692041873931885, "learning_rate": 4.7144348667032545e-07, "loss": 1.8073, "step": 5947 }, { "epoch": 0.9596644078735076, "grad_norm": 4.611053466796875, "learning_rate": 4.6787077204687445e-07, "loss": 1.7518, "step": 5948 }, { "epoch": 0.9598257502420136, "grad_norm": 5.75453519821167, "learning_rate": 4.643115827820399e-07, "loss": 1.7066, "step": 5949 }, { "epoch": 0.9599870926105195, "grad_norm": 5.713967323303223, "learning_rate": 4.607659198477055e-07, "loss": 1.899, "step": 5950 }, { "epoch": 0.9601484349790255, "grad_norm": 3.9248242378234863, "learning_rate": 4.5723378421205776e-07, "loss": 1.8813, "step": 5951 }, { "epoch": 0.9603097773475314, "grad_norm": 3.939185857772827, "learning_rate": 4.537151768395864e-07, "loss": 1.7456, "step": 5952 }, { "epoch": 0.9604711197160374, "grad_norm": 3.6323013305664062, "learning_rate": 4.5021009869108957e-07, "loss": 1.8564, "step": 5953 }, { "epoch": 0.9606324620845434, "grad_norm": 5.585293292999268, "learning_rate": 4.4671855072367377e-07, "loss": 1.6519, "step": 5954 }, { "epoch": 0.9607938044530494, "grad_norm": 5.114597797393799, "learning_rate": 4.432405338907486e-07, "loss": 1.6841, "step": 5955 }, { "epoch": 0.9609551468215554, "grad_norm": 4.141439914703369, "learning_rate": 4.397760491420322e-07, "loss": 2.0173, "step": 5956 }, { "epoch": 0.9611164891900613, "grad_norm": 4.197299957275391, "learning_rate": 4.3632509742354553e-07, "loss": 1.8466, "step": 5957 }, { "epoch": 0.9612778315585673, "grad_norm": 3.91989803314209, "learning_rate": 4.3288767967760715e-07, "loss": 1.979, "step": 5958 }, { "epoch": 0.9614391739270732, "grad_norm": 3.3383967876434326, "learning_rate": 4.29463796842855e-07, "loss": 1.7008, "step": 5959 }, { "epoch": 0.9616005162955792, "grad_norm": 4.163468360900879, "learning_rate": 4.2605344985421346e-07, "loss": 1.9048, "step": 5960 }, { "epoch": 0.9617618586640851, "grad_norm": 3.660228967666626, "learning_rate": 4.226566396429266e-07, "loss": 1.7799, "step": 5961 }, { "epoch": 0.9619232010325912, "grad_norm": 3.3542301654815674, "learning_rate": 4.1927336713653007e-07, "loss": 1.8733, "step": 5962 }, { "epoch": 0.9620845434010972, "grad_norm": 4.129444599151611, "learning_rate": 4.159036332588739e-07, "loss": 1.7442, "step": 5963 }, { "epoch": 0.9622458857696031, "grad_norm": 3.580047607421875, "learning_rate": 4.125474389300998e-07, "loss": 1.7935, "step": 5964 }, { "epoch": 0.9624072281381091, "grad_norm": 4.628206729888916, "learning_rate": 4.092047850666636e-07, "loss": 2.0498, "step": 5965 }, { "epoch": 0.962568570506615, "grad_norm": 4.669541835784912, "learning_rate": 4.058756725813129e-07, "loss": 1.9091, "step": 5966 }, { "epoch": 0.962729912875121, "grad_norm": 5.048605918884277, "learning_rate": 4.025601023831094e-07, "loss": 1.9444, "step": 5967 }, { "epoch": 0.962891255243627, "grad_norm": 3.759253740310669, "learning_rate": 3.992580753774067e-07, "loss": 1.9328, "step": 5968 }, { "epoch": 0.9630525976121329, "grad_norm": 6.495770454406738, "learning_rate": 3.9596959246585575e-07, "loss": 1.6384, "step": 5969 }, { "epoch": 0.9632139399806389, "grad_norm": 9.589229583740234, "learning_rate": 3.926946545464327e-07, "loss": 1.9898, "step": 5970 }, { "epoch": 0.9633752823491449, "grad_norm": 5.594679832458496, "learning_rate": 3.894332625133945e-07, "loss": 1.6206, "step": 5971 }, { "epoch": 0.9635366247176509, "grad_norm": 3.989413261413574, "learning_rate": 3.861854172572954e-07, "loss": 1.8372, "step": 5972 }, { "epoch": 0.9636979670861568, "grad_norm": 4.727180004119873, "learning_rate": 3.829511196650093e-07, "loss": 1.9627, "step": 5973 }, { "epoch": 0.9638593094546628, "grad_norm": 4.975203037261963, "learning_rate": 3.797303706196964e-07, "loss": 1.8582, "step": 5974 }, { "epoch": 0.9640206518231688, "grad_norm": 3.692265272140503, "learning_rate": 3.7652317100082543e-07, "loss": 1.6858, "step": 5975 }, { "epoch": 0.9641819941916747, "grad_norm": 5.113863945007324, "learning_rate": 3.7332952168416257e-07, "loss": 1.7237, "step": 5976 }, { "epoch": 0.9643433365601807, "grad_norm": 6.761280536651611, "learning_rate": 3.7014942354176575e-07, "loss": 2.0606, "step": 5977 }, { "epoch": 0.9645046789286866, "grad_norm": 5.555169582366943, "learning_rate": 3.6698287744200697e-07, "loss": 1.9713, "step": 5978 }, { "epoch": 0.9646660212971926, "grad_norm": 5.829470634460449, "learning_rate": 3.638298842495502e-07, "loss": 1.7158, "step": 5979 }, { "epoch": 0.9648273636656987, "grad_norm": 4.175316333770752, "learning_rate": 3.6069044482535674e-07, "loss": 1.6679, "step": 5980 }, { "epoch": 0.9649887060342046, "grad_norm": 5.2124714851379395, "learning_rate": 3.5756456002668525e-07, "loss": 1.929, "step": 5981 }, { "epoch": 0.9651500484027106, "grad_norm": 5.597807884216309, "learning_rate": 3.5445223070710855e-07, "loss": 2.0152, "step": 5982 }, { "epoch": 0.9653113907712165, "grad_norm": 3.889457941055298, "learning_rate": 3.513534577164801e-07, "loss": 1.5768, "step": 5983 }, { "epoch": 0.9654727331397225, "grad_norm": 5.3268141746521, "learning_rate": 3.482682419009509e-07, "loss": 1.8987, "step": 5984 }, { "epoch": 0.9656340755082284, "grad_norm": 4.461936950683594, "learning_rate": 3.4519658410299136e-07, "loss": 1.7494, "step": 5985 }, { "epoch": 0.9657954178767344, "grad_norm": 4.557859897613525, "learning_rate": 3.4213848516134186e-07, "loss": 2.0838, "step": 5986 }, { "epoch": 0.9659567602452404, "grad_norm": 5.341065406799316, "learning_rate": 3.390939459110676e-07, "loss": 1.9941, "step": 5987 }, { "epoch": 0.9661181026137464, "grad_norm": 5.358587265014648, "learning_rate": 3.360629671835036e-07, "loss": 1.699, "step": 5988 }, { "epoch": 0.9662794449822524, "grad_norm": 3.979477882385254, "learning_rate": 3.330455498063045e-07, "loss": 1.7465, "step": 5989 }, { "epoch": 0.9664407873507583, "grad_norm": 3.8328661918640137, "learning_rate": 3.300416946034168e-07, "loss": 1.7958, "step": 5990 }, { "epoch": 0.9666021297192643, "grad_norm": 4.59087610244751, "learning_rate": 3.270514023950733e-07, "loss": 2.0609, "step": 5991 }, { "epoch": 0.9667634720877702, "grad_norm": 4.539885520935059, "learning_rate": 3.24074673997804e-07, "loss": 1.7699, "step": 5992 }, { "epoch": 0.9669248144562762, "grad_norm": 3.7460310459136963, "learning_rate": 3.2111151022445883e-07, "loss": 1.7859, "step": 5993 }, { "epoch": 0.9670861568247822, "grad_norm": 5.758821964263916, "learning_rate": 3.1816191188415166e-07, "loss": 1.7002, "step": 5994 }, { "epoch": 0.9672474991932881, "grad_norm": 3.9628663063049316, "learning_rate": 3.1522587978231045e-07, "loss": 1.7443, "step": 5995 }, { "epoch": 0.9674088415617941, "grad_norm": 5.32503080368042, "learning_rate": 3.123034147206605e-07, "loss": 1.7886, "step": 5996 }, { "epoch": 0.9675701839303001, "grad_norm": 5.373361587524414, "learning_rate": 3.0939451749720794e-07, "loss": 1.6785, "step": 5997 }, { "epoch": 0.9677315262988061, "grad_norm": 4.285488605499268, "learning_rate": 3.064991889062674e-07, "loss": 1.8405, "step": 5998 }, { "epoch": 0.9678928686673121, "grad_norm": 4.067709445953369, "learning_rate": 3.036174297384453e-07, "loss": 1.8125, "step": 5999 }, { "epoch": 0.968054211035818, "grad_norm": 4.507450103759766, "learning_rate": 3.007492407806456e-07, "loss": 1.8592, "step": 6000 }, { "epoch": 0.968215553404324, "grad_norm": 4.683064937591553, "learning_rate": 2.9789462281605284e-07, "loss": 1.4376, "step": 6001 }, { "epoch": 0.9683768957728299, "grad_norm": 3.8618106842041016, "learning_rate": 2.950535766241602e-07, "loss": 1.7857, "step": 6002 }, { "epoch": 0.9685382381413359, "grad_norm": 6.9576239585876465, "learning_rate": 2.9222610298074717e-07, "loss": 1.7491, "step": 6003 }, { "epoch": 0.9686995805098418, "grad_norm": 4.038752555847168, "learning_rate": 2.894122026579016e-07, "loss": 1.5902, "step": 6004 }, { "epoch": 0.9688609228783479, "grad_norm": 5.251195907592773, "learning_rate": 2.866118764239756e-07, "loss": 2.095, "step": 6005 }, { "epoch": 0.9690222652468539, "grad_norm": 5.801555633544922, "learning_rate": 2.8382512504365186e-07, "loss": 1.5683, "step": 6006 }, { "epoch": 0.9691836076153598, "grad_norm": 5.112753391265869, "learning_rate": 2.810519492778774e-07, "loss": 1.8896, "step": 6007 }, { "epoch": 0.9693449499838658, "grad_norm": 4.228761672973633, "learning_rate": 2.7829234988390184e-07, "loss": 1.6108, "step": 6008 }, { "epoch": 0.9695062923523717, "grad_norm": 4.433478355407715, "learning_rate": 2.7554632761526146e-07, "loss": 1.8418, "step": 6009 }, { "epoch": 0.9696676347208777, "grad_norm": 5.7169365882873535, "learning_rate": 2.7281388322180635e-07, "loss": 1.93, "step": 6010 }, { "epoch": 0.9698289770893836, "grad_norm": 5.642277240753174, "learning_rate": 2.700950174496564e-07, "loss": 1.7152, "step": 6011 }, { "epoch": 0.9699903194578896, "grad_norm": 5.4277448654174805, "learning_rate": 2.673897310412288e-07, "loss": 1.8661, "step": 6012 }, { "epoch": 0.9701516618263956, "grad_norm": 5.835224151611328, "learning_rate": 2.646980247352437e-07, "loss": 1.6669, "step": 6013 }, { "epoch": 0.9703130041949016, "grad_norm": 5.225020885467529, "learning_rate": 2.6201989926669115e-07, "loss": 1.8734, "step": 6014 }, { "epoch": 0.9704743465634076, "grad_norm": 4.355700492858887, "learning_rate": 2.5935535536688036e-07, "loss": 1.6864, "step": 6015 }, { "epoch": 0.9706356889319135, "grad_norm": 4.854405403137207, "learning_rate": 2.5670439376339063e-07, "loss": 1.8816, "step": 6016 }, { "epoch": 0.9707970313004195, "grad_norm": 5.25092887878418, "learning_rate": 2.5406701518009834e-07, "loss": 1.9376, "step": 6017 }, { "epoch": 0.9709583736689255, "grad_norm": 5.707984447479248, "learning_rate": 2.5144322033717747e-07, "loss": 1.5456, "step": 6018 }, { "epoch": 0.9711197160374314, "grad_norm": 4.779830455780029, "learning_rate": 2.488330099510883e-07, "loss": 1.7704, "step": 6019 }, { "epoch": 0.9712810584059374, "grad_norm": 3.8003082275390625, "learning_rate": 2.4623638473457167e-07, "loss": 1.6952, "step": 6020 }, { "epoch": 0.9714424007744433, "grad_norm": 5.577223300933838, "learning_rate": 2.436533453966772e-07, "loss": 1.8789, "step": 6021 }, { "epoch": 0.9716037431429493, "grad_norm": 4.0358123779296875, "learning_rate": 2.4108389264272947e-07, "loss": 1.846, "step": 6022 }, { "epoch": 0.9717650855114554, "grad_norm": 4.698176383972168, "learning_rate": 2.3852802717435617e-07, "loss": 1.8529, "step": 6023 }, { "epoch": 0.9719264278799613, "grad_norm": 4.503703594207764, "learning_rate": 2.359857496894602e-07, "loss": 1.613, "step": 6024 }, { "epoch": 0.9720877702484673, "grad_norm": 3.923738956451416, "learning_rate": 2.3345706088224729e-07, "loss": 1.9184, "step": 6025 }, { "epoch": 0.9722491126169732, "grad_norm": 5.014230728149414, "learning_rate": 2.3094196144320956e-07, "loss": 1.8012, "step": 6026 }, { "epoch": 0.9724104549854792, "grad_norm": 4.5727033615112305, "learning_rate": 2.284404520591199e-07, "loss": 2.0795, "step": 6027 }, { "epoch": 0.9725717973539851, "grad_norm": 4.497374534606934, "learning_rate": 2.25952533413043e-07, "loss": 1.8711, "step": 6028 }, { "epoch": 0.9727331397224911, "grad_norm": 3.5301945209503174, "learning_rate": 2.2347820618434657e-07, "loss": 1.7145, "step": 6029 }, { "epoch": 0.972894482090997, "grad_norm": 4.955672264099121, "learning_rate": 2.2101747104866788e-07, "loss": 1.5397, "step": 6030 }, { "epoch": 0.9730558244595031, "grad_norm": 3.6721248626708984, "learning_rate": 2.185703286779417e-07, "loss": 1.7076, "step": 6031 }, { "epoch": 0.9732171668280091, "grad_norm": 4.408209323883057, "learning_rate": 2.161367797403946e-07, "loss": 2.1241, "step": 6032 }, { "epoch": 0.973378509196515, "grad_norm": 3.461690902709961, "learning_rate": 2.137168249005339e-07, "loss": 1.9949, "step": 6033 }, { "epoch": 0.973539851565021, "grad_norm": 4.539180755615234, "learning_rate": 2.113104648191644e-07, "loss": 1.6892, "step": 6034 }, { "epoch": 0.9737011939335269, "grad_norm": 5.39754056930542, "learning_rate": 2.0891770015336044e-07, "loss": 1.7824, "step": 6035 }, { "epoch": 0.9738625363020329, "grad_norm": 3.8471028804779053, "learning_rate": 2.0653853155650492e-07, "loss": 1.8523, "step": 6036 }, { "epoch": 0.9740238786705389, "grad_norm": 3.98358416557312, "learning_rate": 2.0417295967825602e-07, "loss": 1.9759, "step": 6037 }, { "epoch": 0.9741852210390448, "grad_norm": 4.597020626068115, "learning_rate": 2.0182098516456362e-07, "loss": 1.7696, "step": 6038 }, { "epoch": 0.9743465634075508, "grad_norm": 4.354307174682617, "learning_rate": 1.9948260865766398e-07, "loss": 1.8683, "step": 6039 }, { "epoch": 0.9745079057760568, "grad_norm": 4.493190288543701, "learning_rate": 1.971578307960742e-07, "loss": 2.0457, "step": 6040 }, { "epoch": 0.9746692481445628, "grad_norm": 4.883968353271484, "learning_rate": 1.9484665221460861e-07, "loss": 1.9905, "step": 6041 }, { "epoch": 0.9748305905130688, "grad_norm": 4.397480010986328, "learning_rate": 1.9254907354436802e-07, "loss": 1.6917, "step": 6042 }, { "epoch": 0.9749919328815747, "grad_norm": 4.127442359924316, "learning_rate": 1.9026509541272275e-07, "loss": 1.7102, "step": 6043 }, { "epoch": 0.9751532752500807, "grad_norm": 3.7167837619781494, "learning_rate": 1.879947184433517e-07, "loss": 1.8229, "step": 6044 }, { "epoch": 0.9753146176185866, "grad_norm": 3.8155481815338135, "learning_rate": 1.8573794325620343e-07, "loss": 1.7235, "step": 6045 }, { "epoch": 0.9754759599870926, "grad_norm": 4.336913585662842, "learning_rate": 1.8349477046751828e-07, "loss": 1.7814, "step": 6046 }, { "epoch": 0.9756373023555985, "grad_norm": 4.289729595184326, "learning_rate": 1.81265200689823e-07, "loss": 1.972, "step": 6047 }, { "epoch": 0.9757986447241046, "grad_norm": 5.120328426361084, "learning_rate": 1.7904923453193056e-07, "loss": 1.966, "step": 6048 }, { "epoch": 0.9759599870926106, "grad_norm": 5.423091411590576, "learning_rate": 1.7684687259893473e-07, "loss": 1.7488, "step": 6049 }, { "epoch": 0.9761213294611165, "grad_norm": 4.753484725952148, "learning_rate": 1.7465811549222667e-07, "loss": 1.7431, "step": 6050 }, { "epoch": 0.9762826718296225, "grad_norm": 6.624080181121826, "learning_rate": 1.724829638094616e-07, "loss": 1.9048, "step": 6051 }, { "epoch": 0.9764440141981284, "grad_norm": 4.905503749847412, "learning_rate": 1.7032141814459778e-07, "loss": 1.6309, "step": 6052 }, { "epoch": 0.9766053565666344, "grad_norm": 4.667633056640625, "learning_rate": 1.6817347908786863e-07, "loss": 1.9415, "step": 6053 }, { "epoch": 0.9767666989351403, "grad_norm": 3.977940559387207, "learning_rate": 1.660391472257994e-07, "loss": 2.0148, "step": 6054 }, { "epoch": 0.9769280413036463, "grad_norm": 4.7679443359375, "learning_rate": 1.6391842314119054e-07, "loss": 1.6452, "step": 6055 }, { "epoch": 0.9770893836721523, "grad_norm": 3.8190910816192627, "learning_rate": 1.6181130741314e-07, "loss": 1.6309, "step": 6056 }, { "epoch": 0.9772507260406583, "grad_norm": 4.3463826179504395, "learning_rate": 1.5971780061701524e-07, "loss": 1.7745, "step": 6057 }, { "epoch": 0.9774120684091643, "grad_norm": 5.691588878631592, "learning_rate": 1.576379033244757e-07, "loss": 1.8893, "step": 6058 }, { "epoch": 0.9775734107776702, "grad_norm": 4.150592803955078, "learning_rate": 1.555716161034615e-07, "loss": 1.7366, "step": 6059 }, { "epoch": 0.9777347531461762, "grad_norm": 3.8583078384399414, "learning_rate": 1.5351893951819906e-07, "loss": 1.7324, "step": 6060 }, { "epoch": 0.9778960955146822, "grad_norm": 4.069317817687988, "learning_rate": 1.5147987412920116e-07, "loss": 1.7406, "step": 6061 }, { "epoch": 0.9780574378831881, "grad_norm": 4.734239101409912, "learning_rate": 1.494544204932502e-07, "loss": 1.8533, "step": 6062 }, { "epoch": 0.9782187802516941, "grad_norm": 4.880402565002441, "learning_rate": 1.4744257916343153e-07, "loss": 1.8194, "step": 6063 }, { "epoch": 0.9783801226202, "grad_norm": 3.40069317817688, "learning_rate": 1.4544435068909456e-07, "loss": 1.7689, "step": 6064 }, { "epoch": 0.9785414649887061, "grad_norm": 4.194741249084473, "learning_rate": 1.434597356158862e-07, "loss": 1.6788, "step": 6065 }, { "epoch": 0.978702807357212, "grad_norm": 5.724354267120361, "learning_rate": 1.4148873448573408e-07, "loss": 2.2099, "step": 6066 }, { "epoch": 0.978864149725718, "grad_norm": 4.644571304321289, "learning_rate": 1.3953134783682987e-07, "loss": 1.6492, "step": 6067 }, { "epoch": 0.979025492094224, "grad_norm": 4.232357978820801, "learning_rate": 1.375875762036738e-07, "loss": 1.6987, "step": 6068 }, { "epoch": 0.9791868344627299, "grad_norm": 7.880347728729248, "learning_rate": 1.3565742011703576e-07, "loss": 2.0461, "step": 6069 }, { "epoch": 0.9793481768312359, "grad_norm": 3.9191014766693115, "learning_rate": 1.3374088010396635e-07, "loss": 1.6321, "step": 6070 }, { "epoch": 0.9795095191997418, "grad_norm": 4.339367866516113, "learning_rate": 1.3183795668779697e-07, "loss": 1.8875, "step": 6071 }, { "epoch": 0.9796708615682478, "grad_norm": 4.509636878967285, "learning_rate": 1.2994865038815086e-07, "loss": 1.8603, "step": 6072 }, { "epoch": 0.9798322039367537, "grad_norm": 4.019570827484131, "learning_rate": 1.2807296172092086e-07, "loss": 1.9206, "step": 6073 }, { "epoch": 0.9799935463052598, "grad_norm": 5.26365852355957, "learning_rate": 1.2621089119829178e-07, "loss": 1.8456, "step": 6074 }, { "epoch": 0.9801548886737658, "grad_norm": 4.8168864250183105, "learning_rate": 1.243624393287235e-07, "loss": 1.7182, "step": 6075 }, { "epoch": 0.9803162310422717, "grad_norm": 3.4046692848205566, "learning_rate": 1.2252760661695672e-07, "loss": 1.9333, "step": 6076 }, { "epoch": 0.9804775734107777, "grad_norm": 4.598193645477295, "learning_rate": 1.2070639356401292e-07, "loss": 1.9252, "step": 6077 }, { "epoch": 0.9806389157792836, "grad_norm": 4.1954240798950195, "learning_rate": 1.1889880066720538e-07, "loss": 1.8775, "step": 6078 }, { "epoch": 0.9808002581477896, "grad_norm": 3.791823148727417, "learning_rate": 1.1710482842011151e-07, "loss": 1.7546, "step": 6079 }, { "epoch": 0.9809616005162956, "grad_norm": 4.0534868240356445, "learning_rate": 1.1532447731260054e-07, "loss": 1.9442, "step": 6080 }, { "epoch": 0.9811229428848015, "grad_norm": 4.838503360748291, "learning_rate": 1.1355774783081696e-07, "loss": 1.7617, "step": 6081 }, { "epoch": 0.9812842852533075, "grad_norm": 5.009237289428711, "learning_rate": 1.1180464045719708e-07, "loss": 1.7312, "step": 6082 }, { "epoch": 0.9814456276218135, "grad_norm": 4.203751087188721, "learning_rate": 1.1006515567043574e-07, "loss": 1.5578, "step": 6083 }, { "epoch": 0.9816069699903195, "grad_norm": 3.759047269821167, "learning_rate": 1.0833929394552522e-07, "loss": 1.7059, "step": 6084 }, { "epoch": 0.9817683123588254, "grad_norm": 3.4367129802703857, "learning_rate": 1.0662705575373855e-07, "loss": 1.3799, "step": 6085 }, { "epoch": 0.9819296547273314, "grad_norm": 4.999564170837402, "learning_rate": 1.0492844156262394e-07, "loss": 1.6221, "step": 6086 }, { "epoch": 0.9820909970958374, "grad_norm": 4.454932689666748, "learning_rate": 1.0324345183599926e-07, "loss": 1.9766, "step": 6087 }, { "epoch": 0.9822523394643433, "grad_norm": 4.134593963623047, "learning_rate": 1.0157208703397426e-07, "loss": 1.9193, "step": 6088 }, { "epoch": 0.9824136818328493, "grad_norm": 3.602125644683838, "learning_rate": 9.991434761293938e-08, "loss": 1.826, "step": 6089 }, { "epoch": 0.9825750242013552, "grad_norm": 4.12139892578125, "learning_rate": 9.827023402556035e-08, "loss": 1.8143, "step": 6090 }, { "epoch": 0.9827363665698613, "grad_norm": 4.671372890472412, "learning_rate": 9.663974672078912e-08, "loss": 1.6706, "step": 6091 }, { "epoch": 0.9828977089383673, "grad_norm": 4.771209239959717, "learning_rate": 9.502288614383625e-08, "loss": 1.8342, "step": 6092 }, { "epoch": 0.9830590513068732, "grad_norm": 4.081023216247559, "learning_rate": 9.341965273621522e-08, "loss": 1.7494, "step": 6093 }, { "epoch": 0.9832203936753792, "grad_norm": 4.775580406188965, "learning_rate": 9.183004693570363e-08, "loss": 1.8544, "step": 6094 }, { "epoch": 0.9833817360438851, "grad_norm": 5.823474884033203, "learning_rate": 9.025406917636537e-08, "loss": 2.0259, "step": 6095 }, { "epoch": 0.9835430784123911, "grad_norm": 3.8634917736053467, "learning_rate": 8.869171988854508e-08, "loss": 2.1488, "step": 6096 }, { "epoch": 0.983704420780897, "grad_norm": 5.826895236968994, "learning_rate": 8.714299949885707e-08, "loss": 1.7576, "step": 6097 }, { "epoch": 0.983865763149403, "grad_norm": 4.188059329986572, "learning_rate": 8.560790843019639e-08, "loss": 1.76, "step": 6098 }, { "epoch": 0.984027105517909, "grad_norm": 3.9793922901153564, "learning_rate": 8.408644710173886e-08, "loss": 1.8688, "step": 6099 }, { "epoch": 0.984188447886415, "grad_norm": 3.9827539920806885, "learning_rate": 8.2578615928941e-08, "loss": 1.6499, "step": 6100 }, { "epoch": 0.984349790254921, "grad_norm": 6.811621189117432, "learning_rate": 8.108441532353461e-08, "loss": 1.8214, "step": 6101 }, { "epoch": 0.9845111326234269, "grad_norm": 3.6228678226470947, "learning_rate": 7.960384569353219e-08, "loss": 1.8004, "step": 6102 }, { "epoch": 0.9846724749919329, "grad_norm": 3.770827531814575, "learning_rate": 7.813690744321033e-08, "loss": 1.9498, "step": 6103 }, { "epoch": 0.9848338173604388, "grad_norm": 5.747048854827881, "learning_rate": 7.668360097314864e-08, "loss": 1.8909, "step": 6104 }, { "epoch": 0.9849951597289448, "grad_norm": 4.43508243560791, "learning_rate": 7.524392668018521e-08, "loss": 1.5331, "step": 6105 }, { "epoch": 0.9851565020974508, "grad_norm": 4.186741352081299, "learning_rate": 7.381788495743891e-08, "loss": 1.9215, "step": 6106 }, { "epoch": 0.9853178444659567, "grad_norm": 3.6606838703155518, "learning_rate": 7.240547619430382e-08, "loss": 1.8836, "step": 6107 }, { "epoch": 0.9854791868344628, "grad_norm": 4.097906589508057, "learning_rate": 7.100670077646587e-08, "loss": 1.6784, "step": 6108 }, { "epoch": 0.9856405292029687, "grad_norm": 4.952674388885498, "learning_rate": 6.962155908586954e-08, "loss": 1.7973, "step": 6109 }, { "epoch": 0.9858018715714747, "grad_norm": 4.816624641418457, "learning_rate": 6.825005150075114e-08, "loss": 1.7826, "step": 6110 }, { "epoch": 0.9859632139399807, "grad_norm": 5.050424098968506, "learning_rate": 6.689217839561113e-08, "loss": 1.9938, "step": 6111 }, { "epoch": 0.9861245563084866, "grad_norm": 4.853276252746582, "learning_rate": 6.554794014124177e-08, "loss": 1.8858, "step": 6112 }, { "epoch": 0.9862858986769926, "grad_norm": 4.018308639526367, "learning_rate": 6.421733710469391e-08, "loss": 1.8462, "step": 6113 }, { "epoch": 0.9864472410454985, "grad_norm": 4.396552562713623, "learning_rate": 6.290036964931578e-08, "loss": 1.8676, "step": 6114 }, { "epoch": 0.9866085834140045, "grad_norm": 4.659566402435303, "learning_rate": 6.159703813471418e-08, "loss": 1.6639, "step": 6115 }, { "epoch": 0.9867699257825104, "grad_norm": 4.091868877410889, "learning_rate": 6.030734291677664e-08, "loss": 1.6306, "step": 6116 }, { "epoch": 0.9869312681510165, "grad_norm": 6.1804375648498535, "learning_rate": 5.903128434768257e-08, "loss": 2.1267, "step": 6117 }, { "epoch": 0.9870926105195225, "grad_norm": 4.831748008728027, "learning_rate": 5.7768862775864354e-08, "loss": 1.6718, "step": 6118 }, { "epoch": 0.9872539528880284, "grad_norm": 3.624600887298584, "learning_rate": 5.652007854605179e-08, "loss": 1.6628, "step": 6119 }, { "epoch": 0.9874152952565344, "grad_norm": 4.931962490081787, "learning_rate": 5.5284931999227685e-08, "loss": 1.7394, "step": 6120 }, { "epoch": 0.9875766376250403, "grad_norm": 5.466416835784912, "learning_rate": 5.4063423472672236e-08, "loss": 1.7392, "step": 6121 }, { "epoch": 0.9877379799935463, "grad_norm": 5.067384243011475, "learning_rate": 5.28555532999353e-08, "loss": 1.8295, "step": 6122 }, { "epoch": 0.9878993223620522, "grad_norm": 4.261351585388184, "learning_rate": 5.1661321810836385e-08, "loss": 2.1182, "step": 6123 }, { "epoch": 0.9880606647305582, "grad_norm": 3.832385540008545, "learning_rate": 5.048072933148129e-08, "loss": 1.8679, "step": 6124 }, { "epoch": 0.9882220070990642, "grad_norm": 4.070018768310547, "learning_rate": 4.9313776184234386e-08, "loss": 1.8629, "step": 6125 }, { "epoch": 0.9883833494675702, "grad_norm": 3.701960325241089, "learning_rate": 4.8160462687757425e-08, "loss": 1.8328, "step": 6126 }, { "epoch": 0.9885446918360762, "grad_norm": 5.63292121887207, "learning_rate": 4.7020789156965175e-08, "loss": 1.8521, "step": 6127 }, { "epoch": 0.9887060342045821, "grad_norm": 3.8325681686401367, "learning_rate": 4.5894755903075347e-08, "loss": 1.7439, "step": 6128 }, { "epoch": 0.9888673765730881, "grad_norm": 3.7844038009643555, "learning_rate": 4.478236323355311e-08, "loss": 1.6977, "step": 6129 }, { "epoch": 0.989028718941594, "grad_norm": 3.7526204586029053, "learning_rate": 4.368361145214994e-08, "loss": 2.0068, "step": 6130 }, { "epoch": 0.9891900613101, "grad_norm": 4.725560665130615, "learning_rate": 4.25985008589036e-08, "loss": 1.7959, "step": 6131 }, { "epoch": 0.989351403678606, "grad_norm": 4.374151706695557, "learning_rate": 4.152703175011041e-08, "loss": 1.8819, "step": 6132 }, { "epoch": 0.9895127460471119, "grad_norm": 3.925537347793579, "learning_rate": 4.046920441834745e-08, "loss": 1.9331, "step": 6133 }, { "epoch": 0.989674088415618, "grad_norm": 4.154895305633545, "learning_rate": 3.942501915247254e-08, "loss": 1.7509, "step": 6134 }, { "epoch": 0.989835430784124, "grad_norm": 4.7291998863220215, "learning_rate": 3.839447623760761e-08, "loss": 1.7866, "step": 6135 }, { "epoch": 0.9899967731526299, "grad_norm": 3.555507183074951, "learning_rate": 3.737757595515534e-08, "loss": 1.7108, "step": 6136 }, { "epoch": 0.9901581155211359, "grad_norm": 4.529835224151611, "learning_rate": 3.637431858279916e-08, "loss": 1.6214, "step": 6137 }, { "epoch": 0.9903194578896418, "grad_norm": 4.146393775939941, "learning_rate": 3.538470439448105e-08, "loss": 1.73, "step": 6138 }, { "epoch": 0.9904808002581478, "grad_norm": 5.236659526824951, "learning_rate": 3.4408733660440395e-08, "loss": 1.8729, "step": 6139 }, { "epoch": 0.9906421426266537, "grad_norm": 4.751691818237305, "learning_rate": 3.344640664716958e-08, "loss": 1.6228, "step": 6140 }, { "epoch": 0.9908034849951597, "grad_norm": 8.98833179473877, "learning_rate": 3.249772361744175e-08, "loss": 1.8316, "step": 6141 }, { "epoch": 0.9909648273636656, "grad_norm": 5.137564182281494, "learning_rate": 3.156268483031077e-08, "loss": 1.8294, "step": 6142 }, { "epoch": 0.9911261697321717, "grad_norm": 4.2446370124816895, "learning_rate": 3.064129054110021e-08, "loss": 1.7952, "step": 6143 }, { "epoch": 0.9912875121006777, "grad_norm": 4.958536148071289, "learning_rate": 2.9733541001408794e-08, "loss": 1.7894, "step": 6144 }, { "epoch": 0.9914488544691836, "grad_norm": 5.8110671043396, "learning_rate": 2.8839436459104918e-08, "loss": 2.0013, "step": 6145 }, { "epoch": 0.9916101968376896, "grad_norm": 3.97647762298584, "learning_rate": 2.795897715833773e-08, "loss": 2.0651, "step": 6146 }, { "epoch": 0.9917715392061955, "grad_norm": 3.5526373386383057, "learning_rate": 2.709216333952602e-08, "loss": 1.7351, "step": 6147 }, { "epoch": 0.9919328815747015, "grad_norm": 4.236942768096924, "learning_rate": 2.6238995239369346e-08, "loss": 1.6112, "step": 6148 }, { "epoch": 0.9920942239432075, "grad_norm": 3.8910460472106934, "learning_rate": 2.5399473090825798e-08, "loss": 1.5615, "step": 6149 }, { "epoch": 0.9922555663117134, "grad_norm": 5.299407482147217, "learning_rate": 2.4573597123145332e-08, "loss": 1.8359, "step": 6150 }, { "epoch": 0.9924169086802195, "grad_norm": 3.532557249069214, "learning_rate": 2.3761367561841998e-08, "loss": 1.9855, "step": 6151 }, { "epoch": 0.9925782510487254, "grad_norm": 4.3103437423706055, "learning_rate": 2.2962784628705046e-08, "loss": 1.7814, "step": 6152 }, { "epoch": 0.9927395934172314, "grad_norm": 4.5299973487854, "learning_rate": 2.2177848541793388e-08, "loss": 1.7005, "step": 6153 }, { "epoch": 0.9929009357857373, "grad_norm": 3.790895462036133, "learning_rate": 2.1406559515452228e-08, "loss": 1.4501, "step": 6154 }, { "epoch": 0.9930622781542433, "grad_norm": 4.620772361755371, "learning_rate": 2.0648917760279775e-08, "loss": 1.7938, "step": 6155 }, { "epoch": 0.9932236205227493, "grad_norm": 4.014719009399414, "learning_rate": 1.9904923483171635e-08, "loss": 2.0391, "step": 6156 }, { "epoch": 0.9933849628912552, "grad_norm": 4.058664321899414, "learning_rate": 1.9174576887276417e-08, "loss": 1.6082, "step": 6157 }, { "epoch": 0.9935463052597612, "grad_norm": 4.010701656341553, "learning_rate": 1.845787817202349e-08, "loss": 1.6187, "step": 6158 }, { "epoch": 0.9937076476282671, "grad_norm": 4.73102331161499, "learning_rate": 1.7754827533122964e-08, "loss": 1.9062, "step": 6159 }, { "epoch": 0.9938689899967732, "grad_norm": 4.531217575073242, "learning_rate": 1.7065425162549054e-08, "loss": 1.8403, "step": 6160 }, { "epoch": 0.9940303323652792, "grad_norm": 6.234742641448975, "learning_rate": 1.6389671248545623e-08, "loss": 1.7349, "step": 6161 }, { "epoch": 0.9941916747337851, "grad_norm": 3.9504058361053467, "learning_rate": 1.5727565975642844e-08, "loss": 1.9756, "step": 6162 }, { "epoch": 0.9943530171022911, "grad_norm": 5.519317150115967, "learning_rate": 1.5079109524634983e-08, "loss": 1.6333, "step": 6163 }, { "epoch": 0.994514359470797, "grad_norm": 3.721313238143921, "learning_rate": 1.4444302072591508e-08, "loss": 1.7705, "step": 6164 }, { "epoch": 0.994675701839303, "grad_norm": 4.5881757736206055, "learning_rate": 1.3823143792851545e-08, "loss": 1.7942, "step": 6165 }, { "epoch": 0.9948370442078089, "grad_norm": 4.943873405456543, "learning_rate": 1.3215634855029413e-08, "loss": 1.8513, "step": 6166 }, { "epoch": 0.9949983865763149, "grad_norm": 4.627598762512207, "learning_rate": 1.2621775425020189e-08, "loss": 1.6969, "step": 6167 }, { "epoch": 0.995159728944821, "grad_norm": 4.433756351470947, "learning_rate": 1.2041565664977494e-08, "loss": 1.9785, "step": 6168 }, { "epoch": 0.9953210713133269, "grad_norm": 3.788708448410034, "learning_rate": 1.1475005733335708e-08, "loss": 1.9499, "step": 6169 }, { "epoch": 0.9954824136818329, "grad_norm": 4.006746768951416, "learning_rate": 1.0922095784798858e-08, "loss": 1.9891, "step": 6170 }, { "epoch": 0.9956437560503388, "grad_norm": 4.220999240875244, "learning_rate": 1.0382835970357275e-08, "loss": 1.7248, "step": 6171 }, { "epoch": 0.9958050984188448, "grad_norm": 3.901395320892334, "learning_rate": 9.857226437248735e-09, "loss": 1.5894, "step": 6172 }, { "epoch": 0.9959664407873507, "grad_norm": 4.1000285148620605, "learning_rate": 9.34526732900287e-09, "loss": 1.8607, "step": 6173 }, { "epoch": 0.9961277831558567, "grad_norm": 3.8186614513397217, "learning_rate": 8.846958785418968e-09, "loss": 1.7024, "step": 6174 }, { "epoch": 0.9962891255243627, "grad_norm": 4.361697196960449, "learning_rate": 8.362300942560409e-09, "loss": 2.022, "step": 6175 }, { "epoch": 0.9964504678928686, "grad_norm": 4.559261322021484, "learning_rate": 7.891293932776878e-09, "loss": 1.8452, "step": 6176 }, { "epoch": 0.9966118102613747, "grad_norm": 3.8690664768218994, "learning_rate": 7.433937884676611e-09, "loss": 1.9764, "step": 6177 }, { "epoch": 0.9967731526298806, "grad_norm": 5.568852424621582, "learning_rate": 6.990232923148599e-09, "loss": 1.8318, "step": 6178 }, { "epoch": 0.9969344949983866, "grad_norm": 4.514222621917725, "learning_rate": 6.5601791693514766e-09, "loss": 1.6589, "step": 6179 }, { "epoch": 0.9970958373668926, "grad_norm": 5.722458839416504, "learning_rate": 6.143776740713536e-09, "loss": 1.8228, "step": 6180 }, { "epoch": 0.9972571797353985, "grad_norm": 4.524956703186035, "learning_rate": 5.741025750943818e-09, "loss": 1.8108, "step": 6181 }, { "epoch": 0.9974185221039045, "grad_norm": 5.531214237213135, "learning_rate": 5.351926310015465e-09, "loss": 1.9062, "step": 6182 }, { "epoch": 0.9975798644724104, "grad_norm": 5.001838684082031, "learning_rate": 4.97647852417682e-09, "loss": 1.7831, "step": 6183 }, { "epoch": 0.9977412068409164, "grad_norm": 4.544003963470459, "learning_rate": 4.614682495951428e-09, "loss": 1.811, "step": 6184 }, { "epoch": 0.9979025492094223, "grad_norm": 4.039700031280518, "learning_rate": 4.266538324132485e-09, "loss": 1.7955, "step": 6185 }, { "epoch": 0.9980638915779284, "grad_norm": 5.618652820587158, "learning_rate": 3.932046103777287e-09, "loss": 1.7273, "step": 6186 }, { "epoch": 0.9982252339464344, "grad_norm": 4.512746334075928, "learning_rate": 3.6112059262294327e-09, "loss": 1.7958, "step": 6187 }, { "epoch": 0.9983865763149403, "grad_norm": 4.33228874206543, "learning_rate": 3.3040178790966216e-09, "loss": 1.6066, "step": 6188 }, { "epoch": 0.9985479186834463, "grad_norm": 4.162201404571533, "learning_rate": 3.0104820462673044e-09, "loss": 1.8014, "step": 6189 }, { "epoch": 0.9987092610519522, "grad_norm": 4.221867084503174, "learning_rate": 2.730598507882931e-09, "loss": 1.8571, "step": 6190 }, { "epoch": 0.9988706034204582, "grad_norm": 4.4337053298950195, "learning_rate": 2.464367340376805e-09, "loss": 1.8666, "step": 6191 }, { "epoch": 0.9990319457889641, "grad_norm": 4.931772232055664, "learning_rate": 2.21178861644078e-09, "loss": 1.5379, "step": 6192 }, { "epoch": 0.9991932881574701, "grad_norm": 3.5758066177368164, "learning_rate": 1.9728624050530107e-09, "loss": 1.7382, "step": 6193 }, { "epoch": 0.9993546305259762, "grad_norm": 4.1605000495910645, "learning_rate": 1.7475887714502038e-09, "loss": 1.7858, "step": 6194 }, { "epoch": 0.9995159728944821, "grad_norm": 5.4196696281433105, "learning_rate": 1.535967777149816e-09, "loss": 1.801, "step": 6195 }, { "epoch": 0.9996773152629881, "grad_norm": 4.785623550415039, "learning_rate": 1.3379994799278538e-09, "loss": 1.6988, "step": 6196 }, { "epoch": 0.999838657631494, "grad_norm": 4.673151016235352, "learning_rate": 1.1536839338521787e-09, "loss": 1.6805, "step": 6197 }, { "epoch": 0.999838657631494, "step": 6197, "total_flos": 7.714147240515731e+18, "train_loss": 0.05741394475061547, "train_runtime": 1948.2586, "train_samples_per_second": 101.798, "train_steps_per_second": 3.181 } ], "logging_steps": 1.0, "max_steps": 6198, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.714147240515731e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }