{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 0, "global_step": 1479, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002028397565922921, "grad_norm": 0.56640625, "learning_rate": 9.993238674780258e-06, "loss": 2.0083, "step": 1 }, { "epoch": 0.004056795131845842, "grad_norm": 0.515625, "learning_rate": 9.986477349560515e-06, "loss": 1.8898, "step": 2 }, { "epoch": 0.006085192697768763, "grad_norm": 0.50390625, "learning_rate": 9.979716024340772e-06, "loss": 1.9586, "step": 3 }, { "epoch": 0.008113590263691683, "grad_norm": 0.46484375, "learning_rate": 9.972954699121028e-06, "loss": 1.8423, "step": 4 }, { "epoch": 0.010141987829614604, "grad_norm": 0.4609375, "learning_rate": 9.966193373901285e-06, "loss": 1.9374, "step": 5 }, { "epoch": 0.012170385395537525, "grad_norm": 0.42578125, "learning_rate": 9.959432048681542e-06, "loss": 1.856, "step": 6 }, { "epoch": 0.014198782961460446, "grad_norm": 0.400390625, "learning_rate": 9.9526707234618e-06, "loss": 1.827, "step": 7 }, { "epoch": 0.016227180527383367, "grad_norm": 0.40625, "learning_rate": 9.945909398242056e-06, "loss": 1.9399, "step": 8 }, { "epoch": 0.018255578093306288, "grad_norm": 0.396484375, "learning_rate": 9.939148073022313e-06, "loss": 1.8973, "step": 9 }, { "epoch": 0.02028397565922921, "grad_norm": 0.349609375, "learning_rate": 9.93238674780257e-06, "loss": 1.7955, "step": 10 }, { "epoch": 0.02231237322515213, "grad_norm": 0.34765625, "learning_rate": 9.925625422582827e-06, "loss": 1.7947, "step": 11 }, { "epoch": 0.02434077079107505, "grad_norm": 0.33203125, "learning_rate": 9.918864097363084e-06, "loss": 1.7894, "step": 12 }, { "epoch": 0.02636916835699797, "grad_norm": 0.33984375, "learning_rate": 9.91210277214334e-06, "loss": 1.7834, "step": 13 }, { "epoch": 0.028397565922920892, "grad_norm": 0.3125, "learning_rate": 9.905341446923598e-06, "loss": 1.6634, "step": 14 }, { "epoch": 0.030425963488843813, "grad_norm": 0.326171875, "learning_rate": 9.898580121703854e-06, "loss": 1.7655, "step": 15 }, { "epoch": 0.032454361054766734, "grad_norm": 0.318359375, "learning_rate": 9.891818796484111e-06, "loss": 1.6973, "step": 16 }, { "epoch": 0.034482758620689655, "grad_norm": 0.328125, "learning_rate": 9.885057471264368e-06, "loss": 1.7866, "step": 17 }, { "epoch": 0.036511156186612576, "grad_norm": 0.283203125, "learning_rate": 9.878296146044625e-06, "loss": 1.6943, "step": 18 }, { "epoch": 0.038539553752535496, "grad_norm": 0.3125, "learning_rate": 9.871534820824882e-06, "loss": 1.7, "step": 19 }, { "epoch": 0.04056795131845842, "grad_norm": 0.37109375, "learning_rate": 9.864773495605139e-06, "loss": 1.6642, "step": 20 }, { "epoch": 0.04259634888438134, "grad_norm": 0.28125, "learning_rate": 9.858012170385396e-06, "loss": 1.6852, "step": 21 }, { "epoch": 0.04462474645030426, "grad_norm": 0.291015625, "learning_rate": 9.851250845165653e-06, "loss": 1.6855, "step": 22 }, { "epoch": 0.04665314401622718, "grad_norm": 0.255859375, "learning_rate": 9.84448951994591e-06, "loss": 1.6641, "step": 23 }, { "epoch": 0.0486815415821501, "grad_norm": 0.259765625, "learning_rate": 9.837728194726167e-06, "loss": 1.6916, "step": 24 }, { "epoch": 0.05070993914807302, "grad_norm": 0.275390625, "learning_rate": 9.830966869506424e-06, "loss": 1.7211, "step": 25 }, { "epoch": 0.05273833671399594, "grad_norm": 0.240234375, "learning_rate": 9.82420554428668e-06, "loss": 1.5421, "step": 26 }, { "epoch": 0.05476673427991886, "grad_norm": 0.236328125, "learning_rate": 9.817444219066939e-06, "loss": 1.5551, "step": 27 }, { "epoch": 0.056795131845841784, "grad_norm": 0.2265625, "learning_rate": 9.810682893847194e-06, "loss": 1.5284, "step": 28 }, { "epoch": 0.058823529411764705, "grad_norm": 0.263671875, "learning_rate": 9.803921568627451e-06, "loss": 1.5221, "step": 29 }, { "epoch": 0.060851926977687626, "grad_norm": 0.240234375, "learning_rate": 9.797160243407708e-06, "loss": 1.6265, "step": 30 }, { "epoch": 0.06288032454361055, "grad_norm": 0.224609375, "learning_rate": 9.790398918187965e-06, "loss": 1.582, "step": 31 }, { "epoch": 0.06490872210953347, "grad_norm": 0.212890625, "learning_rate": 9.783637592968222e-06, "loss": 1.5373, "step": 32 }, { "epoch": 0.06693711967545639, "grad_norm": 0.2109375, "learning_rate": 9.776876267748479e-06, "loss": 1.4842, "step": 33 }, { "epoch": 0.06896551724137931, "grad_norm": 0.203125, "learning_rate": 9.770114942528738e-06, "loss": 1.4742, "step": 34 }, { "epoch": 0.07099391480730223, "grad_norm": 0.2158203125, "learning_rate": 9.763353617308994e-06, "loss": 1.5238, "step": 35 }, { "epoch": 0.07302231237322515, "grad_norm": 0.21484375, "learning_rate": 9.75659229208925e-06, "loss": 1.4861, "step": 36 }, { "epoch": 0.07505070993914807, "grad_norm": 0.369140625, "learning_rate": 9.749830966869507e-06, "loss": 1.475, "step": 37 }, { "epoch": 0.07707910750507099, "grad_norm": 0.2021484375, "learning_rate": 9.743069641649763e-06, "loss": 1.492, "step": 38 }, { "epoch": 0.07910750507099391, "grad_norm": 0.2890625, "learning_rate": 9.73630831643002e-06, "loss": 1.4861, "step": 39 }, { "epoch": 0.08113590263691683, "grad_norm": 0.19921875, "learning_rate": 9.729546991210277e-06, "loss": 1.4854, "step": 40 }, { "epoch": 0.08316430020283976, "grad_norm": 0.201171875, "learning_rate": 9.722785665990536e-06, "loss": 1.4827, "step": 41 }, { "epoch": 0.08519269776876268, "grad_norm": 0.1982421875, "learning_rate": 9.716024340770793e-06, "loss": 1.5104, "step": 42 }, { "epoch": 0.0872210953346856, "grad_norm": 0.1923828125, "learning_rate": 9.70926301555105e-06, "loss": 1.4619, "step": 43 }, { "epoch": 0.08924949290060852, "grad_norm": 0.1826171875, "learning_rate": 9.702501690331305e-06, "loss": 1.4507, "step": 44 }, { "epoch": 0.09127789046653144, "grad_norm": 0.1875, "learning_rate": 9.695740365111562e-06, "loss": 1.4821, "step": 45 }, { "epoch": 0.09330628803245436, "grad_norm": 0.1884765625, "learning_rate": 9.688979039891819e-06, "loss": 1.4255, "step": 46 }, { "epoch": 0.09533468559837728, "grad_norm": 0.1806640625, "learning_rate": 9.682217714672076e-06, "loss": 1.3824, "step": 47 }, { "epoch": 0.0973630831643002, "grad_norm": 0.19140625, "learning_rate": 9.675456389452334e-06, "loss": 1.3772, "step": 48 }, { "epoch": 0.09939148073022312, "grad_norm": 0.1953125, "learning_rate": 9.668695064232591e-06, "loss": 1.4889, "step": 49 }, { "epoch": 0.10141987829614604, "grad_norm": 0.1796875, "learning_rate": 9.661933739012848e-06, "loss": 1.4423, "step": 50 }, { "epoch": 0.10344827586206896, "grad_norm": 0.1728515625, "learning_rate": 9.655172413793105e-06, "loss": 1.415, "step": 51 }, { "epoch": 0.10547667342799188, "grad_norm": 0.169921875, "learning_rate": 9.64841108857336e-06, "loss": 1.3956, "step": 52 }, { "epoch": 0.1075050709939148, "grad_norm": 0.197265625, "learning_rate": 9.641649763353617e-06, "loss": 1.469, "step": 53 }, { "epoch": 0.10953346855983773, "grad_norm": 0.171875, "learning_rate": 9.634888438133874e-06, "loss": 1.4105, "step": 54 }, { "epoch": 0.11156186612576065, "grad_norm": 0.1767578125, "learning_rate": 9.628127112914133e-06, "loss": 1.4032, "step": 55 }, { "epoch": 0.11359026369168357, "grad_norm": 0.1650390625, "learning_rate": 9.62136578769439e-06, "loss": 1.2954, "step": 56 }, { "epoch": 0.11561866125760649, "grad_norm": 0.162109375, "learning_rate": 9.614604462474646e-06, "loss": 1.3745, "step": 57 }, { "epoch": 0.11764705882352941, "grad_norm": 1.578125, "learning_rate": 9.607843137254903e-06, "loss": 1.3565, "step": 58 }, { "epoch": 0.11967545638945233, "grad_norm": 0.1640625, "learning_rate": 9.60108181203516e-06, "loss": 1.3677, "step": 59 }, { "epoch": 0.12170385395537525, "grad_norm": 0.17578125, "learning_rate": 9.594320486815416e-06, "loss": 1.3665, "step": 60 }, { "epoch": 0.12373225152129817, "grad_norm": 0.197265625, "learning_rate": 9.587559161595672e-06, "loss": 1.3636, "step": 61 }, { "epoch": 0.1257606490872211, "grad_norm": 0.1640625, "learning_rate": 9.580797836375931e-06, "loss": 1.3645, "step": 62 }, { "epoch": 0.12778904665314403, "grad_norm": 0.1630859375, "learning_rate": 9.574036511156188e-06, "loss": 1.3666, "step": 63 }, { "epoch": 0.12981744421906694, "grad_norm": 0.1533203125, "learning_rate": 9.567275185936445e-06, "loss": 1.3332, "step": 64 }, { "epoch": 0.13184584178498987, "grad_norm": 0.1611328125, "learning_rate": 9.560513860716702e-06, "loss": 1.373, "step": 65 }, { "epoch": 0.13387423935091278, "grad_norm": 0.1748046875, "learning_rate": 9.553752535496959e-06, "loss": 1.3931, "step": 66 }, { "epoch": 0.1359026369168357, "grad_norm": 0.1787109375, "learning_rate": 9.546991210277216e-06, "loss": 1.2847, "step": 67 }, { "epoch": 0.13793103448275862, "grad_norm": 0.16796875, "learning_rate": 9.54022988505747e-06, "loss": 1.3494, "step": 68 }, { "epoch": 0.13995943204868155, "grad_norm": 0.1591796875, "learning_rate": 9.53346855983773e-06, "loss": 1.3461, "step": 69 }, { "epoch": 0.14198782961460446, "grad_norm": 0.2109375, "learning_rate": 9.526707234617986e-06, "loss": 1.3208, "step": 70 }, { "epoch": 0.1440162271805274, "grad_norm": 0.259765625, "learning_rate": 9.519945909398243e-06, "loss": 1.3241, "step": 71 }, { "epoch": 0.1460446247464503, "grad_norm": 0.1591796875, "learning_rate": 9.5131845841785e-06, "loss": 1.3235, "step": 72 }, { "epoch": 0.14807302231237324, "grad_norm": 0.1923828125, "learning_rate": 9.506423258958757e-06, "loss": 1.3221, "step": 73 }, { "epoch": 0.15010141987829614, "grad_norm": 0.197265625, "learning_rate": 9.499661933739014e-06, "loss": 1.3067, "step": 74 }, { "epoch": 0.15212981744421908, "grad_norm": 0.1669921875, "learning_rate": 9.492900608519271e-06, "loss": 1.3077, "step": 75 }, { "epoch": 0.15415821501014199, "grad_norm": 0.220703125, "learning_rate": 9.486139283299526e-06, "loss": 1.3585, "step": 76 }, { "epoch": 0.15618661257606492, "grad_norm": 0.1650390625, "learning_rate": 9.479377958079785e-06, "loss": 1.3229, "step": 77 }, { "epoch": 0.15821501014198783, "grad_norm": 0.173828125, "learning_rate": 9.472616632860042e-06, "loss": 1.3157, "step": 78 }, { "epoch": 0.16024340770791076, "grad_norm": 0.1904296875, "learning_rate": 9.465855307640299e-06, "loss": 1.2941, "step": 79 }, { "epoch": 0.16227180527383367, "grad_norm": 0.1591796875, "learning_rate": 9.459093982420555e-06, "loss": 1.3267, "step": 80 }, { "epoch": 0.1643002028397566, "grad_norm": 0.158203125, "learning_rate": 9.452332657200812e-06, "loss": 1.3108, "step": 81 }, { "epoch": 0.1663286004056795, "grad_norm": 0.1630859375, "learning_rate": 9.44557133198107e-06, "loss": 1.2806, "step": 82 }, { "epoch": 0.16835699797160245, "grad_norm": 0.15625, "learning_rate": 9.438810006761326e-06, "loss": 1.2928, "step": 83 }, { "epoch": 0.17038539553752535, "grad_norm": 0.169921875, "learning_rate": 9.432048681541583e-06, "loss": 1.3039, "step": 84 }, { "epoch": 0.1724137931034483, "grad_norm": 0.1826171875, "learning_rate": 9.42528735632184e-06, "loss": 1.3077, "step": 85 }, { "epoch": 0.1744421906693712, "grad_norm": 0.1513671875, "learning_rate": 9.418526031102097e-06, "loss": 1.3454, "step": 86 }, { "epoch": 0.17647058823529413, "grad_norm": 0.1513671875, "learning_rate": 9.411764705882354e-06, "loss": 1.3095, "step": 87 }, { "epoch": 0.17849898580121704, "grad_norm": 0.166015625, "learning_rate": 9.40500338066261e-06, "loss": 1.3264, "step": 88 }, { "epoch": 0.18052738336713997, "grad_norm": 0.1591796875, "learning_rate": 9.398242055442868e-06, "loss": 1.3138, "step": 89 }, { "epoch": 0.18255578093306288, "grad_norm": 0.1572265625, "learning_rate": 9.391480730223125e-06, "loss": 1.2476, "step": 90 }, { "epoch": 0.1845841784989858, "grad_norm": 0.1669921875, "learning_rate": 9.384719405003381e-06, "loss": 1.2699, "step": 91 }, { "epoch": 0.18661257606490872, "grad_norm": 0.1640625, "learning_rate": 9.377958079783638e-06, "loss": 1.3391, "step": 92 }, { "epoch": 0.18864097363083165, "grad_norm": 0.1591796875, "learning_rate": 9.371196754563895e-06, "loss": 1.3236, "step": 93 }, { "epoch": 0.19066937119675456, "grad_norm": 0.39453125, "learning_rate": 9.364435429344152e-06, "loss": 1.3209, "step": 94 }, { "epoch": 0.1926977687626775, "grad_norm": 0.326171875, "learning_rate": 9.357674104124409e-06, "loss": 1.3001, "step": 95 }, { "epoch": 0.1947261663286004, "grad_norm": 0.16796875, "learning_rate": 9.350912778904666e-06, "loss": 1.2758, "step": 96 }, { "epoch": 0.19675456389452334, "grad_norm": 0.166015625, "learning_rate": 9.344151453684923e-06, "loss": 1.2668, "step": 97 }, { "epoch": 0.19878296146044624, "grad_norm": 0.1748046875, "learning_rate": 9.33739012846518e-06, "loss": 1.2948, "step": 98 }, { "epoch": 0.20081135902636918, "grad_norm": 0.2099609375, "learning_rate": 9.330628803245437e-06, "loss": 1.3023, "step": 99 }, { "epoch": 0.2028397565922921, "grad_norm": 0.1650390625, "learning_rate": 9.323867478025694e-06, "loss": 1.2459, "step": 100 }, { "epoch": 0.20486815415821502, "grad_norm": 0.1669921875, "learning_rate": 9.31710615280595e-06, "loss": 1.2824, "step": 101 }, { "epoch": 0.20689655172413793, "grad_norm": 0.1845703125, "learning_rate": 9.310344827586207e-06, "loss": 1.2607, "step": 102 }, { "epoch": 0.20892494929006086, "grad_norm": 0.1708984375, "learning_rate": 9.303583502366464e-06, "loss": 1.2807, "step": 103 }, { "epoch": 0.21095334685598377, "grad_norm": 0.283203125, "learning_rate": 9.296822177146721e-06, "loss": 1.2656, "step": 104 }, { "epoch": 0.2129817444219067, "grad_norm": 0.236328125, "learning_rate": 9.290060851926978e-06, "loss": 1.2689, "step": 105 }, { "epoch": 0.2150101419878296, "grad_norm": 0.197265625, "learning_rate": 9.283299526707235e-06, "loss": 1.2439, "step": 106 }, { "epoch": 0.21703853955375255, "grad_norm": 0.24609375, "learning_rate": 9.276538201487492e-06, "loss": 1.2258, "step": 107 }, { "epoch": 0.21906693711967545, "grad_norm": 0.16796875, "learning_rate": 9.269776876267749e-06, "loss": 1.2496, "step": 108 }, { "epoch": 0.2210953346855984, "grad_norm": 0.162109375, "learning_rate": 9.263015551048006e-06, "loss": 1.2327, "step": 109 }, { "epoch": 0.2231237322515213, "grad_norm": 0.1748046875, "learning_rate": 9.256254225828263e-06, "loss": 1.2223, "step": 110 }, { "epoch": 0.22515212981744423, "grad_norm": 0.1943359375, "learning_rate": 9.24949290060852e-06, "loss": 1.2318, "step": 111 }, { "epoch": 0.22718052738336714, "grad_norm": 0.1796875, "learning_rate": 9.242731575388777e-06, "loss": 1.2651, "step": 112 }, { "epoch": 0.22920892494929007, "grad_norm": 0.2470703125, "learning_rate": 9.235970250169034e-06, "loss": 1.1937, "step": 113 }, { "epoch": 0.23123732251521298, "grad_norm": 0.1845703125, "learning_rate": 9.22920892494929e-06, "loss": 1.239, "step": 114 }, { "epoch": 0.2332657200811359, "grad_norm": 0.1796875, "learning_rate": 9.222447599729547e-06, "loss": 1.2483, "step": 115 }, { "epoch": 0.23529411764705882, "grad_norm": 0.1787109375, "learning_rate": 9.215686274509804e-06, "loss": 1.2162, "step": 116 }, { "epoch": 0.23732251521298176, "grad_norm": 0.2080078125, "learning_rate": 9.208924949290061e-06, "loss": 1.2844, "step": 117 }, { "epoch": 0.23935091277890466, "grad_norm": 0.1728515625, "learning_rate": 9.202163624070318e-06, "loss": 1.2785, "step": 118 }, { "epoch": 0.2413793103448276, "grad_norm": 0.1728515625, "learning_rate": 9.195402298850575e-06, "loss": 1.284, "step": 119 }, { "epoch": 0.2434077079107505, "grad_norm": 0.15625, "learning_rate": 9.188640973630832e-06, "loss": 1.2332, "step": 120 }, { "epoch": 0.24543610547667344, "grad_norm": 0.1611328125, "learning_rate": 9.181879648411089e-06, "loss": 1.2216, "step": 121 }, { "epoch": 0.24746450304259635, "grad_norm": 0.1572265625, "learning_rate": 9.175118323191346e-06, "loss": 1.2629, "step": 122 }, { "epoch": 0.24949290060851928, "grad_norm": 0.162109375, "learning_rate": 9.168356997971604e-06, "loss": 1.2498, "step": 123 }, { "epoch": 0.2515212981744422, "grad_norm": 0.2041015625, "learning_rate": 9.16159567275186e-06, "loss": 1.249, "step": 124 }, { "epoch": 0.2535496957403651, "grad_norm": 0.23046875, "learning_rate": 9.154834347532116e-06, "loss": 1.2158, "step": 125 }, { "epoch": 0.25557809330628806, "grad_norm": 0.1708984375, "learning_rate": 9.148073022312373e-06, "loss": 1.227, "step": 126 }, { "epoch": 0.25760649087221094, "grad_norm": 0.240234375, "learning_rate": 9.14131169709263e-06, "loss": 1.2769, "step": 127 }, { "epoch": 0.25963488843813387, "grad_norm": 0.201171875, "learning_rate": 9.134550371872887e-06, "loss": 1.2222, "step": 128 }, { "epoch": 0.2616632860040568, "grad_norm": 0.234375, "learning_rate": 9.127789046653144e-06, "loss": 1.2199, "step": 129 }, { "epoch": 0.26369168356997974, "grad_norm": 0.1748046875, "learning_rate": 9.121027721433403e-06, "loss": 1.2321, "step": 130 }, { "epoch": 0.2657200811359026, "grad_norm": 0.23046875, "learning_rate": 9.11426639621366e-06, "loss": 1.2217, "step": 131 }, { "epoch": 0.26774847870182555, "grad_norm": 0.181640625, "learning_rate": 9.107505070993915e-06, "loss": 1.2269, "step": 132 }, { "epoch": 0.2697768762677485, "grad_norm": 0.1708984375, "learning_rate": 9.100743745774172e-06, "loss": 1.2449, "step": 133 }, { "epoch": 0.2718052738336714, "grad_norm": 0.1728515625, "learning_rate": 9.093982420554429e-06, "loss": 1.2616, "step": 134 }, { "epoch": 0.2738336713995943, "grad_norm": 0.189453125, "learning_rate": 9.087221095334686e-06, "loss": 1.2719, "step": 135 }, { "epoch": 0.27586206896551724, "grad_norm": 0.1953125, "learning_rate": 9.080459770114942e-06, "loss": 1.217, "step": 136 }, { "epoch": 0.2778904665314402, "grad_norm": 0.1708984375, "learning_rate": 9.073698444895201e-06, "loss": 1.21, "step": 137 }, { "epoch": 0.2799188640973631, "grad_norm": 0.1806640625, "learning_rate": 9.066937119675458e-06, "loss": 1.1984, "step": 138 }, { "epoch": 0.281947261663286, "grad_norm": 0.189453125, "learning_rate": 9.060175794455715e-06, "loss": 1.2262, "step": 139 }, { "epoch": 0.2839756592292089, "grad_norm": 0.1708984375, "learning_rate": 9.05341446923597e-06, "loss": 1.2322, "step": 140 }, { "epoch": 0.28600405679513186, "grad_norm": 0.2451171875, "learning_rate": 9.046653144016227e-06, "loss": 1.2377, "step": 141 }, { "epoch": 0.2880324543610548, "grad_norm": 0.169921875, "learning_rate": 9.039891818796484e-06, "loss": 1.2473, "step": 142 }, { "epoch": 0.29006085192697767, "grad_norm": 0.177734375, "learning_rate": 9.033130493576741e-06, "loss": 1.2727, "step": 143 }, { "epoch": 0.2920892494929006, "grad_norm": 0.33203125, "learning_rate": 9.026369168357e-06, "loss": 1.1902, "step": 144 }, { "epoch": 0.29411764705882354, "grad_norm": 0.185546875, "learning_rate": 9.019607843137256e-06, "loss": 1.2436, "step": 145 }, { "epoch": 0.2961460446247465, "grad_norm": 0.185546875, "learning_rate": 9.012846517917513e-06, "loss": 1.1909, "step": 146 }, { "epoch": 0.29817444219066935, "grad_norm": 0.17578125, "learning_rate": 9.00608519269777e-06, "loss": 1.2215, "step": 147 }, { "epoch": 0.3002028397565923, "grad_norm": 0.2001953125, "learning_rate": 8.999323867478025e-06, "loss": 1.2217, "step": 148 }, { "epoch": 0.3022312373225152, "grad_norm": 0.2138671875, "learning_rate": 8.992562542258282e-06, "loss": 1.2267, "step": 149 }, { "epoch": 0.30425963488843816, "grad_norm": 0.16796875, "learning_rate": 8.98580121703854e-06, "loss": 1.2343, "step": 150 }, { "epoch": 0.30628803245436104, "grad_norm": 0.197265625, "learning_rate": 8.979039891818798e-06, "loss": 1.2193, "step": 151 }, { "epoch": 0.30831643002028397, "grad_norm": 0.1806640625, "learning_rate": 8.972278566599055e-06, "loss": 1.2008, "step": 152 }, { "epoch": 0.3103448275862069, "grad_norm": 0.171875, "learning_rate": 8.965517241379312e-06, "loss": 1.217, "step": 153 }, { "epoch": 0.31237322515212984, "grad_norm": 0.2158203125, "learning_rate": 8.958755916159569e-06, "loss": 1.2081, "step": 154 }, { "epoch": 0.3144016227180527, "grad_norm": 0.201171875, "learning_rate": 8.951994590939825e-06, "loss": 1.1466, "step": 155 }, { "epoch": 0.31643002028397565, "grad_norm": 0.173828125, "learning_rate": 8.94523326572008e-06, "loss": 1.1587, "step": 156 }, { "epoch": 0.3184584178498986, "grad_norm": 0.1728515625, "learning_rate": 8.938471940500338e-06, "loss": 1.1839, "step": 157 }, { "epoch": 0.3204868154158215, "grad_norm": 0.177734375, "learning_rate": 8.931710615280596e-06, "loss": 1.208, "step": 158 }, { "epoch": 0.3225152129817444, "grad_norm": 0.1728515625, "learning_rate": 8.924949290060853e-06, "loss": 1.1899, "step": 159 }, { "epoch": 0.32454361054766734, "grad_norm": 0.181640625, "learning_rate": 8.91818796484111e-06, "loss": 1.2019, "step": 160 }, { "epoch": 0.3265720081135903, "grad_norm": 0.1875, "learning_rate": 8.911426639621367e-06, "loss": 1.1336, "step": 161 }, { "epoch": 0.3286004056795132, "grad_norm": 0.177734375, "learning_rate": 8.904665314401624e-06, "loss": 1.1841, "step": 162 }, { "epoch": 0.3306288032454361, "grad_norm": 0.1953125, "learning_rate": 8.89790398918188e-06, "loss": 1.2169, "step": 163 }, { "epoch": 0.332657200811359, "grad_norm": 0.2041015625, "learning_rate": 8.891142663962136e-06, "loss": 1.1857, "step": 164 }, { "epoch": 0.33468559837728196, "grad_norm": 0.1865234375, "learning_rate": 8.884381338742395e-06, "loss": 1.1672, "step": 165 }, { "epoch": 0.3367139959432049, "grad_norm": 0.1884765625, "learning_rate": 8.877620013522652e-06, "loss": 1.2451, "step": 166 }, { "epoch": 0.33874239350912777, "grad_norm": 0.1953125, "learning_rate": 8.870858688302908e-06, "loss": 1.2504, "step": 167 }, { "epoch": 0.3407707910750507, "grad_norm": 0.23046875, "learning_rate": 8.864097363083165e-06, "loss": 1.194, "step": 168 }, { "epoch": 0.34279918864097364, "grad_norm": 0.2158203125, "learning_rate": 8.857336037863422e-06, "loss": 1.1883, "step": 169 }, { "epoch": 0.3448275862068966, "grad_norm": 0.1982421875, "learning_rate": 8.85057471264368e-06, "loss": 1.205, "step": 170 }, { "epoch": 0.34685598377281945, "grad_norm": 0.193359375, "learning_rate": 8.843813387423936e-06, "loss": 1.2313, "step": 171 }, { "epoch": 0.3488843813387424, "grad_norm": 0.22265625, "learning_rate": 8.837052062204193e-06, "loss": 1.163, "step": 172 }, { "epoch": 0.3509127789046653, "grad_norm": 0.2109375, "learning_rate": 8.83029073698445e-06, "loss": 1.1458, "step": 173 }, { "epoch": 0.35294117647058826, "grad_norm": 0.185546875, "learning_rate": 8.823529411764707e-06, "loss": 1.1783, "step": 174 }, { "epoch": 0.35496957403651114, "grad_norm": 0.181640625, "learning_rate": 8.816768086544964e-06, "loss": 1.1763, "step": 175 }, { "epoch": 0.35699797160243407, "grad_norm": 0.171875, "learning_rate": 8.81000676132522e-06, "loss": 1.1719, "step": 176 }, { "epoch": 0.359026369168357, "grad_norm": 0.1865234375, "learning_rate": 8.803245436105478e-06, "loss": 1.1703, "step": 177 }, { "epoch": 0.36105476673427994, "grad_norm": 0.1982421875, "learning_rate": 8.796484110885734e-06, "loss": 1.213, "step": 178 }, { "epoch": 0.3630831643002028, "grad_norm": 0.20703125, "learning_rate": 8.789722785665991e-06, "loss": 1.1656, "step": 179 }, { "epoch": 0.36511156186612576, "grad_norm": 0.17578125, "learning_rate": 8.782961460446248e-06, "loss": 1.1381, "step": 180 }, { "epoch": 0.3671399594320487, "grad_norm": 0.1904296875, "learning_rate": 8.776200135226505e-06, "loss": 1.213, "step": 181 }, { "epoch": 0.3691683569979716, "grad_norm": 0.1845703125, "learning_rate": 8.769438810006762e-06, "loss": 1.1966, "step": 182 }, { "epoch": 0.3711967545638945, "grad_norm": 0.259765625, "learning_rate": 8.762677484787019e-06, "loss": 1.2488, "step": 183 }, { "epoch": 0.37322515212981744, "grad_norm": 0.25390625, "learning_rate": 8.755916159567276e-06, "loss": 1.1837, "step": 184 }, { "epoch": 0.3752535496957404, "grad_norm": 0.1865234375, "learning_rate": 8.749154834347533e-06, "loss": 1.1915, "step": 185 }, { "epoch": 0.3772819472616633, "grad_norm": 0.19140625, "learning_rate": 8.74239350912779e-06, "loss": 1.187, "step": 186 }, { "epoch": 0.3793103448275862, "grad_norm": 0.18359375, "learning_rate": 8.735632183908047e-06, "loss": 1.189, "step": 187 }, { "epoch": 0.3813387423935091, "grad_norm": 0.1787109375, "learning_rate": 8.728870858688304e-06, "loss": 1.1898, "step": 188 }, { "epoch": 0.38336713995943206, "grad_norm": 0.1982421875, "learning_rate": 8.72210953346856e-06, "loss": 1.1981, "step": 189 }, { "epoch": 0.385395537525355, "grad_norm": 0.1865234375, "learning_rate": 8.715348208248817e-06, "loss": 1.1717, "step": 190 }, { "epoch": 0.38742393509127787, "grad_norm": 0.1943359375, "learning_rate": 8.708586883029074e-06, "loss": 1.1787, "step": 191 }, { "epoch": 0.3894523326572008, "grad_norm": 0.1943359375, "learning_rate": 8.701825557809331e-06, "loss": 1.1728, "step": 192 }, { "epoch": 0.39148073022312374, "grad_norm": 0.1845703125, "learning_rate": 8.695064232589588e-06, "loss": 1.1653, "step": 193 }, { "epoch": 0.3935091277890467, "grad_norm": 0.2734375, "learning_rate": 8.688302907369845e-06, "loss": 1.117, "step": 194 }, { "epoch": 0.39553752535496955, "grad_norm": 0.1806640625, "learning_rate": 8.681541582150102e-06, "loss": 1.1384, "step": 195 }, { "epoch": 0.3975659229208925, "grad_norm": 0.302734375, "learning_rate": 8.674780256930359e-06, "loss": 1.1527, "step": 196 }, { "epoch": 0.3995943204868154, "grad_norm": 0.1953125, "learning_rate": 8.668018931710616e-06, "loss": 1.1487, "step": 197 }, { "epoch": 0.40162271805273836, "grad_norm": 0.1982421875, "learning_rate": 8.661257606490873e-06, "loss": 1.2106, "step": 198 }, { "epoch": 0.40365111561866124, "grad_norm": 0.197265625, "learning_rate": 8.65449628127113e-06, "loss": 1.1738, "step": 199 }, { "epoch": 0.4056795131845842, "grad_norm": 1.890625, "learning_rate": 8.647734956051387e-06, "loss": 1.1729, "step": 200 }, { "epoch": 0.4077079107505071, "grad_norm": 0.2099609375, "learning_rate": 8.640973630831643e-06, "loss": 1.1877, "step": 201 }, { "epoch": 0.40973630831643004, "grad_norm": 0.2138671875, "learning_rate": 8.6342123056119e-06, "loss": 1.1108, "step": 202 }, { "epoch": 0.4117647058823529, "grad_norm": 0.185546875, "learning_rate": 8.627450980392157e-06, "loss": 1.1975, "step": 203 }, { "epoch": 0.41379310344827586, "grad_norm": 0.2021484375, "learning_rate": 8.620689655172414e-06, "loss": 1.1852, "step": 204 }, { "epoch": 0.4158215010141988, "grad_norm": 0.19140625, "learning_rate": 8.613928329952671e-06, "loss": 1.1958, "step": 205 }, { "epoch": 0.4178498985801217, "grad_norm": 0.265625, "learning_rate": 8.607167004732928e-06, "loss": 1.1645, "step": 206 }, { "epoch": 0.4198782961460446, "grad_norm": 0.1943359375, "learning_rate": 8.600405679513185e-06, "loss": 1.126, "step": 207 }, { "epoch": 0.42190669371196754, "grad_norm": 0.1796875, "learning_rate": 8.593644354293442e-06, "loss": 1.159, "step": 208 }, { "epoch": 0.4239350912778905, "grad_norm": 0.181640625, "learning_rate": 8.586883029073699e-06, "loss": 1.1864, "step": 209 }, { "epoch": 0.4259634888438134, "grad_norm": 0.224609375, "learning_rate": 8.580121703853956e-06, "loss": 1.1173, "step": 210 }, { "epoch": 0.4279918864097363, "grad_norm": 0.1875, "learning_rate": 8.573360378634214e-06, "loss": 1.1146, "step": 211 }, { "epoch": 0.4300202839756592, "grad_norm": 0.2275390625, "learning_rate": 8.56659905341447e-06, "loss": 1.2435, "step": 212 }, { "epoch": 0.43204868154158216, "grad_norm": 0.2021484375, "learning_rate": 8.559837728194726e-06, "loss": 1.1341, "step": 213 }, { "epoch": 0.4340770791075051, "grad_norm": 0.19140625, "learning_rate": 8.553076402974983e-06, "loss": 1.1846, "step": 214 }, { "epoch": 0.43610547667342797, "grad_norm": 0.2138671875, "learning_rate": 8.54631507775524e-06, "loss": 1.156, "step": 215 }, { "epoch": 0.4381338742393509, "grad_norm": 0.203125, "learning_rate": 8.539553752535497e-06, "loss": 1.1899, "step": 216 }, { "epoch": 0.44016227180527384, "grad_norm": 0.17578125, "learning_rate": 8.532792427315754e-06, "loss": 1.1491, "step": 217 }, { "epoch": 0.4421906693711968, "grad_norm": 0.185546875, "learning_rate": 8.526031102096013e-06, "loss": 1.116, "step": 218 }, { "epoch": 0.44421906693711966, "grad_norm": 0.2041015625, "learning_rate": 8.51926977687627e-06, "loss": 1.1333, "step": 219 }, { "epoch": 0.4462474645030426, "grad_norm": 0.2255859375, "learning_rate": 8.512508451656525e-06, "loss": 1.1806, "step": 220 }, { "epoch": 0.4482758620689655, "grad_norm": 0.357421875, "learning_rate": 8.505747126436782e-06, "loss": 1.1594, "step": 221 }, { "epoch": 0.45030425963488846, "grad_norm": 0.1982421875, "learning_rate": 8.498985801217039e-06, "loss": 1.1281, "step": 222 }, { "epoch": 0.45233265720081134, "grad_norm": 0.1845703125, "learning_rate": 8.492224475997295e-06, "loss": 1.1715, "step": 223 }, { "epoch": 0.4543610547667343, "grad_norm": 0.193359375, "learning_rate": 8.485463150777552e-06, "loss": 1.179, "step": 224 }, { "epoch": 0.4563894523326572, "grad_norm": 0.18359375, "learning_rate": 8.478701825557811e-06, "loss": 1.1349, "step": 225 }, { "epoch": 0.45841784989858014, "grad_norm": 0.201171875, "learning_rate": 8.471940500338068e-06, "loss": 1.165, "step": 226 }, { "epoch": 0.460446247464503, "grad_norm": 0.1875, "learning_rate": 8.465179175118325e-06, "loss": 1.1169, "step": 227 }, { "epoch": 0.46247464503042596, "grad_norm": 0.2021484375, "learning_rate": 8.45841784989858e-06, "loss": 1.17, "step": 228 }, { "epoch": 0.4645030425963489, "grad_norm": 0.212890625, "learning_rate": 8.451656524678837e-06, "loss": 1.0952, "step": 229 }, { "epoch": 0.4665314401622718, "grad_norm": 0.193359375, "learning_rate": 8.444895199459094e-06, "loss": 1.1216, "step": 230 }, { "epoch": 0.4685598377281947, "grad_norm": 0.203125, "learning_rate": 8.43813387423935e-06, "loss": 1.1362, "step": 231 }, { "epoch": 0.47058823529411764, "grad_norm": 0.203125, "learning_rate": 8.43137254901961e-06, "loss": 1.1633, "step": 232 }, { "epoch": 0.4726166328600406, "grad_norm": 0.197265625, "learning_rate": 8.424611223799866e-06, "loss": 1.1214, "step": 233 }, { "epoch": 0.4746450304259635, "grad_norm": 0.220703125, "learning_rate": 8.417849898580123e-06, "loss": 1.1316, "step": 234 }, { "epoch": 0.4766734279918864, "grad_norm": 0.1875, "learning_rate": 8.41108857336038e-06, "loss": 1.1439, "step": 235 }, { "epoch": 0.4787018255578093, "grad_norm": 0.205078125, "learning_rate": 8.404327248140635e-06, "loss": 1.1409, "step": 236 }, { "epoch": 0.48073022312373226, "grad_norm": 0.2041015625, "learning_rate": 8.397565922920892e-06, "loss": 1.1773, "step": 237 }, { "epoch": 0.4827586206896552, "grad_norm": 0.2265625, "learning_rate": 8.390804597701149e-06, "loss": 1.1738, "step": 238 }, { "epoch": 0.4847870182555781, "grad_norm": 0.1865234375, "learning_rate": 8.384043272481408e-06, "loss": 1.1175, "step": 239 }, { "epoch": 0.486815415821501, "grad_norm": 0.2001953125, "learning_rate": 8.377281947261665e-06, "loss": 1.1377, "step": 240 }, { "epoch": 0.48884381338742394, "grad_norm": 0.296875, "learning_rate": 8.370520622041922e-06, "loss": 1.19, "step": 241 }, { "epoch": 0.4908722109533469, "grad_norm": 0.2197265625, "learning_rate": 8.363759296822178e-06, "loss": 1.159, "step": 242 }, { "epoch": 0.49290060851926976, "grad_norm": 0.21875, "learning_rate": 8.356997971602435e-06, "loss": 1.1575, "step": 243 }, { "epoch": 0.4949290060851927, "grad_norm": 0.2080078125, "learning_rate": 8.35023664638269e-06, "loss": 1.1038, "step": 244 }, { "epoch": 0.4969574036511156, "grad_norm": 0.19140625, "learning_rate": 8.343475321162948e-06, "loss": 1.1418, "step": 245 }, { "epoch": 0.49898580121703856, "grad_norm": 0.2392578125, "learning_rate": 8.336713995943206e-06, "loss": 1.1345, "step": 246 }, { "epoch": 0.5010141987829615, "grad_norm": 0.2177734375, "learning_rate": 8.329952670723463e-06, "loss": 1.14, "step": 247 }, { "epoch": 0.5030425963488844, "grad_norm": 0.201171875, "learning_rate": 8.32319134550372e-06, "loss": 1.1234, "step": 248 }, { "epoch": 0.5050709939148073, "grad_norm": 0.265625, "learning_rate": 8.316430020283977e-06, "loss": 1.1479, "step": 249 }, { "epoch": 0.5070993914807302, "grad_norm": 0.205078125, "learning_rate": 8.309668695064234e-06, "loss": 1.1019, "step": 250 }, { "epoch": 0.5091277890466531, "grad_norm": 0.1865234375, "learning_rate": 8.30290736984449e-06, "loss": 1.1496, "step": 251 }, { "epoch": 0.5111561866125761, "grad_norm": 0.23046875, "learning_rate": 8.296146044624746e-06, "loss": 1.1473, "step": 252 }, { "epoch": 0.513184584178499, "grad_norm": 0.1962890625, "learning_rate": 8.289384719405005e-06, "loss": 1.153, "step": 253 }, { "epoch": 0.5152129817444219, "grad_norm": 0.20703125, "learning_rate": 8.282623394185261e-06, "loss": 1.1607, "step": 254 }, { "epoch": 0.5172413793103449, "grad_norm": 0.205078125, "learning_rate": 8.275862068965518e-06, "loss": 1.1626, "step": 255 }, { "epoch": 0.5192697768762677, "grad_norm": 0.1962890625, "learning_rate": 8.269100743745775e-06, "loss": 1.1553, "step": 256 }, { "epoch": 0.5212981744421906, "grad_norm": 0.2041015625, "learning_rate": 8.262339418526032e-06, "loss": 1.1893, "step": 257 }, { "epoch": 0.5233265720081136, "grad_norm": 0.1943359375, "learning_rate": 8.255578093306289e-06, "loss": 1.0982, "step": 258 }, { "epoch": 0.5253549695740365, "grad_norm": 0.2080078125, "learning_rate": 8.248816768086546e-06, "loss": 1.1448, "step": 259 }, { "epoch": 0.5273833671399595, "grad_norm": 0.22265625, "learning_rate": 8.242055442866801e-06, "loss": 1.1399, "step": 260 }, { "epoch": 0.5294117647058824, "grad_norm": 0.1845703125, "learning_rate": 8.23529411764706e-06, "loss": 1.0997, "step": 261 }, { "epoch": 0.5314401622718052, "grad_norm": 0.2060546875, "learning_rate": 8.228532792427317e-06, "loss": 1.128, "step": 262 }, { "epoch": 0.5334685598377282, "grad_norm": 0.1982421875, "learning_rate": 8.221771467207574e-06, "loss": 1.1279, "step": 263 }, { "epoch": 0.5354969574036511, "grad_norm": 0.212890625, "learning_rate": 8.21501014198783e-06, "loss": 1.1596, "step": 264 }, { "epoch": 0.537525354969574, "grad_norm": 0.203125, "learning_rate": 8.208248816768087e-06, "loss": 1.1058, "step": 265 }, { "epoch": 0.539553752535497, "grad_norm": 0.1953125, "learning_rate": 8.201487491548344e-06, "loss": 1.125, "step": 266 }, { "epoch": 0.5415821501014199, "grad_norm": 0.19921875, "learning_rate": 8.194726166328601e-06, "loss": 1.1207, "step": 267 }, { "epoch": 0.5436105476673428, "grad_norm": 0.193359375, "learning_rate": 8.187964841108858e-06, "loss": 1.1092, "step": 268 }, { "epoch": 0.5456389452332657, "grad_norm": 0.220703125, "learning_rate": 8.181203515889115e-06, "loss": 1.1074, "step": 269 }, { "epoch": 0.5476673427991886, "grad_norm": 0.2001953125, "learning_rate": 8.174442190669372e-06, "loss": 1.0965, "step": 270 }, { "epoch": 0.5496957403651116, "grad_norm": 0.21875, "learning_rate": 8.167680865449629e-06, "loss": 1.1247, "step": 271 }, { "epoch": 0.5517241379310345, "grad_norm": 0.203125, "learning_rate": 8.160919540229886e-06, "loss": 1.1494, "step": 272 }, { "epoch": 0.5537525354969574, "grad_norm": 0.1943359375, "learning_rate": 8.154158215010143e-06, "loss": 1.1079, "step": 273 }, { "epoch": 0.5557809330628803, "grad_norm": 0.255859375, "learning_rate": 8.1473968897904e-06, "loss": 1.1407, "step": 274 }, { "epoch": 0.5578093306288032, "grad_norm": 0.2001953125, "learning_rate": 8.140635564570657e-06, "loss": 1.1192, "step": 275 }, { "epoch": 0.5598377281947262, "grad_norm": 0.1923828125, "learning_rate": 8.133874239350913e-06, "loss": 1.1077, "step": 276 }, { "epoch": 0.5618661257606491, "grad_norm": 0.20703125, "learning_rate": 8.12711291413117e-06, "loss": 1.1626, "step": 277 }, { "epoch": 0.563894523326572, "grad_norm": 0.203125, "learning_rate": 8.120351588911427e-06, "loss": 1.0602, "step": 278 }, { "epoch": 0.565922920892495, "grad_norm": 0.23828125, "learning_rate": 8.113590263691684e-06, "loss": 1.0946, "step": 279 }, { "epoch": 0.5679513184584178, "grad_norm": 0.2119140625, "learning_rate": 8.106828938471941e-06, "loss": 1.1575, "step": 280 }, { "epoch": 0.5699797160243407, "grad_norm": 0.236328125, "learning_rate": 8.100067613252198e-06, "loss": 1.1317, "step": 281 }, { "epoch": 0.5720081135902637, "grad_norm": 0.1962890625, "learning_rate": 8.093306288032455e-06, "loss": 1.1557, "step": 282 }, { "epoch": 0.5740365111561866, "grad_norm": 0.205078125, "learning_rate": 8.086544962812712e-06, "loss": 1.1137, "step": 283 }, { "epoch": 0.5760649087221096, "grad_norm": 0.22265625, "learning_rate": 8.079783637592969e-06, "loss": 1.1279, "step": 284 }, { "epoch": 0.5780933062880325, "grad_norm": 0.2041015625, "learning_rate": 8.073022312373226e-06, "loss": 1.0888, "step": 285 }, { "epoch": 0.5801217038539553, "grad_norm": 0.203125, "learning_rate": 8.066260987153483e-06, "loss": 1.1286, "step": 286 }, { "epoch": 0.5821501014198783, "grad_norm": 0.201171875, "learning_rate": 8.05949966193374e-06, "loss": 1.1542, "step": 287 }, { "epoch": 0.5841784989858012, "grad_norm": 0.2060546875, "learning_rate": 8.052738336713996e-06, "loss": 1.1188, "step": 288 }, { "epoch": 0.5862068965517241, "grad_norm": 0.2080078125, "learning_rate": 8.045977011494253e-06, "loss": 1.1265, "step": 289 }, { "epoch": 0.5882352941176471, "grad_norm": 0.2177734375, "learning_rate": 8.03921568627451e-06, "loss": 1.1099, "step": 290 }, { "epoch": 0.59026369168357, "grad_norm": 0.279296875, "learning_rate": 8.032454361054767e-06, "loss": 1.088, "step": 291 }, { "epoch": 0.592292089249493, "grad_norm": 0.1953125, "learning_rate": 8.025693035835024e-06, "loss": 1.1426, "step": 292 }, { "epoch": 0.5943204868154158, "grad_norm": 0.236328125, "learning_rate": 8.018931710615281e-06, "loss": 1.099, "step": 293 }, { "epoch": 0.5963488843813387, "grad_norm": 0.259765625, "learning_rate": 8.012170385395538e-06, "loss": 1.1593, "step": 294 }, { "epoch": 0.5983772819472617, "grad_norm": 0.208984375, "learning_rate": 8.005409060175795e-06, "loss": 1.1478, "step": 295 }, { "epoch": 0.6004056795131846, "grad_norm": 0.26171875, "learning_rate": 7.998647734956052e-06, "loss": 1.1165, "step": 296 }, { "epoch": 0.6024340770791075, "grad_norm": 0.2109375, "learning_rate": 7.991886409736309e-06, "loss": 1.0772, "step": 297 }, { "epoch": 0.6044624746450304, "grad_norm": 0.2099609375, "learning_rate": 7.985125084516566e-06, "loss": 1.0881, "step": 298 }, { "epoch": 0.6064908722109533, "grad_norm": 0.2021484375, "learning_rate": 7.978363759296822e-06, "loss": 1.0898, "step": 299 }, { "epoch": 0.6085192697768763, "grad_norm": 0.314453125, "learning_rate": 7.97160243407708e-06, "loss": 1.095, "step": 300 }, { "epoch": 0.6105476673427992, "grad_norm": 0.20703125, "learning_rate": 7.964841108857336e-06, "loss": 1.1129, "step": 301 }, { "epoch": 0.6125760649087221, "grad_norm": 0.220703125, "learning_rate": 7.958079783637593e-06, "loss": 1.1106, "step": 302 }, { "epoch": 0.6146044624746451, "grad_norm": 0.2001953125, "learning_rate": 7.95131845841785e-06, "loss": 1.0957, "step": 303 }, { "epoch": 0.6166328600405679, "grad_norm": 0.296875, "learning_rate": 7.944557133198107e-06, "loss": 1.1653, "step": 304 }, { "epoch": 0.6186612576064908, "grad_norm": 0.2119140625, "learning_rate": 7.937795807978364e-06, "loss": 1.13, "step": 305 }, { "epoch": 0.6206896551724138, "grad_norm": 0.205078125, "learning_rate": 7.93103448275862e-06, "loss": 1.1194, "step": 306 }, { "epoch": 0.6227180527383367, "grad_norm": 0.232421875, "learning_rate": 7.92427315753888e-06, "loss": 1.1271, "step": 307 }, { "epoch": 0.6247464503042597, "grad_norm": 0.20703125, "learning_rate": 7.917511832319135e-06, "loss": 1.1328, "step": 308 }, { "epoch": 0.6267748478701826, "grad_norm": 0.26953125, "learning_rate": 7.910750507099392e-06, "loss": 1.1269, "step": 309 }, { "epoch": 0.6288032454361054, "grad_norm": 0.2041015625, "learning_rate": 7.903989181879648e-06, "loss": 1.094, "step": 310 }, { "epoch": 0.6308316430020284, "grad_norm": 0.232421875, "learning_rate": 7.897227856659905e-06, "loss": 1.0956, "step": 311 }, { "epoch": 0.6328600405679513, "grad_norm": 0.20703125, "learning_rate": 7.890466531440162e-06, "loss": 1.0989, "step": 312 }, { "epoch": 0.6348884381338742, "grad_norm": 0.216796875, "learning_rate": 7.88370520622042e-06, "loss": 1.1115, "step": 313 }, { "epoch": 0.6369168356997972, "grad_norm": 0.208984375, "learning_rate": 7.876943881000678e-06, "loss": 1.1377, "step": 314 }, { "epoch": 0.6389452332657201, "grad_norm": 0.21875, "learning_rate": 7.870182555780935e-06, "loss": 1.0829, "step": 315 }, { "epoch": 0.640973630831643, "grad_norm": 0.19921875, "learning_rate": 7.86342123056119e-06, "loss": 1.0775, "step": 316 }, { "epoch": 0.6430020283975659, "grad_norm": 0.21875, "learning_rate": 7.856659905341447e-06, "loss": 1.058, "step": 317 }, { "epoch": 0.6450304259634888, "grad_norm": 0.265625, "learning_rate": 7.849898580121704e-06, "loss": 1.0845, "step": 318 }, { "epoch": 0.6470588235294118, "grad_norm": 0.220703125, "learning_rate": 7.84313725490196e-06, "loss": 1.1295, "step": 319 }, { "epoch": 0.6490872210953347, "grad_norm": 0.2216796875, "learning_rate": 7.836375929682218e-06, "loss": 1.1128, "step": 320 }, { "epoch": 0.6511156186612576, "grad_norm": 0.2275390625, "learning_rate": 7.829614604462476e-06, "loss": 1.0901, "step": 321 }, { "epoch": 0.6531440162271805, "grad_norm": 0.197265625, "learning_rate": 7.822853279242733e-06, "loss": 1.0869, "step": 322 }, { "epoch": 0.6551724137931034, "grad_norm": 0.2099609375, "learning_rate": 7.81609195402299e-06, "loss": 1.1382, "step": 323 }, { "epoch": 0.6572008113590264, "grad_norm": 0.26953125, "learning_rate": 7.809330628803245e-06, "loss": 1.0594, "step": 324 }, { "epoch": 0.6592292089249493, "grad_norm": 0.205078125, "learning_rate": 7.802569303583502e-06, "loss": 1.1196, "step": 325 }, { "epoch": 0.6612576064908722, "grad_norm": 0.212890625, "learning_rate": 7.795807978363759e-06, "loss": 1.0247, "step": 326 }, { "epoch": 0.6632860040567952, "grad_norm": 0.2119140625, "learning_rate": 7.789046653144016e-06, "loss": 1.1146, "step": 327 }, { "epoch": 0.665314401622718, "grad_norm": 0.2109375, "learning_rate": 7.782285327924275e-06, "loss": 1.0828, "step": 328 }, { "epoch": 0.6673427991886409, "grad_norm": 0.2080078125, "learning_rate": 7.775524002704531e-06, "loss": 1.0726, "step": 329 }, { "epoch": 0.6693711967545639, "grad_norm": 0.3984375, "learning_rate": 7.768762677484788e-06, "loss": 1.1187, "step": 330 }, { "epoch": 0.6713995943204868, "grad_norm": 0.212890625, "learning_rate": 7.762001352265045e-06, "loss": 1.118, "step": 331 }, { "epoch": 0.6734279918864098, "grad_norm": 0.23046875, "learning_rate": 7.7552400270453e-06, "loss": 1.1304, "step": 332 }, { "epoch": 0.6754563894523327, "grad_norm": 0.2041015625, "learning_rate": 7.748478701825557e-06, "loss": 1.0665, "step": 333 }, { "epoch": 0.6774847870182555, "grad_norm": 0.2314453125, "learning_rate": 7.741717376605814e-06, "loss": 1.1892, "step": 334 }, { "epoch": 0.6795131845841785, "grad_norm": 0.3046875, "learning_rate": 7.734956051386073e-06, "loss": 1.1124, "step": 335 }, { "epoch": 0.6815415821501014, "grad_norm": 0.2216796875, "learning_rate": 7.72819472616633e-06, "loss": 1.1264, "step": 336 }, { "epoch": 0.6835699797160243, "grad_norm": 0.2255859375, "learning_rate": 7.721433400946587e-06, "loss": 1.1204, "step": 337 }, { "epoch": 0.6855983772819473, "grad_norm": 0.298828125, "learning_rate": 7.714672075726844e-06, "loss": 1.0857, "step": 338 }, { "epoch": 0.6876267748478702, "grad_norm": 0.220703125, "learning_rate": 7.7079107505071e-06, "loss": 1.0994, "step": 339 }, { "epoch": 0.6896551724137931, "grad_norm": 0.2216796875, "learning_rate": 7.701149425287356e-06, "loss": 1.1221, "step": 340 }, { "epoch": 0.691683569979716, "grad_norm": 0.216796875, "learning_rate": 7.694388100067613e-06, "loss": 1.1262, "step": 341 }, { "epoch": 0.6937119675456389, "grad_norm": 0.20703125, "learning_rate": 7.687626774847871e-06, "loss": 1.1352, "step": 342 }, { "epoch": 0.6957403651115619, "grad_norm": 0.2021484375, "learning_rate": 7.680865449628128e-06, "loss": 1.097, "step": 343 }, { "epoch": 0.6977687626774848, "grad_norm": 0.21484375, "learning_rate": 7.674104124408385e-06, "loss": 1.1079, "step": 344 }, { "epoch": 0.6997971602434077, "grad_norm": 0.220703125, "learning_rate": 7.667342799188642e-06, "loss": 1.0598, "step": 345 }, { "epoch": 0.7018255578093306, "grad_norm": 0.25, "learning_rate": 7.660581473968899e-06, "loss": 1.1034, "step": 346 }, { "epoch": 0.7038539553752535, "grad_norm": 0.22265625, "learning_rate": 7.653820148749156e-06, "loss": 1.0847, "step": 347 }, { "epoch": 0.7058823529411765, "grad_norm": 0.24609375, "learning_rate": 7.647058823529411e-06, "loss": 1.0882, "step": 348 }, { "epoch": 0.7079107505070994, "grad_norm": 0.80078125, "learning_rate": 7.64029749830967e-06, "loss": 1.1341, "step": 349 }, { "epoch": 0.7099391480730223, "grad_norm": 0.212890625, "learning_rate": 7.633536173089927e-06, "loss": 1.0863, "step": 350 }, { "epoch": 0.7119675456389453, "grad_norm": 0.2109375, "learning_rate": 7.626774847870183e-06, "loss": 1.0326, "step": 351 }, { "epoch": 0.7139959432048681, "grad_norm": 0.2333984375, "learning_rate": 7.6200135226504404e-06, "loss": 1.0598, "step": 352 }, { "epoch": 0.716024340770791, "grad_norm": 0.2119140625, "learning_rate": 7.613252197430697e-06, "loss": 1.0759, "step": 353 }, { "epoch": 0.718052738336714, "grad_norm": 0.2294921875, "learning_rate": 7.606490872210954e-06, "loss": 1.116, "step": 354 }, { "epoch": 0.7200811359026369, "grad_norm": 0.21875, "learning_rate": 7.599729546991211e-06, "loss": 1.1235, "step": 355 }, { "epoch": 0.7221095334685599, "grad_norm": 0.2578125, "learning_rate": 7.592968221771467e-06, "loss": 1.1006, "step": 356 }, { "epoch": 0.7241379310344828, "grad_norm": 0.216796875, "learning_rate": 7.586206896551724e-06, "loss": 1.1092, "step": 357 }, { "epoch": 0.7261663286004056, "grad_norm": 0.248046875, "learning_rate": 7.579445571331981e-06, "loss": 1.0582, "step": 358 }, { "epoch": 0.7281947261663286, "grad_norm": 0.2275390625, "learning_rate": 7.572684246112239e-06, "loss": 1.1501, "step": 359 }, { "epoch": 0.7302231237322515, "grad_norm": 0.2158203125, "learning_rate": 7.565922920892496e-06, "loss": 1.1079, "step": 360 }, { "epoch": 0.7322515212981744, "grad_norm": 0.251953125, "learning_rate": 7.559161595672753e-06, "loss": 1.0929, "step": 361 }, { "epoch": 0.7342799188640974, "grad_norm": 0.279296875, "learning_rate": 7.5524002704530095e-06, "loss": 1.0586, "step": 362 }, { "epoch": 0.7363083164300203, "grad_norm": 0.2119140625, "learning_rate": 7.5456389452332665e-06, "loss": 1.1174, "step": 363 }, { "epoch": 0.7383367139959433, "grad_norm": 0.2255859375, "learning_rate": 7.5388776200135225e-06, "loss": 1.1049, "step": 364 }, { "epoch": 0.7403651115618661, "grad_norm": 0.271484375, "learning_rate": 7.5321162947937794e-06, "loss": 1.1212, "step": 365 }, { "epoch": 0.742393509127789, "grad_norm": 0.2177734375, "learning_rate": 7.525354969574037e-06, "loss": 1.1117, "step": 366 }, { "epoch": 0.744421906693712, "grad_norm": 0.2490234375, "learning_rate": 7.518593644354294e-06, "loss": 1.0885, "step": 367 }, { "epoch": 0.7464503042596349, "grad_norm": 0.224609375, "learning_rate": 7.511832319134551e-06, "loss": 1.1178, "step": 368 }, { "epoch": 0.7484787018255578, "grad_norm": 0.21484375, "learning_rate": 7.505070993914808e-06, "loss": 1.0965, "step": 369 }, { "epoch": 0.7505070993914807, "grad_norm": 0.21875, "learning_rate": 7.498309668695065e-06, "loss": 1.0961, "step": 370 }, { "epoch": 0.7525354969574036, "grad_norm": 0.369140625, "learning_rate": 7.491548343475323e-06, "loss": 1.0683, "step": 371 }, { "epoch": 0.7545638945233266, "grad_norm": 0.2109375, "learning_rate": 7.484787018255578e-06, "loss": 1.0914, "step": 372 }, { "epoch": 0.7565922920892495, "grad_norm": 0.216796875, "learning_rate": 7.4780256930358356e-06, "loss": 1.0738, "step": 373 }, { "epoch": 0.7586206896551724, "grad_norm": 0.1982421875, "learning_rate": 7.4712643678160925e-06, "loss": 1.0488, "step": 374 }, { "epoch": 0.7606490872210954, "grad_norm": 0.2216796875, "learning_rate": 7.464503042596349e-06, "loss": 1.1491, "step": 375 }, { "epoch": 0.7626774847870182, "grad_norm": 0.22265625, "learning_rate": 7.457741717376606e-06, "loss": 1.095, "step": 376 }, { "epoch": 0.7647058823529411, "grad_norm": 0.2177734375, "learning_rate": 7.450980392156863e-06, "loss": 1.112, "step": 377 }, { "epoch": 0.7667342799188641, "grad_norm": 0.216796875, "learning_rate": 7.444219066937121e-06, "loss": 1.0972, "step": 378 }, { "epoch": 0.768762677484787, "grad_norm": 0.26953125, "learning_rate": 7.437457741717378e-06, "loss": 1.1215, "step": 379 }, { "epoch": 0.77079107505071, "grad_norm": 0.216796875, "learning_rate": 7.430696416497634e-06, "loss": 1.1147, "step": 380 }, { "epoch": 0.7728194726166329, "grad_norm": 0.345703125, "learning_rate": 7.423935091277891e-06, "loss": 1.0614, "step": 381 }, { "epoch": 0.7748478701825557, "grad_norm": 0.2197265625, "learning_rate": 7.417173766058148e-06, "loss": 1.1024, "step": 382 }, { "epoch": 0.7768762677484787, "grad_norm": 0.255859375, "learning_rate": 7.410412440838405e-06, "loss": 1.0603, "step": 383 }, { "epoch": 0.7789046653144016, "grad_norm": 0.2255859375, "learning_rate": 7.403651115618662e-06, "loss": 1.1042, "step": 384 }, { "epoch": 0.7809330628803245, "grad_norm": 0.2314453125, "learning_rate": 7.396889790398919e-06, "loss": 1.1058, "step": 385 }, { "epoch": 0.7829614604462475, "grad_norm": 0.2158203125, "learning_rate": 7.390128465179176e-06, "loss": 1.1207, "step": 386 }, { "epoch": 0.7849898580121704, "grad_norm": 0.359375, "learning_rate": 7.383367139959433e-06, "loss": 1.0499, "step": 387 }, { "epoch": 0.7870182555780934, "grad_norm": 0.2119140625, "learning_rate": 7.376605814739689e-06, "loss": 1.1111, "step": 388 }, { "epoch": 0.7890466531440162, "grad_norm": 0.2216796875, "learning_rate": 7.369844489519946e-06, "loss": 1.1469, "step": 389 }, { "epoch": 0.7910750507099391, "grad_norm": 0.25390625, "learning_rate": 7.363083164300203e-06, "loss": 1.0579, "step": 390 }, { "epoch": 0.7931034482758621, "grad_norm": 0.2431640625, "learning_rate": 7.35632183908046e-06, "loss": 1.0904, "step": 391 }, { "epoch": 0.795131845841785, "grad_norm": 0.208984375, "learning_rate": 7.349560513860718e-06, "loss": 1.1116, "step": 392 }, { "epoch": 0.7971602434077079, "grad_norm": 0.236328125, "learning_rate": 7.342799188640975e-06, "loss": 1.0659, "step": 393 }, { "epoch": 0.7991886409736308, "grad_norm": 0.2275390625, "learning_rate": 7.3360378634212316e-06, "loss": 1.0893, "step": 394 }, { "epoch": 0.8012170385395537, "grad_norm": 0.23828125, "learning_rate": 7.3292765382014885e-06, "loss": 1.0927, "step": 395 }, { "epoch": 0.8032454361054767, "grad_norm": 0.2021484375, "learning_rate": 7.3225152129817445e-06, "loss": 1.0579, "step": 396 }, { "epoch": 0.8052738336713996, "grad_norm": 0.2080078125, "learning_rate": 7.3157538877620015e-06, "loss": 1.0357, "step": 397 }, { "epoch": 0.8073022312373225, "grad_norm": 0.2158203125, "learning_rate": 7.308992562542258e-06, "loss": 1.0872, "step": 398 }, { "epoch": 0.8093306288032455, "grad_norm": 0.2119140625, "learning_rate": 7.302231237322516e-06, "loss": 1.0981, "step": 399 }, { "epoch": 0.8113590263691683, "grad_norm": 0.330078125, "learning_rate": 7.295469912102773e-06, "loss": 1.1117, "step": 400 }, { "epoch": 0.8133874239350912, "grad_norm": 0.2197265625, "learning_rate": 7.28870858688303e-06, "loss": 1.0784, "step": 401 }, { "epoch": 0.8154158215010142, "grad_norm": 0.265625, "learning_rate": 7.281947261663287e-06, "loss": 1.1133, "step": 402 }, { "epoch": 0.8174442190669371, "grad_norm": 0.2470703125, "learning_rate": 7.275185936443544e-06, "loss": 1.0285, "step": 403 }, { "epoch": 0.8194726166328601, "grad_norm": 0.23828125, "learning_rate": 7.2684246112238e-06, "loss": 1.0703, "step": 404 }, { "epoch": 0.821501014198783, "grad_norm": 0.2353515625, "learning_rate": 7.261663286004057e-06, "loss": 1.091, "step": 405 }, { "epoch": 0.8235294117647058, "grad_norm": 0.2216796875, "learning_rate": 7.2549019607843145e-06, "loss": 1.0706, "step": 406 }, { "epoch": 0.8255578093306288, "grad_norm": 0.25, "learning_rate": 7.248140635564571e-06, "loss": 1.0761, "step": 407 }, { "epoch": 0.8275862068965517, "grad_norm": 0.23828125, "learning_rate": 7.241379310344828e-06, "loss": 1.1163, "step": 408 }, { "epoch": 0.8296146044624746, "grad_norm": 0.2158203125, "learning_rate": 7.234617985125085e-06, "loss": 1.1036, "step": 409 }, { "epoch": 0.8316430020283976, "grad_norm": 0.22265625, "learning_rate": 7.227856659905342e-06, "loss": 1.096, "step": 410 }, { "epoch": 0.8336713995943205, "grad_norm": 0.220703125, "learning_rate": 7.221095334685599e-06, "loss": 1.0663, "step": 411 }, { "epoch": 0.8356997971602435, "grad_norm": 0.228515625, "learning_rate": 7.214334009465855e-06, "loss": 1.119, "step": 412 }, { "epoch": 0.8377281947261663, "grad_norm": 0.22265625, "learning_rate": 7.207572684246112e-06, "loss": 1.1057, "step": 413 }, { "epoch": 0.8397565922920892, "grad_norm": 0.23046875, "learning_rate": 7.20081135902637e-06, "loss": 1.0777, "step": 414 }, { "epoch": 0.8417849898580122, "grad_norm": 0.2275390625, "learning_rate": 7.194050033806627e-06, "loss": 1.1091, "step": 415 }, { "epoch": 0.8438133874239351, "grad_norm": 0.2353515625, "learning_rate": 7.187288708586884e-06, "loss": 1.055, "step": 416 }, { "epoch": 0.845841784989858, "grad_norm": 0.2109375, "learning_rate": 7.1805273833671405e-06, "loss": 1.1213, "step": 417 }, { "epoch": 0.847870182555781, "grad_norm": 0.2314453125, "learning_rate": 7.1737660581473974e-06, "loss": 1.0516, "step": 418 }, { "epoch": 0.8498985801217038, "grad_norm": 0.2236328125, "learning_rate": 7.167004732927655e-06, "loss": 1.1068, "step": 419 }, { "epoch": 0.8519269776876268, "grad_norm": 0.232421875, "learning_rate": 7.16024340770791e-06, "loss": 1.1144, "step": 420 }, { "epoch": 0.8539553752535497, "grad_norm": 0.2216796875, "learning_rate": 7.153482082488168e-06, "loss": 1.0661, "step": 421 }, { "epoch": 0.8559837728194726, "grad_norm": 0.27734375, "learning_rate": 7.146720757268425e-06, "loss": 1.1194, "step": 422 }, { "epoch": 0.8580121703853956, "grad_norm": 0.2255859375, "learning_rate": 7.139959432048682e-06, "loss": 1.0732, "step": 423 }, { "epoch": 0.8600405679513184, "grad_norm": 0.2216796875, "learning_rate": 7.133198106828939e-06, "loss": 1.0919, "step": 424 }, { "epoch": 0.8620689655172413, "grad_norm": 0.2265625, "learning_rate": 7.126436781609196e-06, "loss": 1.0936, "step": 425 }, { "epoch": 0.8640973630831643, "grad_norm": 0.349609375, "learning_rate": 7.119675456389454e-06, "loss": 1.1049, "step": 426 }, { "epoch": 0.8661257606490872, "grad_norm": 0.228515625, "learning_rate": 7.1129141311697105e-06, "loss": 1.1039, "step": 427 }, { "epoch": 0.8681541582150102, "grad_norm": 0.296875, "learning_rate": 7.1061528059499666e-06, "loss": 1.034, "step": 428 }, { "epoch": 0.8701825557809331, "grad_norm": 0.2314453125, "learning_rate": 7.0993914807302235e-06, "loss": 1.1177, "step": 429 }, { "epoch": 0.8722109533468559, "grad_norm": 0.33984375, "learning_rate": 7.09263015551048e-06, "loss": 1.0703, "step": 430 }, { "epoch": 0.8742393509127789, "grad_norm": 0.21875, "learning_rate": 7.085868830290737e-06, "loss": 1.0331, "step": 431 }, { "epoch": 0.8762677484787018, "grad_norm": 0.220703125, "learning_rate": 7.079107505070994e-06, "loss": 1.0849, "step": 432 }, { "epoch": 0.8782961460446247, "grad_norm": 0.2265625, "learning_rate": 7.072346179851252e-06, "loss": 1.0612, "step": 433 }, { "epoch": 0.8803245436105477, "grad_norm": 0.2177734375, "learning_rate": 7.065584854631509e-06, "loss": 1.0692, "step": 434 }, { "epoch": 0.8823529411764706, "grad_norm": 0.2197265625, "learning_rate": 7.058823529411766e-06, "loss": 1.0678, "step": 435 }, { "epoch": 0.8843813387423936, "grad_norm": 0.2216796875, "learning_rate": 7.052062204192022e-06, "loss": 1.0662, "step": 436 }, { "epoch": 0.8864097363083164, "grad_norm": 0.2158203125, "learning_rate": 7.045300878972279e-06, "loss": 1.08, "step": 437 }, { "epoch": 0.8884381338742393, "grad_norm": 0.3046875, "learning_rate": 7.038539553752536e-06, "loss": 1.1165, "step": 438 }, { "epoch": 0.8904665314401623, "grad_norm": 0.2216796875, "learning_rate": 7.031778228532793e-06, "loss": 1.0933, "step": 439 }, { "epoch": 0.8924949290060852, "grad_norm": 0.228515625, "learning_rate": 7.02501690331305e-06, "loss": 1.1124, "step": 440 }, { "epoch": 0.8945233265720081, "grad_norm": 0.2314453125, "learning_rate": 7.018255578093307e-06, "loss": 1.1189, "step": 441 }, { "epoch": 0.896551724137931, "grad_norm": 0.228515625, "learning_rate": 7.011494252873564e-06, "loss": 1.0761, "step": 442 }, { "epoch": 0.8985801217038539, "grad_norm": 0.2294921875, "learning_rate": 7.004732927653821e-06, "loss": 1.0929, "step": 443 }, { "epoch": 0.9006085192697769, "grad_norm": 0.2392578125, "learning_rate": 6.997971602434077e-06, "loss": 1.0499, "step": 444 }, { "epoch": 0.9026369168356998, "grad_norm": 0.3515625, "learning_rate": 6.991210277214334e-06, "loss": 1.0665, "step": 445 }, { "epoch": 0.9046653144016227, "grad_norm": 0.2216796875, "learning_rate": 6.984448951994591e-06, "loss": 1.0624, "step": 446 }, { "epoch": 0.9066937119675457, "grad_norm": 0.240234375, "learning_rate": 6.977687626774849e-06, "loss": 1.1199, "step": 447 }, { "epoch": 0.9087221095334685, "grad_norm": 0.28125, "learning_rate": 6.970926301555106e-06, "loss": 1.0736, "step": 448 }, { "epoch": 0.9107505070993914, "grad_norm": 0.228515625, "learning_rate": 6.9641649763353625e-06, "loss": 1.1005, "step": 449 }, { "epoch": 0.9127789046653144, "grad_norm": 0.248046875, "learning_rate": 6.9574036511156195e-06, "loss": 1.1016, "step": 450 }, { "epoch": 0.9148073022312373, "grad_norm": 0.337890625, "learning_rate": 6.950642325895876e-06, "loss": 1.0351, "step": 451 }, { "epoch": 0.9168356997971603, "grad_norm": 0.30078125, "learning_rate": 6.9438810006761324e-06, "loss": 1.055, "step": 452 }, { "epoch": 0.9188640973630832, "grad_norm": 0.2177734375, "learning_rate": 6.937119675456389e-06, "loss": 1.0669, "step": 453 }, { "epoch": 0.920892494929006, "grad_norm": 0.220703125, "learning_rate": 6.930358350236647e-06, "loss": 1.1177, "step": 454 }, { "epoch": 0.922920892494929, "grad_norm": 0.2294921875, "learning_rate": 6.923597025016904e-06, "loss": 1.1167, "step": 455 }, { "epoch": 0.9249492900608519, "grad_norm": 0.2578125, "learning_rate": 6.916835699797161e-06, "loss": 1.0982, "step": 456 }, { "epoch": 0.9269776876267748, "grad_norm": 0.2421875, "learning_rate": 6.910074374577418e-06, "loss": 1.0828, "step": 457 }, { "epoch": 0.9290060851926978, "grad_norm": 0.228515625, "learning_rate": 6.903313049357675e-06, "loss": 1.0629, "step": 458 }, { "epoch": 0.9310344827586207, "grad_norm": 0.625, "learning_rate": 6.896551724137932e-06, "loss": 1.107, "step": 459 }, { "epoch": 0.9330628803245437, "grad_norm": 0.23828125, "learning_rate": 6.889790398918188e-06, "loss": 1.0501, "step": 460 }, { "epoch": 0.9350912778904665, "grad_norm": 0.2333984375, "learning_rate": 6.8830290736984455e-06, "loss": 1.0976, "step": 461 }, { "epoch": 0.9371196754563894, "grad_norm": 0.259765625, "learning_rate": 6.876267748478702e-06, "loss": 1.1372, "step": 462 }, { "epoch": 0.9391480730223124, "grad_norm": 0.2197265625, "learning_rate": 6.869506423258959e-06, "loss": 1.0403, "step": 463 }, { "epoch": 0.9411764705882353, "grad_norm": 0.2216796875, "learning_rate": 6.862745098039216e-06, "loss": 1.0414, "step": 464 }, { "epoch": 0.9432048681541582, "grad_norm": 0.22265625, "learning_rate": 6.855983772819473e-06, "loss": 1.104, "step": 465 }, { "epoch": 0.9452332657200812, "grad_norm": 0.234375, "learning_rate": 6.84922244759973e-06, "loss": 1.1283, "step": 466 }, { "epoch": 0.947261663286004, "grad_norm": 0.234375, "learning_rate": 6.842461122379988e-06, "loss": 1.0765, "step": 467 }, { "epoch": 0.949290060851927, "grad_norm": 0.279296875, "learning_rate": 6.835699797160244e-06, "loss": 1.0378, "step": 468 }, { "epoch": 0.9513184584178499, "grad_norm": 0.234375, "learning_rate": 6.828938471940501e-06, "loss": 1.1038, "step": 469 }, { "epoch": 0.9533468559837728, "grad_norm": 0.234375, "learning_rate": 6.822177146720758e-06, "loss": 1.0895, "step": 470 }, { "epoch": 0.9553752535496958, "grad_norm": 0.2451171875, "learning_rate": 6.815415821501015e-06, "loss": 1.0341, "step": 471 }, { "epoch": 0.9574036511156186, "grad_norm": 0.322265625, "learning_rate": 6.8086544962812715e-06, "loss": 1.0862, "step": 472 }, { "epoch": 0.9594320486815415, "grad_norm": 0.2421875, "learning_rate": 6.801893171061528e-06, "loss": 1.1023, "step": 473 }, { "epoch": 0.9614604462474645, "grad_norm": 0.23046875, "learning_rate": 6.795131845841786e-06, "loss": 1.0123, "step": 474 }, { "epoch": 0.9634888438133874, "grad_norm": 0.228515625, "learning_rate": 6.788370520622043e-06, "loss": 1.1256, "step": 475 }, { "epoch": 0.9655172413793104, "grad_norm": 0.2353515625, "learning_rate": 6.781609195402299e-06, "loss": 1.0688, "step": 476 }, { "epoch": 0.9675456389452333, "grad_norm": 0.2373046875, "learning_rate": 6.774847870182556e-06, "loss": 1.1208, "step": 477 }, { "epoch": 0.9695740365111561, "grad_norm": 0.23828125, "learning_rate": 6.768086544962813e-06, "loss": 1.087, "step": 478 }, { "epoch": 0.9716024340770791, "grad_norm": 0.2490234375, "learning_rate": 6.76132521974307e-06, "loss": 1.099, "step": 479 }, { "epoch": 0.973630831643002, "grad_norm": 0.2490234375, "learning_rate": 6.754563894523327e-06, "loss": 1.1076, "step": 480 }, { "epoch": 0.9756592292089249, "grad_norm": 0.2255859375, "learning_rate": 6.7478025693035846e-06, "loss": 1.0634, "step": 481 }, { "epoch": 0.9776876267748479, "grad_norm": 0.22265625, "learning_rate": 6.7410412440838415e-06, "loss": 1.0835, "step": 482 }, { "epoch": 0.9797160243407708, "grad_norm": 0.2353515625, "learning_rate": 6.734279918864098e-06, "loss": 1.0837, "step": 483 }, { "epoch": 0.9817444219066938, "grad_norm": 0.228515625, "learning_rate": 6.7275185936443544e-06, "loss": 1.0663, "step": 484 }, { "epoch": 0.9837728194726166, "grad_norm": 0.2255859375, "learning_rate": 6.720757268424611e-06, "loss": 1.0664, "step": 485 }, { "epoch": 0.9858012170385395, "grad_norm": 0.2490234375, "learning_rate": 6.713995943204868e-06, "loss": 1.0677, "step": 486 }, { "epoch": 0.9878296146044625, "grad_norm": 0.2216796875, "learning_rate": 6.707234617985125e-06, "loss": 1.0646, "step": 487 }, { "epoch": 0.9898580121703854, "grad_norm": 0.2255859375, "learning_rate": 6.700473292765383e-06, "loss": 1.0106, "step": 488 }, { "epoch": 0.9918864097363083, "grad_norm": 0.2265625, "learning_rate": 6.69371196754564e-06, "loss": 1.1186, "step": 489 }, { "epoch": 0.9939148073022313, "grad_norm": 0.287109375, "learning_rate": 6.686950642325897e-06, "loss": 1.0706, "step": 490 }, { "epoch": 0.9959432048681541, "grad_norm": 0.2451171875, "learning_rate": 6.680189317106154e-06, "loss": 1.065, "step": 491 }, { "epoch": 0.9979716024340771, "grad_norm": 0.2421875, "learning_rate": 6.67342799188641e-06, "loss": 1.0931, "step": 492 }, { "epoch": 1.0, "grad_norm": 0.244140625, "learning_rate": 6.666666666666667e-06, "loss": 1.0794, "step": 493 }, { "epoch": 1.002028397565923, "grad_norm": 0.28125, "learning_rate": 6.6599053414469236e-06, "loss": 1.087, "step": 494 }, { "epoch": 1.0040567951318458, "grad_norm": 0.23828125, "learning_rate": 6.653144016227181e-06, "loss": 1.0326, "step": 495 }, { "epoch": 1.0060851926977687, "grad_norm": 0.248046875, "learning_rate": 6.646382691007438e-06, "loss": 1.056, "step": 496 }, { "epoch": 1.0081135902636917, "grad_norm": 0.2255859375, "learning_rate": 6.639621365787695e-06, "loss": 1.0488, "step": 497 }, { "epoch": 1.0101419878296145, "grad_norm": 0.2275390625, "learning_rate": 6.632860040567952e-06, "loss": 1.0683, "step": 498 }, { "epoch": 1.0121703853955375, "grad_norm": 0.294921875, "learning_rate": 6.626098715348209e-06, "loss": 1.0659, "step": 499 }, { "epoch": 1.0141987829614605, "grad_norm": 0.2333984375, "learning_rate": 6.619337390128465e-06, "loss": 1.0606, "step": 500 }, { "epoch": 1.0162271805273835, "grad_norm": 0.30859375, "learning_rate": 6.612576064908722e-06, "loss": 1.0074, "step": 501 }, { "epoch": 1.0182555780933062, "grad_norm": 0.283203125, "learning_rate": 6.60581473968898e-06, "loss": 1.0893, "step": 502 }, { "epoch": 1.0202839756592292, "grad_norm": 0.22265625, "learning_rate": 6.599053414469237e-06, "loss": 1.079, "step": 503 }, { "epoch": 1.0223123732251522, "grad_norm": 0.322265625, "learning_rate": 6.5922920892494935e-06, "loss": 1.0616, "step": 504 }, { "epoch": 1.024340770791075, "grad_norm": 0.2333984375, "learning_rate": 6.5855307640297504e-06, "loss": 1.0783, "step": 505 }, { "epoch": 1.026369168356998, "grad_norm": 0.2314453125, "learning_rate": 6.578769438810007e-06, "loss": 1.0645, "step": 506 }, { "epoch": 1.028397565922921, "grad_norm": 0.2490234375, "learning_rate": 6.572008113590265e-06, "loss": 1.0546, "step": 507 }, { "epoch": 1.0304259634888437, "grad_norm": 0.2333984375, "learning_rate": 6.56524678837052e-06, "loss": 1.088, "step": 508 }, { "epoch": 1.0324543610547667, "grad_norm": 0.2412109375, "learning_rate": 6.558485463150778e-06, "loss": 1.1028, "step": 509 }, { "epoch": 1.0344827586206897, "grad_norm": 0.2314453125, "learning_rate": 6.551724137931035e-06, "loss": 1.0477, "step": 510 }, { "epoch": 1.0365111561866125, "grad_norm": 0.2294921875, "learning_rate": 6.544962812711292e-06, "loss": 1.0453, "step": 511 }, { "epoch": 1.0385395537525355, "grad_norm": 0.2275390625, "learning_rate": 6.538201487491549e-06, "loss": 1.0321, "step": 512 }, { "epoch": 1.0405679513184585, "grad_norm": 0.2314453125, "learning_rate": 6.531440162271806e-06, "loss": 1.0569, "step": 513 }, { "epoch": 1.0425963488843812, "grad_norm": 0.234375, "learning_rate": 6.5246788370520635e-06, "loss": 1.0619, "step": 514 }, { "epoch": 1.0446247464503042, "grad_norm": 0.251953125, "learning_rate": 6.51791751183232e-06, "loss": 1.0559, "step": 515 }, { "epoch": 1.0466531440162272, "grad_norm": 0.2431640625, "learning_rate": 6.5111561866125765e-06, "loss": 1.0531, "step": 516 }, { "epoch": 1.04868154158215, "grad_norm": 0.22265625, "learning_rate": 6.504394861392833e-06, "loss": 1.0724, "step": 517 }, { "epoch": 1.050709939148073, "grad_norm": 0.2451171875, "learning_rate": 6.49763353617309e-06, "loss": 1.0478, "step": 518 }, { "epoch": 1.052738336713996, "grad_norm": 0.21875, "learning_rate": 6.490872210953347e-06, "loss": 1.0659, "step": 519 }, { "epoch": 1.054766734279919, "grad_norm": 0.2216796875, "learning_rate": 6.484110885733604e-06, "loss": 1.0672, "step": 520 }, { "epoch": 1.0567951318458417, "grad_norm": 0.2734375, "learning_rate": 6.477349560513861e-06, "loss": 1.0845, "step": 521 }, { "epoch": 1.0588235294117647, "grad_norm": 0.310546875, "learning_rate": 6.470588235294119e-06, "loss": 1.0633, "step": 522 }, { "epoch": 1.0608519269776877, "grad_norm": 0.26171875, "learning_rate": 6.463826910074376e-06, "loss": 1.1189, "step": 523 }, { "epoch": 1.0628803245436105, "grad_norm": 0.296875, "learning_rate": 6.457065584854632e-06, "loss": 1.0884, "step": 524 }, { "epoch": 1.0649087221095335, "grad_norm": 0.2255859375, "learning_rate": 6.450304259634889e-06, "loss": 1.0337, "step": 525 }, { "epoch": 1.0669371196754565, "grad_norm": 0.2255859375, "learning_rate": 6.4435429344151456e-06, "loss": 1.0333, "step": 526 }, { "epoch": 1.0689655172413792, "grad_norm": 0.2333984375, "learning_rate": 6.4367816091954025e-06, "loss": 1.0785, "step": 527 }, { "epoch": 1.0709939148073022, "grad_norm": 0.2255859375, "learning_rate": 6.430020283975659e-06, "loss": 1.0, "step": 528 }, { "epoch": 1.0730223123732252, "grad_norm": 0.2392578125, "learning_rate": 6.423258958755917e-06, "loss": 1.0664, "step": 529 }, { "epoch": 1.075050709939148, "grad_norm": 0.29296875, "learning_rate": 6.416497633536174e-06, "loss": 1.1181, "step": 530 }, { "epoch": 1.077079107505071, "grad_norm": 0.255859375, "learning_rate": 6.409736308316431e-06, "loss": 1.0471, "step": 531 }, { "epoch": 1.079107505070994, "grad_norm": 0.2421875, "learning_rate": 6.402974983096687e-06, "loss": 1.0518, "step": 532 }, { "epoch": 1.081135902636917, "grad_norm": 0.2470703125, "learning_rate": 6.396213657876944e-06, "loss": 1.0784, "step": 533 }, { "epoch": 1.0831643002028397, "grad_norm": 0.32421875, "learning_rate": 6.389452332657201e-06, "loss": 1.0695, "step": 534 }, { "epoch": 1.0851926977687627, "grad_norm": 0.234375, "learning_rate": 6.382691007437458e-06, "loss": 1.044, "step": 535 }, { "epoch": 1.0872210953346857, "grad_norm": 0.337890625, "learning_rate": 6.3759296822177155e-06, "loss": 1.049, "step": 536 }, { "epoch": 1.0892494929006085, "grad_norm": 0.287109375, "learning_rate": 6.3691683569979724e-06, "loss": 1.0108, "step": 537 }, { "epoch": 1.0912778904665315, "grad_norm": 0.234375, "learning_rate": 6.362407031778229e-06, "loss": 1.02, "step": 538 }, { "epoch": 1.0933062880324544, "grad_norm": 0.26953125, "learning_rate": 6.355645706558486e-06, "loss": 1.0442, "step": 539 }, { "epoch": 1.0953346855983772, "grad_norm": 0.302734375, "learning_rate": 6.348884381338742e-06, "loss": 1.0711, "step": 540 }, { "epoch": 1.0973630831643002, "grad_norm": 0.35546875, "learning_rate": 6.342123056118999e-06, "loss": 1.0747, "step": 541 }, { "epoch": 1.0993914807302232, "grad_norm": 0.234375, "learning_rate": 6.335361730899256e-06, "loss": 1.0477, "step": 542 }, { "epoch": 1.101419878296146, "grad_norm": 0.2314453125, "learning_rate": 6.328600405679514e-06, "loss": 1.023, "step": 543 }, { "epoch": 1.103448275862069, "grad_norm": 0.234375, "learning_rate": 6.321839080459771e-06, "loss": 1.0482, "step": 544 }, { "epoch": 1.105476673427992, "grad_norm": 0.2470703125, "learning_rate": 6.315077755240028e-06, "loss": 1.1157, "step": 545 }, { "epoch": 1.1075050709939147, "grad_norm": 0.2451171875, "learning_rate": 6.308316430020285e-06, "loss": 1.0663, "step": 546 }, { "epoch": 1.1095334685598377, "grad_norm": 0.2294921875, "learning_rate": 6.3015551048005416e-06, "loss": 1.0461, "step": 547 }, { "epoch": 1.1115618661257607, "grad_norm": 0.2412109375, "learning_rate": 6.294793779580798e-06, "loss": 1.0951, "step": 548 }, { "epoch": 1.1135902636916835, "grad_norm": 0.2431640625, "learning_rate": 6.2880324543610545e-06, "loss": 1.0817, "step": 549 }, { "epoch": 1.1156186612576064, "grad_norm": 0.2578125, "learning_rate": 6.281271129141312e-06, "loss": 1.0373, "step": 550 }, { "epoch": 1.1176470588235294, "grad_norm": 0.26953125, "learning_rate": 6.274509803921569e-06, "loss": 1.0159, "step": 551 }, { "epoch": 1.1196754563894524, "grad_norm": 0.263671875, "learning_rate": 6.267748478701826e-06, "loss": 1.0672, "step": 552 }, { "epoch": 1.1217038539553752, "grad_norm": 0.23046875, "learning_rate": 6.260987153482083e-06, "loss": 1.0657, "step": 553 }, { "epoch": 1.1237322515212982, "grad_norm": 0.2275390625, "learning_rate": 6.25422582826234e-06, "loss": 1.0105, "step": 554 }, { "epoch": 1.1257606490872212, "grad_norm": 0.2412109375, "learning_rate": 6.247464503042598e-06, "loss": 1.0405, "step": 555 }, { "epoch": 1.127789046653144, "grad_norm": 0.2490234375, "learning_rate": 6.240703177822853e-06, "loss": 1.0467, "step": 556 }, { "epoch": 1.129817444219067, "grad_norm": 0.2255859375, "learning_rate": 6.233941852603111e-06, "loss": 1.0506, "step": 557 }, { "epoch": 1.13184584178499, "grad_norm": 0.2392578125, "learning_rate": 6.227180527383368e-06, "loss": 1.0681, "step": 558 }, { "epoch": 1.1338742393509127, "grad_norm": 0.2421875, "learning_rate": 6.2204192021636245e-06, "loss": 1.126, "step": 559 }, { "epoch": 1.1359026369168357, "grad_norm": 0.2451171875, "learning_rate": 6.213657876943881e-06, "loss": 1.0295, "step": 560 }, { "epoch": 1.1379310344827587, "grad_norm": 0.28515625, "learning_rate": 6.206896551724138e-06, "loss": 1.076, "step": 561 }, { "epoch": 1.1399594320486814, "grad_norm": 0.240234375, "learning_rate": 6.200135226504396e-06, "loss": 1.0124, "step": 562 }, { "epoch": 1.1419878296146044, "grad_norm": 0.2490234375, "learning_rate": 6.193373901284653e-06, "loss": 1.1096, "step": 563 }, { "epoch": 1.1440162271805274, "grad_norm": 0.296875, "learning_rate": 6.186612576064909e-06, "loss": 1.0413, "step": 564 }, { "epoch": 1.1460446247464504, "grad_norm": 0.2314453125, "learning_rate": 6.179851250845166e-06, "loss": 1.0461, "step": 565 }, { "epoch": 1.1480730223123732, "grad_norm": 0.330078125, "learning_rate": 6.173089925625423e-06, "loss": 1.0517, "step": 566 }, { "epoch": 1.1501014198782962, "grad_norm": 0.2451171875, "learning_rate": 6.16632860040568e-06, "loss": 1.0945, "step": 567 }, { "epoch": 1.1521298174442192, "grad_norm": 0.251953125, "learning_rate": 6.159567275185937e-06, "loss": 1.1018, "step": 568 }, { "epoch": 1.154158215010142, "grad_norm": 0.275390625, "learning_rate": 6.1528059499661945e-06, "loss": 0.9737, "step": 569 }, { "epoch": 1.156186612576065, "grad_norm": 0.2431640625, "learning_rate": 6.146044624746451e-06, "loss": 1.0363, "step": 570 }, { "epoch": 1.158215010141988, "grad_norm": 0.61328125, "learning_rate": 6.139283299526708e-06, "loss": 1.071, "step": 571 }, { "epoch": 1.1602434077079107, "grad_norm": 0.2392578125, "learning_rate": 6.132521974306964e-06, "loss": 1.0491, "step": 572 }, { "epoch": 1.1622718052738337, "grad_norm": 0.2451171875, "learning_rate": 6.125760649087221e-06, "loss": 1.0269, "step": 573 }, { "epoch": 1.1643002028397567, "grad_norm": 0.2490234375, "learning_rate": 6.118999323867478e-06, "loss": 1.0467, "step": 574 }, { "epoch": 1.1663286004056794, "grad_norm": 0.25390625, "learning_rate": 6.112237998647735e-06, "loss": 1.0575, "step": 575 }, { "epoch": 1.1683569979716024, "grad_norm": 0.2392578125, "learning_rate": 6.105476673427993e-06, "loss": 1.0519, "step": 576 }, { "epoch": 1.1703853955375254, "grad_norm": 0.259765625, "learning_rate": 6.09871534820825e-06, "loss": 1.0848, "step": 577 }, { "epoch": 1.1724137931034484, "grad_norm": 0.2333984375, "learning_rate": 6.091954022988507e-06, "loss": 1.0639, "step": 578 }, { "epoch": 1.1744421906693712, "grad_norm": 0.23828125, "learning_rate": 6.0851926977687636e-06, "loss": 1.0619, "step": 579 }, { "epoch": 1.1764705882352942, "grad_norm": 0.2392578125, "learning_rate": 6.07843137254902e-06, "loss": 1.0422, "step": 580 }, { "epoch": 1.178498985801217, "grad_norm": 0.2294921875, "learning_rate": 6.0716700473292766e-06, "loss": 1.0245, "step": 581 }, { "epoch": 1.18052738336714, "grad_norm": 0.2255859375, "learning_rate": 6.0649087221095335e-06, "loss": 1.051, "step": 582 }, { "epoch": 1.182555780933063, "grad_norm": 0.298828125, "learning_rate": 6.058147396889791e-06, "loss": 1.0893, "step": 583 }, { "epoch": 1.184584178498986, "grad_norm": 0.2412109375, "learning_rate": 6.051386071670048e-06, "loss": 1.0841, "step": 584 }, { "epoch": 1.1866125760649087, "grad_norm": 0.2451171875, "learning_rate": 6.044624746450305e-06, "loss": 1.0444, "step": 585 }, { "epoch": 1.1886409736308317, "grad_norm": 0.2490234375, "learning_rate": 6.037863421230562e-06, "loss": 1.0222, "step": 586 }, { "epoch": 1.1906693711967546, "grad_norm": 0.26171875, "learning_rate": 6.031102096010819e-06, "loss": 1.0554, "step": 587 }, { "epoch": 1.1926977687626774, "grad_norm": 0.25, "learning_rate": 6.024340770791075e-06, "loss": 1.0436, "step": 588 }, { "epoch": 1.1947261663286004, "grad_norm": 0.255859375, "learning_rate": 6.017579445571332e-06, "loss": 1.0239, "step": 589 }, { "epoch": 1.1967545638945234, "grad_norm": 0.2373046875, "learning_rate": 6.01081812035159e-06, "loss": 1.0412, "step": 590 }, { "epoch": 1.1987829614604462, "grad_norm": 0.2431640625, "learning_rate": 6.0040567951318465e-06, "loss": 1.0504, "step": 591 }, { "epoch": 1.2008113590263692, "grad_norm": 0.236328125, "learning_rate": 5.9972954699121034e-06, "loss": 1.0352, "step": 592 }, { "epoch": 1.2028397565922921, "grad_norm": 0.2353515625, "learning_rate": 5.99053414469236e-06, "loss": 1.0506, "step": 593 }, { "epoch": 1.204868154158215, "grad_norm": 0.2412109375, "learning_rate": 5.983772819472617e-06, "loss": 1.0723, "step": 594 }, { "epoch": 1.206896551724138, "grad_norm": 0.251953125, "learning_rate": 5.977011494252874e-06, "loss": 1.0563, "step": 595 }, { "epoch": 1.208924949290061, "grad_norm": 0.244140625, "learning_rate": 5.97025016903313e-06, "loss": 1.0346, "step": 596 }, { "epoch": 1.2109533468559839, "grad_norm": 0.2451171875, "learning_rate": 5.963488843813387e-06, "loss": 1.0636, "step": 597 }, { "epoch": 1.2129817444219066, "grad_norm": 0.236328125, "learning_rate": 5.956727518593645e-06, "loss": 1.0694, "step": 598 }, { "epoch": 1.2150101419878296, "grad_norm": 0.251953125, "learning_rate": 5.949966193373902e-06, "loss": 1.0332, "step": 599 }, { "epoch": 1.2170385395537526, "grad_norm": 0.234375, "learning_rate": 5.943204868154159e-06, "loss": 1.0194, "step": 600 }, { "epoch": 1.2190669371196754, "grad_norm": 0.37890625, "learning_rate": 5.936443542934416e-06, "loss": 1.0426, "step": 601 }, { "epoch": 1.2210953346855984, "grad_norm": 0.2412109375, "learning_rate": 5.9296822177146725e-06, "loss": 1.0372, "step": 602 }, { "epoch": 1.2231237322515214, "grad_norm": 0.50390625, "learning_rate": 5.92292089249493e-06, "loss": 0.9994, "step": 603 }, { "epoch": 1.2251521298174441, "grad_norm": 0.263671875, "learning_rate": 5.9161595672751855e-06, "loss": 1.0621, "step": 604 }, { "epoch": 1.2271805273833671, "grad_norm": 0.236328125, "learning_rate": 5.909398242055443e-06, "loss": 1.0593, "step": 605 }, { "epoch": 1.2292089249492901, "grad_norm": 0.25, "learning_rate": 5.9026369168357e-06, "loss": 1.0912, "step": 606 }, { "epoch": 1.231237322515213, "grad_norm": 0.23828125, "learning_rate": 5.895875591615957e-06, "loss": 1.0958, "step": 607 }, { "epoch": 1.2332657200811359, "grad_norm": 0.2392578125, "learning_rate": 5.889114266396214e-06, "loss": 0.997, "step": 608 }, { "epoch": 1.2352941176470589, "grad_norm": 0.248046875, "learning_rate": 5.882352941176471e-06, "loss": 1.1018, "step": 609 }, { "epoch": 1.2373225152129819, "grad_norm": 0.244140625, "learning_rate": 5.875591615956729e-06, "loss": 1.0352, "step": 610 }, { "epoch": 1.2393509127789046, "grad_norm": 0.265625, "learning_rate": 5.868830290736986e-06, "loss": 1.0638, "step": 611 }, { "epoch": 1.2413793103448276, "grad_norm": 0.24609375, "learning_rate": 5.862068965517242e-06, "loss": 1.0008, "step": 612 }, { "epoch": 1.2434077079107504, "grad_norm": 0.251953125, "learning_rate": 5.8553076402974986e-06, "loss": 1.0329, "step": 613 }, { "epoch": 1.2454361054766734, "grad_norm": 0.2412109375, "learning_rate": 5.8485463150777555e-06, "loss": 1.0629, "step": 614 }, { "epoch": 1.2474645030425964, "grad_norm": 0.240234375, "learning_rate": 5.841784989858012e-06, "loss": 1.0493, "step": 615 }, { "epoch": 1.2494929006085194, "grad_norm": 0.25, "learning_rate": 5.835023664638269e-06, "loss": 1.0285, "step": 616 }, { "epoch": 1.2515212981744421, "grad_norm": 0.2392578125, "learning_rate": 5.828262339418527e-06, "loss": 1.0356, "step": 617 }, { "epoch": 1.2535496957403651, "grad_norm": 0.31640625, "learning_rate": 5.821501014198784e-06, "loss": 1.0757, "step": 618 }, { "epoch": 1.2555780933062881, "grad_norm": 0.267578125, "learning_rate": 5.814739688979041e-06, "loss": 1.0564, "step": 619 }, { "epoch": 1.2576064908722109, "grad_norm": 0.25, "learning_rate": 5.807978363759297e-06, "loss": 1.0453, "step": 620 }, { "epoch": 1.2596348884381339, "grad_norm": 0.24609375, "learning_rate": 5.801217038539554e-06, "loss": 1.0489, "step": 621 }, { "epoch": 1.2616632860040569, "grad_norm": 0.25390625, "learning_rate": 5.794455713319811e-06, "loss": 1.0171, "step": 622 }, { "epoch": 1.2636916835699799, "grad_norm": 0.263671875, "learning_rate": 5.787694388100068e-06, "loss": 1.0695, "step": 623 }, { "epoch": 1.2657200811359026, "grad_norm": 0.2431640625, "learning_rate": 5.7809330628803254e-06, "loss": 1.0492, "step": 624 }, { "epoch": 1.2677484787018256, "grad_norm": 0.3984375, "learning_rate": 5.774171737660582e-06, "loss": 1.0291, "step": 625 }, { "epoch": 1.2697768762677484, "grad_norm": 0.318359375, "learning_rate": 5.767410412440839e-06, "loss": 1.0349, "step": 626 }, { "epoch": 1.2718052738336714, "grad_norm": 0.23046875, "learning_rate": 5.760649087221096e-06, "loss": 1.0465, "step": 627 }, { "epoch": 1.2738336713995944, "grad_norm": 0.26171875, "learning_rate": 5.753887762001352e-06, "loss": 1.0746, "step": 628 }, { "epoch": 1.2758620689655173, "grad_norm": 0.2578125, "learning_rate": 5.747126436781609e-06, "loss": 1.058, "step": 629 }, { "epoch": 1.2778904665314401, "grad_norm": 0.255859375, "learning_rate": 5.740365111561866e-06, "loss": 1.0603, "step": 630 }, { "epoch": 1.279918864097363, "grad_norm": 0.251953125, "learning_rate": 5.733603786342124e-06, "loss": 1.0767, "step": 631 }, { "epoch": 1.2819472616632859, "grad_norm": 0.248046875, "learning_rate": 5.726842461122381e-06, "loss": 1.0272, "step": 632 }, { "epoch": 1.2839756592292089, "grad_norm": 0.2373046875, "learning_rate": 5.720081135902638e-06, "loss": 1.0608, "step": 633 }, { "epoch": 1.2860040567951319, "grad_norm": 0.2421875, "learning_rate": 5.7133198106828946e-06, "loss": 1.0151, "step": 634 }, { "epoch": 1.2880324543610548, "grad_norm": 0.287109375, "learning_rate": 5.7065584854631515e-06, "loss": 1.0519, "step": 635 }, { "epoch": 1.2900608519269776, "grad_norm": 0.23828125, "learning_rate": 5.6997971602434075e-06, "loss": 1.0587, "step": 636 }, { "epoch": 1.2920892494929006, "grad_norm": 0.2353515625, "learning_rate": 5.6930358350236644e-06, "loss": 1.0706, "step": 637 }, { "epoch": 1.2941176470588236, "grad_norm": 0.2314453125, "learning_rate": 5.686274509803922e-06, "loss": 1.0084, "step": 638 }, { "epoch": 1.2961460446247464, "grad_norm": 0.25, "learning_rate": 5.679513184584179e-06, "loss": 1.0433, "step": 639 }, { "epoch": 1.2981744421906694, "grad_norm": 0.2353515625, "learning_rate": 5.672751859364436e-06, "loss": 1.0044, "step": 640 }, { "epoch": 1.3002028397565923, "grad_norm": 0.248046875, "learning_rate": 5.665990534144693e-06, "loss": 0.9463, "step": 641 }, { "epoch": 1.3022312373225153, "grad_norm": 0.23828125, "learning_rate": 5.65922920892495e-06, "loss": 1.0042, "step": 642 }, { "epoch": 1.304259634888438, "grad_norm": 0.322265625, "learning_rate": 5.652467883705207e-06, "loss": 1.0487, "step": 643 }, { "epoch": 1.306288032454361, "grad_norm": 0.2412109375, "learning_rate": 5.645706558485463e-06, "loss": 1.0599, "step": 644 }, { "epoch": 1.3083164300202839, "grad_norm": 0.240234375, "learning_rate": 5.638945233265721e-06, "loss": 1.0294, "step": 645 }, { "epoch": 1.3103448275862069, "grad_norm": 0.24609375, "learning_rate": 5.6321839080459775e-06, "loss": 1.0401, "step": 646 }, { "epoch": 1.3123732251521298, "grad_norm": 0.248046875, "learning_rate": 5.625422582826234e-06, "loss": 1.0143, "step": 647 }, { "epoch": 1.3144016227180528, "grad_norm": 0.2314453125, "learning_rate": 5.618661257606491e-06, "loss": 1.0506, "step": 648 }, { "epoch": 1.3164300202839756, "grad_norm": 0.23828125, "learning_rate": 5.611899932386748e-06, "loss": 1.0314, "step": 649 }, { "epoch": 1.3184584178498986, "grad_norm": 0.2333984375, "learning_rate": 5.605138607167005e-06, "loss": 1.0434, "step": 650 }, { "epoch": 1.3204868154158216, "grad_norm": 0.2890625, "learning_rate": 5.598377281947263e-06, "loss": 1.0598, "step": 651 }, { "epoch": 1.3225152129817443, "grad_norm": 0.28515625, "learning_rate": 5.591615956727519e-06, "loss": 1.011, "step": 652 }, { "epoch": 1.3245436105476673, "grad_norm": 0.25390625, "learning_rate": 5.584854631507776e-06, "loss": 1.0506, "step": 653 }, { "epoch": 1.3265720081135903, "grad_norm": 0.24609375, "learning_rate": 5.578093306288033e-06, "loss": 1.0387, "step": 654 }, { "epoch": 1.3286004056795133, "grad_norm": 0.2490234375, "learning_rate": 5.57133198106829e-06, "loss": 1.0571, "step": 655 }, { "epoch": 1.330628803245436, "grad_norm": 0.26953125, "learning_rate": 5.564570655848547e-06, "loss": 1.0351, "step": 656 }, { "epoch": 1.332657200811359, "grad_norm": 0.2431640625, "learning_rate": 5.5578093306288035e-06, "loss": 1.0454, "step": 657 }, { "epoch": 1.3346855983772818, "grad_norm": 0.248046875, "learning_rate": 5.551048005409061e-06, "loss": 1.0374, "step": 658 }, { "epoch": 1.3367139959432048, "grad_norm": 0.279296875, "learning_rate": 5.544286680189318e-06, "loss": 1.0395, "step": 659 }, { "epoch": 1.3387423935091278, "grad_norm": 0.2490234375, "learning_rate": 5.537525354969574e-06, "loss": 0.989, "step": 660 }, { "epoch": 1.3407707910750508, "grad_norm": 0.267578125, "learning_rate": 5.530764029749831e-06, "loss": 1.0449, "step": 661 }, { "epoch": 1.3427991886409736, "grad_norm": 0.23828125, "learning_rate": 5.524002704530088e-06, "loss": 0.9985, "step": 662 }, { "epoch": 1.3448275862068966, "grad_norm": 0.275390625, "learning_rate": 5.517241379310345e-06, "loss": 1.0594, "step": 663 }, { "epoch": 1.3468559837728193, "grad_norm": 0.2353515625, "learning_rate": 5.510480054090602e-06, "loss": 1.0711, "step": 664 }, { "epoch": 1.3488843813387423, "grad_norm": 0.251953125, "learning_rate": 5.50371872887086e-06, "loss": 1.0372, "step": 665 }, { "epoch": 1.3509127789046653, "grad_norm": 0.25390625, "learning_rate": 5.4969574036511166e-06, "loss": 1.0782, "step": 666 }, { "epoch": 1.3529411764705883, "grad_norm": 0.26171875, "learning_rate": 5.4901960784313735e-06, "loss": 1.0556, "step": 667 }, { "epoch": 1.354969574036511, "grad_norm": 0.30078125, "learning_rate": 5.4834347532116295e-06, "loss": 1.0984, "step": 668 }, { "epoch": 1.356997971602434, "grad_norm": 0.2431640625, "learning_rate": 5.4766734279918865e-06, "loss": 0.9965, "step": 669 }, { "epoch": 1.359026369168357, "grad_norm": 0.279296875, "learning_rate": 5.469912102772143e-06, "loss": 1.0513, "step": 670 }, { "epoch": 1.3610547667342798, "grad_norm": 0.259765625, "learning_rate": 5.4631507775524e-06, "loss": 1.0406, "step": 671 }, { "epoch": 1.3630831643002028, "grad_norm": 0.2431640625, "learning_rate": 5.456389452332658e-06, "loss": 1.0864, "step": 672 }, { "epoch": 1.3651115618661258, "grad_norm": 0.275390625, "learning_rate": 5.449628127112915e-06, "loss": 1.0379, "step": 673 }, { "epoch": 1.3671399594320488, "grad_norm": 0.51171875, "learning_rate": 5.442866801893172e-06, "loss": 1.0279, "step": 674 }, { "epoch": 1.3691683569979716, "grad_norm": 0.796875, "learning_rate": 5.436105476673429e-06, "loss": 1.0537, "step": 675 }, { "epoch": 1.3711967545638946, "grad_norm": 0.248046875, "learning_rate": 5.429344151453685e-06, "loss": 1.022, "step": 676 }, { "epoch": 1.3732251521298173, "grad_norm": 0.271484375, "learning_rate": 5.422582826233942e-06, "loss": 1.0154, "step": 677 }, { "epoch": 1.3752535496957403, "grad_norm": 0.25390625, "learning_rate": 5.415821501014199e-06, "loss": 1.0449, "step": 678 }, { "epoch": 1.3772819472616633, "grad_norm": 0.24609375, "learning_rate": 5.409060175794456e-06, "loss": 1.0143, "step": 679 }, { "epoch": 1.3793103448275863, "grad_norm": 0.2314453125, "learning_rate": 5.402298850574713e-06, "loss": 1.047, "step": 680 }, { "epoch": 1.381338742393509, "grad_norm": 0.2431640625, "learning_rate": 5.39553752535497e-06, "loss": 1.036, "step": 681 }, { "epoch": 1.383367139959432, "grad_norm": 0.236328125, "learning_rate": 5.388776200135227e-06, "loss": 1.076, "step": 682 }, { "epoch": 1.385395537525355, "grad_norm": 0.2392578125, "learning_rate": 5.382014874915484e-06, "loss": 1.0543, "step": 683 }, { "epoch": 1.3874239350912778, "grad_norm": 0.28125, "learning_rate": 5.37525354969574e-06, "loss": 1.0474, "step": 684 }, { "epoch": 1.3894523326572008, "grad_norm": 0.283203125, "learning_rate": 5.368492224475997e-06, "loss": 1.0287, "step": 685 }, { "epoch": 1.3914807302231238, "grad_norm": 0.36328125, "learning_rate": 5.361730899256255e-06, "loss": 1.0412, "step": 686 }, { "epoch": 1.3935091277890468, "grad_norm": 0.26171875, "learning_rate": 5.354969574036512e-06, "loss": 1.1062, "step": 687 }, { "epoch": 1.3955375253549696, "grad_norm": 0.23046875, "learning_rate": 5.348208248816769e-06, "loss": 1.0338, "step": 688 }, { "epoch": 1.3975659229208925, "grad_norm": 0.251953125, "learning_rate": 5.3414469235970255e-06, "loss": 1.0185, "step": 689 }, { "epoch": 1.3995943204868153, "grad_norm": 0.251953125, "learning_rate": 5.3346855983772824e-06, "loss": 1.0041, "step": 690 }, { "epoch": 1.4016227180527383, "grad_norm": 0.263671875, "learning_rate": 5.32792427315754e-06, "loss": 1.0337, "step": 691 }, { "epoch": 1.4036511156186613, "grad_norm": 0.2470703125, "learning_rate": 5.321162947937795e-06, "loss": 1.0103, "step": 692 }, { "epoch": 1.4056795131845843, "grad_norm": 0.302734375, "learning_rate": 5.314401622718053e-06, "loss": 1.0466, "step": 693 }, { "epoch": 1.407707910750507, "grad_norm": 0.27734375, "learning_rate": 5.30764029749831e-06, "loss": 1.0092, "step": 694 }, { "epoch": 1.40973630831643, "grad_norm": 0.25, "learning_rate": 5.300878972278567e-06, "loss": 1.0347, "step": 695 }, { "epoch": 1.4117647058823528, "grad_norm": 0.24609375, "learning_rate": 5.294117647058824e-06, "loss": 1.0402, "step": 696 }, { "epoch": 1.4137931034482758, "grad_norm": 0.255859375, "learning_rate": 5.287356321839081e-06, "loss": 1.0706, "step": 697 }, { "epoch": 1.4158215010141988, "grad_norm": 0.248046875, "learning_rate": 5.280594996619339e-06, "loss": 1.0361, "step": 698 }, { "epoch": 1.4178498985801218, "grad_norm": 0.251953125, "learning_rate": 5.2738336713995955e-06, "loss": 1.0189, "step": 699 }, { "epoch": 1.4198782961460445, "grad_norm": 0.30859375, "learning_rate": 5.2670723461798516e-06, "loss": 1.0324, "step": 700 }, { "epoch": 1.4219066937119675, "grad_norm": 0.248046875, "learning_rate": 5.2603110209601085e-06, "loss": 1.0408, "step": 701 }, { "epoch": 1.4239350912778905, "grad_norm": 0.271484375, "learning_rate": 5.253549695740365e-06, "loss": 1.0557, "step": 702 }, { "epoch": 1.4259634888438133, "grad_norm": 0.2578125, "learning_rate": 5.246788370520622e-06, "loss": 1.0519, "step": 703 }, { "epoch": 1.4279918864097363, "grad_norm": 0.365234375, "learning_rate": 5.240027045300879e-06, "loss": 0.9978, "step": 704 }, { "epoch": 1.4300202839756593, "grad_norm": 0.2578125, "learning_rate": 5.233265720081136e-06, "loss": 0.9964, "step": 705 }, { "epoch": 1.4320486815415823, "grad_norm": 0.25390625, "learning_rate": 5.226504394861394e-06, "loss": 1.0495, "step": 706 }, { "epoch": 1.434077079107505, "grad_norm": 0.259765625, "learning_rate": 5.219743069641651e-06, "loss": 1.0334, "step": 707 }, { "epoch": 1.436105476673428, "grad_norm": 0.310546875, "learning_rate": 5.212981744421907e-06, "loss": 1.0414, "step": 708 }, { "epoch": 1.4381338742393508, "grad_norm": 0.275390625, "learning_rate": 5.206220419202164e-06, "loss": 1.0865, "step": 709 }, { "epoch": 1.4401622718052738, "grad_norm": 0.255859375, "learning_rate": 5.199459093982421e-06, "loss": 1.0293, "step": 710 }, { "epoch": 1.4421906693711968, "grad_norm": 0.25390625, "learning_rate": 5.192697768762678e-06, "loss": 1.0371, "step": 711 }, { "epoch": 1.4442190669371198, "grad_norm": 0.267578125, "learning_rate": 5.1859364435429345e-06, "loss": 1.0549, "step": 712 }, { "epoch": 1.4462474645030425, "grad_norm": 0.2490234375, "learning_rate": 5.179175118323192e-06, "loss": 1.0262, "step": 713 }, { "epoch": 1.4482758620689655, "grad_norm": 0.240234375, "learning_rate": 5.172413793103449e-06, "loss": 1.0362, "step": 714 }, { "epoch": 1.4503042596348885, "grad_norm": 0.259765625, "learning_rate": 5.165652467883706e-06, "loss": 1.0823, "step": 715 }, { "epoch": 1.4523326572008113, "grad_norm": 0.2333984375, "learning_rate": 5.158891142663962e-06, "loss": 1.0068, "step": 716 }, { "epoch": 1.4543610547667343, "grad_norm": 0.263671875, "learning_rate": 5.152129817444219e-06, "loss": 1.0176, "step": 717 }, { "epoch": 1.4563894523326573, "grad_norm": 0.27734375, "learning_rate": 5.145368492224476e-06, "loss": 1.0531, "step": 718 }, { "epoch": 1.4584178498985803, "grad_norm": 0.2470703125, "learning_rate": 5.138607167004733e-06, "loss": 1.003, "step": 719 }, { "epoch": 1.460446247464503, "grad_norm": 0.37109375, "learning_rate": 5.131845841784991e-06, "loss": 1.097, "step": 720 }, { "epoch": 1.462474645030426, "grad_norm": 0.255859375, "learning_rate": 5.1250845165652475e-06, "loss": 1.0326, "step": 721 }, { "epoch": 1.4645030425963488, "grad_norm": 0.259765625, "learning_rate": 5.1183231913455045e-06, "loss": 1.0651, "step": 722 }, { "epoch": 1.4665314401622718, "grad_norm": 0.234375, "learning_rate": 5.111561866125761e-06, "loss": 1.0116, "step": 723 }, { "epoch": 1.4685598377281948, "grad_norm": 0.2734375, "learning_rate": 5.1048005409060174e-06, "loss": 1.0215, "step": 724 }, { "epoch": 1.4705882352941178, "grad_norm": 0.2451171875, "learning_rate": 5.098039215686274e-06, "loss": 1.0207, "step": 725 }, { "epoch": 1.4726166328600405, "grad_norm": 0.26171875, "learning_rate": 5.091277890466531e-06, "loss": 1.0264, "step": 726 }, { "epoch": 1.4746450304259635, "grad_norm": 0.251953125, "learning_rate": 5.084516565246789e-06, "loss": 1.0648, "step": 727 }, { "epoch": 1.4766734279918863, "grad_norm": 0.275390625, "learning_rate": 5.077755240027046e-06, "loss": 1.0485, "step": 728 }, { "epoch": 1.4787018255578093, "grad_norm": 0.2490234375, "learning_rate": 5.070993914807303e-06, "loss": 1.0618, "step": 729 }, { "epoch": 1.4807302231237323, "grad_norm": 0.28125, "learning_rate": 5.06423258958756e-06, "loss": 1.0572, "step": 730 }, { "epoch": 1.4827586206896552, "grad_norm": 0.259765625, "learning_rate": 5.057471264367817e-06, "loss": 1.0321, "step": 731 }, { "epoch": 1.484787018255578, "grad_norm": 0.259765625, "learning_rate": 5.050709939148073e-06, "loss": 1.0498, "step": 732 }, { "epoch": 1.486815415821501, "grad_norm": 0.5, "learning_rate": 5.04394861392833e-06, "loss": 1.0393, "step": 733 }, { "epoch": 1.488843813387424, "grad_norm": 0.24609375, "learning_rate": 5.037187288708587e-06, "loss": 1.047, "step": 734 }, { "epoch": 1.4908722109533468, "grad_norm": 0.267578125, "learning_rate": 5.030425963488844e-06, "loss": 1.128, "step": 735 }, { "epoch": 1.4929006085192698, "grad_norm": 0.24609375, "learning_rate": 5.023664638269101e-06, "loss": 1.0581, "step": 736 }, { "epoch": 1.4949290060851927, "grad_norm": 0.2294921875, "learning_rate": 5.016903313049358e-06, "loss": 0.9735, "step": 737 }, { "epoch": 1.4969574036511157, "grad_norm": 0.333984375, "learning_rate": 5.010141987829615e-06, "loss": 1.0535, "step": 738 }, { "epoch": 1.4989858012170385, "grad_norm": 0.2392578125, "learning_rate": 5.003380662609873e-06, "loss": 1.0524, "step": 739 }, { "epoch": 1.5010141987829615, "grad_norm": 0.2392578125, "learning_rate": 4.996619337390129e-06, "loss": 1.0575, "step": 740 }, { "epoch": 1.5030425963488843, "grad_norm": 0.255859375, "learning_rate": 4.989858012170386e-06, "loss": 1.008, "step": 741 }, { "epoch": 1.5050709939148073, "grad_norm": 0.244140625, "learning_rate": 4.983096686950643e-06, "loss": 1.0194, "step": 742 }, { "epoch": 1.5070993914807302, "grad_norm": 0.2451171875, "learning_rate": 4.9763353617309e-06, "loss": 1.0579, "step": 743 }, { "epoch": 1.5091277890466532, "grad_norm": 0.3359375, "learning_rate": 4.9695740365111565e-06, "loss": 1.0465, "step": 744 }, { "epoch": 1.5111561866125762, "grad_norm": 0.2431640625, "learning_rate": 4.962812711291413e-06, "loss": 1.044, "step": 745 }, { "epoch": 1.513184584178499, "grad_norm": 0.248046875, "learning_rate": 4.95605138607167e-06, "loss": 1.048, "step": 746 }, { "epoch": 1.5152129817444218, "grad_norm": 0.2412109375, "learning_rate": 4.949290060851927e-06, "loss": 1.0386, "step": 747 }, { "epoch": 1.5172413793103448, "grad_norm": 0.267578125, "learning_rate": 4.942528735632184e-06, "loss": 1.0929, "step": 748 }, { "epoch": 1.5192697768762677, "grad_norm": 0.337890625, "learning_rate": 4.935767410412441e-06, "loss": 1.0158, "step": 749 }, { "epoch": 1.5212981744421907, "grad_norm": 0.248046875, "learning_rate": 4.929006085192698e-06, "loss": 1.0559, "step": 750 }, { "epoch": 1.5233265720081137, "grad_norm": 0.263671875, "learning_rate": 4.922244759972955e-06, "loss": 1.0256, "step": 751 }, { "epoch": 1.5253549695740365, "grad_norm": 0.294921875, "learning_rate": 4.915483434753212e-06, "loss": 1.0062, "step": 752 }, { "epoch": 1.5273833671399595, "grad_norm": 0.251953125, "learning_rate": 4.9087221095334696e-06, "loss": 1.0676, "step": 753 }, { "epoch": 1.5294117647058822, "grad_norm": 0.2392578125, "learning_rate": 4.901960784313726e-06, "loss": 1.0193, "step": 754 }, { "epoch": 1.5314401622718052, "grad_norm": 0.2421875, "learning_rate": 4.8951994590939825e-06, "loss": 1.0317, "step": 755 }, { "epoch": 1.5334685598377282, "grad_norm": 0.2353515625, "learning_rate": 4.8884381338742394e-06, "loss": 1.0093, "step": 756 }, { "epoch": 1.5354969574036512, "grad_norm": 0.283203125, "learning_rate": 4.881676808654497e-06, "loss": 1.0976, "step": 757 }, { "epoch": 1.537525354969574, "grad_norm": 0.236328125, "learning_rate": 4.874915483434753e-06, "loss": 1.0135, "step": 758 }, { "epoch": 1.539553752535497, "grad_norm": 0.2421875, "learning_rate": 4.86815415821501e-06, "loss": 1.02, "step": 759 }, { "epoch": 1.5415821501014197, "grad_norm": 0.259765625, "learning_rate": 4.861392832995268e-06, "loss": 1.0506, "step": 760 }, { "epoch": 1.5436105476673427, "grad_norm": 0.28515625, "learning_rate": 4.854631507775525e-06, "loss": 1.0188, "step": 761 }, { "epoch": 1.5456389452332657, "grad_norm": 0.26171875, "learning_rate": 4.847870182555781e-06, "loss": 1.0333, "step": 762 }, { "epoch": 1.5476673427991887, "grad_norm": 0.248046875, "learning_rate": 4.841108857336038e-06, "loss": 1.0798, "step": 763 }, { "epoch": 1.5496957403651117, "grad_norm": 0.240234375, "learning_rate": 4.834347532116296e-06, "loss": 1.01, "step": 764 }, { "epoch": 1.5517241379310345, "grad_norm": 0.333984375, "learning_rate": 4.8275862068965525e-06, "loss": 1.081, "step": 765 }, { "epoch": 1.5537525354969572, "grad_norm": 0.248046875, "learning_rate": 4.8208248816768086e-06, "loss": 1.0455, "step": 766 }, { "epoch": 1.5557809330628802, "grad_norm": 0.240234375, "learning_rate": 4.814063556457066e-06, "loss": 1.0435, "step": 767 }, { "epoch": 1.5578093306288032, "grad_norm": 0.369140625, "learning_rate": 4.807302231237323e-06, "loss": 1.0402, "step": 768 }, { "epoch": 1.5598377281947262, "grad_norm": 0.3046875, "learning_rate": 4.80054090601758e-06, "loss": 1.0597, "step": 769 }, { "epoch": 1.5618661257606492, "grad_norm": 0.248046875, "learning_rate": 4.793779580797836e-06, "loss": 1.0579, "step": 770 }, { "epoch": 1.563894523326572, "grad_norm": 0.251953125, "learning_rate": 4.787018255578094e-06, "loss": 1.0255, "step": 771 }, { "epoch": 1.565922920892495, "grad_norm": 0.2470703125, "learning_rate": 4.780256930358351e-06, "loss": 1.0427, "step": 772 }, { "epoch": 1.5679513184584177, "grad_norm": 0.24609375, "learning_rate": 4.773495605138608e-06, "loss": 1.0013, "step": 773 }, { "epoch": 1.5699797160243407, "grad_norm": 0.33984375, "learning_rate": 4.766734279918865e-06, "loss": 1.0138, "step": 774 }, { "epoch": 1.5720081135902637, "grad_norm": 0.275390625, "learning_rate": 4.759972954699122e-06, "loss": 1.0689, "step": 775 }, { "epoch": 1.5740365111561867, "grad_norm": 0.2392578125, "learning_rate": 4.7532116294793785e-06, "loss": 1.0476, "step": 776 }, { "epoch": 1.5760649087221097, "grad_norm": 0.2578125, "learning_rate": 4.7464503042596354e-06, "loss": 1.0279, "step": 777 }, { "epoch": 1.5780933062880325, "grad_norm": 0.255859375, "learning_rate": 4.739688979039892e-06, "loss": 1.0612, "step": 778 }, { "epoch": 1.5801217038539552, "grad_norm": 0.255859375, "learning_rate": 4.732927653820149e-06, "loss": 1.0301, "step": 779 }, { "epoch": 1.5821501014198782, "grad_norm": 0.302734375, "learning_rate": 4.726166328600406e-06, "loss": 1.0758, "step": 780 }, { "epoch": 1.5841784989858012, "grad_norm": 0.306640625, "learning_rate": 4.719405003380663e-06, "loss": 1.0069, "step": 781 }, { "epoch": 1.5862068965517242, "grad_norm": 0.271484375, "learning_rate": 4.71264367816092e-06, "loss": 1.0242, "step": 782 }, { "epoch": 1.5882352941176472, "grad_norm": 0.265625, "learning_rate": 4.705882352941177e-06, "loss": 1.109, "step": 783 }, { "epoch": 1.59026369168357, "grad_norm": 0.259765625, "learning_rate": 4.699121027721434e-06, "loss": 0.9922, "step": 784 }, { "epoch": 1.592292089249493, "grad_norm": 0.267578125, "learning_rate": 4.692359702501691e-06, "loss": 1.0403, "step": 785 }, { "epoch": 1.5943204868154157, "grad_norm": 0.26953125, "learning_rate": 4.685598377281948e-06, "loss": 0.9909, "step": 786 }, { "epoch": 1.5963488843813387, "grad_norm": 0.267578125, "learning_rate": 4.6788370520622046e-06, "loss": 1.0451, "step": 787 }, { "epoch": 1.5983772819472617, "grad_norm": 0.2431640625, "learning_rate": 4.6720757268424615e-06, "loss": 1.0213, "step": 788 }, { "epoch": 1.6004056795131847, "grad_norm": 0.236328125, "learning_rate": 4.665314401622718e-06, "loss": 0.9739, "step": 789 }, { "epoch": 1.6024340770791075, "grad_norm": 0.271484375, "learning_rate": 4.658553076402975e-06, "loss": 1.0517, "step": 790 }, { "epoch": 1.6044624746450304, "grad_norm": 0.26171875, "learning_rate": 4.651791751183232e-06, "loss": 1.0491, "step": 791 }, { "epoch": 1.6064908722109532, "grad_norm": 0.2490234375, "learning_rate": 4.645030425963489e-06, "loss": 1.0499, "step": 792 }, { "epoch": 1.6085192697768762, "grad_norm": 0.255859375, "learning_rate": 4.638269100743746e-06, "loss": 1.0303, "step": 793 }, { "epoch": 1.6105476673427992, "grad_norm": 0.2734375, "learning_rate": 4.631507775524003e-06, "loss": 1.0469, "step": 794 }, { "epoch": 1.6125760649087222, "grad_norm": 0.2734375, "learning_rate": 4.62474645030426e-06, "loss": 1.0545, "step": 795 }, { "epoch": 1.6146044624746452, "grad_norm": 0.255859375, "learning_rate": 4.617985125084517e-06, "loss": 1.0506, "step": 796 }, { "epoch": 1.616632860040568, "grad_norm": 0.3984375, "learning_rate": 4.611223799864774e-06, "loss": 1.0621, "step": 797 }, { "epoch": 1.6186612576064907, "grad_norm": 0.373046875, "learning_rate": 4.604462474645031e-06, "loss": 1.0528, "step": 798 }, { "epoch": 1.6206896551724137, "grad_norm": 0.265625, "learning_rate": 4.5977011494252875e-06, "loss": 1.0781, "step": 799 }, { "epoch": 1.6227180527383367, "grad_norm": 0.27734375, "learning_rate": 4.590939824205544e-06, "loss": 1.053, "step": 800 }, { "epoch": 1.6247464503042597, "grad_norm": 0.25390625, "learning_rate": 4.584178498985802e-06, "loss": 1.0392, "step": 801 }, { "epoch": 1.6267748478701827, "grad_norm": 0.25390625, "learning_rate": 4.577417173766058e-06, "loss": 1.0657, "step": 802 }, { "epoch": 1.6288032454361054, "grad_norm": 0.25390625, "learning_rate": 4.570655848546315e-06, "loss": 1.0228, "step": 803 }, { "epoch": 1.6308316430020284, "grad_norm": 0.265625, "learning_rate": 4.563894523326572e-06, "loss": 1.0226, "step": 804 }, { "epoch": 1.6328600405679512, "grad_norm": 0.259765625, "learning_rate": 4.55713319810683e-06, "loss": 1.0523, "step": 805 }, { "epoch": 1.6348884381338742, "grad_norm": 0.255859375, "learning_rate": 4.550371872887086e-06, "loss": 1.011, "step": 806 }, { "epoch": 1.6369168356997972, "grad_norm": 0.2578125, "learning_rate": 4.543610547667343e-06, "loss": 1.0638, "step": 807 }, { "epoch": 1.6389452332657202, "grad_norm": 0.26953125, "learning_rate": 4.5368492224476005e-06, "loss": 1.0666, "step": 808 }, { "epoch": 1.6409736308316432, "grad_norm": 0.2578125, "learning_rate": 4.5300878972278575e-06, "loss": 1.0697, "step": 809 }, { "epoch": 1.643002028397566, "grad_norm": 0.25, "learning_rate": 4.5233265720081135e-06, "loss": 1.0536, "step": 810 }, { "epoch": 1.6450304259634887, "grad_norm": 0.328125, "learning_rate": 4.5165652467883704e-06, "loss": 1.0281, "step": 811 }, { "epoch": 1.6470588235294117, "grad_norm": 0.3984375, "learning_rate": 4.509803921568628e-06, "loss": 1.0882, "step": 812 }, { "epoch": 1.6490872210953347, "grad_norm": 0.255859375, "learning_rate": 4.503042596348885e-06, "loss": 1.0889, "step": 813 }, { "epoch": 1.6511156186612577, "grad_norm": 0.357421875, "learning_rate": 4.496281271129141e-06, "loss": 1.0441, "step": 814 }, { "epoch": 1.6531440162271807, "grad_norm": 0.25, "learning_rate": 4.489519945909399e-06, "loss": 1.075, "step": 815 }, { "epoch": 1.6551724137931034, "grad_norm": 0.255859375, "learning_rate": 4.482758620689656e-06, "loss": 1.0429, "step": 816 }, { "epoch": 1.6572008113590264, "grad_norm": 0.2490234375, "learning_rate": 4.475997295469913e-06, "loss": 1.0555, "step": 817 }, { "epoch": 1.6592292089249492, "grad_norm": 0.279296875, "learning_rate": 4.469235970250169e-06, "loss": 1.0478, "step": 818 }, { "epoch": 1.6612576064908722, "grad_norm": 0.265625, "learning_rate": 4.4624746450304266e-06, "loss": 1.0178, "step": 819 }, { "epoch": 1.6632860040567952, "grad_norm": 0.28125, "learning_rate": 4.4557133198106835e-06, "loss": 1.009, "step": 820 }, { "epoch": 1.6653144016227182, "grad_norm": 0.25390625, "learning_rate": 4.44895199459094e-06, "loss": 1.008, "step": 821 }, { "epoch": 1.667342799188641, "grad_norm": 0.25390625, "learning_rate": 4.442190669371197e-06, "loss": 1.0553, "step": 822 }, { "epoch": 1.669371196754564, "grad_norm": 0.263671875, "learning_rate": 4.435429344151454e-06, "loss": 1.0913, "step": 823 }, { "epoch": 1.6713995943204867, "grad_norm": 0.26953125, "learning_rate": 4.428668018931711e-06, "loss": 1.0635, "step": 824 }, { "epoch": 1.6734279918864097, "grad_norm": 0.2451171875, "learning_rate": 4.421906693711968e-06, "loss": 1.0251, "step": 825 }, { "epoch": 1.6754563894523327, "grad_norm": 0.2373046875, "learning_rate": 4.415145368492225e-06, "loss": 1.0472, "step": 826 }, { "epoch": 1.6774847870182557, "grad_norm": 0.25390625, "learning_rate": 4.408384043272482e-06, "loss": 1.0504, "step": 827 }, { "epoch": 1.6795131845841786, "grad_norm": 0.244140625, "learning_rate": 4.401622718052739e-06, "loss": 1.0581, "step": 828 }, { "epoch": 1.6815415821501014, "grad_norm": 0.244140625, "learning_rate": 4.394861392832996e-06, "loss": 1.0142, "step": 829 }, { "epoch": 1.6835699797160242, "grad_norm": 0.2451171875, "learning_rate": 4.388100067613253e-06, "loss": 1.0157, "step": 830 }, { "epoch": 1.6855983772819472, "grad_norm": 0.25390625, "learning_rate": 4.3813387423935095e-06, "loss": 1.0668, "step": 831 }, { "epoch": 1.6876267748478702, "grad_norm": 0.296875, "learning_rate": 4.374577417173766e-06, "loss": 1.009, "step": 832 }, { "epoch": 1.6896551724137931, "grad_norm": 0.2412109375, "learning_rate": 4.367816091954023e-06, "loss": 1.0159, "step": 833 }, { "epoch": 1.6916835699797161, "grad_norm": 0.2578125, "learning_rate": 4.36105476673428e-06, "loss": 1.0275, "step": 834 }, { "epoch": 1.693711967545639, "grad_norm": 0.240234375, "learning_rate": 4.354293441514537e-06, "loss": 0.9794, "step": 835 }, { "epoch": 1.695740365111562, "grad_norm": 0.2578125, "learning_rate": 4.347532116294794e-06, "loss": 1.0311, "step": 836 }, { "epoch": 1.6977687626774847, "grad_norm": 0.2431640625, "learning_rate": 4.340770791075051e-06, "loss": 1.0232, "step": 837 }, { "epoch": 1.6997971602434077, "grad_norm": 0.265625, "learning_rate": 4.334009465855308e-06, "loss": 1.0875, "step": 838 }, { "epoch": 1.7018255578093306, "grad_norm": 0.2451171875, "learning_rate": 4.327248140635565e-06, "loss": 1.0545, "step": 839 }, { "epoch": 1.7038539553752536, "grad_norm": 0.2578125, "learning_rate": 4.320486815415822e-06, "loss": 1.1009, "step": 840 }, { "epoch": 1.7058823529411766, "grad_norm": 0.240234375, "learning_rate": 4.313725490196079e-06, "loss": 1.0431, "step": 841 }, { "epoch": 1.7079107505070994, "grad_norm": 0.30078125, "learning_rate": 4.3069641649763355e-06, "loss": 1.0407, "step": 842 }, { "epoch": 1.7099391480730222, "grad_norm": 0.265625, "learning_rate": 4.3002028397565924e-06, "loss": 1.0143, "step": 843 }, { "epoch": 1.7119675456389452, "grad_norm": 0.287109375, "learning_rate": 4.293441514536849e-06, "loss": 1.0071, "step": 844 }, { "epoch": 1.7139959432048681, "grad_norm": 0.25390625, "learning_rate": 4.286680189317107e-06, "loss": 1.0152, "step": 845 }, { "epoch": 1.7160243407707911, "grad_norm": 0.25, "learning_rate": 4.279918864097363e-06, "loss": 1.0049, "step": 846 }, { "epoch": 1.7180527383367141, "grad_norm": 0.251953125, "learning_rate": 4.27315753887762e-06, "loss": 1.048, "step": 847 }, { "epoch": 1.720081135902637, "grad_norm": 0.265625, "learning_rate": 4.266396213657877e-06, "loss": 1.0403, "step": 848 }, { "epoch": 1.7221095334685599, "grad_norm": 0.2431640625, "learning_rate": 4.259634888438135e-06, "loss": 1.0488, "step": 849 }, { "epoch": 1.7241379310344827, "grad_norm": 0.2451171875, "learning_rate": 4.252873563218391e-06, "loss": 1.063, "step": 850 }, { "epoch": 1.7261663286004056, "grad_norm": 0.265625, "learning_rate": 4.246112237998648e-06, "loss": 1.0591, "step": 851 }, { "epoch": 1.7281947261663286, "grad_norm": 0.234375, "learning_rate": 4.2393509127789055e-06, "loss": 1.0191, "step": 852 }, { "epoch": 1.7302231237322516, "grad_norm": 0.275390625, "learning_rate": 4.232589587559162e-06, "loss": 1.0538, "step": 853 }, { "epoch": 1.7322515212981744, "grad_norm": 0.349609375, "learning_rate": 4.2258282623394185e-06, "loss": 1.07, "step": 854 }, { "epoch": 1.7342799188640974, "grad_norm": 0.24609375, "learning_rate": 4.219066937119675e-06, "loss": 1.0387, "step": 855 }, { "epoch": 1.7363083164300201, "grad_norm": 0.24609375, "learning_rate": 4.212305611899933e-06, "loss": 1.0549, "step": 856 }, { "epoch": 1.7383367139959431, "grad_norm": 0.3671875, "learning_rate": 4.20554428668019e-06, "loss": 1.0035, "step": 857 }, { "epoch": 1.7403651115618661, "grad_norm": 0.2490234375, "learning_rate": 4.198782961460446e-06, "loss": 1.0304, "step": 858 }, { "epoch": 1.7423935091277891, "grad_norm": 0.251953125, "learning_rate": 4.192021636240704e-06, "loss": 1.0472, "step": 859 }, { "epoch": 1.744421906693712, "grad_norm": 0.291015625, "learning_rate": 4.185260311020961e-06, "loss": 1.0034, "step": 860 }, { "epoch": 1.7464503042596349, "grad_norm": 0.271484375, "learning_rate": 4.178498985801218e-06, "loss": 1.0649, "step": 861 }, { "epoch": 1.7484787018255576, "grad_norm": 0.2431640625, "learning_rate": 4.171737660581474e-06, "loss": 1.0257, "step": 862 }, { "epoch": 1.7505070993914806, "grad_norm": 0.37109375, "learning_rate": 4.1649763353617315e-06, "loss": 1.0053, "step": 863 }, { "epoch": 1.7525354969574036, "grad_norm": 0.2470703125, "learning_rate": 4.1582150101419884e-06, "loss": 1.0393, "step": 864 }, { "epoch": 1.7545638945233266, "grad_norm": 0.26171875, "learning_rate": 4.151453684922245e-06, "loss": 1.0857, "step": 865 }, { "epoch": 1.7565922920892496, "grad_norm": 0.263671875, "learning_rate": 4.144692359702502e-06, "loss": 1.0793, "step": 866 }, { "epoch": 1.7586206896551724, "grad_norm": 0.30859375, "learning_rate": 4.137931034482759e-06, "loss": 1.0013, "step": 867 }, { "epoch": 1.7606490872210954, "grad_norm": 0.25390625, "learning_rate": 4.131169709263016e-06, "loss": 1.0325, "step": 868 }, { "epoch": 1.7626774847870181, "grad_norm": 0.2470703125, "learning_rate": 4.124408384043273e-06, "loss": 1.0407, "step": 869 }, { "epoch": 1.7647058823529411, "grad_norm": 0.2421875, "learning_rate": 4.11764705882353e-06, "loss": 1.0443, "step": 870 }, { "epoch": 1.7667342799188641, "grad_norm": 0.298828125, "learning_rate": 4.110885733603787e-06, "loss": 1.0305, "step": 871 }, { "epoch": 1.768762677484787, "grad_norm": 0.25390625, "learning_rate": 4.104124408384044e-06, "loss": 1.0855, "step": 872 }, { "epoch": 1.77079107505071, "grad_norm": 0.24609375, "learning_rate": 4.097363083164301e-06, "loss": 1.0202, "step": 873 }, { "epoch": 1.7728194726166329, "grad_norm": 0.2431640625, "learning_rate": 4.0906017579445575e-06, "loss": 1.0145, "step": 874 }, { "epoch": 1.7748478701825556, "grad_norm": 0.2578125, "learning_rate": 4.0838404327248145e-06, "loss": 1.0533, "step": 875 }, { "epoch": 1.7768762677484786, "grad_norm": 0.40234375, "learning_rate": 4.077079107505071e-06, "loss": 1.0158, "step": 876 }, { "epoch": 1.7789046653144016, "grad_norm": 0.263671875, "learning_rate": 4.070317782285328e-06, "loss": 0.9991, "step": 877 }, { "epoch": 1.7809330628803246, "grad_norm": 0.255859375, "learning_rate": 4.063556457065585e-06, "loss": 1.0217, "step": 878 }, { "epoch": 1.7829614604462476, "grad_norm": 0.24609375, "learning_rate": 4.056795131845842e-06, "loss": 1.0227, "step": 879 }, { "epoch": 1.7849898580121704, "grad_norm": 0.29296875, "learning_rate": 4.050033806626099e-06, "loss": 1.0078, "step": 880 }, { "epoch": 1.7870182555780934, "grad_norm": 0.271484375, "learning_rate": 4.043272481406356e-06, "loss": 1.0662, "step": 881 }, { "epoch": 1.7890466531440161, "grad_norm": 0.263671875, "learning_rate": 4.036511156186613e-06, "loss": 1.0436, "step": 882 }, { "epoch": 1.791075050709939, "grad_norm": 0.326171875, "learning_rate": 4.02974983096687e-06, "loss": 1.0535, "step": 883 }, { "epoch": 1.793103448275862, "grad_norm": 0.265625, "learning_rate": 4.022988505747127e-06, "loss": 1.0301, "step": 884 }, { "epoch": 1.795131845841785, "grad_norm": 0.25, "learning_rate": 4.0162271805273836e-06, "loss": 1.0391, "step": 885 }, { "epoch": 1.7971602434077079, "grad_norm": 0.3671875, "learning_rate": 4.0094658553076405e-06, "loss": 1.0251, "step": 886 }, { "epoch": 1.7991886409736308, "grad_norm": 0.251953125, "learning_rate": 4.002704530087897e-06, "loss": 1.0301, "step": 887 }, { "epoch": 1.8012170385395536, "grad_norm": 0.267578125, "learning_rate": 3.995943204868154e-06, "loss": 1.021, "step": 888 }, { "epoch": 1.8032454361054766, "grad_norm": 0.361328125, "learning_rate": 3.989181879648411e-06, "loss": 1.051, "step": 889 }, { "epoch": 1.8052738336713996, "grad_norm": 0.25390625, "learning_rate": 3.982420554428668e-06, "loss": 0.9883, "step": 890 }, { "epoch": 1.8073022312373226, "grad_norm": 0.93359375, "learning_rate": 3.975659229208925e-06, "loss": 1.055, "step": 891 }, { "epoch": 1.8093306288032456, "grad_norm": 0.2392578125, "learning_rate": 3.968897903989182e-06, "loss": 1.0091, "step": 892 }, { "epoch": 1.8113590263691683, "grad_norm": 0.25, "learning_rate": 3.96213657876944e-06, "loss": 1.0557, "step": 893 }, { "epoch": 1.8133874239350911, "grad_norm": 0.2578125, "learning_rate": 3.955375253549696e-06, "loss": 1.0326, "step": 894 }, { "epoch": 1.815415821501014, "grad_norm": 0.275390625, "learning_rate": 3.948613928329953e-06, "loss": 1.0162, "step": 895 }, { "epoch": 1.817444219066937, "grad_norm": 0.26171875, "learning_rate": 3.94185260311021e-06, "loss": 1.0341, "step": 896 }, { "epoch": 1.81947261663286, "grad_norm": 0.25390625, "learning_rate": 3.935091277890467e-06, "loss": 1.0521, "step": 897 }, { "epoch": 1.821501014198783, "grad_norm": 0.25390625, "learning_rate": 3.928329952670723e-06, "loss": 1.0544, "step": 898 }, { "epoch": 1.8235294117647058, "grad_norm": 0.287109375, "learning_rate": 3.92156862745098e-06, "loss": 0.9953, "step": 899 }, { "epoch": 1.8255578093306288, "grad_norm": 0.244140625, "learning_rate": 3.914807302231238e-06, "loss": 1.019, "step": 900 }, { "epoch": 1.8275862068965516, "grad_norm": 0.26171875, "learning_rate": 3.908045977011495e-06, "loss": 0.9932, "step": 901 }, { "epoch": 1.8296146044624746, "grad_norm": 0.251953125, "learning_rate": 3.901284651791751e-06, "loss": 1.0182, "step": 902 }, { "epoch": 1.8316430020283976, "grad_norm": 0.27734375, "learning_rate": 3.894523326572008e-06, "loss": 1.0809, "step": 903 }, { "epoch": 1.8336713995943206, "grad_norm": 0.275390625, "learning_rate": 3.887762001352266e-06, "loss": 1.0352, "step": 904 }, { "epoch": 1.8356997971602436, "grad_norm": 0.337890625, "learning_rate": 3.881000676132523e-06, "loss": 1.0161, "step": 905 }, { "epoch": 1.8377281947261663, "grad_norm": 0.2578125, "learning_rate": 3.874239350912779e-06, "loss": 1.0188, "step": 906 }, { "epoch": 1.839756592292089, "grad_norm": 0.2412109375, "learning_rate": 3.8674780256930365e-06, "loss": 1.0352, "step": 907 }, { "epoch": 1.841784989858012, "grad_norm": 0.27734375, "learning_rate": 3.860716700473293e-06, "loss": 1.0515, "step": 908 }, { "epoch": 1.843813387423935, "grad_norm": 0.265625, "learning_rate": 3.85395537525355e-06, "loss": 1.074, "step": 909 }, { "epoch": 1.845841784989858, "grad_norm": 0.310546875, "learning_rate": 3.847194050033806e-06, "loss": 1.0543, "step": 910 }, { "epoch": 1.847870182555781, "grad_norm": 0.27734375, "learning_rate": 3.840432724814064e-06, "loss": 1.0243, "step": 911 }, { "epoch": 1.8498985801217038, "grad_norm": 0.255859375, "learning_rate": 3.833671399594321e-06, "loss": 1.0708, "step": 912 }, { "epoch": 1.8519269776876268, "grad_norm": 0.263671875, "learning_rate": 3.826910074374578e-06, "loss": 1.0486, "step": 913 }, { "epoch": 1.8539553752535496, "grad_norm": 0.24609375, "learning_rate": 3.820148749154835e-06, "loss": 1.0197, "step": 914 }, { "epoch": 1.8559837728194726, "grad_norm": 0.25390625, "learning_rate": 3.8133874239350913e-06, "loss": 1.0298, "step": 915 }, { "epoch": 1.8580121703853956, "grad_norm": 0.2578125, "learning_rate": 3.8066260987153487e-06, "loss": 1.0123, "step": 916 }, { "epoch": 1.8600405679513186, "grad_norm": 0.2578125, "learning_rate": 3.7998647734956056e-06, "loss": 0.9994, "step": 917 }, { "epoch": 1.8620689655172413, "grad_norm": 0.24609375, "learning_rate": 3.793103448275862e-06, "loss": 1.0284, "step": 918 }, { "epoch": 1.8640973630831643, "grad_norm": 0.287109375, "learning_rate": 3.7863421230561194e-06, "loss": 1.0786, "step": 919 }, { "epoch": 1.866125760649087, "grad_norm": 0.318359375, "learning_rate": 3.7795807978363763e-06, "loss": 1.0081, "step": 920 }, { "epoch": 1.86815415821501, "grad_norm": 0.251953125, "learning_rate": 3.7728194726166332e-06, "loss": 0.9898, "step": 921 }, { "epoch": 1.870182555780933, "grad_norm": 0.2470703125, "learning_rate": 3.7660581473968897e-06, "loss": 1.0628, "step": 922 }, { "epoch": 1.872210953346856, "grad_norm": 0.32421875, "learning_rate": 3.759296822177147e-06, "loss": 1.0391, "step": 923 }, { "epoch": 1.874239350912779, "grad_norm": 0.240234375, "learning_rate": 3.752535496957404e-06, "loss": 1.0142, "step": 924 }, { "epoch": 1.8762677484787018, "grad_norm": 0.265625, "learning_rate": 3.7457741717376613e-06, "loss": 1.046, "step": 925 }, { "epoch": 1.8782961460446246, "grad_norm": 0.25, "learning_rate": 3.7390128465179178e-06, "loss": 1.0367, "step": 926 }, { "epoch": 1.8803245436105476, "grad_norm": 0.26953125, "learning_rate": 3.7322515212981747e-06, "loss": 0.9889, "step": 927 }, { "epoch": 1.8823529411764706, "grad_norm": 0.259765625, "learning_rate": 3.7254901960784316e-06, "loss": 1.0335, "step": 928 }, { "epoch": 1.8843813387423936, "grad_norm": 0.27734375, "learning_rate": 3.718728870858689e-06, "loss": 1.0742, "step": 929 }, { "epoch": 1.8864097363083165, "grad_norm": 0.248046875, "learning_rate": 3.7119675456389454e-06, "loss": 1.0103, "step": 930 }, { "epoch": 1.8884381338742393, "grad_norm": 0.2470703125, "learning_rate": 3.7052062204192023e-06, "loss": 1.029, "step": 931 }, { "epoch": 1.8904665314401623, "grad_norm": 0.25390625, "learning_rate": 3.6984448951994597e-06, "loss": 1.0185, "step": 932 }, { "epoch": 1.892494929006085, "grad_norm": 0.349609375, "learning_rate": 3.6916835699797166e-06, "loss": 1.0273, "step": 933 }, { "epoch": 1.894523326572008, "grad_norm": 0.279296875, "learning_rate": 3.684922244759973e-06, "loss": 1.0552, "step": 934 }, { "epoch": 1.896551724137931, "grad_norm": 0.251953125, "learning_rate": 3.67816091954023e-06, "loss": 0.9835, "step": 935 }, { "epoch": 1.898580121703854, "grad_norm": 0.251953125, "learning_rate": 3.6713995943204873e-06, "loss": 1.0271, "step": 936 }, { "epoch": 1.900608519269777, "grad_norm": 0.2431640625, "learning_rate": 3.6646382691007442e-06, "loss": 1.0514, "step": 937 }, { "epoch": 1.9026369168356998, "grad_norm": 0.263671875, "learning_rate": 3.6578769438810007e-06, "loss": 1.0763, "step": 938 }, { "epoch": 1.9046653144016226, "grad_norm": 0.255859375, "learning_rate": 3.651115618661258e-06, "loss": 1.0356, "step": 939 }, { "epoch": 1.9066937119675456, "grad_norm": 0.314453125, "learning_rate": 3.644354293441515e-06, "loss": 1.0892, "step": 940 }, { "epoch": 1.9087221095334685, "grad_norm": 0.25, "learning_rate": 3.637592968221772e-06, "loss": 1.0789, "step": 941 }, { "epoch": 1.9107505070993915, "grad_norm": 0.255859375, "learning_rate": 3.6308316430020284e-06, "loss": 1.0647, "step": 942 }, { "epoch": 1.9127789046653145, "grad_norm": 0.26953125, "learning_rate": 3.6240703177822857e-06, "loss": 1.0863, "step": 943 }, { "epoch": 1.9148073022312373, "grad_norm": 0.2470703125, "learning_rate": 3.6173089925625426e-06, "loss": 0.9983, "step": 944 }, { "epoch": 1.9168356997971603, "grad_norm": 0.2578125, "learning_rate": 3.6105476673427995e-06, "loss": 0.9991, "step": 945 }, { "epoch": 1.918864097363083, "grad_norm": 0.263671875, "learning_rate": 3.603786342123056e-06, "loss": 1.1201, "step": 946 }, { "epoch": 1.920892494929006, "grad_norm": 0.265625, "learning_rate": 3.5970250169033134e-06, "loss": 1.0471, "step": 947 }, { "epoch": 1.922920892494929, "grad_norm": 0.375, "learning_rate": 3.5902636916835703e-06, "loss": 0.9676, "step": 948 }, { "epoch": 1.924949290060852, "grad_norm": 0.251953125, "learning_rate": 3.5835023664638276e-06, "loss": 1.0699, "step": 949 }, { "epoch": 1.9269776876267748, "grad_norm": 0.263671875, "learning_rate": 3.576741041244084e-06, "loss": 1.0475, "step": 950 }, { "epoch": 1.9290060851926978, "grad_norm": 0.255859375, "learning_rate": 3.569979716024341e-06, "loss": 1.0266, "step": 951 }, { "epoch": 1.9310344827586206, "grad_norm": 0.2451171875, "learning_rate": 3.563218390804598e-06, "loss": 0.9755, "step": 952 }, { "epoch": 1.9330628803245435, "grad_norm": 0.251953125, "learning_rate": 3.5564570655848552e-06, "loss": 1.0245, "step": 953 }, { "epoch": 1.9350912778904665, "grad_norm": 0.25, "learning_rate": 3.5496957403651117e-06, "loss": 1.052, "step": 954 }, { "epoch": 1.9371196754563895, "grad_norm": 0.341796875, "learning_rate": 3.5429344151453686e-06, "loss": 1.0402, "step": 955 }, { "epoch": 1.9391480730223125, "grad_norm": 0.291015625, "learning_rate": 3.536173089925626e-06, "loss": 1.0447, "step": 956 }, { "epoch": 1.9411764705882353, "grad_norm": 0.26171875, "learning_rate": 3.529411764705883e-06, "loss": 1.0743, "step": 957 }, { "epoch": 1.943204868154158, "grad_norm": 0.26171875, "learning_rate": 3.5226504394861394e-06, "loss": 1.066, "step": 958 }, { "epoch": 1.945233265720081, "grad_norm": 0.25390625, "learning_rate": 3.5158891142663963e-06, "loss": 1.0455, "step": 959 }, { "epoch": 1.947261663286004, "grad_norm": 0.267578125, "learning_rate": 3.5091277890466536e-06, "loss": 1.0448, "step": 960 }, { "epoch": 1.949290060851927, "grad_norm": 0.30859375, "learning_rate": 3.5023664638269105e-06, "loss": 1.012, "step": 961 }, { "epoch": 1.95131845841785, "grad_norm": 0.376953125, "learning_rate": 3.495605138607167e-06, "loss": 0.9866, "step": 962 }, { "epoch": 1.9533468559837728, "grad_norm": 0.267578125, "learning_rate": 3.4888438133874244e-06, "loss": 1.0504, "step": 963 }, { "epoch": 1.9553752535496958, "grad_norm": 0.263671875, "learning_rate": 3.4820824881676813e-06, "loss": 1.0103, "step": 964 }, { "epoch": 1.9574036511156185, "grad_norm": 0.265625, "learning_rate": 3.475321162947938e-06, "loss": 1.0262, "step": 965 }, { "epoch": 1.9594320486815415, "grad_norm": 0.259765625, "learning_rate": 3.4685598377281947e-06, "loss": 1.0183, "step": 966 }, { "epoch": 1.9614604462474645, "grad_norm": 0.2451171875, "learning_rate": 3.461798512508452e-06, "loss": 1.0283, "step": 967 }, { "epoch": 1.9634888438133875, "grad_norm": 0.25390625, "learning_rate": 3.455037187288709e-06, "loss": 1.0364, "step": 968 }, { "epoch": 1.9655172413793105, "grad_norm": 0.251953125, "learning_rate": 3.448275862068966e-06, "loss": 1.0066, "step": 969 }, { "epoch": 1.9675456389452333, "grad_norm": 0.24609375, "learning_rate": 3.4415145368492227e-06, "loss": 1.083, "step": 970 }, { "epoch": 1.969574036511156, "grad_norm": 0.28515625, "learning_rate": 3.4347532116294797e-06, "loss": 1.0155, "step": 971 }, { "epoch": 1.971602434077079, "grad_norm": 0.263671875, "learning_rate": 3.4279918864097366e-06, "loss": 1.0288, "step": 972 }, { "epoch": 1.973630831643002, "grad_norm": 0.2578125, "learning_rate": 3.421230561189994e-06, "loss": 1.0395, "step": 973 }, { "epoch": 1.975659229208925, "grad_norm": 0.25, "learning_rate": 3.4144692359702504e-06, "loss": 1.0201, "step": 974 }, { "epoch": 1.977687626774848, "grad_norm": 0.248046875, "learning_rate": 3.4077079107505073e-06, "loss": 1.0418, "step": 975 }, { "epoch": 1.9797160243407708, "grad_norm": 0.25390625, "learning_rate": 3.400946585530764e-06, "loss": 1.0185, "step": 976 }, { "epoch": 1.9817444219066938, "grad_norm": 0.25, "learning_rate": 3.3941852603110215e-06, "loss": 1.0386, "step": 977 }, { "epoch": 1.9837728194726165, "grad_norm": 0.255859375, "learning_rate": 3.387423935091278e-06, "loss": 1.0045, "step": 978 }, { "epoch": 1.9858012170385395, "grad_norm": 0.2470703125, "learning_rate": 3.380662609871535e-06, "loss": 1.0222, "step": 979 }, { "epoch": 1.9878296146044625, "grad_norm": 0.251953125, "learning_rate": 3.3739012846517923e-06, "loss": 1.0219, "step": 980 }, { "epoch": 1.9898580121703855, "grad_norm": 0.365234375, "learning_rate": 3.367139959432049e-06, "loss": 1.0264, "step": 981 }, { "epoch": 1.9918864097363083, "grad_norm": 0.259765625, "learning_rate": 3.3603786342123057e-06, "loss": 1.0746, "step": 982 }, { "epoch": 1.9939148073022313, "grad_norm": 0.2490234375, "learning_rate": 3.3536173089925626e-06, "loss": 1.0116, "step": 983 }, { "epoch": 1.995943204868154, "grad_norm": 0.255859375, "learning_rate": 3.34685598377282e-06, "loss": 1.02, "step": 984 }, { "epoch": 1.997971602434077, "grad_norm": 0.291015625, "learning_rate": 3.340094658553077e-06, "loss": 1.0196, "step": 985 }, { "epoch": 2.0, "grad_norm": 0.248046875, "learning_rate": 3.3333333333333333e-06, "loss": 1.0513, "step": 986 }, { "epoch": 2.002028397565923, "grad_norm": 0.279296875, "learning_rate": 3.3265720081135907e-06, "loss": 0.9924, "step": 987 }, { "epoch": 2.004056795131846, "grad_norm": 0.25, "learning_rate": 3.3198106828938476e-06, "loss": 1.0519, "step": 988 }, { "epoch": 2.006085192697769, "grad_norm": 0.275390625, "learning_rate": 3.3130493576741045e-06, "loss": 1.0348, "step": 989 }, { "epoch": 2.0081135902636915, "grad_norm": 0.314453125, "learning_rate": 3.306288032454361e-06, "loss": 0.9656, "step": 990 }, { "epoch": 2.0101419878296145, "grad_norm": 0.27734375, "learning_rate": 3.2995267072346183e-06, "loss": 1.0121, "step": 991 }, { "epoch": 2.0121703853955375, "grad_norm": 0.28125, "learning_rate": 3.2927653820148752e-06, "loss": 1.0166, "step": 992 }, { "epoch": 2.0141987829614605, "grad_norm": 0.5625, "learning_rate": 3.2860040567951326e-06, "loss": 1.0339, "step": 993 }, { "epoch": 2.0162271805273835, "grad_norm": 0.2734375, "learning_rate": 3.279242731575389e-06, "loss": 1.0445, "step": 994 }, { "epoch": 2.0182555780933065, "grad_norm": 0.2578125, "learning_rate": 3.272481406355646e-06, "loss": 1.0673, "step": 995 }, { "epoch": 2.020283975659229, "grad_norm": 0.255859375, "learning_rate": 3.265720081135903e-06, "loss": 0.9861, "step": 996 }, { "epoch": 2.022312373225152, "grad_norm": 0.267578125, "learning_rate": 3.25895875591616e-06, "loss": 1.0428, "step": 997 }, { "epoch": 2.024340770791075, "grad_norm": 0.259765625, "learning_rate": 3.2521974306964167e-06, "loss": 1.0235, "step": 998 }, { "epoch": 2.026369168356998, "grad_norm": 0.259765625, "learning_rate": 3.2454361054766736e-06, "loss": 1.0356, "step": 999 }, { "epoch": 2.028397565922921, "grad_norm": 0.25, "learning_rate": 3.2386747802569305e-06, "loss": 1.0242, "step": 1000 }, { "epoch": 2.030425963488844, "grad_norm": 0.255859375, "learning_rate": 3.231913455037188e-06, "loss": 1.0575, "step": 1001 }, { "epoch": 2.032454361054767, "grad_norm": 0.25390625, "learning_rate": 3.2251521298174443e-06, "loss": 1.0393, "step": 1002 }, { "epoch": 2.0344827586206895, "grad_norm": 0.30078125, "learning_rate": 3.2183908045977012e-06, "loss": 0.9703, "step": 1003 }, { "epoch": 2.0365111561866125, "grad_norm": 0.251953125, "learning_rate": 3.2116294793779586e-06, "loss": 1.0081, "step": 1004 }, { "epoch": 2.0385395537525355, "grad_norm": 0.271484375, "learning_rate": 3.2048681541582155e-06, "loss": 1.0917, "step": 1005 }, { "epoch": 2.0405679513184585, "grad_norm": 0.25390625, "learning_rate": 3.198106828938472e-06, "loss": 1.0668, "step": 1006 }, { "epoch": 2.0425963488843815, "grad_norm": 0.2578125, "learning_rate": 3.191345503718729e-06, "loss": 1.036, "step": 1007 }, { "epoch": 2.0446247464503045, "grad_norm": 0.267578125, "learning_rate": 3.1845841784989862e-06, "loss": 1.0184, "step": 1008 }, { "epoch": 2.046653144016227, "grad_norm": 0.248046875, "learning_rate": 3.177822853279243e-06, "loss": 1.0415, "step": 1009 }, { "epoch": 2.04868154158215, "grad_norm": 0.2490234375, "learning_rate": 3.1710615280594996e-06, "loss": 1.0184, "step": 1010 }, { "epoch": 2.050709939148073, "grad_norm": 0.255859375, "learning_rate": 3.164300202839757e-06, "loss": 1.0386, "step": 1011 }, { "epoch": 2.052738336713996, "grad_norm": 0.2470703125, "learning_rate": 3.157538877620014e-06, "loss": 1.0234, "step": 1012 }, { "epoch": 2.054766734279919, "grad_norm": 0.25, "learning_rate": 3.1507775524002708e-06, "loss": 1.0344, "step": 1013 }, { "epoch": 2.056795131845842, "grad_norm": 0.3203125, "learning_rate": 3.1440162271805273e-06, "loss": 1.0343, "step": 1014 }, { "epoch": 2.0588235294117645, "grad_norm": 0.25390625, "learning_rate": 3.1372549019607846e-06, "loss": 1.034, "step": 1015 }, { "epoch": 2.0608519269776875, "grad_norm": 0.259765625, "learning_rate": 3.1304935767410415e-06, "loss": 1.0121, "step": 1016 }, { "epoch": 2.0628803245436105, "grad_norm": 0.31640625, "learning_rate": 3.123732251521299e-06, "loss": 1.022, "step": 1017 }, { "epoch": 2.0649087221095335, "grad_norm": 0.388671875, "learning_rate": 3.1169709263015553e-06, "loss": 1.1013, "step": 1018 }, { "epoch": 2.0669371196754565, "grad_norm": 0.2451171875, "learning_rate": 3.1102096010818122e-06, "loss": 0.994, "step": 1019 }, { "epoch": 2.0689655172413794, "grad_norm": 0.275390625, "learning_rate": 3.103448275862069e-06, "loss": 1.0659, "step": 1020 }, { "epoch": 2.0709939148073024, "grad_norm": 0.2470703125, "learning_rate": 3.0966869506423265e-06, "loss": 1.0322, "step": 1021 }, { "epoch": 2.073022312373225, "grad_norm": 0.3203125, "learning_rate": 3.089925625422583e-06, "loss": 1.0494, "step": 1022 }, { "epoch": 2.075050709939148, "grad_norm": 0.263671875, "learning_rate": 3.08316430020284e-06, "loss": 1.0761, "step": 1023 }, { "epoch": 2.077079107505071, "grad_norm": 0.296875, "learning_rate": 3.0764029749830972e-06, "loss": 1.0021, "step": 1024 }, { "epoch": 2.079107505070994, "grad_norm": 0.255859375, "learning_rate": 3.069641649763354e-06, "loss": 0.9927, "step": 1025 }, { "epoch": 2.081135902636917, "grad_norm": 0.2578125, "learning_rate": 3.0628803245436106e-06, "loss": 0.999, "step": 1026 }, { "epoch": 2.08316430020284, "grad_norm": 0.25, "learning_rate": 3.0561189993238675e-06, "loss": 1.0619, "step": 1027 }, { "epoch": 2.0851926977687625, "grad_norm": 0.2470703125, "learning_rate": 3.049357674104125e-06, "loss": 1.0605, "step": 1028 }, { "epoch": 2.0872210953346855, "grad_norm": 0.3046875, "learning_rate": 3.0425963488843818e-06, "loss": 0.971, "step": 1029 }, { "epoch": 2.0892494929006085, "grad_norm": 0.26171875, "learning_rate": 3.0358350236646383e-06, "loss": 1.0451, "step": 1030 }, { "epoch": 2.0912778904665315, "grad_norm": 0.26171875, "learning_rate": 3.0290736984448956e-06, "loss": 1.0517, "step": 1031 }, { "epoch": 2.0933062880324544, "grad_norm": 0.271484375, "learning_rate": 3.0223123732251525e-06, "loss": 1.0314, "step": 1032 }, { "epoch": 2.0953346855983774, "grad_norm": 0.26171875, "learning_rate": 3.0155510480054094e-06, "loss": 1.0495, "step": 1033 }, { "epoch": 2.0973630831643, "grad_norm": 0.251953125, "learning_rate": 3.008789722785666e-06, "loss": 1.0287, "step": 1034 }, { "epoch": 2.099391480730223, "grad_norm": 0.26953125, "learning_rate": 3.0020283975659233e-06, "loss": 1.0541, "step": 1035 }, { "epoch": 2.101419878296146, "grad_norm": 0.259765625, "learning_rate": 2.99526707234618e-06, "loss": 1.045, "step": 1036 }, { "epoch": 2.103448275862069, "grad_norm": 0.2578125, "learning_rate": 2.988505747126437e-06, "loss": 1.0201, "step": 1037 }, { "epoch": 2.105476673427992, "grad_norm": 0.283203125, "learning_rate": 2.9817444219066936e-06, "loss": 1.0175, "step": 1038 }, { "epoch": 2.107505070993915, "grad_norm": 0.25, "learning_rate": 2.974983096686951e-06, "loss": 1.0352, "step": 1039 }, { "epoch": 2.109533468559838, "grad_norm": 0.306640625, "learning_rate": 2.968221771467208e-06, "loss": 1.0876, "step": 1040 }, { "epoch": 2.1115618661257605, "grad_norm": 0.359375, "learning_rate": 2.961460446247465e-06, "loss": 0.9568, "step": 1041 }, { "epoch": 2.1135902636916835, "grad_norm": 0.3828125, "learning_rate": 2.9546991210277216e-06, "loss": 0.9813, "step": 1042 }, { "epoch": 2.1156186612576064, "grad_norm": 0.263671875, "learning_rate": 2.9479377958079785e-06, "loss": 1.0693, "step": 1043 }, { "epoch": 2.1176470588235294, "grad_norm": 0.251953125, "learning_rate": 2.9411764705882355e-06, "loss": 0.9901, "step": 1044 }, { "epoch": 2.1196754563894524, "grad_norm": 0.26171875, "learning_rate": 2.934415145368493e-06, "loss": 1.0182, "step": 1045 }, { "epoch": 2.1217038539553754, "grad_norm": 0.2578125, "learning_rate": 2.9276538201487493e-06, "loss": 1.0229, "step": 1046 }, { "epoch": 2.123732251521298, "grad_norm": 0.25390625, "learning_rate": 2.920892494929006e-06, "loss": 1.06, "step": 1047 }, { "epoch": 2.125760649087221, "grad_norm": 0.2421875, "learning_rate": 2.9141311697092635e-06, "loss": 1.0253, "step": 1048 }, { "epoch": 2.127789046653144, "grad_norm": 0.306640625, "learning_rate": 2.9073698444895204e-06, "loss": 1.0695, "step": 1049 }, { "epoch": 2.129817444219067, "grad_norm": 0.296875, "learning_rate": 2.900608519269777e-06, "loss": 1.0031, "step": 1050 }, { "epoch": 2.13184584178499, "grad_norm": 0.296875, "learning_rate": 2.893847194050034e-06, "loss": 1.0045, "step": 1051 }, { "epoch": 2.133874239350913, "grad_norm": 0.2490234375, "learning_rate": 2.887085868830291e-06, "loss": 1.0059, "step": 1052 }, { "epoch": 2.135902636916836, "grad_norm": 0.248046875, "learning_rate": 2.880324543610548e-06, "loss": 1.004, "step": 1053 }, { "epoch": 2.1379310344827585, "grad_norm": 0.259765625, "learning_rate": 2.8735632183908046e-06, "loss": 1.0363, "step": 1054 }, { "epoch": 2.1399594320486814, "grad_norm": 0.279296875, "learning_rate": 2.866801893171062e-06, "loss": 1.0313, "step": 1055 }, { "epoch": 2.1419878296146044, "grad_norm": 0.25390625, "learning_rate": 2.860040567951319e-06, "loss": 1.0128, "step": 1056 }, { "epoch": 2.1440162271805274, "grad_norm": 0.28125, "learning_rate": 2.8532792427315757e-06, "loss": 1.0291, "step": 1057 }, { "epoch": 2.1460446247464504, "grad_norm": 0.25390625, "learning_rate": 2.8465179175118322e-06, "loss": 1.0473, "step": 1058 }, { "epoch": 2.1480730223123734, "grad_norm": 0.26171875, "learning_rate": 2.8397565922920896e-06, "loss": 1.018, "step": 1059 }, { "epoch": 2.150101419878296, "grad_norm": 0.29296875, "learning_rate": 2.8329952670723465e-06, "loss": 1.0078, "step": 1060 }, { "epoch": 2.152129817444219, "grad_norm": 0.25390625, "learning_rate": 2.8262339418526034e-06, "loss": 1.0386, "step": 1061 }, { "epoch": 2.154158215010142, "grad_norm": 0.283203125, "learning_rate": 2.8194726166328603e-06, "loss": 1.0723, "step": 1062 }, { "epoch": 2.156186612576065, "grad_norm": 0.2490234375, "learning_rate": 2.812711291413117e-06, "loss": 1.0031, "step": 1063 }, { "epoch": 2.158215010141988, "grad_norm": 0.251953125, "learning_rate": 2.805949966193374e-06, "loss": 1.0709, "step": 1064 }, { "epoch": 2.160243407707911, "grad_norm": 0.25, "learning_rate": 2.7991886409736314e-06, "loss": 1.0254, "step": 1065 }, { "epoch": 2.162271805273834, "grad_norm": 0.25, "learning_rate": 2.792427315753888e-06, "loss": 0.9745, "step": 1066 }, { "epoch": 2.1643002028397564, "grad_norm": 0.306640625, "learning_rate": 2.785665990534145e-06, "loss": 0.9904, "step": 1067 }, { "epoch": 2.1663286004056794, "grad_norm": 0.27734375, "learning_rate": 2.7789046653144018e-06, "loss": 1.0366, "step": 1068 }, { "epoch": 2.1683569979716024, "grad_norm": 0.24609375, "learning_rate": 2.772143340094659e-06, "loss": 0.9916, "step": 1069 }, { "epoch": 2.1703853955375254, "grad_norm": 0.27734375, "learning_rate": 2.7653820148749156e-06, "loss": 1.0308, "step": 1070 }, { "epoch": 2.1724137931034484, "grad_norm": 0.2470703125, "learning_rate": 2.7586206896551725e-06, "loss": 0.987, "step": 1071 }, { "epoch": 2.1744421906693714, "grad_norm": 0.25390625, "learning_rate": 2.75185936443543e-06, "loss": 1.036, "step": 1072 }, { "epoch": 2.176470588235294, "grad_norm": 0.255859375, "learning_rate": 2.7450980392156867e-06, "loss": 1.0158, "step": 1073 }, { "epoch": 2.178498985801217, "grad_norm": 0.275390625, "learning_rate": 2.7383367139959432e-06, "loss": 1.0426, "step": 1074 }, { "epoch": 2.18052738336714, "grad_norm": 0.267578125, "learning_rate": 2.7315753887762e-06, "loss": 1.0285, "step": 1075 }, { "epoch": 2.182555780933063, "grad_norm": 0.2578125, "learning_rate": 2.7248140635564575e-06, "loss": 0.9971, "step": 1076 }, { "epoch": 2.184584178498986, "grad_norm": 0.248046875, "learning_rate": 2.7180527383367144e-06, "loss": 1.0294, "step": 1077 }, { "epoch": 2.186612576064909, "grad_norm": 0.349609375, "learning_rate": 2.711291413116971e-06, "loss": 1.017, "step": 1078 }, { "epoch": 2.1886409736308314, "grad_norm": 0.265625, "learning_rate": 2.704530087897228e-06, "loss": 1.0512, "step": 1079 }, { "epoch": 2.1906693711967544, "grad_norm": 0.314453125, "learning_rate": 2.697768762677485e-06, "loss": 1.0469, "step": 1080 }, { "epoch": 2.1926977687626774, "grad_norm": 0.2734375, "learning_rate": 2.691007437457742e-06, "loss": 1.0847, "step": 1081 }, { "epoch": 2.1947261663286004, "grad_norm": 0.251953125, "learning_rate": 2.6842461122379985e-06, "loss": 1.0098, "step": 1082 }, { "epoch": 2.1967545638945234, "grad_norm": 0.25390625, "learning_rate": 2.677484787018256e-06, "loss": 1.0116, "step": 1083 }, { "epoch": 2.1987829614604464, "grad_norm": 0.396484375, "learning_rate": 2.6707234617985128e-06, "loss": 1.0333, "step": 1084 }, { "epoch": 2.2008113590263694, "grad_norm": 0.322265625, "learning_rate": 2.66396213657877e-06, "loss": 0.9811, "step": 1085 }, { "epoch": 2.202839756592292, "grad_norm": 0.2578125, "learning_rate": 2.6572008113590266e-06, "loss": 1.0221, "step": 1086 }, { "epoch": 2.204868154158215, "grad_norm": 0.32421875, "learning_rate": 2.6504394861392835e-06, "loss": 1.0477, "step": 1087 }, { "epoch": 2.206896551724138, "grad_norm": 0.2470703125, "learning_rate": 2.6436781609195404e-06, "loss": 1.0179, "step": 1088 }, { "epoch": 2.208924949290061, "grad_norm": 0.427734375, "learning_rate": 2.6369168356997977e-06, "loss": 0.9656, "step": 1089 }, { "epoch": 2.210953346855984, "grad_norm": 0.271484375, "learning_rate": 2.6301555104800542e-06, "loss": 0.9941, "step": 1090 }, { "epoch": 2.212981744421907, "grad_norm": 0.2470703125, "learning_rate": 2.623394185260311e-06, "loss": 0.9892, "step": 1091 }, { "epoch": 2.2150101419878294, "grad_norm": 0.25390625, "learning_rate": 2.616632860040568e-06, "loss": 1.0165, "step": 1092 }, { "epoch": 2.2170385395537524, "grad_norm": 0.2490234375, "learning_rate": 2.6098715348208254e-06, "loss": 1.0287, "step": 1093 }, { "epoch": 2.2190669371196754, "grad_norm": 0.25, "learning_rate": 2.603110209601082e-06, "loss": 1.0269, "step": 1094 }, { "epoch": 2.2210953346855984, "grad_norm": 0.25, "learning_rate": 2.596348884381339e-06, "loss": 1.0034, "step": 1095 }, { "epoch": 2.2231237322515214, "grad_norm": 0.251953125, "learning_rate": 2.589587559161596e-06, "loss": 1.024, "step": 1096 }, { "epoch": 2.2251521298174444, "grad_norm": 0.26171875, "learning_rate": 2.582826233941853e-06, "loss": 1.0846, "step": 1097 }, { "epoch": 2.227180527383367, "grad_norm": 0.279296875, "learning_rate": 2.5760649087221095e-06, "loss": 1.0558, "step": 1098 }, { "epoch": 2.22920892494929, "grad_norm": 0.251953125, "learning_rate": 2.5693035835023664e-06, "loss": 0.9899, "step": 1099 }, { "epoch": 2.231237322515213, "grad_norm": 0.26171875, "learning_rate": 2.5625422582826238e-06, "loss": 1.0611, "step": 1100 }, { "epoch": 2.233265720081136, "grad_norm": 0.263671875, "learning_rate": 2.5557809330628807e-06, "loss": 1.0497, "step": 1101 }, { "epoch": 2.235294117647059, "grad_norm": 0.28515625, "learning_rate": 2.549019607843137e-06, "loss": 1.037, "step": 1102 }, { "epoch": 2.237322515212982, "grad_norm": 0.25, "learning_rate": 2.5422582826233945e-06, "loss": 1.0295, "step": 1103 }, { "epoch": 2.239350912778905, "grad_norm": 0.2490234375, "learning_rate": 2.5354969574036514e-06, "loss": 1.0433, "step": 1104 }, { "epoch": 2.2413793103448274, "grad_norm": 0.267578125, "learning_rate": 2.5287356321839083e-06, "loss": 1.002, "step": 1105 }, { "epoch": 2.2434077079107504, "grad_norm": 0.251953125, "learning_rate": 2.521974306964165e-06, "loss": 1.0088, "step": 1106 }, { "epoch": 2.2454361054766734, "grad_norm": 0.353515625, "learning_rate": 2.515212981744422e-06, "loss": 0.9883, "step": 1107 }, { "epoch": 2.2474645030425964, "grad_norm": 0.2578125, "learning_rate": 2.508451656524679e-06, "loss": 1.0244, "step": 1108 }, { "epoch": 2.2494929006085194, "grad_norm": 0.28515625, "learning_rate": 2.5016903313049364e-06, "loss": 1.0374, "step": 1109 }, { "epoch": 2.2515212981744424, "grad_norm": 0.26953125, "learning_rate": 2.494929006085193e-06, "loss": 1.0797, "step": 1110 }, { "epoch": 2.2535496957403653, "grad_norm": 0.263671875, "learning_rate": 2.48816768086545e-06, "loss": 1.0207, "step": 1111 }, { "epoch": 2.255578093306288, "grad_norm": 0.2578125, "learning_rate": 2.4814063556457067e-06, "loss": 1.0208, "step": 1112 }, { "epoch": 2.257606490872211, "grad_norm": 0.248046875, "learning_rate": 2.4746450304259636e-06, "loss": 1.0544, "step": 1113 }, { "epoch": 2.259634888438134, "grad_norm": 0.353515625, "learning_rate": 2.4678837052062205e-06, "loss": 0.9771, "step": 1114 }, { "epoch": 2.261663286004057, "grad_norm": 0.255859375, "learning_rate": 2.4611223799864774e-06, "loss": 1.0019, "step": 1115 }, { "epoch": 2.26369168356998, "grad_norm": 0.25390625, "learning_rate": 2.4543610547667348e-06, "loss": 1.0274, "step": 1116 }, { "epoch": 2.2657200811359024, "grad_norm": 0.291015625, "learning_rate": 2.4475997295469913e-06, "loss": 1.0097, "step": 1117 }, { "epoch": 2.2677484787018254, "grad_norm": 0.279296875, "learning_rate": 2.4408384043272486e-06, "loss": 1.0111, "step": 1118 }, { "epoch": 2.2697768762677484, "grad_norm": 0.25390625, "learning_rate": 2.434077079107505e-06, "loss": 1.0339, "step": 1119 }, { "epoch": 2.2718052738336714, "grad_norm": 0.26953125, "learning_rate": 2.4273157538877624e-06, "loss": 1.0308, "step": 1120 }, { "epoch": 2.2738336713995944, "grad_norm": 0.337890625, "learning_rate": 2.420554428668019e-06, "loss": 1.0253, "step": 1121 }, { "epoch": 2.2758620689655173, "grad_norm": 0.259765625, "learning_rate": 2.4137931034482762e-06, "loss": 1.0244, "step": 1122 }, { "epoch": 2.2778904665314403, "grad_norm": 0.375, "learning_rate": 2.407031778228533e-06, "loss": 1.014, "step": 1123 }, { "epoch": 2.279918864097363, "grad_norm": 0.25390625, "learning_rate": 2.40027045300879e-06, "loss": 1.037, "step": 1124 }, { "epoch": 2.281947261663286, "grad_norm": 0.259765625, "learning_rate": 2.393509127789047e-06, "loss": 1.0582, "step": 1125 }, { "epoch": 2.283975659229209, "grad_norm": 0.369140625, "learning_rate": 2.386747802569304e-06, "loss": 1.0285, "step": 1126 }, { "epoch": 2.286004056795132, "grad_norm": 0.2490234375, "learning_rate": 2.379986477349561e-06, "loss": 1.0411, "step": 1127 }, { "epoch": 2.288032454361055, "grad_norm": 0.267578125, "learning_rate": 2.3732251521298177e-06, "loss": 1.0125, "step": 1128 }, { "epoch": 2.290060851926978, "grad_norm": 0.291015625, "learning_rate": 2.3664638269100746e-06, "loss": 1.0455, "step": 1129 }, { "epoch": 2.292089249492901, "grad_norm": 0.271484375, "learning_rate": 2.3597025016903315e-06, "loss": 1.0079, "step": 1130 }, { "epoch": 2.2941176470588234, "grad_norm": 0.2470703125, "learning_rate": 2.3529411764705885e-06, "loss": 1.0286, "step": 1131 }, { "epoch": 2.2961460446247464, "grad_norm": 0.24609375, "learning_rate": 2.3461798512508454e-06, "loss": 1.0324, "step": 1132 }, { "epoch": 2.2981744421906694, "grad_norm": 0.271484375, "learning_rate": 2.3394185260311023e-06, "loss": 1.0273, "step": 1133 }, { "epoch": 2.3002028397565923, "grad_norm": 0.25, "learning_rate": 2.332657200811359e-06, "loss": 1.0111, "step": 1134 }, { "epoch": 2.3022312373225153, "grad_norm": 0.4609375, "learning_rate": 2.325895875591616e-06, "loss": 0.9797, "step": 1135 }, { "epoch": 2.3042596348884383, "grad_norm": 0.271484375, "learning_rate": 2.319134550371873e-06, "loss": 0.9904, "step": 1136 }, { "epoch": 2.306288032454361, "grad_norm": 0.251953125, "learning_rate": 2.31237322515213e-06, "loss": 1.0559, "step": 1137 }, { "epoch": 2.308316430020284, "grad_norm": 0.25390625, "learning_rate": 2.305611899932387e-06, "loss": 1.0311, "step": 1138 }, { "epoch": 2.310344827586207, "grad_norm": 0.26953125, "learning_rate": 2.2988505747126437e-06, "loss": 1.0926, "step": 1139 }, { "epoch": 2.31237322515213, "grad_norm": 0.26171875, "learning_rate": 2.292089249492901e-06, "loss": 1.0408, "step": 1140 }, { "epoch": 2.314401622718053, "grad_norm": 0.265625, "learning_rate": 2.2853279242731576e-06, "loss": 1.0569, "step": 1141 }, { "epoch": 2.316430020283976, "grad_norm": 0.26953125, "learning_rate": 2.278566599053415e-06, "loss": 1.0491, "step": 1142 }, { "epoch": 2.3184584178498984, "grad_norm": 0.255859375, "learning_rate": 2.2718052738336714e-06, "loss": 1.0171, "step": 1143 }, { "epoch": 2.3204868154158214, "grad_norm": 0.26953125, "learning_rate": 2.2650439486139287e-06, "loss": 1.0477, "step": 1144 }, { "epoch": 2.3225152129817443, "grad_norm": 0.26953125, "learning_rate": 2.2582826233941852e-06, "loss": 1.0472, "step": 1145 }, { "epoch": 2.3245436105476673, "grad_norm": 0.267578125, "learning_rate": 2.2515212981744425e-06, "loss": 1.0426, "step": 1146 }, { "epoch": 2.3265720081135903, "grad_norm": 0.263671875, "learning_rate": 2.2447599729546995e-06, "loss": 1.0696, "step": 1147 }, { "epoch": 2.3286004056795133, "grad_norm": 0.267578125, "learning_rate": 2.2379986477349564e-06, "loss": 1.0435, "step": 1148 }, { "epoch": 2.3306288032454363, "grad_norm": 0.267578125, "learning_rate": 2.2312373225152133e-06, "loss": 1.0468, "step": 1149 }, { "epoch": 2.332657200811359, "grad_norm": 0.2490234375, "learning_rate": 2.22447599729547e-06, "loss": 1.013, "step": 1150 }, { "epoch": 2.334685598377282, "grad_norm": 0.265625, "learning_rate": 2.217714672075727e-06, "loss": 1.0125, "step": 1151 }, { "epoch": 2.336713995943205, "grad_norm": 0.26171875, "learning_rate": 2.210953346855984e-06, "loss": 1.0292, "step": 1152 }, { "epoch": 2.338742393509128, "grad_norm": 0.52734375, "learning_rate": 2.204192021636241e-06, "loss": 0.9943, "step": 1153 }, { "epoch": 2.340770791075051, "grad_norm": 0.341796875, "learning_rate": 2.197430696416498e-06, "loss": 1.0632, "step": 1154 }, { "epoch": 2.342799188640974, "grad_norm": 0.2490234375, "learning_rate": 2.1906693711967548e-06, "loss": 0.9995, "step": 1155 }, { "epoch": 2.344827586206897, "grad_norm": 0.2451171875, "learning_rate": 2.1839080459770117e-06, "loss": 0.983, "step": 1156 }, { "epoch": 2.3468559837728193, "grad_norm": 0.255859375, "learning_rate": 2.1771467207572686e-06, "loss": 1.003, "step": 1157 }, { "epoch": 2.3488843813387423, "grad_norm": 0.26171875, "learning_rate": 2.1703853955375255e-06, "loss": 1.0498, "step": 1158 }, { "epoch": 2.3509127789046653, "grad_norm": 0.28125, "learning_rate": 2.1636240703177824e-06, "loss": 1.0288, "step": 1159 }, { "epoch": 2.3529411764705883, "grad_norm": 0.2578125, "learning_rate": 2.1568627450980393e-06, "loss": 1.0166, "step": 1160 }, { "epoch": 2.3549695740365113, "grad_norm": 0.259765625, "learning_rate": 2.1501014198782962e-06, "loss": 1.0634, "step": 1161 }, { "epoch": 2.356997971602434, "grad_norm": 0.291015625, "learning_rate": 2.1433400946585536e-06, "loss": 1.0077, "step": 1162 }, { "epoch": 2.359026369168357, "grad_norm": 0.263671875, "learning_rate": 2.13657876943881e-06, "loss": 1.0331, "step": 1163 }, { "epoch": 2.36105476673428, "grad_norm": 0.259765625, "learning_rate": 2.1298174442190674e-06, "loss": 0.987, "step": 1164 }, { "epoch": 2.363083164300203, "grad_norm": 0.25, "learning_rate": 2.123056118999324e-06, "loss": 1.0156, "step": 1165 }, { "epoch": 2.365111561866126, "grad_norm": 0.25390625, "learning_rate": 2.116294793779581e-06, "loss": 0.9946, "step": 1166 }, { "epoch": 2.367139959432049, "grad_norm": 0.25390625, "learning_rate": 2.1095334685598377e-06, "loss": 0.9646, "step": 1167 }, { "epoch": 2.369168356997972, "grad_norm": 0.267578125, "learning_rate": 2.102772143340095e-06, "loss": 1.0295, "step": 1168 }, { "epoch": 2.3711967545638943, "grad_norm": 0.259765625, "learning_rate": 2.096010818120352e-06, "loss": 1.0657, "step": 1169 }, { "epoch": 2.3732251521298173, "grad_norm": 0.275390625, "learning_rate": 2.089249492900609e-06, "loss": 1.0251, "step": 1170 }, { "epoch": 2.3752535496957403, "grad_norm": 0.26171875, "learning_rate": 2.0824881676808658e-06, "loss": 1.0299, "step": 1171 }, { "epoch": 2.3772819472616633, "grad_norm": 0.265625, "learning_rate": 2.0757268424611227e-06, "loss": 1.0147, "step": 1172 }, { "epoch": 2.3793103448275863, "grad_norm": 0.2734375, "learning_rate": 2.0689655172413796e-06, "loss": 0.9979, "step": 1173 }, { "epoch": 2.3813387423935093, "grad_norm": 0.265625, "learning_rate": 2.0622041920216365e-06, "loss": 1.0701, "step": 1174 }, { "epoch": 2.3833671399594323, "grad_norm": 0.25390625, "learning_rate": 2.0554428668018934e-06, "loss": 1.0585, "step": 1175 }, { "epoch": 2.385395537525355, "grad_norm": 0.287109375, "learning_rate": 2.0486815415821503e-06, "loss": 1.0384, "step": 1176 }, { "epoch": 2.387423935091278, "grad_norm": 0.326171875, "learning_rate": 2.0419202163624072e-06, "loss": 1.0231, "step": 1177 }, { "epoch": 2.389452332657201, "grad_norm": 0.2470703125, "learning_rate": 2.035158891142664e-06, "loss": 0.9854, "step": 1178 }, { "epoch": 2.391480730223124, "grad_norm": 0.26953125, "learning_rate": 2.028397565922921e-06, "loss": 0.9973, "step": 1179 }, { "epoch": 2.393509127789047, "grad_norm": 0.37109375, "learning_rate": 2.021636240703178e-06, "loss": 1.0059, "step": 1180 }, { "epoch": 2.3955375253549693, "grad_norm": 0.259765625, "learning_rate": 2.014874915483435e-06, "loss": 1.0267, "step": 1181 }, { "epoch": 2.3975659229208923, "grad_norm": 0.275390625, "learning_rate": 2.0081135902636918e-06, "loss": 1.015, "step": 1182 }, { "epoch": 2.3995943204868153, "grad_norm": 0.267578125, "learning_rate": 2.0013522650439487e-06, "loss": 1.0269, "step": 1183 }, { "epoch": 2.4016227180527383, "grad_norm": 0.28515625, "learning_rate": 1.9945909398242056e-06, "loss": 1.0155, "step": 1184 }, { "epoch": 2.4036511156186613, "grad_norm": 0.267578125, "learning_rate": 1.9878296146044625e-06, "loss": 1.0457, "step": 1185 }, { "epoch": 2.4056795131845843, "grad_norm": 0.251953125, "learning_rate": 1.98106828938472e-06, "loss": 0.9996, "step": 1186 }, { "epoch": 2.4077079107505073, "grad_norm": 0.2578125, "learning_rate": 1.9743069641649763e-06, "loss": 1.0397, "step": 1187 }, { "epoch": 2.40973630831643, "grad_norm": 0.255859375, "learning_rate": 1.9675456389452337e-06, "loss": 1.0423, "step": 1188 }, { "epoch": 2.411764705882353, "grad_norm": 0.259765625, "learning_rate": 1.96078431372549e-06, "loss": 1.0214, "step": 1189 }, { "epoch": 2.413793103448276, "grad_norm": 0.25390625, "learning_rate": 1.9540229885057475e-06, "loss": 0.9866, "step": 1190 }, { "epoch": 2.415821501014199, "grad_norm": 0.267578125, "learning_rate": 1.947261663286004e-06, "loss": 1.045, "step": 1191 }, { "epoch": 2.417849898580122, "grad_norm": 0.2490234375, "learning_rate": 1.9405003380662613e-06, "loss": 1.0609, "step": 1192 }, { "epoch": 2.4198782961460448, "grad_norm": 0.2578125, "learning_rate": 1.9337390128465182e-06, "loss": 1.0639, "step": 1193 }, { "epoch": 2.4219066937119678, "grad_norm": 0.283203125, "learning_rate": 1.926977687626775e-06, "loss": 1.0107, "step": 1194 }, { "epoch": 2.4239350912778903, "grad_norm": 0.283203125, "learning_rate": 1.920216362407032e-06, "loss": 1.041, "step": 1195 }, { "epoch": 2.4259634888438133, "grad_norm": 0.25390625, "learning_rate": 1.913455037187289e-06, "loss": 1.0544, "step": 1196 }, { "epoch": 2.4279918864097363, "grad_norm": 0.255859375, "learning_rate": 1.9066937119675457e-06, "loss": 1.0184, "step": 1197 }, { "epoch": 2.4300202839756593, "grad_norm": 0.28515625, "learning_rate": 1.8999323867478028e-06, "loss": 1.0371, "step": 1198 }, { "epoch": 2.4320486815415823, "grad_norm": 0.265625, "learning_rate": 1.8931710615280597e-06, "loss": 1.0313, "step": 1199 }, { "epoch": 2.4340770791075053, "grad_norm": 0.27734375, "learning_rate": 1.8864097363083166e-06, "loss": 1.0275, "step": 1200 }, { "epoch": 2.436105476673428, "grad_norm": 0.255859375, "learning_rate": 1.8796484110885735e-06, "loss": 1.0283, "step": 1201 }, { "epoch": 2.438133874239351, "grad_norm": 0.25, "learning_rate": 1.8728870858688306e-06, "loss": 1.0195, "step": 1202 }, { "epoch": 2.440162271805274, "grad_norm": 0.25, "learning_rate": 1.8661257606490873e-06, "loss": 1.0568, "step": 1203 }, { "epoch": 2.4421906693711968, "grad_norm": 0.306640625, "learning_rate": 1.8593644354293445e-06, "loss": 1.0432, "step": 1204 }, { "epoch": 2.4442190669371198, "grad_norm": 0.3671875, "learning_rate": 1.8526031102096012e-06, "loss": 1.0448, "step": 1205 }, { "epoch": 2.4462474645030428, "grad_norm": 0.267578125, "learning_rate": 1.8458417849898583e-06, "loss": 1.0647, "step": 1206 }, { "epoch": 2.4482758620689653, "grad_norm": 0.263671875, "learning_rate": 1.839080459770115e-06, "loss": 1.0329, "step": 1207 }, { "epoch": 2.4503042596348883, "grad_norm": 0.2578125, "learning_rate": 1.8323191345503721e-06, "loss": 1.0383, "step": 1208 }, { "epoch": 2.4523326572008113, "grad_norm": 0.3203125, "learning_rate": 1.825557809330629e-06, "loss": 1.0725, "step": 1209 }, { "epoch": 2.4543610547667343, "grad_norm": 0.25390625, "learning_rate": 1.818796484110886e-06, "loss": 1.0147, "step": 1210 }, { "epoch": 2.4563894523326573, "grad_norm": 0.2578125, "learning_rate": 1.8120351588911429e-06, "loss": 1.0861, "step": 1211 }, { "epoch": 2.4584178498985803, "grad_norm": 0.26171875, "learning_rate": 1.8052738336713998e-06, "loss": 1.0131, "step": 1212 }, { "epoch": 2.4604462474645032, "grad_norm": 0.25390625, "learning_rate": 1.7985125084516567e-06, "loss": 1.0415, "step": 1213 }, { "epoch": 2.462474645030426, "grad_norm": 0.259765625, "learning_rate": 1.7917511832319138e-06, "loss": 0.9922, "step": 1214 }, { "epoch": 2.464503042596349, "grad_norm": 0.265625, "learning_rate": 1.7849898580121705e-06, "loss": 1.0788, "step": 1215 }, { "epoch": 2.4665314401622718, "grad_norm": 0.2578125, "learning_rate": 1.7782285327924276e-06, "loss": 1.0135, "step": 1216 }, { "epoch": 2.4685598377281948, "grad_norm": 0.25390625, "learning_rate": 1.7714672075726843e-06, "loss": 1.0231, "step": 1217 }, { "epoch": 2.4705882352941178, "grad_norm": 0.2890625, "learning_rate": 1.7647058823529414e-06, "loss": 1.0446, "step": 1218 }, { "epoch": 2.4726166328600407, "grad_norm": 0.248046875, "learning_rate": 1.7579445571331981e-06, "loss": 1.0019, "step": 1219 }, { "epoch": 2.4746450304259637, "grad_norm": 0.302734375, "learning_rate": 1.7511832319134553e-06, "loss": 0.9993, "step": 1220 }, { "epoch": 2.4766734279918863, "grad_norm": 0.2734375, "learning_rate": 1.7444219066937122e-06, "loss": 0.9909, "step": 1221 }, { "epoch": 2.4787018255578093, "grad_norm": 0.26953125, "learning_rate": 1.737660581473969e-06, "loss": 1.0211, "step": 1222 }, { "epoch": 2.4807302231237323, "grad_norm": 0.275390625, "learning_rate": 1.730899256254226e-06, "loss": 1.0205, "step": 1223 }, { "epoch": 2.4827586206896552, "grad_norm": 0.37109375, "learning_rate": 1.724137931034483e-06, "loss": 1.0238, "step": 1224 }, { "epoch": 2.4847870182555782, "grad_norm": 0.25390625, "learning_rate": 1.7173766058147398e-06, "loss": 0.9695, "step": 1225 }, { "epoch": 2.486815415821501, "grad_norm": 0.3359375, "learning_rate": 1.710615280594997e-06, "loss": 0.9944, "step": 1226 }, { "epoch": 2.4888438133874238, "grad_norm": 0.251953125, "learning_rate": 1.7038539553752536e-06, "loss": 1.0094, "step": 1227 }, { "epoch": 2.4908722109533468, "grad_norm": 0.265625, "learning_rate": 1.6970926301555108e-06, "loss": 1.0162, "step": 1228 }, { "epoch": 2.4929006085192698, "grad_norm": 0.263671875, "learning_rate": 1.6903313049357675e-06, "loss": 1.0123, "step": 1229 }, { "epoch": 2.4949290060851927, "grad_norm": 0.2490234375, "learning_rate": 1.6835699797160246e-06, "loss": 1.0224, "step": 1230 }, { "epoch": 2.4969574036511157, "grad_norm": 0.255859375, "learning_rate": 1.6768086544962813e-06, "loss": 0.9803, "step": 1231 }, { "epoch": 2.4989858012170387, "grad_norm": 0.259765625, "learning_rate": 1.6700473292765384e-06, "loss": 1.0051, "step": 1232 }, { "epoch": 2.5010141987829613, "grad_norm": 0.2578125, "learning_rate": 1.6632860040567953e-06, "loss": 1.0288, "step": 1233 }, { "epoch": 2.5030425963488843, "grad_norm": 0.25, "learning_rate": 1.6565246788370522e-06, "loss": 1.0262, "step": 1234 }, { "epoch": 2.5050709939148073, "grad_norm": 0.294921875, "learning_rate": 1.6497633536173092e-06, "loss": 1.0119, "step": 1235 }, { "epoch": 2.5070993914807302, "grad_norm": 0.30859375, "learning_rate": 1.6430020283975663e-06, "loss": 1.0361, "step": 1236 }, { "epoch": 2.5091277890466532, "grad_norm": 0.30078125, "learning_rate": 1.636240703177823e-06, "loss": 0.9858, "step": 1237 }, { "epoch": 2.5111561866125762, "grad_norm": 0.259765625, "learning_rate": 1.62947937795808e-06, "loss": 1.0318, "step": 1238 }, { "epoch": 2.513184584178499, "grad_norm": 0.244140625, "learning_rate": 1.6227180527383368e-06, "loss": 1.0007, "step": 1239 }, { "epoch": 2.5152129817444218, "grad_norm": 0.248046875, "learning_rate": 1.615956727518594e-06, "loss": 1.0325, "step": 1240 }, { "epoch": 2.5172413793103448, "grad_norm": 0.302734375, "learning_rate": 1.6091954022988506e-06, "loss": 1.0019, "step": 1241 }, { "epoch": 2.5192697768762677, "grad_norm": 0.337890625, "learning_rate": 1.6024340770791077e-06, "loss": 1.0205, "step": 1242 }, { "epoch": 2.5212981744421907, "grad_norm": 0.283203125, "learning_rate": 1.5956727518593644e-06, "loss": 1.0035, "step": 1243 }, { "epoch": 2.5233265720081137, "grad_norm": 0.314453125, "learning_rate": 1.5889114266396216e-06, "loss": 1.0094, "step": 1244 }, { "epoch": 2.5253549695740363, "grad_norm": 0.294921875, "learning_rate": 1.5821501014198785e-06, "loss": 1.0204, "step": 1245 }, { "epoch": 2.5273833671399597, "grad_norm": 0.298828125, "learning_rate": 1.5753887762001354e-06, "loss": 0.9897, "step": 1246 }, { "epoch": 2.5294117647058822, "grad_norm": 0.2578125, "learning_rate": 1.5686274509803923e-06, "loss": 1.051, "step": 1247 }, { "epoch": 2.5314401622718052, "grad_norm": 0.2734375, "learning_rate": 1.5618661257606494e-06, "loss": 0.9939, "step": 1248 }, { "epoch": 2.5334685598377282, "grad_norm": 0.2578125, "learning_rate": 1.5551048005409061e-06, "loss": 1.0292, "step": 1249 }, { "epoch": 2.535496957403651, "grad_norm": 0.388671875, "learning_rate": 1.5483434753211632e-06, "loss": 1.1074, "step": 1250 }, { "epoch": 2.537525354969574, "grad_norm": 0.412109375, "learning_rate": 1.54158215010142e-06, "loss": 0.9867, "step": 1251 }, { "epoch": 2.5395537525354968, "grad_norm": 0.255859375, "learning_rate": 1.534820824881677e-06, "loss": 1.0396, "step": 1252 }, { "epoch": 2.5415821501014197, "grad_norm": 0.28125, "learning_rate": 1.5280594996619338e-06, "loss": 0.9953, "step": 1253 }, { "epoch": 2.5436105476673427, "grad_norm": 0.279296875, "learning_rate": 1.5212981744421909e-06, "loss": 1.0594, "step": 1254 }, { "epoch": 2.5456389452332657, "grad_norm": 0.296875, "learning_rate": 1.5145368492224478e-06, "loss": 1.0398, "step": 1255 }, { "epoch": 2.5476673427991887, "grad_norm": 0.25, "learning_rate": 1.5077755240027047e-06, "loss": 1.0247, "step": 1256 }, { "epoch": 2.5496957403651117, "grad_norm": 0.263671875, "learning_rate": 1.5010141987829616e-06, "loss": 1.003, "step": 1257 }, { "epoch": 2.5517241379310347, "grad_norm": 0.28515625, "learning_rate": 1.4942528735632185e-06, "loss": 1.0286, "step": 1258 }, { "epoch": 2.5537525354969572, "grad_norm": 0.25, "learning_rate": 1.4874915483434755e-06, "loss": 1.0062, "step": 1259 }, { "epoch": 2.5557809330628802, "grad_norm": 0.2490234375, "learning_rate": 1.4807302231237326e-06, "loss": 1.0012, "step": 1260 }, { "epoch": 2.5578093306288032, "grad_norm": 0.251953125, "learning_rate": 1.4739688979039893e-06, "loss": 1.0322, "step": 1261 }, { "epoch": 2.559837728194726, "grad_norm": 0.255859375, "learning_rate": 1.4672075726842464e-06, "loss": 1.0477, "step": 1262 }, { "epoch": 2.561866125760649, "grad_norm": 0.255859375, "learning_rate": 1.460446247464503e-06, "loss": 1.0406, "step": 1263 }, { "epoch": 2.5638945233265718, "grad_norm": 0.25390625, "learning_rate": 1.4536849222447602e-06, "loss": 1.0073, "step": 1264 }, { "epoch": 2.565922920892495, "grad_norm": 0.25390625, "learning_rate": 1.446923597025017e-06, "loss": 1.0306, "step": 1265 }, { "epoch": 2.5679513184584177, "grad_norm": 0.28125, "learning_rate": 1.440162271805274e-06, "loss": 0.9963, "step": 1266 }, { "epoch": 2.5699797160243407, "grad_norm": 0.26953125, "learning_rate": 1.433400946585531e-06, "loss": 0.9997, "step": 1267 }, { "epoch": 2.5720081135902637, "grad_norm": 0.384765625, "learning_rate": 1.4266396213657879e-06, "loss": 1.0072, "step": 1268 }, { "epoch": 2.5740365111561867, "grad_norm": 0.2470703125, "learning_rate": 1.4198782961460448e-06, "loss": 1.0053, "step": 1269 }, { "epoch": 2.5760649087221097, "grad_norm": 0.25390625, "learning_rate": 1.4131169709263017e-06, "loss": 1.029, "step": 1270 }, { "epoch": 2.5780933062880322, "grad_norm": 0.275390625, "learning_rate": 1.4063556457065586e-06, "loss": 1.0295, "step": 1271 }, { "epoch": 2.5801217038539552, "grad_norm": 0.30078125, "learning_rate": 1.3995943204868157e-06, "loss": 1.0263, "step": 1272 }, { "epoch": 2.582150101419878, "grad_norm": 0.255859375, "learning_rate": 1.3928329952670724e-06, "loss": 0.997, "step": 1273 }, { "epoch": 2.584178498985801, "grad_norm": 0.2578125, "learning_rate": 1.3860716700473295e-06, "loss": 1.0262, "step": 1274 }, { "epoch": 2.586206896551724, "grad_norm": 0.25390625, "learning_rate": 1.3793103448275862e-06, "loss": 1.0284, "step": 1275 }, { "epoch": 2.588235294117647, "grad_norm": 0.263671875, "learning_rate": 1.3725490196078434e-06, "loss": 0.9981, "step": 1276 }, { "epoch": 2.59026369168357, "grad_norm": 0.259765625, "learning_rate": 1.3657876943881e-06, "loss": 1.0424, "step": 1277 }, { "epoch": 2.5922920892494927, "grad_norm": 0.28515625, "learning_rate": 1.3590263691683572e-06, "loss": 1.0289, "step": 1278 }, { "epoch": 2.5943204868154157, "grad_norm": 0.25390625, "learning_rate": 1.352265043948614e-06, "loss": 1.0007, "step": 1279 }, { "epoch": 2.5963488843813387, "grad_norm": 0.25390625, "learning_rate": 1.345503718728871e-06, "loss": 1.0017, "step": 1280 }, { "epoch": 2.5983772819472617, "grad_norm": 0.263671875, "learning_rate": 1.338742393509128e-06, "loss": 1.0152, "step": 1281 }, { "epoch": 2.6004056795131847, "grad_norm": 0.37109375, "learning_rate": 1.331981068289385e-06, "loss": 1.0119, "step": 1282 }, { "epoch": 2.6024340770791072, "grad_norm": 0.296875, "learning_rate": 1.3252197430696418e-06, "loss": 1.0538, "step": 1283 }, { "epoch": 2.6044624746450307, "grad_norm": 0.267578125, "learning_rate": 1.3184584178498989e-06, "loss": 1.0105, "step": 1284 }, { "epoch": 2.606490872210953, "grad_norm": 0.25390625, "learning_rate": 1.3116970926301556e-06, "loss": 0.9894, "step": 1285 }, { "epoch": 2.608519269776876, "grad_norm": 0.3515625, "learning_rate": 1.3049357674104127e-06, "loss": 0.9832, "step": 1286 }, { "epoch": 2.610547667342799, "grad_norm": 0.26171875, "learning_rate": 1.2981744421906694e-06, "loss": 1.0766, "step": 1287 }, { "epoch": 2.612576064908722, "grad_norm": 0.2578125, "learning_rate": 1.2914131169709265e-06, "loss": 1.0027, "step": 1288 }, { "epoch": 2.614604462474645, "grad_norm": 0.255859375, "learning_rate": 1.2846517917511832e-06, "loss": 1.0245, "step": 1289 }, { "epoch": 2.6166328600405677, "grad_norm": 0.25390625, "learning_rate": 1.2778904665314403e-06, "loss": 1.065, "step": 1290 }, { "epoch": 2.6186612576064907, "grad_norm": 0.33203125, "learning_rate": 1.2711291413116973e-06, "loss": 1.0342, "step": 1291 }, { "epoch": 2.6206896551724137, "grad_norm": 0.275390625, "learning_rate": 1.2643678160919542e-06, "loss": 1.045, "step": 1292 }, { "epoch": 2.6227180527383367, "grad_norm": 0.31640625, "learning_rate": 1.257606490872211e-06, "loss": 1.0383, "step": 1293 }, { "epoch": 2.6247464503042597, "grad_norm": 0.255859375, "learning_rate": 1.2508451656524682e-06, "loss": 0.9908, "step": 1294 }, { "epoch": 2.6267748478701827, "grad_norm": 0.255859375, "learning_rate": 1.244083840432725e-06, "loss": 1.048, "step": 1295 }, { "epoch": 2.6288032454361057, "grad_norm": 0.271484375, "learning_rate": 1.2373225152129818e-06, "loss": 1.0501, "step": 1296 }, { "epoch": 2.630831643002028, "grad_norm": 0.25, "learning_rate": 1.2305611899932387e-06, "loss": 0.9794, "step": 1297 }, { "epoch": 2.632860040567951, "grad_norm": 0.28125, "learning_rate": 1.2237998647734956e-06, "loss": 0.9768, "step": 1298 }, { "epoch": 2.634888438133874, "grad_norm": 0.248046875, "learning_rate": 1.2170385395537525e-06, "loss": 1.0291, "step": 1299 }, { "epoch": 2.636916835699797, "grad_norm": 0.248046875, "learning_rate": 1.2102772143340095e-06, "loss": 1.0042, "step": 1300 }, { "epoch": 2.63894523326572, "grad_norm": 0.267578125, "learning_rate": 1.2035158891142666e-06, "loss": 1.049, "step": 1301 }, { "epoch": 2.640973630831643, "grad_norm": 0.2451171875, "learning_rate": 1.1967545638945235e-06, "loss": 1.0256, "step": 1302 }, { "epoch": 2.643002028397566, "grad_norm": 0.318359375, "learning_rate": 1.1899932386747804e-06, "loss": 0.9939, "step": 1303 }, { "epoch": 2.6450304259634887, "grad_norm": 0.259765625, "learning_rate": 1.1832319134550373e-06, "loss": 1.0379, "step": 1304 }, { "epoch": 2.6470588235294117, "grad_norm": 0.31640625, "learning_rate": 1.1764705882352942e-06, "loss": 1.0315, "step": 1305 }, { "epoch": 2.6490872210953347, "grad_norm": 0.25390625, "learning_rate": 1.1697092630155511e-06, "loss": 1.0205, "step": 1306 }, { "epoch": 2.6511156186612577, "grad_norm": 0.251953125, "learning_rate": 1.162947937795808e-06, "loss": 1.0316, "step": 1307 }, { "epoch": 2.6531440162271807, "grad_norm": 0.255859375, "learning_rate": 1.156186612576065e-06, "loss": 1.05, "step": 1308 }, { "epoch": 2.655172413793103, "grad_norm": 0.248046875, "learning_rate": 1.1494252873563219e-06, "loss": 1.0154, "step": 1309 }, { "epoch": 2.6572008113590266, "grad_norm": 0.255859375, "learning_rate": 1.1426639621365788e-06, "loss": 1.0074, "step": 1310 }, { "epoch": 2.659229208924949, "grad_norm": 0.275390625, "learning_rate": 1.1359026369168357e-06, "loss": 1.0492, "step": 1311 }, { "epoch": 2.661257606490872, "grad_norm": 0.396484375, "learning_rate": 1.1291413116970926e-06, "loss": 1.0047, "step": 1312 }, { "epoch": 2.663286004056795, "grad_norm": 0.2734375, "learning_rate": 1.1223799864773497e-06, "loss": 1.0508, "step": 1313 }, { "epoch": 2.665314401622718, "grad_norm": 0.25390625, "learning_rate": 1.1156186612576066e-06, "loss": 1.0369, "step": 1314 }, { "epoch": 2.667342799188641, "grad_norm": 0.25390625, "learning_rate": 1.1088573360378636e-06, "loss": 1.0213, "step": 1315 }, { "epoch": 2.6693711967545637, "grad_norm": 0.2578125, "learning_rate": 1.1020960108181205e-06, "loss": 1.0402, "step": 1316 }, { "epoch": 2.6713995943204867, "grad_norm": 0.494140625, "learning_rate": 1.0953346855983774e-06, "loss": 0.9848, "step": 1317 }, { "epoch": 2.6734279918864097, "grad_norm": 0.296875, "learning_rate": 1.0885733603786343e-06, "loss": 1.0522, "step": 1318 }, { "epoch": 2.6754563894523327, "grad_norm": 0.26171875, "learning_rate": 1.0818120351588912e-06, "loss": 1.0088, "step": 1319 }, { "epoch": 2.6774847870182557, "grad_norm": 0.283203125, "learning_rate": 1.0750507099391481e-06, "loss": 1.0378, "step": 1320 }, { "epoch": 2.6795131845841786, "grad_norm": 0.341796875, "learning_rate": 1.068289384719405e-06, "loss": 1.0448, "step": 1321 }, { "epoch": 2.6815415821501016, "grad_norm": 0.291015625, "learning_rate": 1.061528059499662e-06, "loss": 1.0244, "step": 1322 }, { "epoch": 2.683569979716024, "grad_norm": 0.259765625, "learning_rate": 1.0547667342799188e-06, "loss": 1.0069, "step": 1323 }, { "epoch": 2.685598377281947, "grad_norm": 0.267578125, "learning_rate": 1.048005409060176e-06, "loss": 1.042, "step": 1324 }, { "epoch": 2.68762677484787, "grad_norm": 0.248046875, "learning_rate": 1.0412440838404329e-06, "loss": 1.0167, "step": 1325 }, { "epoch": 2.689655172413793, "grad_norm": 0.294921875, "learning_rate": 1.0344827586206898e-06, "loss": 1.0087, "step": 1326 }, { "epoch": 2.691683569979716, "grad_norm": 0.255859375, "learning_rate": 1.0277214334009467e-06, "loss": 1.0285, "step": 1327 }, { "epoch": 2.6937119675456387, "grad_norm": 0.26171875, "learning_rate": 1.0209601081812036e-06, "loss": 1.0014, "step": 1328 }, { "epoch": 2.695740365111562, "grad_norm": 0.2451171875, "learning_rate": 1.0141987829614605e-06, "loss": 1.0103, "step": 1329 }, { "epoch": 2.6977687626774847, "grad_norm": 0.25, "learning_rate": 1.0074374577417174e-06, "loss": 1.0248, "step": 1330 }, { "epoch": 2.6997971602434077, "grad_norm": 0.248046875, "learning_rate": 1.0006761325219743e-06, "loss": 1.0008, "step": 1331 }, { "epoch": 2.7018255578093306, "grad_norm": 0.2490234375, "learning_rate": 9.939148073022313e-07, "loss": 0.9954, "step": 1332 }, { "epoch": 2.7038539553752536, "grad_norm": 0.25390625, "learning_rate": 9.871534820824882e-07, "loss": 1.0192, "step": 1333 }, { "epoch": 2.7058823529411766, "grad_norm": 0.2470703125, "learning_rate": 9.80392156862745e-07, "loss": 1.0073, "step": 1334 }, { "epoch": 2.707910750507099, "grad_norm": 0.328125, "learning_rate": 9.73630831643002e-07, "loss": 1.0504, "step": 1335 }, { "epoch": 2.709939148073022, "grad_norm": 0.37890625, "learning_rate": 9.668695064232591e-07, "loss": 1.0183, "step": 1336 }, { "epoch": 2.711967545638945, "grad_norm": 0.255859375, "learning_rate": 9.60108181203516e-07, "loss": 1.0052, "step": 1337 }, { "epoch": 2.713995943204868, "grad_norm": 0.25, "learning_rate": 9.533468559837728e-07, "loss": 1.0111, "step": 1338 }, { "epoch": 2.716024340770791, "grad_norm": 0.244140625, "learning_rate": 9.465855307640299e-07, "loss": 1.0238, "step": 1339 }, { "epoch": 2.718052738336714, "grad_norm": 0.265625, "learning_rate": 9.398242055442868e-07, "loss": 1.0333, "step": 1340 }, { "epoch": 2.720081135902637, "grad_norm": 0.2490234375, "learning_rate": 9.330628803245437e-07, "loss": 1.0292, "step": 1341 }, { "epoch": 2.7221095334685597, "grad_norm": 0.244140625, "learning_rate": 9.263015551048006e-07, "loss": 0.9643, "step": 1342 }, { "epoch": 2.7241379310344827, "grad_norm": 0.251953125, "learning_rate": 9.195402298850575e-07, "loss": 1.0272, "step": 1343 }, { "epoch": 2.7261663286004056, "grad_norm": 0.263671875, "learning_rate": 9.127789046653145e-07, "loss": 1.0355, "step": 1344 }, { "epoch": 2.7281947261663286, "grad_norm": 0.255859375, "learning_rate": 9.060175794455714e-07, "loss": 1.0653, "step": 1345 }, { "epoch": 2.7302231237322516, "grad_norm": 0.26171875, "learning_rate": 8.992562542258283e-07, "loss": 1.0541, "step": 1346 }, { "epoch": 2.732251521298174, "grad_norm": 0.279296875, "learning_rate": 8.924949290060852e-07, "loss": 1.0, "step": 1347 }, { "epoch": 2.7342799188640976, "grad_norm": 0.291015625, "learning_rate": 8.857336037863422e-07, "loss": 1.0285, "step": 1348 }, { "epoch": 2.73630831643002, "grad_norm": 0.25390625, "learning_rate": 8.789722785665991e-07, "loss": 1.0156, "step": 1349 }, { "epoch": 2.738336713995943, "grad_norm": 0.291015625, "learning_rate": 8.722109533468561e-07, "loss": 1.0228, "step": 1350 }, { "epoch": 2.740365111561866, "grad_norm": 0.255859375, "learning_rate": 8.65449628127113e-07, "loss": 1.0123, "step": 1351 }, { "epoch": 2.742393509127789, "grad_norm": 0.25390625, "learning_rate": 8.586883029073699e-07, "loss": 1.0359, "step": 1352 }, { "epoch": 2.744421906693712, "grad_norm": 0.291015625, "learning_rate": 8.519269776876268e-07, "loss": 1.0283, "step": 1353 }, { "epoch": 2.7464503042596347, "grad_norm": 0.251953125, "learning_rate": 8.451656524678837e-07, "loss": 1.0165, "step": 1354 }, { "epoch": 2.7484787018255576, "grad_norm": 0.25390625, "learning_rate": 8.384043272481406e-07, "loss": 1.0471, "step": 1355 }, { "epoch": 2.7505070993914806, "grad_norm": 0.25, "learning_rate": 8.316430020283977e-07, "loss": 1.037, "step": 1356 }, { "epoch": 2.7525354969574036, "grad_norm": 0.357421875, "learning_rate": 8.248816768086546e-07, "loss": 1.0044, "step": 1357 }, { "epoch": 2.7545638945233266, "grad_norm": 0.259765625, "learning_rate": 8.181203515889115e-07, "loss": 1.0651, "step": 1358 }, { "epoch": 2.7565922920892496, "grad_norm": 0.255859375, "learning_rate": 8.113590263691684e-07, "loss": 1.0226, "step": 1359 }, { "epoch": 2.7586206896551726, "grad_norm": 0.35546875, "learning_rate": 8.045977011494253e-07, "loss": 1.0059, "step": 1360 }, { "epoch": 2.760649087221095, "grad_norm": 0.251953125, "learning_rate": 7.978363759296822e-07, "loss": 0.9958, "step": 1361 }, { "epoch": 2.762677484787018, "grad_norm": 0.251953125, "learning_rate": 7.910750507099392e-07, "loss": 1.0321, "step": 1362 }, { "epoch": 2.764705882352941, "grad_norm": 0.255859375, "learning_rate": 7.843137254901962e-07, "loss": 1.0443, "step": 1363 }, { "epoch": 2.766734279918864, "grad_norm": 0.2578125, "learning_rate": 7.775524002704531e-07, "loss": 1.0088, "step": 1364 }, { "epoch": 2.768762677484787, "grad_norm": 0.251953125, "learning_rate": 7.7079107505071e-07, "loss": 1.0268, "step": 1365 }, { "epoch": 2.77079107505071, "grad_norm": 0.26171875, "learning_rate": 7.640297498309669e-07, "loss": 1.0209, "step": 1366 }, { "epoch": 2.772819472616633, "grad_norm": 0.287109375, "learning_rate": 7.572684246112239e-07, "loss": 0.9831, "step": 1367 }, { "epoch": 2.7748478701825556, "grad_norm": 0.275390625, "learning_rate": 7.505070993914808e-07, "loss": 0.9727, "step": 1368 }, { "epoch": 2.7768762677484786, "grad_norm": 0.25390625, "learning_rate": 7.437457741717377e-07, "loss": 1.012, "step": 1369 }, { "epoch": 2.7789046653144016, "grad_norm": 0.27734375, "learning_rate": 7.369844489519946e-07, "loss": 1.0417, "step": 1370 }, { "epoch": 2.7809330628803246, "grad_norm": 0.25, "learning_rate": 7.302231237322515e-07, "loss": 0.9739, "step": 1371 }, { "epoch": 2.7829614604462476, "grad_norm": 0.2490234375, "learning_rate": 7.234617985125085e-07, "loss": 1.0208, "step": 1372 }, { "epoch": 2.78498985801217, "grad_norm": 0.26953125, "learning_rate": 7.167004732927655e-07, "loss": 1.0333, "step": 1373 }, { "epoch": 2.7870182555780936, "grad_norm": 0.2578125, "learning_rate": 7.099391480730224e-07, "loss": 0.9805, "step": 1374 }, { "epoch": 2.789046653144016, "grad_norm": 0.26171875, "learning_rate": 7.031778228532793e-07, "loss": 1.0046, "step": 1375 }, { "epoch": 2.791075050709939, "grad_norm": 0.3046875, "learning_rate": 6.964164976335362e-07, "loss": 1.0241, "step": 1376 }, { "epoch": 2.793103448275862, "grad_norm": 0.25, "learning_rate": 6.896551724137931e-07, "loss": 1.0465, "step": 1377 }, { "epoch": 2.795131845841785, "grad_norm": 0.25, "learning_rate": 6.8289384719405e-07, "loss": 0.9962, "step": 1378 }, { "epoch": 2.797160243407708, "grad_norm": 0.25, "learning_rate": 6.76132521974307e-07, "loss": 1.0428, "step": 1379 }, { "epoch": 2.7991886409736306, "grad_norm": 0.2451171875, "learning_rate": 6.69371196754564e-07, "loss": 0.9963, "step": 1380 }, { "epoch": 2.8012170385395536, "grad_norm": 0.25390625, "learning_rate": 6.626098715348209e-07, "loss": 0.9982, "step": 1381 }, { "epoch": 2.8032454361054766, "grad_norm": 0.28125, "learning_rate": 6.558485463150778e-07, "loss": 1.038, "step": 1382 }, { "epoch": 2.8052738336713996, "grad_norm": 0.255859375, "learning_rate": 6.490872210953347e-07, "loss": 1.0555, "step": 1383 }, { "epoch": 2.8073022312373226, "grad_norm": 0.330078125, "learning_rate": 6.423258958755916e-07, "loss": 1.0506, "step": 1384 }, { "epoch": 2.8093306288032456, "grad_norm": 0.2578125, "learning_rate": 6.355645706558486e-07, "loss": 0.986, "step": 1385 }, { "epoch": 2.8113590263691686, "grad_norm": 0.251953125, "learning_rate": 6.288032454361055e-07, "loss": 1.0352, "step": 1386 }, { "epoch": 2.813387423935091, "grad_norm": 0.25390625, "learning_rate": 6.220419202163624e-07, "loss": 1.0198, "step": 1387 }, { "epoch": 2.815415821501014, "grad_norm": 0.263671875, "learning_rate": 6.152805949966194e-07, "loss": 1.0464, "step": 1388 }, { "epoch": 2.817444219066937, "grad_norm": 0.2470703125, "learning_rate": 6.085192697768763e-07, "loss": 1.029, "step": 1389 }, { "epoch": 2.81947261663286, "grad_norm": 0.26171875, "learning_rate": 6.017579445571333e-07, "loss": 1.0276, "step": 1390 }, { "epoch": 2.821501014198783, "grad_norm": 0.255859375, "learning_rate": 5.949966193373902e-07, "loss": 1.0087, "step": 1391 }, { "epoch": 2.8235294117647056, "grad_norm": 0.265625, "learning_rate": 5.882352941176471e-07, "loss": 0.9787, "step": 1392 }, { "epoch": 2.825557809330629, "grad_norm": 0.283203125, "learning_rate": 5.81473968897904e-07, "loss": 0.9883, "step": 1393 }, { "epoch": 2.8275862068965516, "grad_norm": 0.3359375, "learning_rate": 5.747126436781609e-07, "loss": 1.0263, "step": 1394 }, { "epoch": 2.8296146044624746, "grad_norm": 0.30859375, "learning_rate": 5.679513184584178e-07, "loss": 1.0101, "step": 1395 }, { "epoch": 2.8316430020283976, "grad_norm": 0.255859375, "learning_rate": 5.611899932386749e-07, "loss": 1.0315, "step": 1396 }, { "epoch": 2.8336713995943206, "grad_norm": 0.2578125, "learning_rate": 5.544286680189318e-07, "loss": 1.0142, "step": 1397 }, { "epoch": 2.8356997971602436, "grad_norm": 0.251953125, "learning_rate": 5.476673427991887e-07, "loss": 1.007, "step": 1398 }, { "epoch": 2.837728194726166, "grad_norm": 0.26953125, "learning_rate": 5.409060175794456e-07, "loss": 1.0391, "step": 1399 }, { "epoch": 2.839756592292089, "grad_norm": 0.2578125, "learning_rate": 5.341446923597025e-07, "loss": 1.0453, "step": 1400 }, { "epoch": 2.841784989858012, "grad_norm": 0.255859375, "learning_rate": 5.273833671399594e-07, "loss": 1.0113, "step": 1401 }, { "epoch": 2.843813387423935, "grad_norm": 0.251953125, "learning_rate": 5.206220419202164e-07, "loss": 1.026, "step": 1402 }, { "epoch": 2.845841784989858, "grad_norm": 0.251953125, "learning_rate": 5.138607167004734e-07, "loss": 1.0115, "step": 1403 }, { "epoch": 2.847870182555781, "grad_norm": 0.27734375, "learning_rate": 5.070993914807303e-07, "loss": 1.0217, "step": 1404 }, { "epoch": 2.849898580121704, "grad_norm": 0.2578125, "learning_rate": 5.003380662609872e-07, "loss": 1.0084, "step": 1405 }, { "epoch": 2.8519269776876266, "grad_norm": 0.314453125, "learning_rate": 4.935767410412441e-07, "loss": 0.988, "step": 1406 }, { "epoch": 2.8539553752535496, "grad_norm": 0.291015625, "learning_rate": 4.86815415821501e-07, "loss": 1.0294, "step": 1407 }, { "epoch": 2.8559837728194726, "grad_norm": 0.2451171875, "learning_rate": 4.80054090601758e-07, "loss": 1.0212, "step": 1408 }, { "epoch": 2.8580121703853956, "grad_norm": 0.259765625, "learning_rate": 4.732927653820149e-07, "loss": 1.043, "step": 1409 }, { "epoch": 2.8600405679513186, "grad_norm": 0.3125, "learning_rate": 4.6653144016227184e-07, "loss": 0.9945, "step": 1410 }, { "epoch": 2.862068965517241, "grad_norm": 0.291015625, "learning_rate": 4.5977011494252875e-07, "loss": 0.9841, "step": 1411 }, { "epoch": 2.8640973630831645, "grad_norm": 0.2490234375, "learning_rate": 4.530087897227857e-07, "loss": 1.0136, "step": 1412 }, { "epoch": 2.866125760649087, "grad_norm": 0.3515625, "learning_rate": 4.462474645030426e-07, "loss": 1.0142, "step": 1413 }, { "epoch": 2.86815415821501, "grad_norm": 0.271484375, "learning_rate": 4.3948613928329954e-07, "loss": 1.022, "step": 1414 }, { "epoch": 2.870182555780933, "grad_norm": 0.25390625, "learning_rate": 4.327248140635565e-07, "loss": 1.0315, "step": 1415 }, { "epoch": 2.872210953346856, "grad_norm": 0.2451171875, "learning_rate": 4.259634888438134e-07, "loss": 1.0136, "step": 1416 }, { "epoch": 2.874239350912779, "grad_norm": 0.2451171875, "learning_rate": 4.192021636240703e-07, "loss": 1.006, "step": 1417 }, { "epoch": 2.8762677484787016, "grad_norm": 0.2470703125, "learning_rate": 4.124408384043273e-07, "loss": 0.9906, "step": 1418 }, { "epoch": 2.8782961460446246, "grad_norm": 0.2734375, "learning_rate": 4.056795131845842e-07, "loss": 1.0487, "step": 1419 }, { "epoch": 2.8803245436105476, "grad_norm": 0.26171875, "learning_rate": 3.989181879648411e-07, "loss": 1.0244, "step": 1420 }, { "epoch": 2.8823529411764706, "grad_norm": 0.296875, "learning_rate": 3.921568627450981e-07, "loss": 1.0045, "step": 1421 }, { "epoch": 2.8843813387423936, "grad_norm": 0.259765625, "learning_rate": 3.85395537525355e-07, "loss": 1.0387, "step": 1422 }, { "epoch": 2.8864097363083165, "grad_norm": 0.265625, "learning_rate": 3.7863421230561195e-07, "loss": 1.0187, "step": 1423 }, { "epoch": 2.8884381338742395, "grad_norm": 0.287109375, "learning_rate": 3.7187288708586886e-07, "loss": 0.9929, "step": 1424 }, { "epoch": 2.890466531440162, "grad_norm": 0.25390625, "learning_rate": 3.651115618661258e-07, "loss": 1.013, "step": 1425 }, { "epoch": 2.892494929006085, "grad_norm": 0.314453125, "learning_rate": 3.5835023664638274e-07, "loss": 0.9687, "step": 1426 }, { "epoch": 2.894523326572008, "grad_norm": 0.267578125, "learning_rate": 3.5158891142663965e-07, "loss": 1.0375, "step": 1427 }, { "epoch": 2.896551724137931, "grad_norm": 0.26171875, "learning_rate": 3.4482758620689656e-07, "loss": 1.0068, "step": 1428 }, { "epoch": 2.898580121703854, "grad_norm": 0.248046875, "learning_rate": 3.380662609871535e-07, "loss": 1.0095, "step": 1429 }, { "epoch": 2.900608519269777, "grad_norm": 0.25, "learning_rate": 3.3130493576741044e-07, "loss": 1.0516, "step": 1430 }, { "epoch": 2.9026369168357, "grad_norm": 0.345703125, "learning_rate": 3.2454361054766735e-07, "loss": 0.9735, "step": 1431 }, { "epoch": 2.9046653144016226, "grad_norm": 0.24609375, "learning_rate": 3.177822853279243e-07, "loss": 0.9895, "step": 1432 }, { "epoch": 2.9066937119675456, "grad_norm": 0.25390625, "learning_rate": 3.110209601081812e-07, "loss": 1.0294, "step": 1433 }, { "epoch": 2.9087221095334685, "grad_norm": 0.2470703125, "learning_rate": 3.0425963488843814e-07, "loss": 0.9815, "step": 1434 }, { "epoch": 2.9107505070993915, "grad_norm": 0.25390625, "learning_rate": 2.974983096686951e-07, "loss": 1.0464, "step": 1435 }, { "epoch": 2.9127789046653145, "grad_norm": 0.296875, "learning_rate": 2.90736984448952e-07, "loss": 0.9971, "step": 1436 }, { "epoch": 2.914807302231237, "grad_norm": 0.265625, "learning_rate": 2.839756592292089e-07, "loss": 1.0677, "step": 1437 }, { "epoch": 2.9168356997971605, "grad_norm": 0.310546875, "learning_rate": 2.772143340094659e-07, "loss": 1.0104, "step": 1438 }, { "epoch": 2.918864097363083, "grad_norm": 0.283203125, "learning_rate": 2.704530087897228e-07, "loss": 1.011, "step": 1439 }, { "epoch": 2.920892494929006, "grad_norm": 0.2890625, "learning_rate": 2.636916835699797e-07, "loss": 0.9893, "step": 1440 }, { "epoch": 2.922920892494929, "grad_norm": 0.2490234375, "learning_rate": 2.569303583502367e-07, "loss": 1.0341, "step": 1441 }, { "epoch": 2.924949290060852, "grad_norm": 0.25390625, "learning_rate": 2.501690331304936e-07, "loss": 1.0429, "step": 1442 }, { "epoch": 2.926977687626775, "grad_norm": 0.25, "learning_rate": 2.434077079107505e-07, "loss": 0.9993, "step": 1443 }, { "epoch": 2.9290060851926976, "grad_norm": 0.279296875, "learning_rate": 2.3664638269100746e-07, "loss": 1.0904, "step": 1444 }, { "epoch": 2.9310344827586206, "grad_norm": 0.2578125, "learning_rate": 2.2988505747126437e-07, "loss": 1.0504, "step": 1445 }, { "epoch": 2.9330628803245435, "grad_norm": 0.25, "learning_rate": 2.231237322515213e-07, "loss": 0.9868, "step": 1446 }, { "epoch": 2.9350912778904665, "grad_norm": 0.2470703125, "learning_rate": 2.1636240703177825e-07, "loss": 1.0201, "step": 1447 }, { "epoch": 2.9371196754563895, "grad_norm": 0.25, "learning_rate": 2.0960108181203516e-07, "loss": 1.068, "step": 1448 }, { "epoch": 2.9391480730223125, "grad_norm": 0.2734375, "learning_rate": 2.028397565922921e-07, "loss": 1.0115, "step": 1449 }, { "epoch": 2.9411764705882355, "grad_norm": 0.2451171875, "learning_rate": 1.9607843137254904e-07, "loss": 0.9892, "step": 1450 }, { "epoch": 2.943204868154158, "grad_norm": 0.68359375, "learning_rate": 1.8931710615280598e-07, "loss": 1.0336, "step": 1451 }, { "epoch": 2.945233265720081, "grad_norm": 0.26953125, "learning_rate": 1.825557809330629e-07, "loss": 1.0369, "step": 1452 }, { "epoch": 2.947261663286004, "grad_norm": 0.251953125, "learning_rate": 1.7579445571331983e-07, "loss": 0.9911, "step": 1453 }, { "epoch": 2.949290060851927, "grad_norm": 0.25390625, "learning_rate": 1.6903313049357676e-07, "loss": 1.0183, "step": 1454 }, { "epoch": 2.95131845841785, "grad_norm": 0.259765625, "learning_rate": 1.6227180527383367e-07, "loss": 1.0408, "step": 1455 }, { "epoch": 2.9533468559837726, "grad_norm": 0.369140625, "learning_rate": 1.555104800540906e-07, "loss": 1.0125, "step": 1456 }, { "epoch": 2.955375253549696, "grad_norm": 0.255859375, "learning_rate": 1.4874915483434755e-07, "loss": 1.0095, "step": 1457 }, { "epoch": 2.9574036511156185, "grad_norm": 0.255859375, "learning_rate": 1.4198782961460446e-07, "loss": 1.0189, "step": 1458 }, { "epoch": 2.9594320486815415, "grad_norm": 0.3203125, "learning_rate": 1.352265043948614e-07, "loss": 1.0923, "step": 1459 }, { "epoch": 2.9614604462474645, "grad_norm": 0.2490234375, "learning_rate": 1.2846517917511834e-07, "loss": 1.011, "step": 1460 }, { "epoch": 2.9634888438133875, "grad_norm": 0.37109375, "learning_rate": 1.2170385395537525e-07, "loss": 0.9992, "step": 1461 }, { "epoch": 2.9655172413793105, "grad_norm": 0.32421875, "learning_rate": 1.1494252873563219e-07, "loss": 1.0249, "step": 1462 }, { "epoch": 2.967545638945233, "grad_norm": 0.2578125, "learning_rate": 1.0818120351588913e-07, "loss": 1.0594, "step": 1463 }, { "epoch": 2.969574036511156, "grad_norm": 0.2734375, "learning_rate": 1.0141987829614605e-07, "loss": 1.0559, "step": 1464 }, { "epoch": 2.971602434077079, "grad_norm": 0.271484375, "learning_rate": 9.465855307640299e-08, "loss": 1.0337, "step": 1465 }, { "epoch": 2.973630831643002, "grad_norm": 0.263671875, "learning_rate": 8.789722785665991e-08, "loss": 1.0693, "step": 1466 }, { "epoch": 2.975659229208925, "grad_norm": 0.255859375, "learning_rate": 8.113590263691684e-08, "loss": 1.0364, "step": 1467 }, { "epoch": 2.977687626774848, "grad_norm": 0.251953125, "learning_rate": 7.437457741717378e-08, "loss": 1.0163, "step": 1468 }, { "epoch": 2.979716024340771, "grad_norm": 0.265625, "learning_rate": 6.76132521974307e-08, "loss": 0.9708, "step": 1469 }, { "epoch": 2.9817444219066935, "grad_norm": 0.251953125, "learning_rate": 6.085192697768762e-08, "loss": 1.016, "step": 1470 }, { "epoch": 2.9837728194726165, "grad_norm": 0.251953125, "learning_rate": 5.409060175794456e-08, "loss": 1.0521, "step": 1471 }, { "epoch": 2.9858012170385395, "grad_norm": 0.298828125, "learning_rate": 4.7329276538201494e-08, "loss": 1.0276, "step": 1472 }, { "epoch": 2.9878296146044625, "grad_norm": 0.26171875, "learning_rate": 4.056795131845842e-08, "loss": 1.0053, "step": 1473 }, { "epoch": 2.9898580121703855, "grad_norm": 0.27734375, "learning_rate": 3.380662609871535e-08, "loss": 0.9943, "step": 1474 }, { "epoch": 2.991886409736308, "grad_norm": 0.265625, "learning_rate": 2.704530087897228e-08, "loss": 1.0071, "step": 1475 }, { "epoch": 2.9939148073022315, "grad_norm": 0.255859375, "learning_rate": 2.028397565922921e-08, "loss": 1.0272, "step": 1476 }, { "epoch": 2.995943204868154, "grad_norm": 0.25390625, "learning_rate": 1.352265043948614e-08, "loss": 1.0101, "step": 1477 }, { "epoch": 2.997971602434077, "grad_norm": 0.251953125, "learning_rate": 6.76132521974307e-09, "loss": 1.0342, "step": 1478 }, { "epoch": 3.0, "grad_norm": 0.259765625, "learning_rate": 0.0, "loss": 1.0294, "step": 1479 } ], "logging_steps": 1.0, "max_steps": 1479, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.72557905413931e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }