{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 365, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0027397260273972603, "grad_norm": 2.7783915996551514, "learning_rate": 1e-05, "loss": 2.2987, "step": 1 }, { "epoch": 0.005479452054794521, "grad_norm": 2.718986988067627, "learning_rate": 9.972602739726028e-06, "loss": 2.2978, "step": 2 }, { "epoch": 0.00821917808219178, "grad_norm": 2.664280891418457, "learning_rate": 9.945205479452056e-06, "loss": 2.3454, "step": 3 }, { "epoch": 0.010958904109589041, "grad_norm": 2.576812267303467, "learning_rate": 9.917808219178083e-06, "loss": 2.34, "step": 4 }, { "epoch": 0.0136986301369863, "grad_norm": 2.425462007522583, "learning_rate": 9.89041095890411e-06, "loss": 2.2519, "step": 5 }, { "epoch": 0.01643835616438356, "grad_norm": 2.194149971008301, "learning_rate": 9.863013698630138e-06, "loss": 2.1865, "step": 6 }, { "epoch": 0.019178082191780823, "grad_norm": 2.0640907287597656, "learning_rate": 9.835616438356166e-06, "loss": 2.1853, "step": 7 }, { "epoch": 0.021917808219178082, "grad_norm": 1.944751262664795, "learning_rate": 9.808219178082193e-06, "loss": 2.1961, "step": 8 }, { "epoch": 0.024657534246575342, "grad_norm": 1.7866010665893555, "learning_rate": 9.78082191780822e-06, "loss": 2.2195, "step": 9 }, { "epoch": 0.0273972602739726, "grad_norm": 1.6839189529418945, "learning_rate": 9.753424657534248e-06, "loss": 2.1648, "step": 10 }, { "epoch": 0.030136986301369864, "grad_norm": 1.5045173168182373, "learning_rate": 9.726027397260275e-06, "loss": 2.1793, "step": 11 }, { "epoch": 0.03287671232876712, "grad_norm": 1.3616119623184204, "learning_rate": 9.698630136986303e-06, "loss": 2.0597, "step": 12 }, { "epoch": 0.03561643835616438, "grad_norm": 1.2444970607757568, "learning_rate": 9.67123287671233e-06, "loss": 1.9842, "step": 13 }, { "epoch": 0.038356164383561646, "grad_norm": 1.181518316268921, "learning_rate": 9.643835616438358e-06, "loss": 2.0456, "step": 14 }, { "epoch": 0.0410958904109589, "grad_norm": 1.077721357345581, "learning_rate": 9.616438356164385e-06, "loss": 1.9739, "step": 15 }, { "epoch": 0.043835616438356165, "grad_norm": 0.9532090425491333, "learning_rate": 9.589041095890411e-06, "loss": 1.9397, "step": 16 }, { "epoch": 0.04657534246575343, "grad_norm": 0.8984581828117371, "learning_rate": 9.561643835616438e-06, "loss": 1.9033, "step": 17 }, { "epoch": 0.049315068493150684, "grad_norm": 0.8515474796295166, "learning_rate": 9.534246575342466e-06, "loss": 1.9221, "step": 18 }, { "epoch": 0.052054794520547946, "grad_norm": 0.7932195663452148, "learning_rate": 9.506849315068493e-06, "loss": 1.9073, "step": 19 }, { "epoch": 0.0547945205479452, "grad_norm": 0.8313918113708496, "learning_rate": 9.47945205479452e-06, "loss": 1.9648, "step": 20 }, { "epoch": 0.057534246575342465, "grad_norm": 0.7174063920974731, "learning_rate": 9.452054794520548e-06, "loss": 1.8258, "step": 21 }, { "epoch": 0.06027397260273973, "grad_norm": 0.7417664527893066, "learning_rate": 9.424657534246576e-06, "loss": 1.8434, "step": 22 }, { "epoch": 0.06301369863013699, "grad_norm": 0.6711805462837219, "learning_rate": 9.397260273972603e-06, "loss": 1.8295, "step": 23 }, { "epoch": 0.06575342465753424, "grad_norm": 0.6542385816574097, "learning_rate": 9.36986301369863e-06, "loss": 1.7487, "step": 24 }, { "epoch": 0.0684931506849315, "grad_norm": 0.6540365219116211, "learning_rate": 9.342465753424658e-06, "loss": 1.8286, "step": 25 }, { "epoch": 0.07123287671232877, "grad_norm": 0.6243966817855835, "learning_rate": 9.315068493150685e-06, "loss": 1.7783, "step": 26 }, { "epoch": 0.07397260273972603, "grad_norm": 0.614590048789978, "learning_rate": 9.287671232876713e-06, "loss": 1.6739, "step": 27 }, { "epoch": 0.07671232876712329, "grad_norm": 0.6498063206672668, "learning_rate": 9.26027397260274e-06, "loss": 1.8752, "step": 28 }, { "epoch": 0.07945205479452055, "grad_norm": 0.5918038487434387, "learning_rate": 9.232876712328768e-06, "loss": 1.8233, "step": 29 }, { "epoch": 0.0821917808219178, "grad_norm": 0.6467412114143372, "learning_rate": 9.205479452054795e-06, "loss": 1.8496, "step": 30 }, { "epoch": 0.08493150684931507, "grad_norm": 0.5257925391197205, "learning_rate": 9.178082191780823e-06, "loss": 1.6984, "step": 31 }, { "epoch": 0.08767123287671233, "grad_norm": 0.5396108627319336, "learning_rate": 9.15068493150685e-06, "loss": 1.7699, "step": 32 }, { "epoch": 0.09041095890410959, "grad_norm": 0.5339013338088989, "learning_rate": 9.123287671232878e-06, "loss": 1.8231, "step": 33 }, { "epoch": 0.09315068493150686, "grad_norm": 0.5598514676094055, "learning_rate": 9.095890410958905e-06, "loss": 1.8174, "step": 34 }, { "epoch": 0.0958904109589041, "grad_norm": 0.512162983417511, "learning_rate": 9.068493150684932e-06, "loss": 1.7343, "step": 35 }, { "epoch": 0.09863013698630137, "grad_norm": 0.5187001824378967, "learning_rate": 9.04109589041096e-06, "loss": 1.787, "step": 36 }, { "epoch": 0.10136986301369863, "grad_norm": 0.46653178334236145, "learning_rate": 9.013698630136987e-06, "loss": 1.7003, "step": 37 }, { "epoch": 0.10410958904109589, "grad_norm": 0.5134342908859253, "learning_rate": 8.986301369863015e-06, "loss": 1.784, "step": 38 }, { "epoch": 0.10684931506849316, "grad_norm": 0.4487800896167755, "learning_rate": 8.958904109589042e-06, "loss": 1.6445, "step": 39 }, { "epoch": 0.1095890410958904, "grad_norm": 0.4516545534133911, "learning_rate": 8.93150684931507e-06, "loss": 1.6625, "step": 40 }, { "epoch": 0.11232876712328767, "grad_norm": 0.47532588243484497, "learning_rate": 8.904109589041097e-06, "loss": 1.6851, "step": 41 }, { "epoch": 0.11506849315068493, "grad_norm": 0.45430174469947815, "learning_rate": 8.876712328767125e-06, "loss": 1.7143, "step": 42 }, { "epoch": 0.1178082191780822, "grad_norm": 0.44149044156074524, "learning_rate": 8.849315068493152e-06, "loss": 1.6827, "step": 43 }, { "epoch": 0.12054794520547946, "grad_norm": 0.46218550205230713, "learning_rate": 8.82191780821918e-06, "loss": 1.6742, "step": 44 }, { "epoch": 0.1232876712328767, "grad_norm": 0.4688206911087036, "learning_rate": 8.794520547945207e-06, "loss": 1.7103, "step": 45 }, { "epoch": 0.12602739726027398, "grad_norm": 0.43398144841194153, "learning_rate": 8.767123287671233e-06, "loss": 1.6916, "step": 46 }, { "epoch": 0.12876712328767123, "grad_norm": 0.42101365327835083, "learning_rate": 8.73972602739726e-06, "loss": 1.6697, "step": 47 }, { "epoch": 0.13150684931506848, "grad_norm": 0.5128518342971802, "learning_rate": 8.712328767123288e-06, "loss": 1.7214, "step": 48 }, { "epoch": 0.13424657534246576, "grad_norm": 0.42609503865242004, "learning_rate": 8.684931506849315e-06, "loss": 1.6617, "step": 49 }, { "epoch": 0.136986301369863, "grad_norm": 0.41764312982559204, "learning_rate": 8.657534246575343e-06, "loss": 1.659, "step": 50 }, { "epoch": 0.13972602739726028, "grad_norm": 0.6258623003959656, "learning_rate": 8.63013698630137e-06, "loss": 1.7709, "step": 51 }, { "epoch": 0.14246575342465753, "grad_norm": 0.40542301535606384, "learning_rate": 8.602739726027397e-06, "loss": 1.6329, "step": 52 }, { "epoch": 0.14520547945205478, "grad_norm": 0.4480253756046295, "learning_rate": 8.575342465753425e-06, "loss": 1.5718, "step": 53 }, { "epoch": 0.14794520547945206, "grad_norm": 0.38973933458328247, "learning_rate": 8.547945205479454e-06, "loss": 1.6005, "step": 54 }, { "epoch": 0.1506849315068493, "grad_norm": 0.37738144397735596, "learning_rate": 8.520547945205481e-06, "loss": 1.5897, "step": 55 }, { "epoch": 0.15342465753424658, "grad_norm": 0.39026936888694763, "learning_rate": 8.493150684931507e-06, "loss": 1.5894, "step": 56 }, { "epoch": 0.15616438356164383, "grad_norm": 0.3867981433868408, "learning_rate": 8.465753424657535e-06, "loss": 1.6409, "step": 57 }, { "epoch": 0.1589041095890411, "grad_norm": 0.5395059585571289, "learning_rate": 8.438356164383562e-06, "loss": 1.6347, "step": 58 }, { "epoch": 0.16164383561643836, "grad_norm": 0.3915973901748657, "learning_rate": 8.41095890410959e-06, "loss": 1.6286, "step": 59 }, { "epoch": 0.1643835616438356, "grad_norm": 0.36517205834388733, "learning_rate": 8.383561643835617e-06, "loss": 1.5471, "step": 60 }, { "epoch": 0.16712328767123288, "grad_norm": 0.3715989589691162, "learning_rate": 8.356164383561644e-06, "loss": 1.5757, "step": 61 }, { "epoch": 0.16986301369863013, "grad_norm": 0.3596538305282593, "learning_rate": 8.328767123287672e-06, "loss": 1.5789, "step": 62 }, { "epoch": 0.1726027397260274, "grad_norm": 0.3686102330684662, "learning_rate": 8.3013698630137e-06, "loss": 1.5617, "step": 63 }, { "epoch": 0.17534246575342466, "grad_norm": 0.40277448296546936, "learning_rate": 8.273972602739727e-06, "loss": 1.6012, "step": 64 }, { "epoch": 0.1780821917808219, "grad_norm": 0.3889777362346649, "learning_rate": 8.246575342465754e-06, "loss": 1.5994, "step": 65 }, { "epoch": 0.18082191780821918, "grad_norm": 0.3453716039657593, "learning_rate": 8.219178082191782e-06, "loss": 1.5744, "step": 66 }, { "epoch": 0.18356164383561643, "grad_norm": 0.34804436564445496, "learning_rate": 8.19178082191781e-06, "loss": 1.5679, "step": 67 }, { "epoch": 0.1863013698630137, "grad_norm": 0.3630693256855011, "learning_rate": 8.164383561643837e-06, "loss": 1.5603, "step": 68 }, { "epoch": 0.18904109589041096, "grad_norm": 0.35089996457099915, "learning_rate": 8.136986301369864e-06, "loss": 1.5749, "step": 69 }, { "epoch": 0.1917808219178082, "grad_norm": 0.33551210165023804, "learning_rate": 8.109589041095892e-06, "loss": 1.5224, "step": 70 }, { "epoch": 0.19452054794520549, "grad_norm": 0.38519421219825745, "learning_rate": 8.082191780821919e-06, "loss": 1.6463, "step": 71 }, { "epoch": 0.19726027397260273, "grad_norm": 0.3503532409667969, "learning_rate": 8.054794520547946e-06, "loss": 1.5581, "step": 72 }, { "epoch": 0.2, "grad_norm": 0.3333548903465271, "learning_rate": 8.027397260273974e-06, "loss": 1.5602, "step": 73 }, { "epoch": 0.20273972602739726, "grad_norm": 0.3357717990875244, "learning_rate": 8.000000000000001e-06, "loss": 1.5468, "step": 74 }, { "epoch": 0.2054794520547945, "grad_norm": 0.33195677399635315, "learning_rate": 7.972602739726027e-06, "loss": 1.5571, "step": 75 }, { "epoch": 0.20821917808219179, "grad_norm": 0.37599629163742065, "learning_rate": 7.945205479452055e-06, "loss": 1.5548, "step": 76 }, { "epoch": 0.21095890410958903, "grad_norm": 0.33254948258399963, "learning_rate": 7.917808219178082e-06, "loss": 1.4969, "step": 77 }, { "epoch": 0.2136986301369863, "grad_norm": 0.31979507207870483, "learning_rate": 7.89041095890411e-06, "loss": 1.5152, "step": 78 }, { "epoch": 0.21643835616438356, "grad_norm": 0.3433253765106201, "learning_rate": 7.863013698630137e-06, "loss": 1.5019, "step": 79 }, { "epoch": 0.2191780821917808, "grad_norm": 0.34768474102020264, "learning_rate": 7.835616438356164e-06, "loss": 1.5514, "step": 80 }, { "epoch": 0.2219178082191781, "grad_norm": 0.32520607113838196, "learning_rate": 7.808219178082192e-06, "loss": 1.5058, "step": 81 }, { "epoch": 0.22465753424657534, "grad_norm": 0.3185594379901886, "learning_rate": 7.78082191780822e-06, "loss": 1.476, "step": 82 }, { "epoch": 0.2273972602739726, "grad_norm": 0.3094266951084137, "learning_rate": 7.753424657534248e-06, "loss": 1.4908, "step": 83 }, { "epoch": 0.23013698630136986, "grad_norm": 0.3079265356063843, "learning_rate": 7.726027397260276e-06, "loss": 1.4971, "step": 84 }, { "epoch": 0.2328767123287671, "grad_norm": 0.30687814950942993, "learning_rate": 7.698630136986302e-06, "loss": 1.4536, "step": 85 }, { "epoch": 0.2356164383561644, "grad_norm": 0.33721426129341125, "learning_rate": 7.671232876712329e-06, "loss": 1.5183, "step": 86 }, { "epoch": 0.23835616438356164, "grad_norm": 0.33759427070617676, "learning_rate": 7.643835616438356e-06, "loss": 1.5378, "step": 87 }, { "epoch": 0.2410958904109589, "grad_norm": 0.3554996848106384, "learning_rate": 7.616438356164384e-06, "loss": 1.5434, "step": 88 }, { "epoch": 0.24383561643835616, "grad_norm": 0.299603134393692, "learning_rate": 7.589041095890411e-06, "loss": 1.4825, "step": 89 }, { "epoch": 0.2465753424657534, "grad_norm": 0.3191254734992981, "learning_rate": 7.561643835616439e-06, "loss": 1.4953, "step": 90 }, { "epoch": 0.2493150684931507, "grad_norm": 0.32388779520988464, "learning_rate": 7.534246575342466e-06, "loss": 1.4557, "step": 91 }, { "epoch": 0.25205479452054796, "grad_norm": 0.30144524574279785, "learning_rate": 7.506849315068494e-06, "loss": 1.4757, "step": 92 }, { "epoch": 0.2547945205479452, "grad_norm": 0.3293607234954834, "learning_rate": 7.479452054794521e-06, "loss": 1.4837, "step": 93 }, { "epoch": 0.25753424657534246, "grad_norm": 0.31501051783561707, "learning_rate": 7.452054794520549e-06, "loss": 1.5025, "step": 94 }, { "epoch": 0.2602739726027397, "grad_norm": 0.2993467152118683, "learning_rate": 7.424657534246575e-06, "loss": 1.4807, "step": 95 }, { "epoch": 0.26301369863013696, "grad_norm": 0.35344570875167847, "learning_rate": 7.397260273972603e-06, "loss": 1.4771, "step": 96 }, { "epoch": 0.26575342465753427, "grad_norm": 0.38025563955307007, "learning_rate": 7.36986301369863e-06, "loss": 1.4644, "step": 97 }, { "epoch": 0.2684931506849315, "grad_norm": 0.3108437657356262, "learning_rate": 7.342465753424658e-06, "loss": 1.4526, "step": 98 }, { "epoch": 0.27123287671232876, "grad_norm": 0.3841199278831482, "learning_rate": 7.315068493150685e-06, "loss": 1.4994, "step": 99 }, { "epoch": 0.273972602739726, "grad_norm": 0.3451944589614868, "learning_rate": 7.287671232876713e-06, "loss": 1.4736, "step": 100 }, { "epoch": 0.27671232876712326, "grad_norm": 0.3295706808567047, "learning_rate": 7.260273972602741e-06, "loss": 1.5083, "step": 101 }, { "epoch": 0.27945205479452057, "grad_norm": 0.3512739837169647, "learning_rate": 7.232876712328768e-06, "loss": 1.4974, "step": 102 }, { "epoch": 0.2821917808219178, "grad_norm": 0.3645295202732086, "learning_rate": 7.205479452054796e-06, "loss": 1.5129, "step": 103 }, { "epoch": 0.28493150684931506, "grad_norm": 0.3223598599433899, "learning_rate": 7.178082191780823e-06, "loss": 1.4242, "step": 104 }, { "epoch": 0.2876712328767123, "grad_norm": 0.32120782136917114, "learning_rate": 7.15068493150685e-06, "loss": 1.4549, "step": 105 }, { "epoch": 0.29041095890410956, "grad_norm": 0.28700101375579834, "learning_rate": 7.123287671232877e-06, "loss": 1.3941, "step": 106 }, { "epoch": 0.29315068493150687, "grad_norm": 0.3217187523841858, "learning_rate": 7.095890410958905e-06, "loss": 1.4252, "step": 107 }, { "epoch": 0.2958904109589041, "grad_norm": 0.2990974187850952, "learning_rate": 7.068493150684932e-06, "loss": 1.4606, "step": 108 }, { "epoch": 0.29863013698630136, "grad_norm": 0.3032473921775818, "learning_rate": 7.0410958904109596e-06, "loss": 1.4928, "step": 109 }, { "epoch": 0.3013698630136986, "grad_norm": 0.3115643560886383, "learning_rate": 7.013698630136987e-06, "loss": 1.4471, "step": 110 }, { "epoch": 0.3041095890410959, "grad_norm": 0.29027995467185974, "learning_rate": 6.9863013698630145e-06, "loss": 1.3938, "step": 111 }, { "epoch": 0.30684931506849317, "grad_norm": 0.29573702812194824, "learning_rate": 6.958904109589042e-06, "loss": 1.4401, "step": 112 }, { "epoch": 0.3095890410958904, "grad_norm": 0.2828127443790436, "learning_rate": 6.931506849315069e-06, "loss": 1.4293, "step": 113 }, { "epoch": 0.31232876712328766, "grad_norm": 0.3215786814689636, "learning_rate": 6.904109589041097e-06, "loss": 1.4067, "step": 114 }, { "epoch": 0.3150684931506849, "grad_norm": 0.4936331510543823, "learning_rate": 6.876712328767123e-06, "loss": 1.42, "step": 115 }, { "epoch": 0.3178082191780822, "grad_norm": 0.30379927158355713, "learning_rate": 6.849315068493151e-06, "loss": 1.4699, "step": 116 }, { "epoch": 0.32054794520547947, "grad_norm": 0.27884089946746826, "learning_rate": 6.821917808219178e-06, "loss": 1.38, "step": 117 }, { "epoch": 0.3232876712328767, "grad_norm": 0.2906676232814789, "learning_rate": 6.794520547945206e-06, "loss": 1.4266, "step": 118 }, { "epoch": 0.32602739726027397, "grad_norm": 0.2961498200893402, "learning_rate": 6.767123287671233e-06, "loss": 1.4595, "step": 119 }, { "epoch": 0.3287671232876712, "grad_norm": 0.3257698714733124, "learning_rate": 6.739726027397261e-06, "loss": 1.4722, "step": 120 }, { "epoch": 0.3315068493150685, "grad_norm": 0.29315242171287537, "learning_rate": 6.712328767123288e-06, "loss": 1.41, "step": 121 }, { "epoch": 0.33424657534246577, "grad_norm": 0.31047356128692627, "learning_rate": 6.684931506849316e-06, "loss": 1.4136, "step": 122 }, { "epoch": 0.336986301369863, "grad_norm": 0.2866804599761963, "learning_rate": 6.657534246575343e-06, "loss": 1.4161, "step": 123 }, { "epoch": 0.33972602739726027, "grad_norm": 0.3289198875427246, "learning_rate": 6.630136986301371e-06, "loss": 1.4462, "step": 124 }, { "epoch": 0.3424657534246575, "grad_norm": 0.27823013067245483, "learning_rate": 6.602739726027397e-06, "loss": 1.3959, "step": 125 }, { "epoch": 0.3452054794520548, "grad_norm": 0.4073407053947449, "learning_rate": 6.5753424657534245e-06, "loss": 1.4028, "step": 126 }, { "epoch": 0.34794520547945207, "grad_norm": 0.2827169597148895, "learning_rate": 6.547945205479452e-06, "loss": 1.4206, "step": 127 }, { "epoch": 0.3506849315068493, "grad_norm": 0.3490234911441803, "learning_rate": 6.5205479452054794e-06, "loss": 1.4358, "step": 128 }, { "epoch": 0.35342465753424657, "grad_norm": 0.3117371201515198, "learning_rate": 6.493150684931508e-06, "loss": 1.3706, "step": 129 }, { "epoch": 0.3561643835616438, "grad_norm": 0.3357231914997101, "learning_rate": 6.465753424657535e-06, "loss": 1.4638, "step": 130 }, { "epoch": 0.3589041095890411, "grad_norm": 0.3087785542011261, "learning_rate": 6.438356164383563e-06, "loss": 1.4283, "step": 131 }, { "epoch": 0.36164383561643837, "grad_norm": 0.28009334206581116, "learning_rate": 6.41095890410959e-06, "loss": 1.4117, "step": 132 }, { "epoch": 0.3643835616438356, "grad_norm": 0.288065642118454, "learning_rate": 6.3835616438356175e-06, "loss": 1.4098, "step": 133 }, { "epoch": 0.36712328767123287, "grad_norm": 0.30682075023651123, "learning_rate": 6.356164383561645e-06, "loss": 1.4166, "step": 134 }, { "epoch": 0.3698630136986301, "grad_norm": 0.3811328113079071, "learning_rate": 6.328767123287672e-06, "loss": 1.421, "step": 135 }, { "epoch": 0.3726027397260274, "grad_norm": 0.2924054265022278, "learning_rate": 6.301369863013699e-06, "loss": 1.4008, "step": 136 }, { "epoch": 0.37534246575342467, "grad_norm": 0.3342791199684143, "learning_rate": 6.2739726027397265e-06, "loss": 1.4246, "step": 137 }, { "epoch": 0.3780821917808219, "grad_norm": 0.30969563126564026, "learning_rate": 6.246575342465754e-06, "loss": 1.4282, "step": 138 }, { "epoch": 0.38082191780821917, "grad_norm": 0.2826785445213318, "learning_rate": 6.219178082191781e-06, "loss": 1.4053, "step": 139 }, { "epoch": 0.3835616438356164, "grad_norm": 0.28810861706733704, "learning_rate": 6.191780821917809e-06, "loss": 1.3885, "step": 140 }, { "epoch": 0.3863013698630137, "grad_norm": 0.2800002992153168, "learning_rate": 6.164383561643836e-06, "loss": 1.4299, "step": 141 }, { "epoch": 0.38904109589041097, "grad_norm": 0.2789922058582306, "learning_rate": 6.136986301369864e-06, "loss": 1.3567, "step": 142 }, { "epoch": 0.3917808219178082, "grad_norm": 0.2804887592792511, "learning_rate": 6.109589041095891e-06, "loss": 1.3834, "step": 143 }, { "epoch": 0.39452054794520547, "grad_norm": 0.30507779121398926, "learning_rate": 6.082191780821919e-06, "loss": 1.4265, "step": 144 }, { "epoch": 0.3972602739726027, "grad_norm": 0.27467674016952515, "learning_rate": 6.054794520547945e-06, "loss": 1.3817, "step": 145 }, { "epoch": 0.4, "grad_norm": 0.2845015525817871, "learning_rate": 6.027397260273973e-06, "loss": 1.3997, "step": 146 }, { "epoch": 0.40273972602739727, "grad_norm": 0.27660831809043884, "learning_rate": 6e-06, "loss": 1.316, "step": 147 }, { "epoch": 0.4054794520547945, "grad_norm": 0.29394567012786865, "learning_rate": 5.972602739726028e-06, "loss": 1.3689, "step": 148 }, { "epoch": 0.40821917808219177, "grad_norm": 0.27973952889442444, "learning_rate": 5.945205479452055e-06, "loss": 1.3834, "step": 149 }, { "epoch": 0.410958904109589, "grad_norm": 0.2897646129131317, "learning_rate": 5.9178082191780825e-06, "loss": 1.4416, "step": 150 }, { "epoch": 0.4136986301369863, "grad_norm": 0.30043014883995056, "learning_rate": 5.89041095890411e-06, "loss": 1.4002, "step": 151 }, { "epoch": 0.41643835616438357, "grad_norm": 0.3167494833469391, "learning_rate": 5.863013698630137e-06, "loss": 1.3072, "step": 152 }, { "epoch": 0.4191780821917808, "grad_norm": 0.28382283449172974, "learning_rate": 5.835616438356166e-06, "loss": 1.403, "step": 153 }, { "epoch": 0.42191780821917807, "grad_norm": 0.2756654620170593, "learning_rate": 5.8082191780821915e-06, "loss": 1.3818, "step": 154 }, { "epoch": 0.4246575342465753, "grad_norm": 0.30410856008529663, "learning_rate": 5.780821917808219e-06, "loss": 1.4218, "step": 155 }, { "epoch": 0.4273972602739726, "grad_norm": 0.28195640444755554, "learning_rate": 5.753424657534246e-06, "loss": 1.3685, "step": 156 }, { "epoch": 0.4301369863013699, "grad_norm": 0.3396942615509033, "learning_rate": 5.726027397260274e-06, "loss": 1.4313, "step": 157 }, { "epoch": 0.4328767123287671, "grad_norm": 0.3063805103302002, "learning_rate": 5.698630136986302e-06, "loss": 1.3986, "step": 158 }, { "epoch": 0.43561643835616437, "grad_norm": 0.3454056680202484, "learning_rate": 5.6712328767123296e-06, "loss": 1.3891, "step": 159 }, { "epoch": 0.4383561643835616, "grad_norm": 0.2728925347328186, "learning_rate": 5.643835616438357e-06, "loss": 1.3899, "step": 160 }, { "epoch": 0.4410958904109589, "grad_norm": 0.285173624753952, "learning_rate": 5.6164383561643845e-06, "loss": 1.3999, "step": 161 }, { "epoch": 0.4438356164383562, "grad_norm": 0.28108641505241394, "learning_rate": 5.589041095890412e-06, "loss": 1.3795, "step": 162 }, { "epoch": 0.4465753424657534, "grad_norm": 0.2704264521598816, "learning_rate": 5.561643835616439e-06, "loss": 1.3111, "step": 163 }, { "epoch": 0.44931506849315067, "grad_norm": 0.2718411982059479, "learning_rate": 5.534246575342466e-06, "loss": 1.3621, "step": 164 }, { "epoch": 0.4520547945205479, "grad_norm": 0.32745271921157837, "learning_rate": 5.506849315068493e-06, "loss": 1.3626, "step": 165 }, { "epoch": 0.4547945205479452, "grad_norm": 0.27834439277648926, "learning_rate": 5.479452054794521e-06, "loss": 1.3572, "step": 166 }, { "epoch": 0.4575342465753425, "grad_norm": 0.41138356924057007, "learning_rate": 5.452054794520548e-06, "loss": 1.4013, "step": 167 }, { "epoch": 0.4602739726027397, "grad_norm": 0.2827068865299225, "learning_rate": 5.424657534246576e-06, "loss": 1.3834, "step": 168 }, { "epoch": 0.46301369863013697, "grad_norm": 0.2796699106693268, "learning_rate": 5.397260273972603e-06, "loss": 1.4098, "step": 169 }, { "epoch": 0.4657534246575342, "grad_norm": 0.2727374732494354, "learning_rate": 5.369863013698631e-06, "loss": 1.3143, "step": 170 }, { "epoch": 0.4684931506849315, "grad_norm": 0.34712469577789307, "learning_rate": 5.342465753424658e-06, "loss": 1.3511, "step": 171 }, { "epoch": 0.4712328767123288, "grad_norm": 0.33075839281082153, "learning_rate": 5.3150684931506856e-06, "loss": 1.4471, "step": 172 }, { "epoch": 0.473972602739726, "grad_norm": 0.3238897919654846, "learning_rate": 5.287671232876713e-06, "loss": 1.3537, "step": 173 }, { "epoch": 0.4767123287671233, "grad_norm": 0.3690909147262573, "learning_rate": 5.26027397260274e-06, "loss": 1.4236, "step": 174 }, { "epoch": 0.4794520547945205, "grad_norm": 0.2754713296890259, "learning_rate": 5.232876712328767e-06, "loss": 1.3982, "step": 175 }, { "epoch": 0.4821917808219178, "grad_norm": 0.3084002435207367, "learning_rate": 5.2054794520547945e-06, "loss": 1.389, "step": 176 }, { "epoch": 0.4849315068493151, "grad_norm": 0.2934390604496002, "learning_rate": 5.178082191780822e-06, "loss": 1.3787, "step": 177 }, { "epoch": 0.4876712328767123, "grad_norm": 0.2851421535015106, "learning_rate": 5.1506849315068494e-06, "loss": 1.3573, "step": 178 }, { "epoch": 0.4904109589041096, "grad_norm": 0.4750745892524719, "learning_rate": 5.123287671232877e-06, "loss": 1.293, "step": 179 }, { "epoch": 0.4931506849315068, "grad_norm": 0.3686518967151642, "learning_rate": 5.095890410958904e-06, "loss": 1.3124, "step": 180 }, { "epoch": 0.4958904109589041, "grad_norm": 0.28135809302330017, "learning_rate": 5.068493150684932e-06, "loss": 1.3263, "step": 181 }, { "epoch": 0.4986301369863014, "grad_norm": 0.29024508595466614, "learning_rate": 5.04109589041096e-06, "loss": 1.3742, "step": 182 }, { "epoch": 0.5013698630136987, "grad_norm": 0.29644104838371277, "learning_rate": 5.0136986301369875e-06, "loss": 1.2995, "step": 183 }, { "epoch": 0.5041095890410959, "grad_norm": 0.2953638434410095, "learning_rate": 4.986301369863014e-06, "loss": 1.3452, "step": 184 }, { "epoch": 0.5068493150684932, "grad_norm": 0.2855687141418457, "learning_rate": 4.958904109589042e-06, "loss": 1.3446, "step": 185 }, { "epoch": 0.5095890410958904, "grad_norm": 0.28867191076278687, "learning_rate": 4.931506849315069e-06, "loss": 1.368, "step": 186 }, { "epoch": 0.5123287671232877, "grad_norm": 0.2836305797100067, "learning_rate": 4.9041095890410965e-06, "loss": 1.3928, "step": 187 }, { "epoch": 0.5150684931506849, "grad_norm": 0.2619553208351135, "learning_rate": 4.876712328767124e-06, "loss": 1.3847, "step": 188 }, { "epoch": 0.5178082191780822, "grad_norm": 0.34195050597190857, "learning_rate": 4.849315068493151e-06, "loss": 1.3845, "step": 189 }, { "epoch": 0.5205479452054794, "grad_norm": 0.27626919746398926, "learning_rate": 4.821917808219179e-06, "loss": 1.3178, "step": 190 }, { "epoch": 0.5232876712328767, "grad_norm": 0.3042197823524475, "learning_rate": 4.7945205479452054e-06, "loss": 1.3941, "step": 191 }, { "epoch": 0.5260273972602739, "grad_norm": 0.2726993262767792, "learning_rate": 4.767123287671233e-06, "loss": 1.373, "step": 192 }, { "epoch": 0.5287671232876713, "grad_norm": 0.3469006419181824, "learning_rate": 4.73972602739726e-06, "loss": 1.3618, "step": 193 }, { "epoch": 0.5315068493150685, "grad_norm": 0.28888586163520813, "learning_rate": 4.712328767123288e-06, "loss": 1.3919, "step": 194 }, { "epoch": 0.5342465753424658, "grad_norm": 0.2778896689414978, "learning_rate": 4.684931506849315e-06, "loss": 1.327, "step": 195 }, { "epoch": 0.536986301369863, "grad_norm": 0.28161299228668213, "learning_rate": 4.657534246575343e-06, "loss": 1.3406, "step": 196 }, { "epoch": 0.5397260273972603, "grad_norm": 0.27385422587394714, "learning_rate": 4.63013698630137e-06, "loss": 1.3857, "step": 197 }, { "epoch": 0.5424657534246575, "grad_norm": 0.28164947032928467, "learning_rate": 4.602739726027398e-06, "loss": 1.3091, "step": 198 }, { "epoch": 0.5452054794520548, "grad_norm": 0.2788546085357666, "learning_rate": 4.575342465753425e-06, "loss": 1.3562, "step": 199 }, { "epoch": 0.547945205479452, "grad_norm": 0.3709013760089874, "learning_rate": 4.5479452054794525e-06, "loss": 1.3435, "step": 200 }, { "epoch": 0.5506849315068493, "grad_norm": 0.27193161845207214, "learning_rate": 4.52054794520548e-06, "loss": 1.3578, "step": 201 }, { "epoch": 0.5534246575342465, "grad_norm": 0.2905997931957245, "learning_rate": 4.493150684931507e-06, "loss": 1.356, "step": 202 }, { "epoch": 0.5561643835616439, "grad_norm": 0.2716245949268341, "learning_rate": 4.465753424657535e-06, "loss": 1.3242, "step": 203 }, { "epoch": 0.5589041095890411, "grad_norm": 0.26321840286254883, "learning_rate": 4.438356164383562e-06, "loss": 1.3605, "step": 204 }, { "epoch": 0.5616438356164384, "grad_norm": 0.2652720808982849, "learning_rate": 4.41095890410959e-06, "loss": 1.3201, "step": 205 }, { "epoch": 0.5643835616438356, "grad_norm": 0.27451732754707336, "learning_rate": 4.383561643835616e-06, "loss": 1.3452, "step": 206 }, { "epoch": 0.5671232876712329, "grad_norm": 0.4217926561832428, "learning_rate": 4.356164383561644e-06, "loss": 1.3188, "step": 207 }, { "epoch": 0.5698630136986301, "grad_norm": 0.27707886695861816, "learning_rate": 4.328767123287671e-06, "loss": 1.3295, "step": 208 }, { "epoch": 0.5726027397260274, "grad_norm": 0.4550190567970276, "learning_rate": 4.301369863013699e-06, "loss": 1.3321, "step": 209 }, { "epoch": 0.5753424657534246, "grad_norm": 0.27363699674606323, "learning_rate": 4.273972602739727e-06, "loss": 1.2994, "step": 210 }, { "epoch": 0.5780821917808219, "grad_norm": 0.2745828628540039, "learning_rate": 4.246575342465754e-06, "loss": 1.3781, "step": 211 }, { "epoch": 0.5808219178082191, "grad_norm": 0.3593873083591461, "learning_rate": 4.219178082191781e-06, "loss": 1.3414, "step": 212 }, { "epoch": 0.5835616438356165, "grad_norm": 0.34392043948173523, "learning_rate": 4.1917808219178085e-06, "loss": 1.3489, "step": 213 }, { "epoch": 0.5863013698630137, "grad_norm": 0.3030310273170471, "learning_rate": 4.164383561643836e-06, "loss": 1.3444, "step": 214 }, { "epoch": 0.589041095890411, "grad_norm": 0.28436222672462463, "learning_rate": 4.136986301369863e-06, "loss": 1.3575, "step": 215 }, { "epoch": 0.5917808219178082, "grad_norm": 0.2874223589897156, "learning_rate": 4.109589041095891e-06, "loss": 1.3235, "step": 216 }, { "epoch": 0.5945205479452055, "grad_norm": 0.31046220660209656, "learning_rate": 4.082191780821918e-06, "loss": 1.3662, "step": 217 }, { "epoch": 0.5972602739726027, "grad_norm": 0.2944674789905548, "learning_rate": 4.054794520547946e-06, "loss": 1.2767, "step": 218 }, { "epoch": 0.6, "grad_norm": 0.2816692292690277, "learning_rate": 4.027397260273973e-06, "loss": 1.2998, "step": 219 }, { "epoch": 0.6027397260273972, "grad_norm": 0.2940557599067688, "learning_rate": 4.000000000000001e-06, "loss": 1.281, "step": 220 }, { "epoch": 0.6054794520547945, "grad_norm": 0.28142455220222473, "learning_rate": 3.972602739726027e-06, "loss": 1.3325, "step": 221 }, { "epoch": 0.6082191780821918, "grad_norm": 0.4748757779598236, "learning_rate": 3.945205479452055e-06, "loss": 1.3002, "step": 222 }, { "epoch": 0.6109589041095891, "grad_norm": 0.4110530912876129, "learning_rate": 3.917808219178082e-06, "loss": 1.3101, "step": 223 }, { "epoch": 0.6136986301369863, "grad_norm": 0.2625599801540375, "learning_rate": 3.89041095890411e-06, "loss": 1.305, "step": 224 }, { "epoch": 0.6164383561643836, "grad_norm": 0.34154894948005676, "learning_rate": 3.863013698630138e-06, "loss": 1.3298, "step": 225 }, { "epoch": 0.6191780821917808, "grad_norm": 0.275426983833313, "learning_rate": 3.8356164383561645e-06, "loss": 1.3443, "step": 226 }, { "epoch": 0.6219178082191781, "grad_norm": 0.3054882287979126, "learning_rate": 3.808219178082192e-06, "loss": 1.3618, "step": 227 }, { "epoch": 0.6246575342465753, "grad_norm": 0.3048848807811737, "learning_rate": 3.7808219178082194e-06, "loss": 1.3432, "step": 228 }, { "epoch": 0.6273972602739726, "grad_norm": 0.2779141068458557, "learning_rate": 3.753424657534247e-06, "loss": 1.3257, "step": 229 }, { "epoch": 0.6301369863013698, "grad_norm": 0.284501314163208, "learning_rate": 3.7260273972602743e-06, "loss": 1.3403, "step": 230 }, { "epoch": 0.6328767123287671, "grad_norm": 0.281926691532135, "learning_rate": 3.6986301369863014e-06, "loss": 1.3668, "step": 231 }, { "epoch": 0.6356164383561644, "grad_norm": 0.275020033121109, "learning_rate": 3.671232876712329e-06, "loss": 1.3266, "step": 232 }, { "epoch": 0.6383561643835617, "grad_norm": 0.32706278562545776, "learning_rate": 3.6438356164383567e-06, "loss": 1.3712, "step": 233 }, { "epoch": 0.6410958904109589, "grad_norm": 0.2843818664550781, "learning_rate": 3.616438356164384e-06, "loss": 1.3243, "step": 234 }, { "epoch": 0.6438356164383562, "grad_norm": 0.27881118655204773, "learning_rate": 3.5890410958904116e-06, "loss": 1.3715, "step": 235 }, { "epoch": 0.6465753424657534, "grad_norm": 0.2754518985748291, "learning_rate": 3.5616438356164386e-06, "loss": 1.3043, "step": 236 }, { "epoch": 0.6493150684931507, "grad_norm": 0.27016115188598633, "learning_rate": 3.534246575342466e-06, "loss": 1.315, "step": 237 }, { "epoch": 0.6520547945205479, "grad_norm": 0.3224261403083801, "learning_rate": 3.5068493150684935e-06, "loss": 1.3013, "step": 238 }, { "epoch": 0.6547945205479452, "grad_norm": 0.2782925069332123, "learning_rate": 3.479452054794521e-06, "loss": 1.3196, "step": 239 }, { "epoch": 0.6575342465753424, "grad_norm": 0.28471842408180237, "learning_rate": 3.4520547945205484e-06, "loss": 1.3391, "step": 240 }, { "epoch": 0.6602739726027397, "grad_norm": 0.2670688331127167, "learning_rate": 3.4246575342465754e-06, "loss": 1.3379, "step": 241 }, { "epoch": 0.663013698630137, "grad_norm": 0.27165931463241577, "learning_rate": 3.397260273972603e-06, "loss": 1.2915, "step": 242 }, { "epoch": 0.6657534246575343, "grad_norm": 0.3044660687446594, "learning_rate": 3.3698630136986303e-06, "loss": 1.3531, "step": 243 }, { "epoch": 0.6684931506849315, "grad_norm": 0.26178857684135437, "learning_rate": 3.342465753424658e-06, "loss": 1.3062, "step": 244 }, { "epoch": 0.6712328767123288, "grad_norm": 0.2827412784099579, "learning_rate": 3.3150684931506857e-06, "loss": 1.2858, "step": 245 }, { "epoch": 0.673972602739726, "grad_norm": 0.4314991533756256, "learning_rate": 3.2876712328767123e-06, "loss": 1.286, "step": 246 }, { "epoch": 0.6767123287671233, "grad_norm": 0.2708996534347534, "learning_rate": 3.2602739726027397e-06, "loss": 1.3272, "step": 247 }, { "epoch": 0.6794520547945205, "grad_norm": 0.2826438248157501, "learning_rate": 3.2328767123287676e-06, "loss": 1.2997, "step": 248 }, { "epoch": 0.6821917808219178, "grad_norm": 0.30390018224716187, "learning_rate": 3.205479452054795e-06, "loss": 1.2835, "step": 249 }, { "epoch": 0.684931506849315, "grad_norm": 0.2850046753883362, "learning_rate": 3.1780821917808225e-06, "loss": 1.3371, "step": 250 }, { "epoch": 0.6876712328767123, "grad_norm": 0.27475711703300476, "learning_rate": 3.1506849315068495e-06, "loss": 1.3285, "step": 251 }, { "epoch": 0.6904109589041096, "grad_norm": 0.4499518871307373, "learning_rate": 3.123287671232877e-06, "loss": 1.3769, "step": 252 }, { "epoch": 0.6931506849315069, "grad_norm": 0.3102664649486542, "learning_rate": 3.0958904109589044e-06, "loss": 1.3242, "step": 253 }, { "epoch": 0.6958904109589041, "grad_norm": 0.49726834893226624, "learning_rate": 3.068493150684932e-06, "loss": 1.4027, "step": 254 }, { "epoch": 0.6986301369863014, "grad_norm": 0.4082922637462616, "learning_rate": 3.0410958904109593e-06, "loss": 1.3237, "step": 255 }, { "epoch": 0.7013698630136986, "grad_norm": 0.2956920564174652, "learning_rate": 3.0136986301369864e-06, "loss": 1.2286, "step": 256 }, { "epoch": 0.7041095890410959, "grad_norm": 0.3096351623535156, "learning_rate": 2.986301369863014e-06, "loss": 1.3682, "step": 257 }, { "epoch": 0.7068493150684931, "grad_norm": 0.29533058404922485, "learning_rate": 2.9589041095890413e-06, "loss": 1.2705, "step": 258 }, { "epoch": 0.7095890410958904, "grad_norm": 0.2817760705947876, "learning_rate": 2.9315068493150687e-06, "loss": 1.2882, "step": 259 }, { "epoch": 0.7123287671232876, "grad_norm": 0.27596086263656616, "learning_rate": 2.9041095890410957e-06, "loss": 1.3368, "step": 260 }, { "epoch": 0.7150684931506849, "grad_norm": 0.3040952682495117, "learning_rate": 2.876712328767123e-06, "loss": 1.3595, "step": 261 }, { "epoch": 0.7178082191780822, "grad_norm": 0.43870100378990173, "learning_rate": 2.849315068493151e-06, "loss": 1.311, "step": 262 }, { "epoch": 0.7205479452054795, "grad_norm": 0.2656543552875519, "learning_rate": 2.8219178082191785e-06, "loss": 1.3112, "step": 263 }, { "epoch": 0.7232876712328767, "grad_norm": 0.27828529477119446, "learning_rate": 2.794520547945206e-06, "loss": 1.3254, "step": 264 }, { "epoch": 0.726027397260274, "grad_norm": 0.2784496247768402, "learning_rate": 2.767123287671233e-06, "loss": 1.308, "step": 265 }, { "epoch": 0.7287671232876712, "grad_norm": 0.2659035623073578, "learning_rate": 2.7397260273972604e-06, "loss": 1.3307, "step": 266 }, { "epoch": 0.7315068493150685, "grad_norm": 0.27120259404182434, "learning_rate": 2.712328767123288e-06, "loss": 1.2912, "step": 267 }, { "epoch": 0.7342465753424657, "grad_norm": 0.2604619860649109, "learning_rate": 2.6849315068493153e-06, "loss": 1.292, "step": 268 }, { "epoch": 0.736986301369863, "grad_norm": 0.3721306324005127, "learning_rate": 2.6575342465753428e-06, "loss": 1.2895, "step": 269 }, { "epoch": 0.7397260273972602, "grad_norm": 0.2811748683452606, "learning_rate": 2.63013698630137e-06, "loss": 1.268, "step": 270 }, { "epoch": 0.7424657534246575, "grad_norm": 0.2764860689640045, "learning_rate": 2.6027397260273973e-06, "loss": 1.304, "step": 271 }, { "epoch": 0.7452054794520548, "grad_norm": 0.2695830464363098, "learning_rate": 2.5753424657534247e-06, "loss": 1.3259, "step": 272 }, { "epoch": 0.7479452054794521, "grad_norm": 0.41116493940353394, "learning_rate": 2.547945205479452e-06, "loss": 1.2366, "step": 273 }, { "epoch": 0.7506849315068493, "grad_norm": 0.27686795592308044, "learning_rate": 2.52054794520548e-06, "loss": 1.3407, "step": 274 }, { "epoch": 0.7534246575342466, "grad_norm": 0.2602274715900421, "learning_rate": 2.493150684931507e-06, "loss": 1.2867, "step": 275 }, { "epoch": 0.7561643835616438, "grad_norm": 0.40890637040138245, "learning_rate": 2.4657534246575345e-06, "loss": 1.321, "step": 276 }, { "epoch": 0.7589041095890411, "grad_norm": 0.26742494106292725, "learning_rate": 2.438356164383562e-06, "loss": 1.3096, "step": 277 }, { "epoch": 0.7616438356164383, "grad_norm": 0.2657965421676636, "learning_rate": 2.4109589041095894e-06, "loss": 1.3279, "step": 278 }, { "epoch": 0.7643835616438356, "grad_norm": 0.3065105974674225, "learning_rate": 2.3835616438356164e-06, "loss": 1.3353, "step": 279 }, { "epoch": 0.7671232876712328, "grad_norm": 0.2753995656967163, "learning_rate": 2.356164383561644e-06, "loss": 1.2974, "step": 280 }, { "epoch": 0.7698630136986301, "grad_norm": 0.27016234397888184, "learning_rate": 2.3287671232876713e-06, "loss": 1.3335, "step": 281 }, { "epoch": 0.7726027397260274, "grad_norm": 0.30244848132133484, "learning_rate": 2.301369863013699e-06, "loss": 1.3243, "step": 282 }, { "epoch": 0.7753424657534247, "grad_norm": 0.3118477463722229, "learning_rate": 2.2739726027397262e-06, "loss": 1.3193, "step": 283 }, { "epoch": 0.7780821917808219, "grad_norm": 0.2658824920654297, "learning_rate": 2.2465753424657537e-06, "loss": 1.2783, "step": 284 }, { "epoch": 0.7808219178082192, "grad_norm": 0.4244176149368286, "learning_rate": 2.219178082191781e-06, "loss": 1.328, "step": 285 }, { "epoch": 0.7835616438356164, "grad_norm": 0.3017803728580475, "learning_rate": 2.191780821917808e-06, "loss": 1.2953, "step": 286 }, { "epoch": 0.7863013698630137, "grad_norm": 0.314688503742218, "learning_rate": 2.1643835616438356e-06, "loss": 1.2982, "step": 287 }, { "epoch": 0.7890410958904109, "grad_norm": 0.28380101919174194, "learning_rate": 2.1369863013698635e-06, "loss": 1.3065, "step": 288 }, { "epoch": 0.7917808219178082, "grad_norm": 0.2954842746257782, "learning_rate": 2.1095890410958905e-06, "loss": 1.3192, "step": 289 }, { "epoch": 0.7945205479452054, "grad_norm": 0.32515281438827515, "learning_rate": 2.082191780821918e-06, "loss": 1.3145, "step": 290 }, { "epoch": 0.7972602739726027, "grad_norm": 0.265230655670166, "learning_rate": 2.0547945205479454e-06, "loss": 1.2832, "step": 291 }, { "epoch": 0.8, "grad_norm": 0.400721937417984, "learning_rate": 2.027397260273973e-06, "loss": 1.3719, "step": 292 }, { "epoch": 0.8027397260273973, "grad_norm": 0.5060768127441406, "learning_rate": 2.0000000000000003e-06, "loss": 1.35, "step": 293 }, { "epoch": 0.8054794520547945, "grad_norm": 0.3433339595794678, "learning_rate": 1.9726027397260274e-06, "loss": 1.2966, "step": 294 }, { "epoch": 0.8082191780821918, "grad_norm": 0.3028399348258972, "learning_rate": 1.945205479452055e-06, "loss": 1.2796, "step": 295 }, { "epoch": 0.810958904109589, "grad_norm": 0.2953545153141022, "learning_rate": 1.9178082191780823e-06, "loss": 1.302, "step": 296 }, { "epoch": 0.8136986301369863, "grad_norm": 0.4774445593357086, "learning_rate": 1.8904109589041097e-06, "loss": 1.3595, "step": 297 }, { "epoch": 0.8164383561643835, "grad_norm": 0.3728451728820801, "learning_rate": 1.8630136986301372e-06, "loss": 1.2984, "step": 298 }, { "epoch": 0.8191780821917808, "grad_norm": 0.3250785768032074, "learning_rate": 1.8356164383561644e-06, "loss": 1.2306, "step": 299 }, { "epoch": 0.821917808219178, "grad_norm": 0.32280248403549194, "learning_rate": 1.808219178082192e-06, "loss": 1.3355, "step": 300 }, { "epoch": 0.8246575342465754, "grad_norm": 0.2740541994571686, "learning_rate": 1.7808219178082193e-06, "loss": 1.3492, "step": 301 }, { "epoch": 0.8273972602739726, "grad_norm": 0.28502756357192993, "learning_rate": 1.7534246575342468e-06, "loss": 1.3216, "step": 302 }, { "epoch": 0.8301369863013699, "grad_norm": 0.3380693197250366, "learning_rate": 1.7260273972602742e-06, "loss": 1.3832, "step": 303 }, { "epoch": 0.8328767123287671, "grad_norm": 0.5040755867958069, "learning_rate": 1.6986301369863014e-06, "loss": 1.3588, "step": 304 }, { "epoch": 0.8356164383561644, "grad_norm": 0.27480220794677734, "learning_rate": 1.671232876712329e-06, "loss": 1.3333, "step": 305 }, { "epoch": 0.8383561643835616, "grad_norm": 0.2973027229309082, "learning_rate": 1.6438356164383561e-06, "loss": 1.2915, "step": 306 }, { "epoch": 0.8410958904109589, "grad_norm": 0.3050600588321686, "learning_rate": 1.6164383561643838e-06, "loss": 1.2481, "step": 307 }, { "epoch": 0.8438356164383561, "grad_norm": 0.25979846715927124, "learning_rate": 1.5890410958904112e-06, "loss": 1.2818, "step": 308 }, { "epoch": 0.8465753424657534, "grad_norm": 0.44944027066230774, "learning_rate": 1.5616438356164385e-06, "loss": 1.3942, "step": 309 }, { "epoch": 0.8493150684931506, "grad_norm": 0.32843637466430664, "learning_rate": 1.534246575342466e-06, "loss": 1.2982, "step": 310 }, { "epoch": 0.852054794520548, "grad_norm": 0.37917110323905945, "learning_rate": 1.5068493150684932e-06, "loss": 1.3055, "step": 311 }, { "epoch": 0.8547945205479452, "grad_norm": 0.27931544184684753, "learning_rate": 1.4794520547945206e-06, "loss": 1.2723, "step": 312 }, { "epoch": 0.8575342465753425, "grad_norm": 0.27145153284072876, "learning_rate": 1.4520547945205479e-06, "loss": 1.2889, "step": 313 }, { "epoch": 0.8602739726027397, "grad_norm": 0.4370143711566925, "learning_rate": 1.4246575342465755e-06, "loss": 1.3094, "step": 314 }, { "epoch": 0.863013698630137, "grad_norm": 0.2772330045700073, "learning_rate": 1.397260273972603e-06, "loss": 1.2905, "step": 315 }, { "epoch": 0.8657534246575342, "grad_norm": 0.25955408811569214, "learning_rate": 1.3698630136986302e-06, "loss": 1.3027, "step": 316 }, { "epoch": 0.8684931506849315, "grad_norm": 0.2769858241081238, "learning_rate": 1.3424657534246577e-06, "loss": 1.304, "step": 317 }, { "epoch": 0.8712328767123287, "grad_norm": 0.2773462235927582, "learning_rate": 1.315068493150685e-06, "loss": 1.2832, "step": 318 }, { "epoch": 0.873972602739726, "grad_norm": 0.2852456271648407, "learning_rate": 1.2876712328767124e-06, "loss": 1.3438, "step": 319 }, { "epoch": 0.8767123287671232, "grad_norm": 0.26769939064979553, "learning_rate": 1.26027397260274e-06, "loss": 1.3349, "step": 320 }, { "epoch": 0.8794520547945206, "grad_norm": 0.3140236437320709, "learning_rate": 1.2328767123287673e-06, "loss": 1.3362, "step": 321 }, { "epoch": 0.8821917808219178, "grad_norm": 0.27796709537506104, "learning_rate": 1.2054794520547947e-06, "loss": 1.3177, "step": 322 }, { "epoch": 0.8849315068493151, "grad_norm": 0.2727699279785156, "learning_rate": 1.178082191780822e-06, "loss": 1.3348, "step": 323 }, { "epoch": 0.8876712328767123, "grad_norm": 0.27097025513648987, "learning_rate": 1.1506849315068494e-06, "loss": 1.268, "step": 324 }, { "epoch": 0.8904109589041096, "grad_norm": 0.2799121141433716, "learning_rate": 1.1232876712328769e-06, "loss": 1.2653, "step": 325 }, { "epoch": 0.8931506849315068, "grad_norm": 0.3026827573776245, "learning_rate": 1.095890410958904e-06, "loss": 1.3472, "step": 326 }, { "epoch": 0.8958904109589041, "grad_norm": 0.4047977030277252, "learning_rate": 1.0684931506849318e-06, "loss": 1.2996, "step": 327 }, { "epoch": 0.8986301369863013, "grad_norm": 0.28205302357673645, "learning_rate": 1.041095890410959e-06, "loss": 1.3253, "step": 328 }, { "epoch": 0.9013698630136986, "grad_norm": 0.26282304525375366, "learning_rate": 1.0136986301369864e-06, "loss": 1.2757, "step": 329 }, { "epoch": 0.9041095890410958, "grad_norm": 0.32221752405166626, "learning_rate": 9.863013698630137e-07, "loss": 1.2896, "step": 330 }, { "epoch": 0.9068493150684932, "grad_norm": 0.40539026260375977, "learning_rate": 9.589041095890411e-07, "loss": 1.32, "step": 331 }, { "epoch": 0.9095890410958904, "grad_norm": 0.2842485010623932, "learning_rate": 9.315068493150686e-07, "loss": 1.3229, "step": 332 }, { "epoch": 0.9123287671232877, "grad_norm": 0.3267049789428711, "learning_rate": 9.04109589041096e-07, "loss": 1.3252, "step": 333 }, { "epoch": 0.915068493150685, "grad_norm": 0.2732202410697937, "learning_rate": 8.767123287671234e-07, "loss": 1.3405, "step": 334 }, { "epoch": 0.9178082191780822, "grad_norm": 0.2920434772968292, "learning_rate": 8.493150684931507e-07, "loss": 1.3398, "step": 335 }, { "epoch": 0.9205479452054794, "grad_norm": 0.28654786944389343, "learning_rate": 8.219178082191781e-07, "loss": 1.2757, "step": 336 }, { "epoch": 0.9232876712328767, "grad_norm": 0.3055959641933441, "learning_rate": 7.945205479452056e-07, "loss": 1.3166, "step": 337 }, { "epoch": 0.9260273972602739, "grad_norm": 0.3623715043067932, "learning_rate": 7.67123287671233e-07, "loss": 1.3095, "step": 338 }, { "epoch": 0.9287671232876712, "grad_norm": 0.2792491018772125, "learning_rate": 7.397260273972603e-07, "loss": 1.2843, "step": 339 }, { "epoch": 0.9315068493150684, "grad_norm": 0.27143144607543945, "learning_rate": 7.123287671232878e-07, "loss": 1.3463, "step": 340 }, { "epoch": 0.9342465753424658, "grad_norm": 0.27702170610427856, "learning_rate": 6.849315068493151e-07, "loss": 1.2918, "step": 341 }, { "epoch": 0.936986301369863, "grad_norm": 0.2738886773586273, "learning_rate": 6.575342465753425e-07, "loss": 1.3311, "step": 342 }, { "epoch": 0.9397260273972603, "grad_norm": 0.3447202742099762, "learning_rate": 6.3013698630137e-07, "loss": 1.2673, "step": 343 }, { "epoch": 0.9424657534246575, "grad_norm": 0.2835444211959839, "learning_rate": 6.027397260273974e-07, "loss": 1.3422, "step": 344 }, { "epoch": 0.9452054794520548, "grad_norm": 0.305956095457077, "learning_rate": 5.753424657534247e-07, "loss": 1.3449, "step": 345 }, { "epoch": 0.947945205479452, "grad_norm": 0.2742237448692322, "learning_rate": 5.47945205479452e-07, "loss": 1.3003, "step": 346 }, { "epoch": 0.9506849315068493, "grad_norm": 0.3234816789627075, "learning_rate": 5.205479452054795e-07, "loss": 1.3312, "step": 347 }, { "epoch": 0.9534246575342465, "grad_norm": 0.2818801999092102, "learning_rate": 4.931506849315068e-07, "loss": 1.3441, "step": 348 }, { "epoch": 0.9561643835616438, "grad_norm": 0.27905288338661194, "learning_rate": 4.657534246575343e-07, "loss": 1.3021, "step": 349 }, { "epoch": 0.958904109589041, "grad_norm": 0.2791009247303009, "learning_rate": 4.383561643835617e-07, "loss": 1.2948, "step": 350 }, { "epoch": 0.9616438356164384, "grad_norm": 0.26290085911750793, "learning_rate": 4.1095890410958903e-07, "loss": 1.3217, "step": 351 }, { "epoch": 0.9643835616438357, "grad_norm": 0.2941203713417053, "learning_rate": 3.835616438356165e-07, "loss": 1.2635, "step": 352 }, { "epoch": 0.9671232876712329, "grad_norm": 0.2669525146484375, "learning_rate": 3.561643835616439e-07, "loss": 1.2681, "step": 353 }, { "epoch": 0.9698630136986301, "grad_norm": 0.27967914938926697, "learning_rate": 3.2876712328767123e-07, "loss": 1.2566, "step": 354 }, { "epoch": 0.9726027397260274, "grad_norm": 0.29969513416290283, "learning_rate": 3.013698630136987e-07, "loss": 1.3225, "step": 355 }, { "epoch": 0.9753424657534246, "grad_norm": 0.27984827756881714, "learning_rate": 2.73972602739726e-07, "loss": 1.3285, "step": 356 }, { "epoch": 0.9780821917808219, "grad_norm": 0.30030182003974915, "learning_rate": 2.465753424657534e-07, "loss": 1.3041, "step": 357 }, { "epoch": 0.9808219178082191, "grad_norm": 0.30448976159095764, "learning_rate": 2.1917808219178084e-07, "loss": 1.3442, "step": 358 }, { "epoch": 0.9835616438356164, "grad_norm": 0.3098085820674896, "learning_rate": 1.9178082191780824e-07, "loss": 1.3569, "step": 359 }, { "epoch": 0.9863013698630136, "grad_norm": 0.30109989643096924, "learning_rate": 1.6438356164383561e-07, "loss": 1.289, "step": 360 }, { "epoch": 0.989041095890411, "grad_norm": 0.29150623083114624, "learning_rate": 1.36986301369863e-07, "loss": 1.2771, "step": 361 }, { "epoch": 0.9917808219178083, "grad_norm": 0.28063488006591797, "learning_rate": 1.0958904109589042e-07, "loss": 1.3287, "step": 362 }, { "epoch": 0.9945205479452055, "grad_norm": 0.2611445486545563, "learning_rate": 8.219178082191781e-08, "loss": 1.2621, "step": 363 }, { "epoch": 0.9972602739726028, "grad_norm": 0.30826273560523987, "learning_rate": 5.479452054794521e-08, "loss": 1.3666, "step": 364 }, { "epoch": 1.0, "grad_norm": 0.28417009115219116, "learning_rate": 2.7397260273972606e-08, "loss": 1.3091, "step": 365 } ], "logging_steps": 1.0, "max_steps": 365, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4449515620663296e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }