{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000847457627118644, "grad_norm": 4.541811466217041, "learning_rate": 4.998587570621469e-05, "loss": 4.6204, "step": 1 }, { "epoch": 0.001694915254237288, "grad_norm": 2.0906004905700684, "learning_rate": 4.997175141242938e-05, "loss": 4.1783, "step": 2 }, { "epoch": 0.002542372881355932, "grad_norm": 1.9184880256652832, "learning_rate": 4.9957627118644066e-05, "loss": 4.1249, "step": 3 }, { "epoch": 0.003389830508474576, "grad_norm": 1.4485046863555908, "learning_rate": 4.994350282485876e-05, "loss": 3.9969, "step": 4 }, { "epoch": 0.00423728813559322, "grad_norm": 1.5436034202575684, "learning_rate": 4.992937853107345e-05, "loss": 4.022, "step": 5 }, { "epoch": 0.005084745762711864, "grad_norm": 1.273412823677063, "learning_rate": 4.991525423728814e-05, "loss": 3.8472, "step": 6 }, { "epoch": 0.005932203389830509, "grad_norm": 1.2345166206359863, "learning_rate": 4.9901129943502826e-05, "loss": 3.6756, "step": 7 }, { "epoch": 0.006779661016949152, "grad_norm": 1.2115682363510132, "learning_rate": 4.9887005649717516e-05, "loss": 3.5986, "step": 8 }, { "epoch": 0.007627118644067797, "grad_norm": 1.0976130962371826, "learning_rate": 4.9872881355932206e-05, "loss": 3.6337, "step": 9 }, { "epoch": 0.00847457627118644, "grad_norm": 1.1125215291976929, "learning_rate": 4.9858757062146896e-05, "loss": 3.7499, "step": 10 }, { "epoch": 0.009322033898305085, "grad_norm": 2.036208152770996, "learning_rate": 4.984463276836158e-05, "loss": 3.803, "step": 11 }, { "epoch": 0.010169491525423728, "grad_norm": 1.2083711624145508, "learning_rate": 4.9830508474576276e-05, "loss": 3.8121, "step": 12 }, { "epoch": 0.011016949152542373, "grad_norm": 1.387211561203003, "learning_rate": 4.9816384180790966e-05, "loss": 3.7796, "step": 13 }, { "epoch": 0.011864406779661017, "grad_norm": 1.193283200263977, "learning_rate": 4.9802259887005656e-05, "loss": 3.6786, "step": 14 }, { "epoch": 0.012711864406779662, "grad_norm": 1.308258295059204, "learning_rate": 4.978813559322034e-05, "loss": 3.7025, "step": 15 }, { "epoch": 0.013559322033898305, "grad_norm": 1.2283083200454712, "learning_rate": 4.977401129943503e-05, "loss": 3.6833, "step": 16 }, { "epoch": 0.01440677966101695, "grad_norm": 1.0624451637268066, "learning_rate": 4.975988700564972e-05, "loss": 3.7537, "step": 17 }, { "epoch": 0.015254237288135594, "grad_norm": 1.1808664798736572, "learning_rate": 4.974576271186441e-05, "loss": 3.6077, "step": 18 }, { "epoch": 0.016101694915254237, "grad_norm": 1.0545834302902222, "learning_rate": 4.97316384180791e-05, "loss": 3.6229, "step": 19 }, { "epoch": 0.01694915254237288, "grad_norm": 1.1053547859191895, "learning_rate": 4.971751412429379e-05, "loss": 3.6968, "step": 20 }, { "epoch": 0.017796610169491526, "grad_norm": 1.1348943710327148, "learning_rate": 4.970338983050848e-05, "loss": 3.5653, "step": 21 }, { "epoch": 0.01864406779661017, "grad_norm": 1.1071490049362183, "learning_rate": 4.968926553672317e-05, "loss": 3.8216, "step": 22 }, { "epoch": 0.019491525423728815, "grad_norm": 1.0050177574157715, "learning_rate": 4.967514124293786e-05, "loss": 3.6065, "step": 23 }, { "epoch": 0.020338983050847456, "grad_norm": 1.2216566801071167, "learning_rate": 4.966101694915254e-05, "loss": 3.7189, "step": 24 }, { "epoch": 0.0211864406779661, "grad_norm": 1.062896966934204, "learning_rate": 4.964689265536723e-05, "loss": 3.672, "step": 25 }, { "epoch": 0.022033898305084745, "grad_norm": 1.2944012880325317, "learning_rate": 4.963276836158192e-05, "loss": 3.6715, "step": 26 }, { "epoch": 0.02288135593220339, "grad_norm": 1.076984167098999, "learning_rate": 4.961864406779661e-05, "loss": 3.7157, "step": 27 }, { "epoch": 0.023728813559322035, "grad_norm": 1.0346524715423584, "learning_rate": 4.96045197740113e-05, "loss": 3.4722, "step": 28 }, { "epoch": 0.02457627118644068, "grad_norm": 1.1048908233642578, "learning_rate": 4.959039548022599e-05, "loss": 3.6395, "step": 29 }, { "epoch": 0.025423728813559324, "grad_norm": 1.0698761940002441, "learning_rate": 4.957627118644068e-05, "loss": 3.6145, "step": 30 }, { "epoch": 0.026271186440677965, "grad_norm": 1.1007237434387207, "learning_rate": 4.956214689265537e-05, "loss": 3.5166, "step": 31 }, { "epoch": 0.02711864406779661, "grad_norm": 1.0510921478271484, "learning_rate": 4.954802259887006e-05, "loss": 3.4679, "step": 32 }, { "epoch": 0.027966101694915254, "grad_norm": 0.966667652130127, "learning_rate": 4.9533898305084745e-05, "loss": 3.4887, "step": 33 }, { "epoch": 0.0288135593220339, "grad_norm": 1.2693874835968018, "learning_rate": 4.9519774011299435e-05, "loss": 3.6128, "step": 34 }, { "epoch": 0.029661016949152543, "grad_norm": 0.9852153062820435, "learning_rate": 4.9505649717514125e-05, "loss": 3.723, "step": 35 }, { "epoch": 0.030508474576271188, "grad_norm": 1.13206946849823, "learning_rate": 4.9491525423728815e-05, "loss": 3.6117, "step": 36 }, { "epoch": 0.03135593220338983, "grad_norm": 0.8744503855705261, "learning_rate": 4.9477401129943505e-05, "loss": 3.5283, "step": 37 }, { "epoch": 0.03220338983050847, "grad_norm": 1.1233553886413574, "learning_rate": 4.9463276836158195e-05, "loss": 3.5572, "step": 38 }, { "epoch": 0.03305084745762712, "grad_norm": 1.0636460781097412, "learning_rate": 4.9449152542372885e-05, "loss": 3.6352, "step": 39 }, { "epoch": 0.03389830508474576, "grad_norm": 1.1380923986434937, "learning_rate": 4.9435028248587575e-05, "loss": 3.573, "step": 40 }, { "epoch": 0.03474576271186441, "grad_norm": 1.0083677768707275, "learning_rate": 4.942090395480226e-05, "loss": 3.4665, "step": 41 }, { "epoch": 0.03559322033898305, "grad_norm": 1.374564528465271, "learning_rate": 4.940677966101695e-05, "loss": 3.6156, "step": 42 }, { "epoch": 0.036440677966101696, "grad_norm": 1.0445719957351685, "learning_rate": 4.939265536723164e-05, "loss": 3.618, "step": 43 }, { "epoch": 0.03728813559322034, "grad_norm": 0.9282047748565674, "learning_rate": 4.9378531073446335e-05, "loss": 3.5182, "step": 44 }, { "epoch": 0.038135593220338986, "grad_norm": 1.0341825485229492, "learning_rate": 4.936440677966102e-05, "loss": 3.6193, "step": 45 }, { "epoch": 0.03898305084745763, "grad_norm": 1.0397495031356812, "learning_rate": 4.935028248587571e-05, "loss": 3.573, "step": 46 }, { "epoch": 0.03983050847457627, "grad_norm": 1.06525719165802, "learning_rate": 4.93361581920904e-05, "loss": 3.6207, "step": 47 }, { "epoch": 0.04067796610169491, "grad_norm": 1.061848759651184, "learning_rate": 4.932203389830509e-05, "loss": 3.5083, "step": 48 }, { "epoch": 0.04152542372881356, "grad_norm": 0.9978002309799194, "learning_rate": 4.930790960451978e-05, "loss": 3.4151, "step": 49 }, { "epoch": 0.0423728813559322, "grad_norm": 1.023680567741394, "learning_rate": 4.929378531073446e-05, "loss": 3.5786, "step": 50 }, { "epoch": 0.043220338983050846, "grad_norm": 1.1530768871307373, "learning_rate": 4.927966101694915e-05, "loss": 3.5118, "step": 51 }, { "epoch": 0.04406779661016949, "grad_norm": 1.0338066816329956, "learning_rate": 4.926553672316385e-05, "loss": 3.7083, "step": 52 }, { "epoch": 0.044915254237288135, "grad_norm": 1.0216397047042847, "learning_rate": 4.925141242937854e-05, "loss": 3.598, "step": 53 }, { "epoch": 0.04576271186440678, "grad_norm": 0.9304412007331848, "learning_rate": 4.923728813559322e-05, "loss": 3.5175, "step": 54 }, { "epoch": 0.046610169491525424, "grad_norm": 1.0519485473632812, "learning_rate": 4.922316384180791e-05, "loss": 3.5904, "step": 55 }, { "epoch": 0.04745762711864407, "grad_norm": 0.9387502670288086, "learning_rate": 4.92090395480226e-05, "loss": 3.6034, "step": 56 }, { "epoch": 0.048305084745762714, "grad_norm": 1.0415656566619873, "learning_rate": 4.919491525423729e-05, "loss": 3.627, "step": 57 }, { "epoch": 0.04915254237288136, "grad_norm": 1.013018012046814, "learning_rate": 4.9180790960451975e-05, "loss": 3.5533, "step": 58 }, { "epoch": 0.05, "grad_norm": 1.0008492469787598, "learning_rate": 4.9166666666666665e-05, "loss": 3.5489, "step": 59 }, { "epoch": 0.05084745762711865, "grad_norm": 1.020042896270752, "learning_rate": 4.915254237288136e-05, "loss": 3.5427, "step": 60 }, { "epoch": 0.051694915254237285, "grad_norm": 1.1085624694824219, "learning_rate": 4.913841807909605e-05, "loss": 3.4387, "step": 61 }, { "epoch": 0.05254237288135593, "grad_norm": 0.9071241021156311, "learning_rate": 4.9124293785310735e-05, "loss": 3.6151, "step": 62 }, { "epoch": 0.053389830508474574, "grad_norm": 0.971471905708313, "learning_rate": 4.9110169491525425e-05, "loss": 3.4926, "step": 63 }, { "epoch": 0.05423728813559322, "grad_norm": 1.0597212314605713, "learning_rate": 4.9096045197740115e-05, "loss": 3.6694, "step": 64 }, { "epoch": 0.05508474576271186, "grad_norm": 0.8976388573646545, "learning_rate": 4.9081920903954805e-05, "loss": 3.5233, "step": 65 }, { "epoch": 0.05593220338983051, "grad_norm": 0.9068442583084106, "learning_rate": 4.9067796610169495e-05, "loss": 3.5472, "step": 66 }, { "epoch": 0.05677966101694915, "grad_norm": 0.9367517232894897, "learning_rate": 4.905367231638418e-05, "loss": 3.5259, "step": 67 }, { "epoch": 0.0576271186440678, "grad_norm": 0.8741654753684998, "learning_rate": 4.9039548022598875e-05, "loss": 3.4881, "step": 68 }, { "epoch": 0.05847457627118644, "grad_norm": 0.8584069609642029, "learning_rate": 4.9025423728813565e-05, "loss": 3.4114, "step": 69 }, { "epoch": 0.059322033898305086, "grad_norm": 0.9734240174293518, "learning_rate": 4.9011299435028255e-05, "loss": 3.5721, "step": 70 }, { "epoch": 0.06016949152542373, "grad_norm": 0.9685958027839661, "learning_rate": 4.899717514124294e-05, "loss": 3.3634, "step": 71 }, { "epoch": 0.061016949152542375, "grad_norm": 1.123534083366394, "learning_rate": 4.898305084745763e-05, "loss": 3.4039, "step": 72 }, { "epoch": 0.06186440677966102, "grad_norm": 1.012391448020935, "learning_rate": 4.896892655367232e-05, "loss": 3.439, "step": 73 }, { "epoch": 0.06271186440677966, "grad_norm": 1.3435492515563965, "learning_rate": 4.895480225988701e-05, "loss": 3.4419, "step": 74 }, { "epoch": 0.0635593220338983, "grad_norm": 1.04447603225708, "learning_rate": 4.89406779661017e-05, "loss": 3.5461, "step": 75 }, { "epoch": 0.06440677966101695, "grad_norm": 0.9301888346672058, "learning_rate": 4.892655367231639e-05, "loss": 3.5858, "step": 76 }, { "epoch": 0.06525423728813559, "grad_norm": 0.9731817841529846, "learning_rate": 4.891242937853108e-05, "loss": 3.5402, "step": 77 }, { "epoch": 0.06610169491525424, "grad_norm": 0.8086643218994141, "learning_rate": 4.889830508474577e-05, "loss": 3.4813, "step": 78 }, { "epoch": 0.06694915254237288, "grad_norm": 1.0235638618469238, "learning_rate": 4.888418079096045e-05, "loss": 3.29, "step": 79 }, { "epoch": 0.06779661016949153, "grad_norm": 0.9478564858436584, "learning_rate": 4.887005649717514e-05, "loss": 3.5304, "step": 80 }, { "epoch": 0.06864406779661017, "grad_norm": 0.9608615040779114, "learning_rate": 4.885593220338983e-05, "loss": 3.5029, "step": 81 }, { "epoch": 0.06949152542372881, "grad_norm": 0.8642784357070923, "learning_rate": 4.884180790960452e-05, "loss": 3.4545, "step": 82 }, { "epoch": 0.07033898305084746, "grad_norm": 1.0307790040969849, "learning_rate": 4.882768361581921e-05, "loss": 3.4939, "step": 83 }, { "epoch": 0.0711864406779661, "grad_norm": 1.0859419107437134, "learning_rate": 4.88135593220339e-05, "loss": 3.4883, "step": 84 }, { "epoch": 0.07203389830508475, "grad_norm": 0.946661651134491, "learning_rate": 4.879943502824859e-05, "loss": 3.4903, "step": 85 }, { "epoch": 0.07288135593220339, "grad_norm": 1.1954424381256104, "learning_rate": 4.878531073446328e-05, "loss": 3.4469, "step": 86 }, { "epoch": 0.07372881355932204, "grad_norm": 1.3650797605514526, "learning_rate": 4.877118644067797e-05, "loss": 3.5309, "step": 87 }, { "epoch": 0.07457627118644068, "grad_norm": 0.9673838019371033, "learning_rate": 4.8757062146892654e-05, "loss": 3.455, "step": 88 }, { "epoch": 0.07542372881355933, "grad_norm": 0.9562941789627075, "learning_rate": 4.8742937853107344e-05, "loss": 3.4427, "step": 89 }, { "epoch": 0.07627118644067797, "grad_norm": 1.061118721961975, "learning_rate": 4.8728813559322034e-05, "loss": 3.5553, "step": 90 }, { "epoch": 0.07711864406779662, "grad_norm": 0.9655681848526001, "learning_rate": 4.871468926553673e-05, "loss": 3.3637, "step": 91 }, { "epoch": 0.07796610169491526, "grad_norm": 1.1086546182632446, "learning_rate": 4.8700564971751414e-05, "loss": 3.5719, "step": 92 }, { "epoch": 0.0788135593220339, "grad_norm": 1.1144481897354126, "learning_rate": 4.8686440677966104e-05, "loss": 3.4608, "step": 93 }, { "epoch": 0.07966101694915254, "grad_norm": 0.9629172682762146, "learning_rate": 4.8672316384180794e-05, "loss": 3.4206, "step": 94 }, { "epoch": 0.08050847457627118, "grad_norm": 0.9343329071998596, "learning_rate": 4.8658192090395484e-05, "loss": 3.4169, "step": 95 }, { "epoch": 0.08135593220338982, "grad_norm": 1.0698659420013428, "learning_rate": 4.8644067796610174e-05, "loss": 3.359, "step": 96 }, { "epoch": 0.08220338983050847, "grad_norm": 0.9566194415092468, "learning_rate": 4.862994350282486e-05, "loss": 3.5309, "step": 97 }, { "epoch": 0.08305084745762711, "grad_norm": 1.3343651294708252, "learning_rate": 4.861581920903955e-05, "loss": 3.4451, "step": 98 }, { "epoch": 0.08389830508474576, "grad_norm": 1.4111084938049316, "learning_rate": 4.8601694915254244e-05, "loss": 3.4957, "step": 99 }, { "epoch": 0.0847457627118644, "grad_norm": 1.1460424661636353, "learning_rate": 4.8587570621468934e-05, "loss": 3.3921, "step": 100 }, { "epoch": 0.08559322033898305, "grad_norm": 1.136348843574524, "learning_rate": 4.857344632768362e-05, "loss": 3.3772, "step": 101 }, { "epoch": 0.08644067796610169, "grad_norm": 1.1128734350204468, "learning_rate": 4.855932203389831e-05, "loss": 3.5162, "step": 102 }, { "epoch": 0.08728813559322034, "grad_norm": 1.1538329124450684, "learning_rate": 4.8545197740113e-05, "loss": 3.5819, "step": 103 }, { "epoch": 0.08813559322033898, "grad_norm": 1.1180751323699951, "learning_rate": 4.853107344632769e-05, "loss": 3.5386, "step": 104 }, { "epoch": 0.08898305084745763, "grad_norm": 0.8497837781906128, "learning_rate": 4.851694915254237e-05, "loss": 3.4281, "step": 105 }, { "epoch": 0.08983050847457627, "grad_norm": 1.0639275312423706, "learning_rate": 4.850282485875706e-05, "loss": 3.4381, "step": 106 }, { "epoch": 0.09067796610169492, "grad_norm": 0.8994787335395813, "learning_rate": 4.848870056497176e-05, "loss": 3.4649, "step": 107 }, { "epoch": 0.09152542372881356, "grad_norm": 0.9433245062828064, "learning_rate": 4.847457627118645e-05, "loss": 3.4298, "step": 108 }, { "epoch": 0.0923728813559322, "grad_norm": 1.1026298999786377, "learning_rate": 4.846045197740113e-05, "loss": 3.2809, "step": 109 }, { "epoch": 0.09322033898305085, "grad_norm": 0.9991617202758789, "learning_rate": 4.844632768361582e-05, "loss": 3.4374, "step": 110 }, { "epoch": 0.0940677966101695, "grad_norm": 1.0328609943389893, "learning_rate": 4.843220338983051e-05, "loss": 3.3927, "step": 111 }, { "epoch": 0.09491525423728814, "grad_norm": 0.962786853313446, "learning_rate": 4.84180790960452e-05, "loss": 3.4113, "step": 112 }, { "epoch": 0.09576271186440678, "grad_norm": 0.8685274720191956, "learning_rate": 4.840395480225989e-05, "loss": 3.4442, "step": 113 }, { "epoch": 0.09661016949152543, "grad_norm": 1.1098908185958862, "learning_rate": 4.8389830508474574e-05, "loss": 3.2912, "step": 114 }, { "epoch": 0.09745762711864407, "grad_norm": 1.128380298614502, "learning_rate": 4.837570621468927e-05, "loss": 3.4944, "step": 115 }, { "epoch": 0.09830508474576272, "grad_norm": 1.1417268514633179, "learning_rate": 4.836158192090396e-05, "loss": 3.4952, "step": 116 }, { "epoch": 0.09915254237288136, "grad_norm": 0.9682890772819519, "learning_rate": 4.834745762711865e-05, "loss": 3.4218, "step": 117 }, { "epoch": 0.1, "grad_norm": 1.0429868698120117, "learning_rate": 4.8333333333333334e-05, "loss": 3.4454, "step": 118 }, { "epoch": 0.10084745762711865, "grad_norm": 1.1318089962005615, "learning_rate": 4.8319209039548024e-05, "loss": 3.4676, "step": 119 }, { "epoch": 0.1016949152542373, "grad_norm": 0.9262925386428833, "learning_rate": 4.8305084745762714e-05, "loss": 3.4663, "step": 120 }, { "epoch": 0.10254237288135593, "grad_norm": 0.9783544540405273, "learning_rate": 4.8290960451977404e-05, "loss": 3.4829, "step": 121 }, { "epoch": 0.10338983050847457, "grad_norm": 0.8752434253692627, "learning_rate": 4.8276836158192094e-05, "loss": 3.4232, "step": 122 }, { "epoch": 0.10423728813559321, "grad_norm": 1.132392406463623, "learning_rate": 4.8262711864406784e-05, "loss": 3.3249, "step": 123 }, { "epoch": 0.10508474576271186, "grad_norm": 1.0966801643371582, "learning_rate": 4.8248587570621474e-05, "loss": 3.5047, "step": 124 }, { "epoch": 0.1059322033898305, "grad_norm": 0.9103270173072815, "learning_rate": 4.8234463276836164e-05, "loss": 3.5133, "step": 125 }, { "epoch": 0.10677966101694915, "grad_norm": 1.3025434017181396, "learning_rate": 4.822033898305085e-05, "loss": 3.4794, "step": 126 }, { "epoch": 0.10762711864406779, "grad_norm": 0.9362355470657349, "learning_rate": 4.820621468926554e-05, "loss": 3.4954, "step": 127 }, { "epoch": 0.10847457627118644, "grad_norm": 1.0637141466140747, "learning_rate": 4.819209039548023e-05, "loss": 3.4143, "step": 128 }, { "epoch": 0.10932203389830508, "grad_norm": 1.0837198495864868, "learning_rate": 4.817796610169492e-05, "loss": 3.3429, "step": 129 }, { "epoch": 0.11016949152542373, "grad_norm": 0.9153018593788147, "learning_rate": 4.816384180790961e-05, "loss": 3.3445, "step": 130 }, { "epoch": 0.11101694915254237, "grad_norm": 1.0642338991165161, "learning_rate": 4.81497175141243e-05, "loss": 3.4297, "step": 131 }, { "epoch": 0.11186440677966102, "grad_norm": 1.0573382377624512, "learning_rate": 4.813559322033899e-05, "loss": 3.5539, "step": 132 }, { "epoch": 0.11271186440677966, "grad_norm": 0.9512803554534912, "learning_rate": 4.812146892655368e-05, "loss": 3.398, "step": 133 }, { "epoch": 0.1135593220338983, "grad_norm": 0.9490872621536255, "learning_rate": 4.810734463276837e-05, "loss": 3.3706, "step": 134 }, { "epoch": 0.11440677966101695, "grad_norm": 1.0442562103271484, "learning_rate": 4.809322033898305e-05, "loss": 3.4986, "step": 135 }, { "epoch": 0.1152542372881356, "grad_norm": 0.7907192707061768, "learning_rate": 4.807909604519774e-05, "loss": 3.4127, "step": 136 }, { "epoch": 0.11610169491525424, "grad_norm": 1.0272952318191528, "learning_rate": 4.806497175141243e-05, "loss": 3.4172, "step": 137 }, { "epoch": 0.11694915254237288, "grad_norm": 1.170906662940979, "learning_rate": 4.805084745762712e-05, "loss": 3.4011, "step": 138 }, { "epoch": 0.11779661016949153, "grad_norm": 1.0225719213485718, "learning_rate": 4.803672316384181e-05, "loss": 3.4664, "step": 139 }, { "epoch": 0.11864406779661017, "grad_norm": 0.9054561853408813, "learning_rate": 4.80225988700565e-05, "loss": 3.3756, "step": 140 }, { "epoch": 0.11949152542372882, "grad_norm": 1.0109195709228516, "learning_rate": 4.800847457627119e-05, "loss": 3.506, "step": 141 }, { "epoch": 0.12033898305084746, "grad_norm": 0.9443098902702332, "learning_rate": 4.799435028248588e-05, "loss": 3.4363, "step": 142 }, { "epoch": 0.1211864406779661, "grad_norm": 0.9755582809448242, "learning_rate": 4.798022598870057e-05, "loss": 3.3118, "step": 143 }, { "epoch": 0.12203389830508475, "grad_norm": 0.9171461462974548, "learning_rate": 4.796610169491525e-05, "loss": 3.4458, "step": 144 }, { "epoch": 0.1228813559322034, "grad_norm": 1.032393217086792, "learning_rate": 4.795197740112994e-05, "loss": 3.4287, "step": 145 }, { "epoch": 0.12372881355932204, "grad_norm": 0.9760985970497131, "learning_rate": 4.793785310734463e-05, "loss": 3.4285, "step": 146 }, { "epoch": 0.12457627118644068, "grad_norm": 0.9190145134925842, "learning_rate": 4.792372881355933e-05, "loss": 3.2856, "step": 147 }, { "epoch": 0.12542372881355932, "grad_norm": 1.0193102359771729, "learning_rate": 4.790960451977401e-05, "loss": 3.4372, "step": 148 }, { "epoch": 0.12627118644067797, "grad_norm": 1.2727913856506348, "learning_rate": 4.78954802259887e-05, "loss": 3.3763, "step": 149 }, { "epoch": 0.1271186440677966, "grad_norm": 0.9420662522315979, "learning_rate": 4.788135593220339e-05, "loss": 3.4134, "step": 150 }, { "epoch": 0.12796610169491526, "grad_norm": 1.1431869268417358, "learning_rate": 4.786723163841808e-05, "loss": 3.343, "step": 151 }, { "epoch": 0.1288135593220339, "grad_norm": 1.1734342575073242, "learning_rate": 4.7853107344632766e-05, "loss": 3.4031, "step": 152 }, { "epoch": 0.12966101694915255, "grad_norm": 0.9540769457817078, "learning_rate": 4.7838983050847456e-05, "loss": 3.4251, "step": 153 }, { "epoch": 0.13050847457627118, "grad_norm": 0.9957358241081238, "learning_rate": 4.7824858757062146e-05, "loss": 3.2844, "step": 154 }, { "epoch": 0.13135593220338984, "grad_norm": 1.0116667747497559, "learning_rate": 4.781073446327684e-05, "loss": 3.3582, "step": 155 }, { "epoch": 0.13220338983050847, "grad_norm": 0.9623947143554688, "learning_rate": 4.7796610169491526e-05, "loss": 3.3734, "step": 156 }, { "epoch": 0.13305084745762713, "grad_norm": 1.0396393537521362, "learning_rate": 4.7782485875706216e-05, "loss": 3.3273, "step": 157 }, { "epoch": 0.13389830508474576, "grad_norm": 1.0981794595718384, "learning_rate": 4.7768361581920906e-05, "loss": 3.2942, "step": 158 }, { "epoch": 0.13474576271186442, "grad_norm": 0.9261422157287598, "learning_rate": 4.7754237288135596e-05, "loss": 3.3209, "step": 159 }, { "epoch": 0.13559322033898305, "grad_norm": 0.9253855347633362, "learning_rate": 4.7740112994350286e-05, "loss": 3.3043, "step": 160 }, { "epoch": 0.13644067796610168, "grad_norm": 1.132527232170105, "learning_rate": 4.772598870056497e-05, "loss": 3.5229, "step": 161 }, { "epoch": 0.13728813559322034, "grad_norm": 0.9555791020393372, "learning_rate": 4.7711864406779666e-05, "loss": 3.5308, "step": 162 }, { "epoch": 0.13813559322033897, "grad_norm": 0.8020421266555786, "learning_rate": 4.7697740112994356e-05, "loss": 3.4104, "step": 163 }, { "epoch": 0.13898305084745763, "grad_norm": 1.0284488201141357, "learning_rate": 4.7683615819209046e-05, "loss": 3.451, "step": 164 }, { "epoch": 0.13983050847457626, "grad_norm": 1.0072463750839233, "learning_rate": 4.766949152542373e-05, "loss": 3.3837, "step": 165 }, { "epoch": 0.14067796610169492, "grad_norm": 1.0214042663574219, "learning_rate": 4.765536723163842e-05, "loss": 3.416, "step": 166 }, { "epoch": 0.14152542372881355, "grad_norm": 0.9888172745704651, "learning_rate": 4.764124293785311e-05, "loss": 3.3122, "step": 167 }, { "epoch": 0.1423728813559322, "grad_norm": 0.8845030069351196, "learning_rate": 4.76271186440678e-05, "loss": 3.4606, "step": 168 }, { "epoch": 0.14322033898305084, "grad_norm": 1.0305500030517578, "learning_rate": 4.761299435028248e-05, "loss": 3.3604, "step": 169 }, { "epoch": 0.1440677966101695, "grad_norm": 0.9892067313194275, "learning_rate": 4.759887005649718e-05, "loss": 3.3878, "step": 170 }, { "epoch": 0.14491525423728813, "grad_norm": 1.0221586227416992, "learning_rate": 4.758474576271187e-05, "loss": 3.3996, "step": 171 }, { "epoch": 0.14576271186440679, "grad_norm": 0.8211665749549866, "learning_rate": 4.757062146892656e-05, "loss": 3.3762, "step": 172 }, { "epoch": 0.14661016949152542, "grad_norm": 1.1288832426071167, "learning_rate": 4.755649717514124e-05, "loss": 3.3915, "step": 173 }, { "epoch": 0.14745762711864407, "grad_norm": 1.1231085062026978, "learning_rate": 4.754237288135593e-05, "loss": 3.4621, "step": 174 }, { "epoch": 0.1483050847457627, "grad_norm": 0.9494627714157104, "learning_rate": 4.752824858757062e-05, "loss": 3.4505, "step": 175 }, { "epoch": 0.14915254237288136, "grad_norm": 1.1615875959396362, "learning_rate": 4.751412429378531e-05, "loss": 3.4558, "step": 176 }, { "epoch": 0.15, "grad_norm": 0.9336468577384949, "learning_rate": 4.75e-05, "loss": 3.3102, "step": 177 }, { "epoch": 0.15084745762711865, "grad_norm": 0.8708992004394531, "learning_rate": 4.748587570621469e-05, "loss": 3.4432, "step": 178 }, { "epoch": 0.15169491525423728, "grad_norm": 1.1003191471099854, "learning_rate": 4.747175141242938e-05, "loss": 3.4165, "step": 179 }, { "epoch": 0.15254237288135594, "grad_norm": 0.8817538619041443, "learning_rate": 4.745762711864407e-05, "loss": 3.4338, "step": 180 }, { "epoch": 0.15338983050847457, "grad_norm": 0.9520441889762878, "learning_rate": 4.744350282485876e-05, "loss": 3.3009, "step": 181 }, { "epoch": 0.15423728813559323, "grad_norm": 1.0844974517822266, "learning_rate": 4.7429378531073446e-05, "loss": 3.2901, "step": 182 }, { "epoch": 0.15508474576271186, "grad_norm": 1.1425660848617554, "learning_rate": 4.7415254237288136e-05, "loss": 3.2906, "step": 183 }, { "epoch": 0.15593220338983052, "grad_norm": 0.9530520439147949, "learning_rate": 4.7401129943502826e-05, "loss": 3.4059, "step": 184 }, { "epoch": 0.15677966101694915, "grad_norm": 1.011949896812439, "learning_rate": 4.7387005649717516e-05, "loss": 3.2765, "step": 185 }, { "epoch": 0.1576271186440678, "grad_norm": 1.089328646659851, "learning_rate": 4.7372881355932206e-05, "loss": 3.3774, "step": 186 }, { "epoch": 0.15847457627118644, "grad_norm": 0.9004995226860046, "learning_rate": 4.7358757062146896e-05, "loss": 3.4686, "step": 187 }, { "epoch": 0.15932203389830507, "grad_norm": 0.9860526323318481, "learning_rate": 4.7344632768361586e-05, "loss": 3.3794, "step": 188 }, { "epoch": 0.16016949152542373, "grad_norm": 1.0845123529434204, "learning_rate": 4.7330508474576276e-05, "loss": 3.4398, "step": 189 }, { "epoch": 0.16101694915254236, "grad_norm": 1.10084068775177, "learning_rate": 4.7316384180790966e-05, "loss": 3.2941, "step": 190 }, { "epoch": 0.16186440677966102, "grad_norm": 1.0290236473083496, "learning_rate": 4.730225988700565e-05, "loss": 3.4847, "step": 191 }, { "epoch": 0.16271186440677965, "grad_norm": 1.2007007598876953, "learning_rate": 4.728813559322034e-05, "loss": 3.3852, "step": 192 }, { "epoch": 0.1635593220338983, "grad_norm": 0.9030916094779968, "learning_rate": 4.727401129943503e-05, "loss": 3.4596, "step": 193 }, { "epoch": 0.16440677966101694, "grad_norm": 0.9876952171325684, "learning_rate": 4.725988700564972e-05, "loss": 3.4634, "step": 194 }, { "epoch": 0.1652542372881356, "grad_norm": 1.025385856628418, "learning_rate": 4.724576271186441e-05, "loss": 3.4471, "step": 195 }, { "epoch": 0.16610169491525423, "grad_norm": 1.0539445877075195, "learning_rate": 4.72316384180791e-05, "loss": 3.4172, "step": 196 }, { "epoch": 0.1669491525423729, "grad_norm": 0.973169207572937, "learning_rate": 4.721751412429379e-05, "loss": 3.374, "step": 197 }, { "epoch": 0.16779661016949152, "grad_norm": 1.0636146068572998, "learning_rate": 4.720338983050848e-05, "loss": 3.404, "step": 198 }, { "epoch": 0.16864406779661018, "grad_norm": 0.9954953193664551, "learning_rate": 4.718926553672316e-05, "loss": 3.501, "step": 199 }, { "epoch": 0.1694915254237288, "grad_norm": 0.8904051184654236, "learning_rate": 4.717514124293785e-05, "loss": 3.4237, "step": 200 }, { "epoch": 0.17033898305084746, "grad_norm": 0.9949321746826172, "learning_rate": 4.716101694915254e-05, "loss": 3.3651, "step": 201 }, { "epoch": 0.1711864406779661, "grad_norm": 1.3412538766860962, "learning_rate": 4.714689265536724e-05, "loss": 3.2532, "step": 202 }, { "epoch": 0.17203389830508475, "grad_norm": 1.0155951976776123, "learning_rate": 4.713276836158192e-05, "loss": 3.3058, "step": 203 }, { "epoch": 0.17288135593220338, "grad_norm": 1.0480566024780273, "learning_rate": 4.711864406779661e-05, "loss": 3.3365, "step": 204 }, { "epoch": 0.17372881355932204, "grad_norm": 1.1093454360961914, "learning_rate": 4.71045197740113e-05, "loss": 3.369, "step": 205 }, { "epoch": 0.17457627118644067, "grad_norm": 1.3034790754318237, "learning_rate": 4.709039548022599e-05, "loss": 3.3571, "step": 206 }, { "epoch": 0.17542372881355933, "grad_norm": 1.0214184522628784, "learning_rate": 4.707627118644068e-05, "loss": 3.2449, "step": 207 }, { "epoch": 0.17627118644067796, "grad_norm": 1.0114665031433105, "learning_rate": 4.7062146892655365e-05, "loss": 3.2185, "step": 208 }, { "epoch": 0.17711864406779662, "grad_norm": 0.9668043851852417, "learning_rate": 4.7048022598870055e-05, "loss": 3.3632, "step": 209 }, { "epoch": 0.17796610169491525, "grad_norm": 1.1336166858673096, "learning_rate": 4.703389830508475e-05, "loss": 3.4155, "step": 210 }, { "epoch": 0.1788135593220339, "grad_norm": 1.0119339227676392, "learning_rate": 4.701977401129944e-05, "loss": 3.3101, "step": 211 }, { "epoch": 0.17966101694915254, "grad_norm": 1.012792706489563, "learning_rate": 4.7005649717514125e-05, "loss": 3.3279, "step": 212 }, { "epoch": 0.1805084745762712, "grad_norm": 0.8829046487808228, "learning_rate": 4.6991525423728815e-05, "loss": 3.2263, "step": 213 }, { "epoch": 0.18135593220338983, "grad_norm": 0.903784453868866, "learning_rate": 4.6977401129943505e-05, "loss": 3.4108, "step": 214 }, { "epoch": 0.18220338983050846, "grad_norm": 0.9798447489738464, "learning_rate": 4.6963276836158195e-05, "loss": 3.4257, "step": 215 }, { "epoch": 0.18305084745762712, "grad_norm": 1.0526540279388428, "learning_rate": 4.694915254237288e-05, "loss": 3.3412, "step": 216 }, { "epoch": 0.18389830508474575, "grad_norm": 0.9264103174209595, "learning_rate": 4.693502824858757e-05, "loss": 3.2474, "step": 217 }, { "epoch": 0.1847457627118644, "grad_norm": 1.113366961479187, "learning_rate": 4.6920903954802265e-05, "loss": 3.4456, "step": 218 }, { "epoch": 0.18559322033898304, "grad_norm": 0.9500327706336975, "learning_rate": 4.6906779661016955e-05, "loss": 3.2991, "step": 219 }, { "epoch": 0.1864406779661017, "grad_norm": 0.9833083152770996, "learning_rate": 4.689265536723164e-05, "loss": 3.3296, "step": 220 }, { "epoch": 0.18728813559322033, "grad_norm": 1.0229041576385498, "learning_rate": 4.687853107344633e-05, "loss": 3.2996, "step": 221 }, { "epoch": 0.188135593220339, "grad_norm": 1.103664517402649, "learning_rate": 4.686440677966102e-05, "loss": 3.4222, "step": 222 }, { "epoch": 0.18898305084745762, "grad_norm": 1.1817638874053955, "learning_rate": 4.685028248587571e-05, "loss": 3.5137, "step": 223 }, { "epoch": 0.18983050847457628, "grad_norm": 1.0594677925109863, "learning_rate": 4.68361581920904e-05, "loss": 3.4079, "step": 224 }, { "epoch": 0.1906779661016949, "grad_norm": 1.0142221450805664, "learning_rate": 4.682203389830508e-05, "loss": 3.2882, "step": 225 }, { "epoch": 0.19152542372881357, "grad_norm": 0.9378176927566528, "learning_rate": 4.680790960451978e-05, "loss": 3.4805, "step": 226 }, { "epoch": 0.1923728813559322, "grad_norm": 0.9688156843185425, "learning_rate": 4.679378531073447e-05, "loss": 3.3128, "step": 227 }, { "epoch": 0.19322033898305085, "grad_norm": 1.0353584289550781, "learning_rate": 4.677966101694916e-05, "loss": 3.4662, "step": 228 }, { "epoch": 0.19406779661016949, "grad_norm": 1.0622531175613403, "learning_rate": 4.676553672316384e-05, "loss": 3.3556, "step": 229 }, { "epoch": 0.19491525423728814, "grad_norm": 1.037964940071106, "learning_rate": 4.675141242937853e-05, "loss": 3.3904, "step": 230 }, { "epoch": 0.19576271186440677, "grad_norm": 1.0185190439224243, "learning_rate": 4.673728813559322e-05, "loss": 3.3667, "step": 231 }, { "epoch": 0.19661016949152543, "grad_norm": 1.0384317636489868, "learning_rate": 4.672316384180791e-05, "loss": 3.2698, "step": 232 }, { "epoch": 0.19745762711864406, "grad_norm": 0.911548912525177, "learning_rate": 4.67090395480226e-05, "loss": 3.2601, "step": 233 }, { "epoch": 0.19830508474576272, "grad_norm": 0.9575884938240051, "learning_rate": 4.669491525423729e-05, "loss": 3.3901, "step": 234 }, { "epoch": 0.19915254237288135, "grad_norm": 0.8838641047477722, "learning_rate": 4.668079096045198e-05, "loss": 3.2924, "step": 235 }, { "epoch": 0.2, "grad_norm": 1.0221738815307617, "learning_rate": 4.666666666666667e-05, "loss": 3.5573, "step": 236 }, { "epoch": 0.20084745762711864, "grad_norm": 1.0286304950714111, "learning_rate": 4.6652542372881355e-05, "loss": 3.2207, "step": 237 }, { "epoch": 0.2016949152542373, "grad_norm": 0.8968472480773926, "learning_rate": 4.6638418079096045e-05, "loss": 3.3057, "step": 238 }, { "epoch": 0.20254237288135593, "grad_norm": 0.9834404587745667, "learning_rate": 4.6624293785310735e-05, "loss": 3.3126, "step": 239 }, { "epoch": 0.2033898305084746, "grad_norm": 1.0731356143951416, "learning_rate": 4.6610169491525425e-05, "loss": 3.4379, "step": 240 }, { "epoch": 0.20423728813559322, "grad_norm": 1.158944010734558, "learning_rate": 4.6596045197740115e-05, "loss": 3.2545, "step": 241 }, { "epoch": 0.20508474576271185, "grad_norm": 0.9405023455619812, "learning_rate": 4.6581920903954805e-05, "loss": 3.4378, "step": 242 }, { "epoch": 0.2059322033898305, "grad_norm": 1.1383235454559326, "learning_rate": 4.6567796610169495e-05, "loss": 3.3256, "step": 243 }, { "epoch": 0.20677966101694914, "grad_norm": 0.8874810934066772, "learning_rate": 4.6553672316384185e-05, "loss": 3.3622, "step": 244 }, { "epoch": 0.2076271186440678, "grad_norm": 1.118872880935669, "learning_rate": 4.6539548022598875e-05, "loss": 3.3155, "step": 245 }, { "epoch": 0.20847457627118643, "grad_norm": 1.102102518081665, "learning_rate": 4.652542372881356e-05, "loss": 3.1913, "step": 246 }, { "epoch": 0.2093220338983051, "grad_norm": 1.002294659614563, "learning_rate": 4.651129943502825e-05, "loss": 3.3824, "step": 247 }, { "epoch": 0.21016949152542372, "grad_norm": 1.0566807985305786, "learning_rate": 4.649717514124294e-05, "loss": 3.4923, "step": 248 }, { "epoch": 0.21101694915254238, "grad_norm": 0.828079879283905, "learning_rate": 4.6483050847457635e-05, "loss": 3.3245, "step": 249 }, { "epoch": 0.211864406779661, "grad_norm": 1.1394438743591309, "learning_rate": 4.646892655367232e-05, "loss": 3.4789, "step": 250 }, { "epoch": 0.21271186440677967, "grad_norm": 1.1321762800216675, "learning_rate": 4.645480225988701e-05, "loss": 3.3182, "step": 251 }, { "epoch": 0.2135593220338983, "grad_norm": 1.0497584342956543, "learning_rate": 4.64406779661017e-05, "loss": 3.3439, "step": 252 }, { "epoch": 0.21440677966101696, "grad_norm": 0.9015762209892273, "learning_rate": 4.642655367231639e-05, "loss": 3.3564, "step": 253 }, { "epoch": 0.21525423728813559, "grad_norm": 0.9954678416252136, "learning_rate": 4.641242937853108e-05, "loss": 3.2391, "step": 254 }, { "epoch": 0.21610169491525424, "grad_norm": 1.0133748054504395, "learning_rate": 4.639830508474576e-05, "loss": 3.3173, "step": 255 }, { "epoch": 0.21694915254237288, "grad_norm": 1.0119907855987549, "learning_rate": 4.638418079096045e-05, "loss": 3.5523, "step": 256 }, { "epoch": 0.21779661016949153, "grad_norm": 0.9730232954025269, "learning_rate": 4.637005649717515e-05, "loss": 3.2612, "step": 257 }, { "epoch": 0.21864406779661016, "grad_norm": 1.0322071313858032, "learning_rate": 4.635593220338984e-05, "loss": 3.3631, "step": 258 }, { "epoch": 0.21949152542372882, "grad_norm": 1.0476667881011963, "learning_rate": 4.634180790960452e-05, "loss": 3.3455, "step": 259 }, { "epoch": 0.22033898305084745, "grad_norm": 1.0775266885757446, "learning_rate": 4.632768361581921e-05, "loss": 3.3354, "step": 260 }, { "epoch": 0.2211864406779661, "grad_norm": 1.1111152172088623, "learning_rate": 4.63135593220339e-05, "loss": 3.3548, "step": 261 }, { "epoch": 0.22203389830508474, "grad_norm": 0.9489743113517761, "learning_rate": 4.629943502824859e-05, "loss": 3.3328, "step": 262 }, { "epoch": 0.2228813559322034, "grad_norm": 0.9627196192741394, "learning_rate": 4.6285310734463274e-05, "loss": 3.285, "step": 263 }, { "epoch": 0.22372881355932203, "grad_norm": 1.1259305477142334, "learning_rate": 4.6271186440677964e-05, "loss": 3.434, "step": 264 }, { "epoch": 0.2245762711864407, "grad_norm": 0.8832256197929382, "learning_rate": 4.625706214689266e-05, "loss": 3.2969, "step": 265 }, { "epoch": 0.22542372881355932, "grad_norm": 0.8672728538513184, "learning_rate": 4.624293785310735e-05, "loss": 3.3588, "step": 266 }, { "epoch": 0.22627118644067798, "grad_norm": 1.0554966926574707, "learning_rate": 4.6228813559322034e-05, "loss": 3.2845, "step": 267 }, { "epoch": 0.2271186440677966, "grad_norm": 0.9938945770263672, "learning_rate": 4.6214689265536724e-05, "loss": 3.2831, "step": 268 }, { "epoch": 0.22796610169491524, "grad_norm": 1.081074833869934, "learning_rate": 4.6200564971751414e-05, "loss": 3.2885, "step": 269 }, { "epoch": 0.2288135593220339, "grad_norm": 0.8921335935592651, "learning_rate": 4.6186440677966104e-05, "loss": 3.3689, "step": 270 }, { "epoch": 0.22966101694915253, "grad_norm": 1.042938232421875, "learning_rate": 4.6172316384180794e-05, "loss": 3.2974, "step": 271 }, { "epoch": 0.2305084745762712, "grad_norm": 1.040932059288025, "learning_rate": 4.615819209039548e-05, "loss": 3.2452, "step": 272 }, { "epoch": 0.23135593220338982, "grad_norm": 1.1497408151626587, "learning_rate": 4.6144067796610174e-05, "loss": 3.4384, "step": 273 }, { "epoch": 0.23220338983050848, "grad_norm": 0.8547231554985046, "learning_rate": 4.6129943502824864e-05, "loss": 3.4982, "step": 274 }, { "epoch": 0.2330508474576271, "grad_norm": 1.0105772018432617, "learning_rate": 4.6115819209039554e-05, "loss": 3.3598, "step": 275 }, { "epoch": 0.23389830508474577, "grad_norm": 1.1239712238311768, "learning_rate": 4.610169491525424e-05, "loss": 3.2822, "step": 276 }, { "epoch": 0.2347457627118644, "grad_norm": 1.0043582916259766, "learning_rate": 4.608757062146893e-05, "loss": 3.3077, "step": 277 }, { "epoch": 0.23559322033898306, "grad_norm": 1.0133074522018433, "learning_rate": 4.607344632768362e-05, "loss": 3.3506, "step": 278 }, { "epoch": 0.2364406779661017, "grad_norm": 0.9234474301338196, "learning_rate": 4.605932203389831e-05, "loss": 3.3618, "step": 279 }, { "epoch": 0.23728813559322035, "grad_norm": 0.92128586769104, "learning_rate": 4.6045197740113e-05, "loss": 3.3541, "step": 280 }, { "epoch": 0.23813559322033898, "grad_norm": 0.9465177059173584, "learning_rate": 4.603107344632769e-05, "loss": 3.266, "step": 281 }, { "epoch": 0.23898305084745763, "grad_norm": 0.9086260199546814, "learning_rate": 4.601694915254238e-05, "loss": 3.2646, "step": 282 }, { "epoch": 0.23983050847457626, "grad_norm": 1.1903716325759888, "learning_rate": 4.600282485875707e-05, "loss": 3.2163, "step": 283 }, { "epoch": 0.24067796610169492, "grad_norm": 1.3813388347625732, "learning_rate": 4.598870056497175e-05, "loss": 3.3081, "step": 284 }, { "epoch": 0.24152542372881355, "grad_norm": 1.0255000591278076, "learning_rate": 4.597457627118644e-05, "loss": 3.2165, "step": 285 }, { "epoch": 0.2423728813559322, "grad_norm": 0.9308796525001526, "learning_rate": 4.596045197740113e-05, "loss": 3.4846, "step": 286 }, { "epoch": 0.24322033898305084, "grad_norm": 1.1603049039840698, "learning_rate": 4.594632768361582e-05, "loss": 3.4762, "step": 287 }, { "epoch": 0.2440677966101695, "grad_norm": 0.9982358813285828, "learning_rate": 4.593220338983051e-05, "loss": 3.279, "step": 288 }, { "epoch": 0.24491525423728813, "grad_norm": 0.989406406879425, "learning_rate": 4.59180790960452e-05, "loss": 3.3495, "step": 289 }, { "epoch": 0.2457627118644068, "grad_norm": 0.9485755562782288, "learning_rate": 4.590395480225989e-05, "loss": 3.4254, "step": 290 }, { "epoch": 0.24661016949152542, "grad_norm": 1.067507028579712, "learning_rate": 4.588983050847458e-05, "loss": 3.3039, "step": 291 }, { "epoch": 0.24745762711864408, "grad_norm": 0.9350253939628601, "learning_rate": 4.587570621468927e-05, "loss": 3.3643, "step": 292 }, { "epoch": 0.2483050847457627, "grad_norm": 0.9265627264976501, "learning_rate": 4.5861581920903954e-05, "loss": 3.4288, "step": 293 }, { "epoch": 0.24915254237288137, "grad_norm": 1.0432260036468506, "learning_rate": 4.5847457627118644e-05, "loss": 3.3999, "step": 294 }, { "epoch": 0.25, "grad_norm": 0.937401294708252, "learning_rate": 4.5833333333333334e-05, "loss": 3.287, "step": 295 }, { "epoch": 0.25084745762711863, "grad_norm": 1.0307767391204834, "learning_rate": 4.5819209039548024e-05, "loss": 3.3331, "step": 296 }, { "epoch": 0.25169491525423726, "grad_norm": 1.1131483316421509, "learning_rate": 4.5805084745762714e-05, "loss": 3.3136, "step": 297 }, { "epoch": 0.25254237288135595, "grad_norm": 0.8680022954940796, "learning_rate": 4.5790960451977404e-05, "loss": 3.3641, "step": 298 }, { "epoch": 0.2533898305084746, "grad_norm": 0.8451167345046997, "learning_rate": 4.5776836158192094e-05, "loss": 3.2599, "step": 299 }, { "epoch": 0.2542372881355932, "grad_norm": 0.9475318789482117, "learning_rate": 4.5762711864406784e-05, "loss": 3.3414, "step": 300 }, { "epoch": 0.25508474576271184, "grad_norm": 1.108223557472229, "learning_rate": 4.5748587570621474e-05, "loss": 3.4209, "step": 301 }, { "epoch": 0.2559322033898305, "grad_norm": 0.922511100769043, "learning_rate": 4.573446327683616e-05, "loss": 3.3206, "step": 302 }, { "epoch": 0.25677966101694916, "grad_norm": 1.1367084980010986, "learning_rate": 4.572033898305085e-05, "loss": 3.5687, "step": 303 }, { "epoch": 0.2576271186440678, "grad_norm": 0.91807621717453, "learning_rate": 4.570621468926554e-05, "loss": 3.3019, "step": 304 }, { "epoch": 0.2584745762711864, "grad_norm": 0.9386091828346252, "learning_rate": 4.5692090395480234e-05, "loss": 3.4212, "step": 305 }, { "epoch": 0.2593220338983051, "grad_norm": 0.9776807427406311, "learning_rate": 4.567796610169492e-05, "loss": 3.419, "step": 306 }, { "epoch": 0.26016949152542374, "grad_norm": 1.0071264505386353, "learning_rate": 4.566384180790961e-05, "loss": 3.2786, "step": 307 }, { "epoch": 0.26101694915254237, "grad_norm": 0.9951607584953308, "learning_rate": 4.56497175141243e-05, "loss": 3.2884, "step": 308 }, { "epoch": 0.261864406779661, "grad_norm": 1.0642110109329224, "learning_rate": 4.563559322033899e-05, "loss": 3.3133, "step": 309 }, { "epoch": 0.2627118644067797, "grad_norm": 0.8972973227500916, "learning_rate": 4.562146892655367e-05, "loss": 3.4772, "step": 310 }, { "epoch": 0.2635593220338983, "grad_norm": 0.9095409512519836, "learning_rate": 4.560734463276836e-05, "loss": 3.2671, "step": 311 }, { "epoch": 0.26440677966101694, "grad_norm": 0.9951375126838684, "learning_rate": 4.559322033898305e-05, "loss": 3.4345, "step": 312 }, { "epoch": 0.2652542372881356, "grad_norm": 1.0811796188354492, "learning_rate": 4.557909604519775e-05, "loss": 3.4065, "step": 313 }, { "epoch": 0.26610169491525426, "grad_norm": 0.9475049376487732, "learning_rate": 4.556497175141243e-05, "loss": 3.2354, "step": 314 }, { "epoch": 0.2669491525423729, "grad_norm": 1.0567224025726318, "learning_rate": 4.555084745762712e-05, "loss": 3.3456, "step": 315 }, { "epoch": 0.2677966101694915, "grad_norm": 1.1524204015731812, "learning_rate": 4.553672316384181e-05, "loss": 3.4103, "step": 316 }, { "epoch": 0.26864406779661015, "grad_norm": 0.9683026075363159, "learning_rate": 4.55225988700565e-05, "loss": 3.4152, "step": 317 }, { "epoch": 0.26949152542372884, "grad_norm": 1.0460152626037598, "learning_rate": 4.550847457627119e-05, "loss": 3.2732, "step": 318 }, { "epoch": 0.27033898305084747, "grad_norm": 1.1625036001205444, "learning_rate": 4.549435028248587e-05, "loss": 3.2985, "step": 319 }, { "epoch": 0.2711864406779661, "grad_norm": 1.077868938446045, "learning_rate": 4.548022598870056e-05, "loss": 3.4228, "step": 320 }, { "epoch": 0.27203389830508473, "grad_norm": 0.8281847238540649, "learning_rate": 4.546610169491526e-05, "loss": 3.3651, "step": 321 }, { "epoch": 0.27288135593220336, "grad_norm": 0.8520678281784058, "learning_rate": 4.545197740112995e-05, "loss": 3.2273, "step": 322 }, { "epoch": 0.27372881355932205, "grad_norm": 1.0431276559829712, "learning_rate": 4.543785310734463e-05, "loss": 3.3444, "step": 323 }, { "epoch": 0.2745762711864407, "grad_norm": 1.1178992986679077, "learning_rate": 4.542372881355932e-05, "loss": 3.3786, "step": 324 }, { "epoch": 0.2754237288135593, "grad_norm": 1.0613808631896973, "learning_rate": 4.540960451977401e-05, "loss": 3.3488, "step": 325 }, { "epoch": 0.27627118644067794, "grad_norm": 0.9420961737632751, "learning_rate": 4.53954802259887e-05, "loss": 3.5248, "step": 326 }, { "epoch": 0.2771186440677966, "grad_norm": 1.008429765701294, "learning_rate": 4.5381355932203387e-05, "loss": 3.3873, "step": 327 }, { "epoch": 0.27796610169491526, "grad_norm": 0.9365347623825073, "learning_rate": 4.536723163841808e-05, "loss": 3.2143, "step": 328 }, { "epoch": 0.2788135593220339, "grad_norm": 1.0193514823913574, "learning_rate": 4.535310734463277e-05, "loss": 3.2791, "step": 329 }, { "epoch": 0.2796610169491525, "grad_norm": 1.1028565168380737, "learning_rate": 4.533898305084746e-05, "loss": 3.2877, "step": 330 }, { "epoch": 0.2805084745762712, "grad_norm": 1.04025137424469, "learning_rate": 4.5324858757062147e-05, "loss": 3.2707, "step": 331 }, { "epoch": 0.28135593220338984, "grad_norm": 0.877678632736206, "learning_rate": 4.5310734463276837e-05, "loss": 3.4001, "step": 332 }, { "epoch": 0.28220338983050847, "grad_norm": 0.9502778649330139, "learning_rate": 4.5296610169491527e-05, "loss": 3.284, "step": 333 }, { "epoch": 0.2830508474576271, "grad_norm": 0.9594213962554932, "learning_rate": 4.5282485875706217e-05, "loss": 3.2292, "step": 334 }, { "epoch": 0.2838983050847458, "grad_norm": 1.030950903892517, "learning_rate": 4.5268361581920906e-05, "loss": 3.3599, "step": 335 }, { "epoch": 0.2847457627118644, "grad_norm": 0.912626326084137, "learning_rate": 4.5254237288135596e-05, "loss": 3.2828, "step": 336 }, { "epoch": 0.28559322033898304, "grad_norm": 0.9484438896179199, "learning_rate": 4.5240112994350286e-05, "loss": 3.3328, "step": 337 }, { "epoch": 0.2864406779661017, "grad_norm": 1.0654056072235107, "learning_rate": 4.5225988700564976e-05, "loss": 3.3364, "step": 338 }, { "epoch": 0.28728813559322036, "grad_norm": 1.1941118240356445, "learning_rate": 4.5211864406779666e-05, "loss": 3.4278, "step": 339 }, { "epoch": 0.288135593220339, "grad_norm": 1.1450444459915161, "learning_rate": 4.519774011299435e-05, "loss": 3.3119, "step": 340 }, { "epoch": 0.2889830508474576, "grad_norm": 0.9154060482978821, "learning_rate": 4.518361581920904e-05, "loss": 3.3388, "step": 341 }, { "epoch": 0.28983050847457625, "grad_norm": 1.1089015007019043, "learning_rate": 4.516949152542373e-05, "loss": 3.3443, "step": 342 }, { "epoch": 0.29067796610169494, "grad_norm": 1.036319375038147, "learning_rate": 4.515536723163842e-05, "loss": 3.2356, "step": 343 }, { "epoch": 0.29152542372881357, "grad_norm": 0.9845830798149109, "learning_rate": 4.514124293785311e-05, "loss": 3.4297, "step": 344 }, { "epoch": 0.2923728813559322, "grad_norm": 1.0907063484191895, "learning_rate": 4.51271186440678e-05, "loss": 3.3689, "step": 345 }, { "epoch": 0.29322033898305083, "grad_norm": 1.1132543087005615, "learning_rate": 4.511299435028249e-05, "loss": 3.3098, "step": 346 }, { "epoch": 0.2940677966101695, "grad_norm": 1.1934894323349, "learning_rate": 4.509887005649718e-05, "loss": 3.2462, "step": 347 }, { "epoch": 0.29491525423728815, "grad_norm": 1.2050788402557373, "learning_rate": 4.508474576271187e-05, "loss": 3.2446, "step": 348 }, { "epoch": 0.2957627118644068, "grad_norm": 0.9746072292327881, "learning_rate": 4.507062146892655e-05, "loss": 3.4158, "step": 349 }, { "epoch": 0.2966101694915254, "grad_norm": 1.038040041923523, "learning_rate": 4.505649717514124e-05, "loss": 3.2165, "step": 350 }, { "epoch": 0.29745762711864404, "grad_norm": 0.9746460914611816, "learning_rate": 4.504237288135593e-05, "loss": 3.3609, "step": 351 }, { "epoch": 0.2983050847457627, "grad_norm": 0.9296185970306396, "learning_rate": 4.502824858757062e-05, "loss": 3.3355, "step": 352 }, { "epoch": 0.29915254237288136, "grad_norm": 0.8227651119232178, "learning_rate": 4.501412429378531e-05, "loss": 3.442, "step": 353 }, { "epoch": 0.3, "grad_norm": 0.9187232851982117, "learning_rate": 4.5e-05, "loss": 3.3707, "step": 354 }, { "epoch": 0.3008474576271186, "grad_norm": 0.9959421157836914, "learning_rate": 4.498587570621469e-05, "loss": 3.4917, "step": 355 }, { "epoch": 0.3016949152542373, "grad_norm": 0.9714428782463074, "learning_rate": 4.497175141242938e-05, "loss": 3.325, "step": 356 }, { "epoch": 0.30254237288135594, "grad_norm": 0.9690954089164734, "learning_rate": 4.4957627118644066e-05, "loss": 3.3142, "step": 357 }, { "epoch": 0.30338983050847457, "grad_norm": 1.0296752452850342, "learning_rate": 4.4943502824858756e-05, "loss": 3.2786, "step": 358 }, { "epoch": 0.3042372881355932, "grad_norm": 1.152315616607666, "learning_rate": 4.4929378531073446e-05, "loss": 3.2337, "step": 359 }, { "epoch": 0.3050847457627119, "grad_norm": 1.0662038326263428, "learning_rate": 4.491525423728814e-05, "loss": 3.2721, "step": 360 }, { "epoch": 0.3059322033898305, "grad_norm": 0.8738830089569092, "learning_rate": 4.4901129943502826e-05, "loss": 3.3993, "step": 361 }, { "epoch": 0.30677966101694915, "grad_norm": 0.9107250571250916, "learning_rate": 4.4887005649717516e-05, "loss": 3.2791, "step": 362 }, { "epoch": 0.3076271186440678, "grad_norm": 1.0070526599884033, "learning_rate": 4.4872881355932206e-05, "loss": 3.3507, "step": 363 }, { "epoch": 0.30847457627118646, "grad_norm": 0.9358483552932739, "learning_rate": 4.4858757062146896e-05, "loss": 3.3353, "step": 364 }, { "epoch": 0.3093220338983051, "grad_norm": 0.78831547498703, "learning_rate": 4.4844632768361586e-05, "loss": 3.3122, "step": 365 }, { "epoch": 0.3101694915254237, "grad_norm": 0.9441731572151184, "learning_rate": 4.483050847457627e-05, "loss": 3.3249, "step": 366 }, { "epoch": 0.31101694915254235, "grad_norm": 0.9365278482437134, "learning_rate": 4.481638418079096e-05, "loss": 3.4133, "step": 367 }, { "epoch": 0.31186440677966104, "grad_norm": 1.0537331104278564, "learning_rate": 4.4802259887005656e-05, "loss": 3.3266, "step": 368 }, { "epoch": 0.31271186440677967, "grad_norm": 1.143284559249878, "learning_rate": 4.4788135593220346e-05, "loss": 3.3562, "step": 369 }, { "epoch": 0.3135593220338983, "grad_norm": 1.0316330194473267, "learning_rate": 4.477401129943503e-05, "loss": 3.2217, "step": 370 }, { "epoch": 0.31440677966101693, "grad_norm": 1.0107542276382446, "learning_rate": 4.475988700564972e-05, "loss": 3.3568, "step": 371 }, { "epoch": 0.3152542372881356, "grad_norm": 1.0114811658859253, "learning_rate": 4.474576271186441e-05, "loss": 3.3989, "step": 372 }, { "epoch": 0.31610169491525425, "grad_norm": 0.9530246257781982, "learning_rate": 4.47316384180791e-05, "loss": 3.3506, "step": 373 }, { "epoch": 0.3169491525423729, "grad_norm": 1.0929839611053467, "learning_rate": 4.471751412429378e-05, "loss": 3.3431, "step": 374 }, { "epoch": 0.3177966101694915, "grad_norm": 0.99641352891922, "learning_rate": 4.470338983050847e-05, "loss": 3.4992, "step": 375 }, { "epoch": 0.31864406779661014, "grad_norm": 0.8388562202453613, "learning_rate": 4.468926553672317e-05, "loss": 3.2914, "step": 376 }, { "epoch": 0.31949152542372883, "grad_norm": 0.9754483699798584, "learning_rate": 4.467514124293786e-05, "loss": 3.2311, "step": 377 }, { "epoch": 0.32033898305084746, "grad_norm": 1.01131010055542, "learning_rate": 4.466101694915254e-05, "loss": 3.3946, "step": 378 }, { "epoch": 0.3211864406779661, "grad_norm": 0.9653787016868591, "learning_rate": 4.464689265536723e-05, "loss": 3.2293, "step": 379 }, { "epoch": 0.3220338983050847, "grad_norm": 0.9971035122871399, "learning_rate": 4.463276836158192e-05, "loss": 3.3691, "step": 380 }, { "epoch": 0.3228813559322034, "grad_norm": 1.0390263795852661, "learning_rate": 4.461864406779661e-05, "loss": 3.3818, "step": 381 }, { "epoch": 0.32372881355932204, "grad_norm": 1.0146158933639526, "learning_rate": 4.46045197740113e-05, "loss": 3.285, "step": 382 }, { "epoch": 0.32457627118644067, "grad_norm": 1.0889482498168945, "learning_rate": 4.4590395480225986e-05, "loss": 3.2663, "step": 383 }, { "epoch": 0.3254237288135593, "grad_norm": 1.1688122749328613, "learning_rate": 4.457627118644068e-05, "loss": 3.2271, "step": 384 }, { "epoch": 0.326271186440678, "grad_norm": 1.1634156703948975, "learning_rate": 4.456214689265537e-05, "loss": 3.1355, "step": 385 }, { "epoch": 0.3271186440677966, "grad_norm": 0.9405133128166199, "learning_rate": 4.454802259887006e-05, "loss": 3.2457, "step": 386 }, { "epoch": 0.32796610169491525, "grad_norm": 1.0555479526519775, "learning_rate": 4.4533898305084746e-05, "loss": 3.3926, "step": 387 }, { "epoch": 0.3288135593220339, "grad_norm": 1.1605401039123535, "learning_rate": 4.4519774011299436e-05, "loss": 3.2686, "step": 388 }, { "epoch": 0.32966101694915256, "grad_norm": 1.0401630401611328, "learning_rate": 4.4505649717514126e-05, "loss": 3.1957, "step": 389 }, { "epoch": 0.3305084745762712, "grad_norm": 1.1166563034057617, "learning_rate": 4.4491525423728816e-05, "loss": 3.1175, "step": 390 }, { "epoch": 0.3313559322033898, "grad_norm": 0.9442133903503418, "learning_rate": 4.4477401129943506e-05, "loss": 3.3294, "step": 391 }, { "epoch": 0.33220338983050846, "grad_norm": 1.0875482559204102, "learning_rate": 4.4463276836158196e-05, "loss": 3.1947, "step": 392 }, { "epoch": 0.33305084745762714, "grad_norm": 1.0665291547775269, "learning_rate": 4.4449152542372886e-05, "loss": 3.3995, "step": 393 }, { "epoch": 0.3338983050847458, "grad_norm": 1.101614236831665, "learning_rate": 4.4435028248587575e-05, "loss": 3.3054, "step": 394 }, { "epoch": 0.3347457627118644, "grad_norm": 1.0335354804992676, "learning_rate": 4.442090395480226e-05, "loss": 3.3855, "step": 395 }, { "epoch": 0.33559322033898303, "grad_norm": 1.0372169017791748, "learning_rate": 4.440677966101695e-05, "loss": 3.2569, "step": 396 }, { "epoch": 0.3364406779661017, "grad_norm": 0.8847305774688721, "learning_rate": 4.439265536723164e-05, "loss": 3.3062, "step": 397 }, { "epoch": 0.33728813559322035, "grad_norm": 1.005831003189087, "learning_rate": 4.437853107344633e-05, "loss": 3.3711, "step": 398 }, { "epoch": 0.338135593220339, "grad_norm": 1.0427862405776978, "learning_rate": 4.436440677966102e-05, "loss": 3.3021, "step": 399 }, { "epoch": 0.3389830508474576, "grad_norm": 1.1072407960891724, "learning_rate": 4.435028248587571e-05, "loss": 3.2586, "step": 400 }, { "epoch": 0.3398305084745763, "grad_norm": 0.8681941032409668, "learning_rate": 4.43361581920904e-05, "loss": 3.3237, "step": 401 }, { "epoch": 0.34067796610169493, "grad_norm": 0.9611847996711731, "learning_rate": 4.432203389830509e-05, "loss": 3.3863, "step": 402 }, { "epoch": 0.34152542372881356, "grad_norm": 1.0184085369110107, "learning_rate": 4.430790960451978e-05, "loss": 3.3933, "step": 403 }, { "epoch": 0.3423728813559322, "grad_norm": 1.0703850984573364, "learning_rate": 4.429378531073446e-05, "loss": 3.2485, "step": 404 }, { "epoch": 0.3432203389830508, "grad_norm": 0.9682871103286743, "learning_rate": 4.427966101694915e-05, "loss": 3.3, "step": 405 }, { "epoch": 0.3440677966101695, "grad_norm": 1.0103580951690674, "learning_rate": 4.426553672316384e-05, "loss": 3.2532, "step": 406 }, { "epoch": 0.34491525423728814, "grad_norm": 1.1308388710021973, "learning_rate": 4.425141242937854e-05, "loss": 3.362, "step": 407 }, { "epoch": 0.34576271186440677, "grad_norm": 0.9657171368598938, "learning_rate": 4.423728813559322e-05, "loss": 3.3707, "step": 408 }, { "epoch": 0.3466101694915254, "grad_norm": 0.8440027236938477, "learning_rate": 4.422316384180791e-05, "loss": 3.3622, "step": 409 }, { "epoch": 0.3474576271186441, "grad_norm": 1.0896556377410889, "learning_rate": 4.42090395480226e-05, "loss": 3.1594, "step": 410 }, { "epoch": 0.3483050847457627, "grad_norm": 0.8792701363563538, "learning_rate": 4.419491525423729e-05, "loss": 3.4112, "step": 411 }, { "epoch": 0.34915254237288135, "grad_norm": 0.9398267269134521, "learning_rate": 4.418079096045198e-05, "loss": 3.2665, "step": 412 }, { "epoch": 0.35, "grad_norm": 1.0859825611114502, "learning_rate": 4.4166666666666665e-05, "loss": 3.2293, "step": 413 }, { "epoch": 0.35084745762711866, "grad_norm": 0.935768187046051, "learning_rate": 4.4152542372881355e-05, "loss": 3.4043, "step": 414 }, { "epoch": 0.3516949152542373, "grad_norm": 0.9599078893661499, "learning_rate": 4.413841807909605e-05, "loss": 3.2861, "step": 415 }, { "epoch": 0.3525423728813559, "grad_norm": 0.9147136211395264, "learning_rate": 4.412429378531074e-05, "loss": 3.3048, "step": 416 }, { "epoch": 0.35338983050847456, "grad_norm": 1.0079538822174072, "learning_rate": 4.4110169491525425e-05, "loss": 3.4239, "step": 417 }, { "epoch": 0.35423728813559324, "grad_norm": 0.9813336730003357, "learning_rate": 4.4096045197740115e-05, "loss": 3.1808, "step": 418 }, { "epoch": 0.3550847457627119, "grad_norm": 0.937896728515625, "learning_rate": 4.4081920903954805e-05, "loss": 3.3465, "step": 419 }, { "epoch": 0.3559322033898305, "grad_norm": 0.9621577858924866, "learning_rate": 4.4067796610169495e-05, "loss": 3.2737, "step": 420 }, { "epoch": 0.35677966101694913, "grad_norm": 1.0671390295028687, "learning_rate": 4.405367231638418e-05, "loss": 3.2426, "step": 421 }, { "epoch": 0.3576271186440678, "grad_norm": 0.9483035802841187, "learning_rate": 4.403954802259887e-05, "loss": 3.358, "step": 422 }, { "epoch": 0.35847457627118645, "grad_norm": 0.9741415977478027, "learning_rate": 4.4025423728813565e-05, "loss": 3.2336, "step": 423 }, { "epoch": 0.3593220338983051, "grad_norm": 1.000356674194336, "learning_rate": 4.4011299435028255e-05, "loss": 3.1706, "step": 424 }, { "epoch": 0.3601694915254237, "grad_norm": 0.9553226828575134, "learning_rate": 4.399717514124294e-05, "loss": 3.3855, "step": 425 }, { "epoch": 0.3610169491525424, "grad_norm": 1.0244450569152832, "learning_rate": 4.398305084745763e-05, "loss": 3.4004, "step": 426 }, { "epoch": 0.36186440677966103, "grad_norm": 0.9900097250938416, "learning_rate": 4.396892655367232e-05, "loss": 3.2182, "step": 427 }, { "epoch": 0.36271186440677966, "grad_norm": 0.9080206751823425, "learning_rate": 4.395480225988701e-05, "loss": 3.3002, "step": 428 }, { "epoch": 0.3635593220338983, "grad_norm": 1.0378522872924805, "learning_rate": 4.39406779661017e-05, "loss": 3.3241, "step": 429 }, { "epoch": 0.3644067796610169, "grad_norm": 1.0043469667434692, "learning_rate": 4.392655367231638e-05, "loss": 3.1954, "step": 430 }, { "epoch": 0.3652542372881356, "grad_norm": 0.9944254159927368, "learning_rate": 4.391242937853108e-05, "loss": 3.4255, "step": 431 }, { "epoch": 0.36610169491525424, "grad_norm": 1.0727494955062866, "learning_rate": 4.389830508474577e-05, "loss": 3.3453, "step": 432 }, { "epoch": 0.36694915254237287, "grad_norm": 0.900799036026001, "learning_rate": 4.388418079096046e-05, "loss": 3.2814, "step": 433 }, { "epoch": 0.3677966101694915, "grad_norm": 1.000331163406372, "learning_rate": 4.387005649717514e-05, "loss": 3.2186, "step": 434 }, { "epoch": 0.3686440677966102, "grad_norm": 0.9366008639335632, "learning_rate": 4.385593220338983e-05, "loss": 3.4577, "step": 435 }, { "epoch": 0.3694915254237288, "grad_norm": 1.0296732187271118, "learning_rate": 4.384180790960452e-05, "loss": 3.409, "step": 436 }, { "epoch": 0.37033898305084745, "grad_norm": 0.9627364873886108, "learning_rate": 4.382768361581921e-05, "loss": 3.2705, "step": 437 }, { "epoch": 0.3711864406779661, "grad_norm": 0.8561643362045288, "learning_rate": 4.38135593220339e-05, "loss": 3.3395, "step": 438 }, { "epoch": 0.37203389830508476, "grad_norm": 1.1192781925201416, "learning_rate": 4.379943502824859e-05, "loss": 3.2083, "step": 439 }, { "epoch": 0.3728813559322034, "grad_norm": 1.0735692977905273, "learning_rate": 4.378531073446328e-05, "loss": 3.4037, "step": 440 }, { "epoch": 0.373728813559322, "grad_norm": 0.8644734025001526, "learning_rate": 4.377118644067797e-05, "loss": 3.3224, "step": 441 }, { "epoch": 0.37457627118644066, "grad_norm": 1.0343711376190186, "learning_rate": 4.3757062146892655e-05, "loss": 3.1807, "step": 442 }, { "epoch": 0.37542372881355934, "grad_norm": 1.017646312713623, "learning_rate": 4.3742937853107345e-05, "loss": 3.2531, "step": 443 }, { "epoch": 0.376271186440678, "grad_norm": 0.9342203736305237, "learning_rate": 4.3728813559322035e-05, "loss": 3.3607, "step": 444 }, { "epoch": 0.3771186440677966, "grad_norm": 0.9267883896827698, "learning_rate": 4.3714689265536725e-05, "loss": 3.3541, "step": 445 }, { "epoch": 0.37796610169491524, "grad_norm": 0.9999634623527527, "learning_rate": 4.3700564971751415e-05, "loss": 3.229, "step": 446 }, { "epoch": 0.3788135593220339, "grad_norm": 1.0142055749893188, "learning_rate": 4.3686440677966105e-05, "loss": 3.3592, "step": 447 }, { "epoch": 0.37966101694915255, "grad_norm": 1.1882275342941284, "learning_rate": 4.3672316384180795e-05, "loss": 3.3692, "step": 448 }, { "epoch": 0.3805084745762712, "grad_norm": 1.0376838445663452, "learning_rate": 4.3658192090395485e-05, "loss": 3.3467, "step": 449 }, { "epoch": 0.3813559322033898, "grad_norm": 1.064318299293518, "learning_rate": 4.3644067796610175e-05, "loss": 3.2491, "step": 450 }, { "epoch": 0.3822033898305085, "grad_norm": 0.9614359736442566, "learning_rate": 4.362994350282486e-05, "loss": 3.3057, "step": 451 }, { "epoch": 0.38305084745762713, "grad_norm": 1.059322476387024, "learning_rate": 4.361581920903955e-05, "loss": 3.1946, "step": 452 }, { "epoch": 0.38389830508474576, "grad_norm": 0.9306525588035583, "learning_rate": 4.360169491525424e-05, "loss": 3.337, "step": 453 }, { "epoch": 0.3847457627118644, "grad_norm": 0.9853308200836182, "learning_rate": 4.358757062146893e-05, "loss": 3.2687, "step": 454 }, { "epoch": 0.3855932203389831, "grad_norm": 1.0130183696746826, "learning_rate": 4.357344632768362e-05, "loss": 3.2698, "step": 455 }, { "epoch": 0.3864406779661017, "grad_norm": 0.9569457173347473, "learning_rate": 4.355932203389831e-05, "loss": 3.2626, "step": 456 }, { "epoch": 0.38728813559322034, "grad_norm": 1.0113606452941895, "learning_rate": 4.3545197740113e-05, "loss": 3.3712, "step": 457 }, { "epoch": 0.38813559322033897, "grad_norm": 0.8372867703437805, "learning_rate": 4.353107344632769e-05, "loss": 3.2705, "step": 458 }, { "epoch": 0.3889830508474576, "grad_norm": 0.9295400977134705, "learning_rate": 4.351694915254238e-05, "loss": 3.3907, "step": 459 }, { "epoch": 0.3898305084745763, "grad_norm": 0.9511747360229492, "learning_rate": 4.350282485875706e-05, "loss": 3.2118, "step": 460 }, { "epoch": 0.3906779661016949, "grad_norm": 1.056266188621521, "learning_rate": 4.348870056497175e-05, "loss": 3.2752, "step": 461 }, { "epoch": 0.39152542372881355, "grad_norm": 0.9616141319274902, "learning_rate": 4.347457627118644e-05, "loss": 3.3691, "step": 462 }, { "epoch": 0.3923728813559322, "grad_norm": 1.032304048538208, "learning_rate": 4.346045197740114e-05, "loss": 3.2426, "step": 463 }, { "epoch": 0.39322033898305087, "grad_norm": 0.9025688767433167, "learning_rate": 4.344632768361582e-05, "loss": 3.3979, "step": 464 }, { "epoch": 0.3940677966101695, "grad_norm": 1.0538108348846436, "learning_rate": 4.343220338983051e-05, "loss": 3.3375, "step": 465 }, { "epoch": 0.3949152542372881, "grad_norm": 0.8587263226509094, "learning_rate": 4.34180790960452e-05, "loss": 3.3093, "step": 466 }, { "epoch": 0.39576271186440676, "grad_norm": 1.357358455657959, "learning_rate": 4.340395480225989e-05, "loss": 3.3067, "step": 467 }, { "epoch": 0.39661016949152544, "grad_norm": 0.884049117565155, "learning_rate": 4.3389830508474574e-05, "loss": 3.2978, "step": 468 }, { "epoch": 0.3974576271186441, "grad_norm": 1.1013493537902832, "learning_rate": 4.3375706214689264e-05, "loss": 3.3654, "step": 469 }, { "epoch": 0.3983050847457627, "grad_norm": 0.9923128485679626, "learning_rate": 4.3361581920903954e-05, "loss": 3.2856, "step": 470 }, { "epoch": 0.39915254237288134, "grad_norm": 0.9877704977989197, "learning_rate": 4.334745762711865e-05, "loss": 3.2825, "step": 471 }, { "epoch": 0.4, "grad_norm": 0.8816186189651489, "learning_rate": 4.3333333333333334e-05, "loss": 3.3138, "step": 472 }, { "epoch": 0.40084745762711865, "grad_norm": 0.9489175081253052, "learning_rate": 4.3319209039548024e-05, "loss": 3.3202, "step": 473 }, { "epoch": 0.4016949152542373, "grad_norm": 0.9873949289321899, "learning_rate": 4.3305084745762714e-05, "loss": 3.2659, "step": 474 }, { "epoch": 0.4025423728813559, "grad_norm": 0.9659498333930969, "learning_rate": 4.3290960451977404e-05, "loss": 3.2711, "step": 475 }, { "epoch": 0.4033898305084746, "grad_norm": 1.061615228652954, "learning_rate": 4.3276836158192094e-05, "loss": 3.2285, "step": 476 }, { "epoch": 0.40423728813559323, "grad_norm": 1.028267502784729, "learning_rate": 4.326271186440678e-05, "loss": 3.3729, "step": 477 }, { "epoch": 0.40508474576271186, "grad_norm": 1.2434405088424683, "learning_rate": 4.324858757062147e-05, "loss": 3.2294, "step": 478 }, { "epoch": 0.4059322033898305, "grad_norm": 1.103103518486023, "learning_rate": 4.3234463276836164e-05, "loss": 3.3127, "step": 479 }, { "epoch": 0.4067796610169492, "grad_norm": 0.9126176834106445, "learning_rate": 4.3220338983050854e-05, "loss": 3.2937, "step": 480 }, { "epoch": 0.4076271186440678, "grad_norm": 0.9698008298873901, "learning_rate": 4.320621468926554e-05, "loss": 3.1964, "step": 481 }, { "epoch": 0.40847457627118644, "grad_norm": 0.9518531560897827, "learning_rate": 4.319209039548023e-05, "loss": 3.3216, "step": 482 }, { "epoch": 0.40932203389830507, "grad_norm": 0.9920042157173157, "learning_rate": 4.317796610169492e-05, "loss": 3.166, "step": 483 }, { "epoch": 0.4101694915254237, "grad_norm": 1.0381064414978027, "learning_rate": 4.316384180790961e-05, "loss": 3.2306, "step": 484 }, { "epoch": 0.4110169491525424, "grad_norm": 1.0730125904083252, "learning_rate": 4.314971751412429e-05, "loss": 3.2474, "step": 485 }, { "epoch": 0.411864406779661, "grad_norm": 1.008436918258667, "learning_rate": 4.313559322033899e-05, "loss": 3.4149, "step": 486 }, { "epoch": 0.41271186440677965, "grad_norm": 0.8438705205917358, "learning_rate": 4.312146892655368e-05, "loss": 3.3449, "step": 487 }, { "epoch": 0.4135593220338983, "grad_norm": 0.9985308647155762, "learning_rate": 4.310734463276837e-05, "loss": 3.2744, "step": 488 }, { "epoch": 0.41440677966101697, "grad_norm": 1.0134936571121216, "learning_rate": 4.309322033898305e-05, "loss": 3.2965, "step": 489 }, { "epoch": 0.4152542372881356, "grad_norm": 1.0140883922576904, "learning_rate": 4.307909604519774e-05, "loss": 3.4264, "step": 490 }, { "epoch": 0.4161016949152542, "grad_norm": 1.141499400138855, "learning_rate": 4.306497175141243e-05, "loss": 3.2636, "step": 491 }, { "epoch": 0.41694915254237286, "grad_norm": 1.0653141736984253, "learning_rate": 4.305084745762712e-05, "loss": 3.3115, "step": 492 }, { "epoch": 0.41779661016949154, "grad_norm": 1.101950764656067, "learning_rate": 4.303672316384181e-05, "loss": 3.3546, "step": 493 }, { "epoch": 0.4186440677966102, "grad_norm": 0.9042879939079285, "learning_rate": 4.30225988700565e-05, "loss": 3.1467, "step": 494 }, { "epoch": 0.4194915254237288, "grad_norm": 1.0648889541625977, "learning_rate": 4.300847457627119e-05, "loss": 3.2356, "step": 495 }, { "epoch": 0.42033898305084744, "grad_norm": 0.8660322427749634, "learning_rate": 4.299435028248588e-05, "loss": 3.3347, "step": 496 }, { "epoch": 0.4211864406779661, "grad_norm": 0.9183448553085327, "learning_rate": 4.298022598870057e-05, "loss": 3.2862, "step": 497 }, { "epoch": 0.42203389830508475, "grad_norm": 0.9873458743095398, "learning_rate": 4.2966101694915254e-05, "loss": 3.2141, "step": 498 }, { "epoch": 0.4228813559322034, "grad_norm": 1.1227437257766724, "learning_rate": 4.2951977401129944e-05, "loss": 3.3546, "step": 499 }, { "epoch": 0.423728813559322, "grad_norm": 0.9175024032592773, "learning_rate": 4.2937853107344634e-05, "loss": 3.3137, "step": 500 }, { "epoch": 0.4245762711864407, "grad_norm": 0.9533364772796631, "learning_rate": 4.2923728813559324e-05, "loss": 3.2952, "step": 501 }, { "epoch": 0.42542372881355933, "grad_norm": 0.9788798689842224, "learning_rate": 4.2909604519774014e-05, "loss": 3.3925, "step": 502 }, { "epoch": 0.42627118644067796, "grad_norm": 0.9586952328681946, "learning_rate": 4.2895480225988704e-05, "loss": 3.3547, "step": 503 }, { "epoch": 0.4271186440677966, "grad_norm": 1.0977470874786377, "learning_rate": 4.2881355932203394e-05, "loss": 3.4164, "step": 504 }, { "epoch": 0.4279661016949153, "grad_norm": 1.0293604135513306, "learning_rate": 4.2867231638418084e-05, "loss": 3.1522, "step": 505 }, { "epoch": 0.4288135593220339, "grad_norm": 0.9427899718284607, "learning_rate": 4.2853107344632774e-05, "loss": 3.3939, "step": 506 }, { "epoch": 0.42966101694915254, "grad_norm": 1.0391638278961182, "learning_rate": 4.283898305084746e-05, "loss": 3.3508, "step": 507 }, { "epoch": 0.43050847457627117, "grad_norm": 0.9802543520927429, "learning_rate": 4.282485875706215e-05, "loss": 3.2025, "step": 508 }, { "epoch": 0.43135593220338986, "grad_norm": 1.0067499876022339, "learning_rate": 4.281073446327684e-05, "loss": 3.2871, "step": 509 }, { "epoch": 0.4322033898305085, "grad_norm": 1.0879120826721191, "learning_rate": 4.279661016949153e-05, "loss": 3.2955, "step": 510 }, { "epoch": 0.4330508474576271, "grad_norm": 0.9421293139457703, "learning_rate": 4.278248587570622e-05, "loss": 3.3885, "step": 511 }, { "epoch": 0.43389830508474575, "grad_norm": 0.9865682721138, "learning_rate": 4.276836158192091e-05, "loss": 3.4184, "step": 512 }, { "epoch": 0.4347457627118644, "grad_norm": 1.0247126817703247, "learning_rate": 4.27542372881356e-05, "loss": 3.2416, "step": 513 }, { "epoch": 0.43559322033898307, "grad_norm": 1.006000280380249, "learning_rate": 4.274011299435029e-05, "loss": 3.3106, "step": 514 }, { "epoch": 0.4364406779661017, "grad_norm": 0.927374541759491, "learning_rate": 4.272598870056497e-05, "loss": 3.2072, "step": 515 }, { "epoch": 0.43728813559322033, "grad_norm": 0.9651941061019897, "learning_rate": 4.271186440677966e-05, "loss": 3.2353, "step": 516 }, { "epoch": 0.43813559322033896, "grad_norm": 1.0576109886169434, "learning_rate": 4.269774011299435e-05, "loss": 3.2792, "step": 517 }, { "epoch": 0.43898305084745765, "grad_norm": 0.9735087752342224, "learning_rate": 4.268361581920905e-05, "loss": 3.1281, "step": 518 }, { "epoch": 0.4398305084745763, "grad_norm": 0.8727237582206726, "learning_rate": 4.266949152542373e-05, "loss": 3.3176, "step": 519 }, { "epoch": 0.4406779661016949, "grad_norm": 1.028702735900879, "learning_rate": 4.265536723163842e-05, "loss": 3.2833, "step": 520 }, { "epoch": 0.44152542372881354, "grad_norm": 0.9425292015075684, "learning_rate": 4.264124293785311e-05, "loss": 3.2493, "step": 521 }, { "epoch": 0.4423728813559322, "grad_norm": 0.9147586226463318, "learning_rate": 4.26271186440678e-05, "loss": 3.299, "step": 522 }, { "epoch": 0.44322033898305085, "grad_norm": 0.9517231583595276, "learning_rate": 4.261299435028249e-05, "loss": 3.2144, "step": 523 }, { "epoch": 0.4440677966101695, "grad_norm": 0.8731418251991272, "learning_rate": 4.259887005649717e-05, "loss": 3.2729, "step": 524 }, { "epoch": 0.4449152542372881, "grad_norm": 1.0718739032745361, "learning_rate": 4.258474576271186e-05, "loss": 3.274, "step": 525 }, { "epoch": 0.4457627118644068, "grad_norm": 0.9271964430809021, "learning_rate": 4.257062146892656e-05, "loss": 3.1512, "step": 526 }, { "epoch": 0.44661016949152543, "grad_norm": 0.9362923502922058, "learning_rate": 4.255649717514125e-05, "loss": 3.2176, "step": 527 }, { "epoch": 0.44745762711864406, "grad_norm": 0.9278737902641296, "learning_rate": 4.254237288135593e-05, "loss": 3.3155, "step": 528 }, { "epoch": 0.4483050847457627, "grad_norm": 1.1741576194763184, "learning_rate": 4.252824858757062e-05, "loss": 3.1685, "step": 529 }, { "epoch": 0.4491525423728814, "grad_norm": 1.1526938676834106, "learning_rate": 4.251412429378531e-05, "loss": 3.2377, "step": 530 }, { "epoch": 0.45, "grad_norm": 0.8986443281173706, "learning_rate": 4.25e-05, "loss": 3.2809, "step": 531 }, { "epoch": 0.45084745762711864, "grad_norm": 1.035595417022705, "learning_rate": 4.2485875706214686e-05, "loss": 3.2069, "step": 532 }, { "epoch": 0.4516949152542373, "grad_norm": 0.9578921794891357, "learning_rate": 4.2471751412429376e-05, "loss": 3.1584, "step": 533 }, { "epoch": 0.45254237288135596, "grad_norm": 0.9431161284446716, "learning_rate": 4.245762711864407e-05, "loss": 3.3695, "step": 534 }, { "epoch": 0.4533898305084746, "grad_norm": 0.9524796605110168, "learning_rate": 4.244350282485876e-05, "loss": 3.3014, "step": 535 }, { "epoch": 0.4542372881355932, "grad_norm": 0.9229897260665894, "learning_rate": 4.2429378531073446e-05, "loss": 3.3035, "step": 536 }, { "epoch": 0.45508474576271185, "grad_norm": 1.1222385168075562, "learning_rate": 4.2415254237288136e-05, "loss": 3.1968, "step": 537 }, { "epoch": 0.4559322033898305, "grad_norm": 0.8892698884010315, "learning_rate": 4.2401129943502826e-05, "loss": 3.2539, "step": 538 }, { "epoch": 0.45677966101694917, "grad_norm": 1.1344577074050903, "learning_rate": 4.2387005649717516e-05, "loss": 3.297, "step": 539 }, { "epoch": 0.4576271186440678, "grad_norm": 1.0784862041473389, "learning_rate": 4.2372881355932206e-05, "loss": 3.3599, "step": 540 }, { "epoch": 0.45847457627118643, "grad_norm": 1.1405490636825562, "learning_rate": 4.235875706214689e-05, "loss": 3.2566, "step": 541 }, { "epoch": 0.45932203389830506, "grad_norm": 1.0826579332351685, "learning_rate": 4.2344632768361586e-05, "loss": 3.1976, "step": 542 }, { "epoch": 0.46016949152542375, "grad_norm": 0.9129642248153687, "learning_rate": 4.2330508474576276e-05, "loss": 3.1628, "step": 543 }, { "epoch": 0.4610169491525424, "grad_norm": 0.9974708557128906, "learning_rate": 4.2316384180790966e-05, "loss": 3.1971, "step": 544 }, { "epoch": 0.461864406779661, "grad_norm": 0.8488754630088806, "learning_rate": 4.230225988700565e-05, "loss": 3.2751, "step": 545 }, { "epoch": 0.46271186440677964, "grad_norm": 1.0173834562301636, "learning_rate": 4.228813559322034e-05, "loss": 3.3439, "step": 546 }, { "epoch": 0.4635593220338983, "grad_norm": 1.0113790035247803, "learning_rate": 4.227401129943503e-05, "loss": 3.2798, "step": 547 }, { "epoch": 0.46440677966101696, "grad_norm": 0.9715742468833923, "learning_rate": 4.225988700564972e-05, "loss": 3.2373, "step": 548 }, { "epoch": 0.4652542372881356, "grad_norm": 0.9641973376274109, "learning_rate": 4.224576271186441e-05, "loss": 3.2877, "step": 549 }, { "epoch": 0.4661016949152542, "grad_norm": 1.0681318044662476, "learning_rate": 4.22316384180791e-05, "loss": 3.3821, "step": 550 }, { "epoch": 0.4669491525423729, "grad_norm": 1.1120551824569702, "learning_rate": 4.221751412429379e-05, "loss": 3.2773, "step": 551 }, { "epoch": 0.46779661016949153, "grad_norm": 1.0814745426177979, "learning_rate": 4.220338983050848e-05, "loss": 3.2962, "step": 552 }, { "epoch": 0.46864406779661016, "grad_norm": 0.9989905953407288, "learning_rate": 4.218926553672316e-05, "loss": 3.2294, "step": 553 }, { "epoch": 0.4694915254237288, "grad_norm": 1.1746211051940918, "learning_rate": 4.217514124293785e-05, "loss": 3.2401, "step": 554 }, { "epoch": 0.4703389830508475, "grad_norm": 1.0154939889907837, "learning_rate": 4.216101694915254e-05, "loss": 3.1437, "step": 555 }, { "epoch": 0.4711864406779661, "grad_norm": 0.8692964315414429, "learning_rate": 4.214689265536723e-05, "loss": 3.1535, "step": 556 }, { "epoch": 0.47203389830508474, "grad_norm": 1.010939121246338, "learning_rate": 4.213276836158192e-05, "loss": 3.2304, "step": 557 }, { "epoch": 0.4728813559322034, "grad_norm": 1.1787054538726807, "learning_rate": 4.211864406779661e-05, "loss": 3.1956, "step": 558 }, { "epoch": 0.47372881355932206, "grad_norm": 1.1601383686065674, "learning_rate": 4.21045197740113e-05, "loss": 3.378, "step": 559 }, { "epoch": 0.4745762711864407, "grad_norm": 0.9629536271095276, "learning_rate": 4.209039548022599e-05, "loss": 3.1456, "step": 560 }, { "epoch": 0.4754237288135593, "grad_norm": 1.0416926145553589, "learning_rate": 4.207627118644068e-05, "loss": 3.3116, "step": 561 }, { "epoch": 0.47627118644067795, "grad_norm": 1.0895572900772095, "learning_rate": 4.2062146892655366e-05, "loss": 3.2127, "step": 562 }, { "epoch": 0.47711864406779664, "grad_norm": 0.8969883322715759, "learning_rate": 4.2048022598870056e-05, "loss": 3.2026, "step": 563 }, { "epoch": 0.47796610169491527, "grad_norm": 0.9508427381515503, "learning_rate": 4.2033898305084746e-05, "loss": 3.3177, "step": 564 }, { "epoch": 0.4788135593220339, "grad_norm": 1.079058051109314, "learning_rate": 4.201977401129944e-05, "loss": 3.2039, "step": 565 }, { "epoch": 0.47966101694915253, "grad_norm": 1.0017307996749878, "learning_rate": 4.2005649717514126e-05, "loss": 3.3683, "step": 566 }, { "epoch": 0.48050847457627116, "grad_norm": 0.9946544766426086, "learning_rate": 4.1991525423728816e-05, "loss": 3.2457, "step": 567 }, { "epoch": 0.48135593220338985, "grad_norm": 0.951226532459259, "learning_rate": 4.1977401129943506e-05, "loss": 3.2903, "step": 568 }, { "epoch": 0.4822033898305085, "grad_norm": 0.922289252281189, "learning_rate": 4.1963276836158196e-05, "loss": 3.1778, "step": 569 }, { "epoch": 0.4830508474576271, "grad_norm": 0.9599305391311646, "learning_rate": 4.1949152542372886e-05, "loss": 3.268, "step": 570 }, { "epoch": 0.48389830508474574, "grad_norm": 0.998159646987915, "learning_rate": 4.193502824858757e-05, "loss": 3.2417, "step": 571 }, { "epoch": 0.4847457627118644, "grad_norm": 1.0418500900268555, "learning_rate": 4.192090395480226e-05, "loss": 3.2949, "step": 572 }, { "epoch": 0.48559322033898306, "grad_norm": 0.8767018914222717, "learning_rate": 4.1906779661016956e-05, "loss": 3.1952, "step": 573 }, { "epoch": 0.4864406779661017, "grad_norm": 0.9565122723579407, "learning_rate": 4.1892655367231646e-05, "loss": 3.1557, "step": 574 }, { "epoch": 0.4872881355932203, "grad_norm": 0.9719352126121521, "learning_rate": 4.187853107344633e-05, "loss": 3.2688, "step": 575 }, { "epoch": 0.488135593220339, "grad_norm": 1.0732676982879639, "learning_rate": 4.186440677966102e-05, "loss": 3.1759, "step": 576 }, { "epoch": 0.48898305084745763, "grad_norm": 0.9919960498809814, "learning_rate": 4.185028248587571e-05, "loss": 3.4158, "step": 577 }, { "epoch": 0.48983050847457626, "grad_norm": 1.181933045387268, "learning_rate": 4.18361581920904e-05, "loss": 3.3148, "step": 578 }, { "epoch": 0.4906779661016949, "grad_norm": 0.9915511608123779, "learning_rate": 4.182203389830508e-05, "loss": 3.2224, "step": 579 }, { "epoch": 0.4915254237288136, "grad_norm": 1.0402354001998901, "learning_rate": 4.180790960451977e-05, "loss": 3.2054, "step": 580 }, { "epoch": 0.4923728813559322, "grad_norm": 1.0277245044708252, "learning_rate": 4.179378531073447e-05, "loss": 3.2746, "step": 581 }, { "epoch": 0.49322033898305084, "grad_norm": 0.9708535075187683, "learning_rate": 4.177966101694916e-05, "loss": 3.4521, "step": 582 }, { "epoch": 0.4940677966101695, "grad_norm": 0.9602912068367004, "learning_rate": 4.176553672316384e-05, "loss": 3.3488, "step": 583 }, { "epoch": 0.49491525423728816, "grad_norm": 0.9112564921379089, "learning_rate": 4.175141242937853e-05, "loss": 3.2176, "step": 584 }, { "epoch": 0.4957627118644068, "grad_norm": 0.8513944745063782, "learning_rate": 4.173728813559322e-05, "loss": 3.2106, "step": 585 }, { "epoch": 0.4966101694915254, "grad_norm": 1.0845856666564941, "learning_rate": 4.172316384180791e-05, "loss": 3.2922, "step": 586 }, { "epoch": 0.49745762711864405, "grad_norm": 1.0840615034103394, "learning_rate": 4.17090395480226e-05, "loss": 3.1675, "step": 587 }, { "epoch": 0.49830508474576274, "grad_norm": 1.1470903158187866, "learning_rate": 4.1694915254237285e-05, "loss": 3.289, "step": 588 }, { "epoch": 0.49915254237288137, "grad_norm": 0.9398099780082703, "learning_rate": 4.168079096045198e-05, "loss": 3.1895, "step": 589 }, { "epoch": 0.5, "grad_norm": 0.9933624863624573, "learning_rate": 4.166666666666667e-05, "loss": 3.2252, "step": 590 }, { "epoch": 0.5008474576271187, "grad_norm": 0.8540239334106445, "learning_rate": 4.165254237288136e-05, "loss": 3.2701, "step": 591 }, { "epoch": 0.5016949152542373, "grad_norm": 0.9668716192245483, "learning_rate": 4.1638418079096045e-05, "loss": 3.3911, "step": 592 }, { "epoch": 0.502542372881356, "grad_norm": 0.9678142070770264, "learning_rate": 4.1624293785310735e-05, "loss": 3.3385, "step": 593 }, { "epoch": 0.5033898305084745, "grad_norm": 1.1182767152786255, "learning_rate": 4.1610169491525425e-05, "loss": 3.2207, "step": 594 }, { "epoch": 0.5042372881355932, "grad_norm": 1.115086317062378, "learning_rate": 4.1596045197740115e-05, "loss": 3.1893, "step": 595 }, { "epoch": 0.5050847457627119, "grad_norm": 0.9647793173789978, "learning_rate": 4.1581920903954805e-05, "loss": 3.1498, "step": 596 }, { "epoch": 0.5059322033898305, "grad_norm": 1.0119106769561768, "learning_rate": 4.1567796610169495e-05, "loss": 3.1322, "step": 597 }, { "epoch": 0.5067796610169492, "grad_norm": 1.070473551750183, "learning_rate": 4.1553672316384185e-05, "loss": 3.1809, "step": 598 }, { "epoch": 0.5076271186440678, "grad_norm": 0.979231595993042, "learning_rate": 4.1539548022598875e-05, "loss": 3.294, "step": 599 }, { "epoch": 0.5084745762711864, "grad_norm": 1.0784424543380737, "learning_rate": 4.152542372881356e-05, "loss": 3.2778, "step": 600 }, { "epoch": 0.5093220338983051, "grad_norm": 0.9782651662826538, "learning_rate": 4.151129943502825e-05, "loss": 3.1707, "step": 601 }, { "epoch": 0.5101694915254237, "grad_norm": 0.9835995435714722, "learning_rate": 4.149717514124294e-05, "loss": 3.2508, "step": 602 }, { "epoch": 0.5110169491525424, "grad_norm": 1.139223337173462, "learning_rate": 4.148305084745763e-05, "loss": 3.1667, "step": 603 }, { "epoch": 0.511864406779661, "grad_norm": 0.8076462149620056, "learning_rate": 4.146892655367232e-05, "loss": 3.3504, "step": 604 }, { "epoch": 0.5127118644067796, "grad_norm": 1.087288737297058, "learning_rate": 4.145480225988701e-05, "loss": 3.2342, "step": 605 }, { "epoch": 0.5135593220338983, "grad_norm": 1.18161141872406, "learning_rate": 4.14406779661017e-05, "loss": 3.3338, "step": 606 }, { "epoch": 0.514406779661017, "grad_norm": 1.0188791751861572, "learning_rate": 4.142655367231639e-05, "loss": 3.2616, "step": 607 }, { "epoch": 0.5152542372881356, "grad_norm": 1.0371781587600708, "learning_rate": 4.141242937853108e-05, "loss": 3.3656, "step": 608 }, { "epoch": 0.5161016949152543, "grad_norm": 0.9674088954925537, "learning_rate": 4.139830508474576e-05, "loss": 3.2331, "step": 609 }, { "epoch": 0.5169491525423728, "grad_norm": 1.136169672012329, "learning_rate": 4.138418079096045e-05, "loss": 3.2559, "step": 610 }, { "epoch": 0.5177966101694915, "grad_norm": 1.034875512123108, "learning_rate": 4.137005649717514e-05, "loss": 3.2006, "step": 611 }, { "epoch": 0.5186440677966102, "grad_norm": 1.0347411632537842, "learning_rate": 4.135593220338983e-05, "loss": 3.246, "step": 612 }, { "epoch": 0.5194915254237288, "grad_norm": 1.0293282270431519, "learning_rate": 4.134180790960452e-05, "loss": 3.2248, "step": 613 }, { "epoch": 0.5203389830508475, "grad_norm": 1.0923638343811035, "learning_rate": 4.132768361581921e-05, "loss": 3.208, "step": 614 }, { "epoch": 0.5211864406779662, "grad_norm": 1.2328356504440308, "learning_rate": 4.13135593220339e-05, "loss": 3.3027, "step": 615 }, { "epoch": 0.5220338983050847, "grad_norm": 0.9866951704025269, "learning_rate": 4.129943502824859e-05, "loss": 3.1971, "step": 616 }, { "epoch": 0.5228813559322034, "grad_norm": 1.0059102773666382, "learning_rate": 4.128531073446328e-05, "loss": 3.2596, "step": 617 }, { "epoch": 0.523728813559322, "grad_norm": 1.1470650434494019, "learning_rate": 4.1271186440677965e-05, "loss": 3.3411, "step": 618 }, { "epoch": 0.5245762711864407, "grad_norm": 0.9078112244606018, "learning_rate": 4.1257062146892655e-05, "loss": 3.3085, "step": 619 }, { "epoch": 0.5254237288135594, "grad_norm": 0.8482179045677185, "learning_rate": 4.1242937853107345e-05, "loss": 3.1034, "step": 620 }, { "epoch": 0.5262711864406779, "grad_norm": 1.1044117212295532, "learning_rate": 4.1228813559322035e-05, "loss": 3.309, "step": 621 }, { "epoch": 0.5271186440677966, "grad_norm": 1.0806647539138794, "learning_rate": 4.1214689265536725e-05, "loss": 3.3362, "step": 622 }, { "epoch": 0.5279661016949152, "grad_norm": 1.0033314228057861, "learning_rate": 4.1200564971751415e-05, "loss": 3.2762, "step": 623 }, { "epoch": 0.5288135593220339, "grad_norm": 1.0104668140411377, "learning_rate": 4.1186440677966105e-05, "loss": 3.1519, "step": 624 }, { "epoch": 0.5296610169491526, "grad_norm": 0.9980913996696472, "learning_rate": 4.1172316384180795e-05, "loss": 3.2228, "step": 625 }, { "epoch": 0.5305084745762711, "grad_norm": 1.044937014579773, "learning_rate": 4.115819209039548e-05, "loss": 3.3799, "step": 626 }, { "epoch": 0.5313559322033898, "grad_norm": 0.9099481105804443, "learning_rate": 4.114406779661017e-05, "loss": 3.224, "step": 627 }, { "epoch": 0.5322033898305085, "grad_norm": 0.968677282333374, "learning_rate": 4.112994350282486e-05, "loss": 3.2132, "step": 628 }, { "epoch": 0.5330508474576271, "grad_norm": 1.0089943408966064, "learning_rate": 4.1115819209039555e-05, "loss": 3.289, "step": 629 }, { "epoch": 0.5338983050847458, "grad_norm": 1.0764185190200806, "learning_rate": 4.110169491525424e-05, "loss": 3.1789, "step": 630 }, { "epoch": 0.5347457627118644, "grad_norm": 0.9846460223197937, "learning_rate": 4.108757062146893e-05, "loss": 3.3573, "step": 631 }, { "epoch": 0.535593220338983, "grad_norm": 1.070899486541748, "learning_rate": 4.107344632768362e-05, "loss": 3.1844, "step": 632 }, { "epoch": 0.5364406779661017, "grad_norm": 0.881467878818512, "learning_rate": 4.105932203389831e-05, "loss": 3.2178, "step": 633 }, { "epoch": 0.5372881355932203, "grad_norm": 1.0086942911148071, "learning_rate": 4.1045197740113e-05, "loss": 3.2735, "step": 634 }, { "epoch": 0.538135593220339, "grad_norm": 0.9170603156089783, "learning_rate": 4.103107344632768e-05, "loss": 3.2565, "step": 635 }, { "epoch": 0.5389830508474577, "grad_norm": 0.9414057731628418, "learning_rate": 4.101694915254237e-05, "loss": 3.1911, "step": 636 }, { "epoch": 0.5398305084745763, "grad_norm": 1.075670599937439, "learning_rate": 4.100282485875707e-05, "loss": 3.3587, "step": 637 }, { "epoch": 0.5406779661016949, "grad_norm": 0.8787429928779602, "learning_rate": 4.098870056497176e-05, "loss": 3.1534, "step": 638 }, { "epoch": 0.5415254237288135, "grad_norm": 1.183467984199524, "learning_rate": 4.097457627118644e-05, "loss": 3.1506, "step": 639 }, { "epoch": 0.5423728813559322, "grad_norm": 1.0731979608535767, "learning_rate": 4.096045197740113e-05, "loss": 3.2767, "step": 640 }, { "epoch": 0.5432203389830509, "grad_norm": 0.9497907757759094, "learning_rate": 4.094632768361582e-05, "loss": 3.3184, "step": 641 }, { "epoch": 0.5440677966101695, "grad_norm": 1.0031533241271973, "learning_rate": 4.093220338983051e-05, "loss": 3.1646, "step": 642 }, { "epoch": 0.5449152542372881, "grad_norm": 0.9335317611694336, "learning_rate": 4.0918079096045194e-05, "loss": 3.3837, "step": 643 }, { "epoch": 0.5457627118644067, "grad_norm": 0.9828890562057495, "learning_rate": 4.090395480225989e-05, "loss": 3.2533, "step": 644 }, { "epoch": 0.5466101694915254, "grad_norm": 0.7858693599700928, "learning_rate": 4.088983050847458e-05, "loss": 3.2746, "step": 645 }, { "epoch": 0.5474576271186441, "grad_norm": 0.878791332244873, "learning_rate": 4.087570621468927e-05, "loss": 3.1824, "step": 646 }, { "epoch": 0.5483050847457627, "grad_norm": 0.9138060212135315, "learning_rate": 4.0861581920903954e-05, "loss": 3.2223, "step": 647 }, { "epoch": 0.5491525423728814, "grad_norm": 1.0286030769348145, "learning_rate": 4.0847457627118644e-05, "loss": 3.2213, "step": 648 }, { "epoch": 0.55, "grad_norm": 0.8754992485046387, "learning_rate": 4.0833333333333334e-05, "loss": 3.333, "step": 649 }, { "epoch": 0.5508474576271186, "grad_norm": 0.7981268167495728, "learning_rate": 4.0819209039548024e-05, "loss": 3.2108, "step": 650 }, { "epoch": 0.5516949152542373, "grad_norm": 0.8459306955337524, "learning_rate": 4.0805084745762714e-05, "loss": 3.2302, "step": 651 }, { "epoch": 0.5525423728813559, "grad_norm": 0.9788030385971069, "learning_rate": 4.0790960451977404e-05, "loss": 3.3078, "step": 652 }, { "epoch": 0.5533898305084746, "grad_norm": 0.974795937538147, "learning_rate": 4.0776836158192094e-05, "loss": 3.1427, "step": 653 }, { "epoch": 0.5542372881355933, "grad_norm": 1.1200718879699707, "learning_rate": 4.0762711864406784e-05, "loss": 3.1824, "step": 654 }, { "epoch": 0.5550847457627118, "grad_norm": 0.8478380441665649, "learning_rate": 4.0748587570621474e-05, "loss": 3.2875, "step": 655 }, { "epoch": 0.5559322033898305, "grad_norm": 0.8825022578239441, "learning_rate": 4.073446327683616e-05, "loss": 3.4266, "step": 656 }, { "epoch": 0.5567796610169492, "grad_norm": 0.8521758317947388, "learning_rate": 4.072033898305085e-05, "loss": 3.1245, "step": 657 }, { "epoch": 0.5576271186440678, "grad_norm": 1.152536392211914, "learning_rate": 4.070621468926554e-05, "loss": 3.2448, "step": 658 }, { "epoch": 0.5584745762711865, "grad_norm": 0.9221817851066589, "learning_rate": 4.069209039548023e-05, "loss": 3.3025, "step": 659 }, { "epoch": 0.559322033898305, "grad_norm": 1.1099809408187866, "learning_rate": 4.067796610169492e-05, "loss": 3.0932, "step": 660 }, { "epoch": 0.5601694915254237, "grad_norm": 0.9236830472946167, "learning_rate": 4.066384180790961e-05, "loss": 3.1852, "step": 661 }, { "epoch": 0.5610169491525424, "grad_norm": 0.9270802736282349, "learning_rate": 4.06497175141243e-05, "loss": 3.2191, "step": 662 }, { "epoch": 0.561864406779661, "grad_norm": 1.103100061416626, "learning_rate": 4.063559322033899e-05, "loss": 3.2248, "step": 663 }, { "epoch": 0.5627118644067797, "grad_norm": 0.904633641242981, "learning_rate": 4.062146892655368e-05, "loss": 3.1827, "step": 664 }, { "epoch": 0.5635593220338984, "grad_norm": 0.9881935119628906, "learning_rate": 4.060734463276836e-05, "loss": 3.359, "step": 665 }, { "epoch": 0.5644067796610169, "grad_norm": 0.8849520683288574, "learning_rate": 4.059322033898305e-05, "loss": 3.2448, "step": 666 }, { "epoch": 0.5652542372881356, "grad_norm": 1.035140037536621, "learning_rate": 4.057909604519774e-05, "loss": 3.1842, "step": 667 }, { "epoch": 0.5661016949152542, "grad_norm": 1.0417625904083252, "learning_rate": 4.056497175141243e-05, "loss": 3.2512, "step": 668 }, { "epoch": 0.5669491525423729, "grad_norm": 1.0602601766586304, "learning_rate": 4.055084745762712e-05, "loss": 3.257, "step": 669 }, { "epoch": 0.5677966101694916, "grad_norm": 1.0768911838531494, "learning_rate": 4.053672316384181e-05, "loss": 3.2235, "step": 670 }, { "epoch": 0.5686440677966101, "grad_norm": 1.012581467628479, "learning_rate": 4.05225988700565e-05, "loss": 3.1992, "step": 671 }, { "epoch": 0.5694915254237288, "grad_norm": 1.0086499452590942, "learning_rate": 4.050847457627119e-05, "loss": 3.1611, "step": 672 }, { "epoch": 0.5703389830508474, "grad_norm": 1.0682578086853027, "learning_rate": 4.0494350282485874e-05, "loss": 3.1736, "step": 673 }, { "epoch": 0.5711864406779661, "grad_norm": 0.9920092821121216, "learning_rate": 4.0480225988700564e-05, "loss": 3.3064, "step": 674 }, { "epoch": 0.5720338983050848, "grad_norm": 1.1007444858551025, "learning_rate": 4.0466101694915254e-05, "loss": 3.3154, "step": 675 }, { "epoch": 0.5728813559322034, "grad_norm": 0.971848726272583, "learning_rate": 4.045197740112995e-05, "loss": 3.2014, "step": 676 }, { "epoch": 0.573728813559322, "grad_norm": 1.0632452964782715, "learning_rate": 4.0437853107344634e-05, "loss": 3.2349, "step": 677 }, { "epoch": 0.5745762711864407, "grad_norm": 0.9419097304344177, "learning_rate": 4.0423728813559324e-05, "loss": 3.1495, "step": 678 }, { "epoch": 0.5754237288135593, "grad_norm": 0.7477332353591919, "learning_rate": 4.0409604519774014e-05, "loss": 3.2508, "step": 679 }, { "epoch": 0.576271186440678, "grad_norm": 1.1096488237380981, "learning_rate": 4.0395480225988704e-05, "loss": 3.1411, "step": 680 }, { "epoch": 0.5771186440677966, "grad_norm": 0.9235982894897461, "learning_rate": 4.0381355932203394e-05, "loss": 3.1801, "step": 681 }, { "epoch": 0.5779661016949152, "grad_norm": 1.0497263669967651, "learning_rate": 4.036723163841808e-05, "loss": 3.2461, "step": 682 }, { "epoch": 0.5788135593220339, "grad_norm": 0.974819540977478, "learning_rate": 4.035310734463277e-05, "loss": 3.1853, "step": 683 }, { "epoch": 0.5796610169491525, "grad_norm": 1.018683910369873, "learning_rate": 4.0338983050847464e-05, "loss": 3.2662, "step": 684 }, { "epoch": 0.5805084745762712, "grad_norm": 0.819183886051178, "learning_rate": 4.0324858757062154e-05, "loss": 3.2036, "step": 685 }, { "epoch": 0.5813559322033899, "grad_norm": 1.2149080038070679, "learning_rate": 4.031073446327684e-05, "loss": 3.1061, "step": 686 }, { "epoch": 0.5822033898305085, "grad_norm": 0.9971520304679871, "learning_rate": 4.029661016949153e-05, "loss": 3.2792, "step": 687 }, { "epoch": 0.5830508474576271, "grad_norm": 1.1348650455474854, "learning_rate": 4.028248587570622e-05, "loss": 3.0649, "step": 688 }, { "epoch": 0.5838983050847457, "grad_norm": 0.966973066329956, "learning_rate": 4.026836158192091e-05, "loss": 3.2352, "step": 689 }, { "epoch": 0.5847457627118644, "grad_norm": 1.069762945175171, "learning_rate": 4.025423728813559e-05, "loss": 3.2476, "step": 690 }, { "epoch": 0.5855932203389831, "grad_norm": 1.018039345741272, "learning_rate": 4.024011299435028e-05, "loss": 3.3637, "step": 691 }, { "epoch": 0.5864406779661017, "grad_norm": 1.0465275049209595, "learning_rate": 4.022598870056498e-05, "loss": 3.2668, "step": 692 }, { "epoch": 0.5872881355932204, "grad_norm": 0.8877243995666504, "learning_rate": 4.021186440677967e-05, "loss": 3.1696, "step": 693 }, { "epoch": 0.588135593220339, "grad_norm": 0.8796149492263794, "learning_rate": 4.019774011299435e-05, "loss": 3.28, "step": 694 }, { "epoch": 0.5889830508474576, "grad_norm": 1.0082428455352783, "learning_rate": 4.018361581920904e-05, "loss": 3.2442, "step": 695 }, { "epoch": 0.5898305084745763, "grad_norm": 0.9520376324653625, "learning_rate": 4.016949152542373e-05, "loss": 3.2093, "step": 696 }, { "epoch": 0.5906779661016949, "grad_norm": 1.0042046308517456, "learning_rate": 4.015536723163842e-05, "loss": 3.196, "step": 697 }, { "epoch": 0.5915254237288136, "grad_norm": 0.9781447649002075, "learning_rate": 4.014124293785311e-05, "loss": 3.2572, "step": 698 }, { "epoch": 0.5923728813559322, "grad_norm": 1.0865386724472046, "learning_rate": 4.012711864406779e-05, "loss": 3.3572, "step": 699 }, { "epoch": 0.5932203389830508, "grad_norm": 0.9630225896835327, "learning_rate": 4.011299435028249e-05, "loss": 3.2831, "step": 700 }, { "epoch": 0.5940677966101695, "grad_norm": 1.0484930276870728, "learning_rate": 4.009887005649718e-05, "loss": 3.2598, "step": 701 }, { "epoch": 0.5949152542372881, "grad_norm": 0.8625562191009521, "learning_rate": 4.008474576271187e-05, "loss": 3.1784, "step": 702 }, { "epoch": 0.5957627118644068, "grad_norm": 1.0610886812210083, "learning_rate": 4.007062146892655e-05, "loss": 3.115, "step": 703 }, { "epoch": 0.5966101694915255, "grad_norm": 1.0813184976577759, "learning_rate": 4.005649717514124e-05, "loss": 3.2275, "step": 704 }, { "epoch": 0.597457627118644, "grad_norm": 0.9595462083816528, "learning_rate": 4.004237288135593e-05, "loss": 3.2497, "step": 705 }, { "epoch": 0.5983050847457627, "grad_norm": 0.8793260455131531, "learning_rate": 4.002824858757062e-05, "loss": 3.1899, "step": 706 }, { "epoch": 0.5991525423728814, "grad_norm": 0.9518945813179016, "learning_rate": 4.001412429378531e-05, "loss": 3.2939, "step": 707 }, { "epoch": 0.6, "grad_norm": 0.9193763136863708, "learning_rate": 4e-05, "loss": 3.2363, "step": 708 }, { "epoch": 0.6008474576271187, "grad_norm": 0.970078706741333, "learning_rate": 3.998587570621469e-05, "loss": 3.166, "step": 709 }, { "epoch": 0.6016949152542372, "grad_norm": 1.0736448764801025, "learning_rate": 3.997175141242938e-05, "loss": 3.2552, "step": 710 }, { "epoch": 0.6025423728813559, "grad_norm": 1.0440733432769775, "learning_rate": 3.9957627118644066e-05, "loss": 3.0961, "step": 711 }, { "epoch": 0.6033898305084746, "grad_norm": 0.8885401487350464, "learning_rate": 3.9943502824858756e-05, "loss": 3.1327, "step": 712 }, { "epoch": 0.6042372881355932, "grad_norm": 1.0647501945495605, "learning_rate": 3.9929378531073446e-05, "loss": 3.1719, "step": 713 }, { "epoch": 0.6050847457627119, "grad_norm": 0.8892009854316711, "learning_rate": 3.9915254237288136e-05, "loss": 3.2557, "step": 714 }, { "epoch": 0.6059322033898306, "grad_norm": 1.0955719947814941, "learning_rate": 3.9901129943502826e-05, "loss": 3.2905, "step": 715 }, { "epoch": 0.6067796610169491, "grad_norm": 0.9575005769729614, "learning_rate": 3.9887005649717516e-05, "loss": 3.1437, "step": 716 }, { "epoch": 0.6076271186440678, "grad_norm": 0.8964182138442993, "learning_rate": 3.9872881355932206e-05, "loss": 3.1041, "step": 717 }, { "epoch": 0.6084745762711864, "grad_norm": 1.0153088569641113, "learning_rate": 3.9858757062146896e-05, "loss": 3.2048, "step": 718 }, { "epoch": 0.6093220338983051, "grad_norm": 1.0228848457336426, "learning_rate": 3.9844632768361586e-05, "loss": 3.2921, "step": 719 }, { "epoch": 0.6101694915254238, "grad_norm": 0.8641826510429382, "learning_rate": 3.983050847457627e-05, "loss": 3.238, "step": 720 }, { "epoch": 0.6110169491525423, "grad_norm": 1.1096068620681763, "learning_rate": 3.981638418079096e-05, "loss": 3.2023, "step": 721 }, { "epoch": 0.611864406779661, "grad_norm": 1.0792356729507446, "learning_rate": 3.980225988700565e-05, "loss": 3.1953, "step": 722 }, { "epoch": 0.6127118644067797, "grad_norm": 1.0577806234359741, "learning_rate": 3.978813559322034e-05, "loss": 3.2092, "step": 723 }, { "epoch": 0.6135593220338983, "grad_norm": 0.9147716760635376, "learning_rate": 3.977401129943503e-05, "loss": 3.2009, "step": 724 }, { "epoch": 0.614406779661017, "grad_norm": 0.8805369734764099, "learning_rate": 3.975988700564972e-05, "loss": 3.4287, "step": 725 }, { "epoch": 0.6152542372881356, "grad_norm": 1.0842047929763794, "learning_rate": 3.974576271186441e-05, "loss": 3.2973, "step": 726 }, { "epoch": 0.6161016949152542, "grad_norm": 1.020552158355713, "learning_rate": 3.97316384180791e-05, "loss": 3.2854, "step": 727 }, { "epoch": 0.6169491525423729, "grad_norm": 0.9600501656532288, "learning_rate": 3.971751412429379e-05, "loss": 3.2243, "step": 728 }, { "epoch": 0.6177966101694915, "grad_norm": 1.0542972087860107, "learning_rate": 3.970338983050847e-05, "loss": 3.2449, "step": 729 }, { "epoch": 0.6186440677966102, "grad_norm": 0.8552963137626648, "learning_rate": 3.968926553672316e-05, "loss": 3.2027, "step": 730 }, { "epoch": 0.6194915254237288, "grad_norm": 0.9919419288635254, "learning_rate": 3.967514124293786e-05, "loss": 3.211, "step": 731 }, { "epoch": 0.6203389830508474, "grad_norm": 0.928841233253479, "learning_rate": 3.966101694915255e-05, "loss": 3.4355, "step": 732 }, { "epoch": 0.6211864406779661, "grad_norm": 0.8842092156410217, "learning_rate": 3.964689265536723e-05, "loss": 3.1617, "step": 733 }, { "epoch": 0.6220338983050847, "grad_norm": 0.9310187697410583, "learning_rate": 3.963276836158192e-05, "loss": 3.2295, "step": 734 }, { "epoch": 0.6228813559322034, "grad_norm": 0.9824132919311523, "learning_rate": 3.961864406779661e-05, "loss": 3.3125, "step": 735 }, { "epoch": 0.6237288135593221, "grad_norm": 1.0622237920761108, "learning_rate": 3.96045197740113e-05, "loss": 3.2571, "step": 736 }, { "epoch": 0.6245762711864407, "grad_norm": 0.8373996615409851, "learning_rate": 3.9590395480225986e-05, "loss": 3.1563, "step": 737 }, { "epoch": 0.6254237288135593, "grad_norm": 0.9568261504173279, "learning_rate": 3.9576271186440676e-05, "loss": 3.1749, "step": 738 }, { "epoch": 0.6262711864406779, "grad_norm": 1.0485754013061523, "learning_rate": 3.956214689265537e-05, "loss": 3.248, "step": 739 }, { "epoch": 0.6271186440677966, "grad_norm": 0.9002594947814941, "learning_rate": 3.954802259887006e-05, "loss": 3.1241, "step": 740 }, { "epoch": 0.6279661016949153, "grad_norm": 1.010948896408081, "learning_rate": 3.9533898305084746e-05, "loss": 3.2764, "step": 741 }, { "epoch": 0.6288135593220339, "grad_norm": 1.0623823404312134, "learning_rate": 3.9519774011299436e-05, "loss": 3.1771, "step": 742 }, { "epoch": 0.6296610169491526, "grad_norm": 0.951085090637207, "learning_rate": 3.9505649717514126e-05, "loss": 3.245, "step": 743 }, { "epoch": 0.6305084745762712, "grad_norm": 0.8962848782539368, "learning_rate": 3.9491525423728816e-05, "loss": 3.1905, "step": 744 }, { "epoch": 0.6313559322033898, "grad_norm": 1.0807331800460815, "learning_rate": 3.9477401129943506e-05, "loss": 3.1325, "step": 745 }, { "epoch": 0.6322033898305085, "grad_norm": 0.9225153923034668, "learning_rate": 3.946327683615819e-05, "loss": 3.1945, "step": 746 }, { "epoch": 0.6330508474576271, "grad_norm": 0.8945325613021851, "learning_rate": 3.9449152542372886e-05, "loss": 3.2607, "step": 747 }, { "epoch": 0.6338983050847458, "grad_norm": 1.0851283073425293, "learning_rate": 3.9435028248587576e-05, "loss": 3.3143, "step": 748 }, { "epoch": 0.6347457627118644, "grad_norm": 1.0192757844924927, "learning_rate": 3.9420903954802266e-05, "loss": 3.119, "step": 749 }, { "epoch": 0.635593220338983, "grad_norm": 0.9469757080078125, "learning_rate": 3.940677966101695e-05, "loss": 3.2111, "step": 750 }, { "epoch": 0.6364406779661017, "grad_norm": 0.9430133700370789, "learning_rate": 3.939265536723164e-05, "loss": 3.1526, "step": 751 }, { "epoch": 0.6372881355932203, "grad_norm": 1.0730900764465332, "learning_rate": 3.937853107344633e-05, "loss": 3.0441, "step": 752 }, { "epoch": 0.638135593220339, "grad_norm": 1.1505954265594482, "learning_rate": 3.936440677966102e-05, "loss": 3.2502, "step": 753 }, { "epoch": 0.6389830508474577, "grad_norm": 1.0467617511749268, "learning_rate": 3.935028248587571e-05, "loss": 3.191, "step": 754 }, { "epoch": 0.6398305084745762, "grad_norm": 0.9028640389442444, "learning_rate": 3.93361581920904e-05, "loss": 3.2017, "step": 755 }, { "epoch": 0.6406779661016949, "grad_norm": 1.0674667358398438, "learning_rate": 3.932203389830509e-05, "loss": 3.1055, "step": 756 }, { "epoch": 0.6415254237288136, "grad_norm": 1.0378832817077637, "learning_rate": 3.930790960451978e-05, "loss": 3.3165, "step": 757 }, { "epoch": 0.6423728813559322, "grad_norm": 1.0089573860168457, "learning_rate": 3.929378531073446e-05, "loss": 3.2966, "step": 758 }, { "epoch": 0.6432203389830509, "grad_norm": 1.0097712278366089, "learning_rate": 3.927966101694915e-05, "loss": 3.2123, "step": 759 }, { "epoch": 0.6440677966101694, "grad_norm": 0.9445637464523315, "learning_rate": 3.926553672316384e-05, "loss": 3.0963, "step": 760 }, { "epoch": 0.6449152542372881, "grad_norm": 0.9790430068969727, "learning_rate": 3.925141242937853e-05, "loss": 3.2946, "step": 761 }, { "epoch": 0.6457627118644068, "grad_norm": 0.962350070476532, "learning_rate": 3.923728813559322e-05, "loss": 3.1931, "step": 762 }, { "epoch": 0.6466101694915254, "grad_norm": 0.918471097946167, "learning_rate": 3.922316384180791e-05, "loss": 3.2357, "step": 763 }, { "epoch": 0.6474576271186441, "grad_norm": 0.9634029269218445, "learning_rate": 3.92090395480226e-05, "loss": 3.2918, "step": 764 }, { "epoch": 0.6483050847457628, "grad_norm": 1.0664550065994263, "learning_rate": 3.919491525423729e-05, "loss": 3.3134, "step": 765 }, { "epoch": 0.6491525423728813, "grad_norm": 1.2073235511779785, "learning_rate": 3.918079096045198e-05, "loss": 3.1881, "step": 766 }, { "epoch": 0.65, "grad_norm": 0.9471457600593567, "learning_rate": 3.9166666666666665e-05, "loss": 3.229, "step": 767 }, { "epoch": 0.6508474576271186, "grad_norm": 1.0511412620544434, "learning_rate": 3.9152542372881355e-05, "loss": 3.188, "step": 768 }, { "epoch": 0.6516949152542373, "grad_norm": 0.982347846031189, "learning_rate": 3.9138418079096045e-05, "loss": 3.3035, "step": 769 }, { "epoch": 0.652542372881356, "grad_norm": 1.1548480987548828, "learning_rate": 3.9124293785310735e-05, "loss": 3.1049, "step": 770 }, { "epoch": 0.6533898305084745, "grad_norm": 1.029438853263855, "learning_rate": 3.9110169491525425e-05, "loss": 3.2432, "step": 771 }, { "epoch": 0.6542372881355932, "grad_norm": 1.0703421831130981, "learning_rate": 3.9096045197740115e-05, "loss": 3.151, "step": 772 }, { "epoch": 0.6550847457627119, "grad_norm": 0.9013370275497437, "learning_rate": 3.9081920903954805e-05, "loss": 3.235, "step": 773 }, { "epoch": 0.6559322033898305, "grad_norm": 1.1812381744384766, "learning_rate": 3.9067796610169495e-05, "loss": 3.0823, "step": 774 }, { "epoch": 0.6567796610169492, "grad_norm": 0.9711376428604126, "learning_rate": 3.9053672316384185e-05, "loss": 3.1236, "step": 775 }, { "epoch": 0.6576271186440678, "grad_norm": 1.2097504138946533, "learning_rate": 3.903954802259887e-05, "loss": 3.1695, "step": 776 }, { "epoch": 0.6584745762711864, "grad_norm": 1.0563030242919922, "learning_rate": 3.902542372881356e-05, "loss": 3.2828, "step": 777 }, { "epoch": 0.6593220338983051, "grad_norm": 0.8016531467437744, "learning_rate": 3.901129943502825e-05, "loss": 3.1842, "step": 778 }, { "epoch": 0.6601694915254237, "grad_norm": 0.9715229868888855, "learning_rate": 3.899717514124294e-05, "loss": 3.1828, "step": 779 }, { "epoch": 0.6610169491525424, "grad_norm": 0.970073938369751, "learning_rate": 3.898305084745763e-05, "loss": 3.1113, "step": 780 }, { "epoch": 0.661864406779661, "grad_norm": 1.078091025352478, "learning_rate": 3.896892655367232e-05, "loss": 3.2263, "step": 781 }, { "epoch": 0.6627118644067796, "grad_norm": 1.0375956296920776, "learning_rate": 3.895480225988701e-05, "loss": 3.2353, "step": 782 }, { "epoch": 0.6635593220338983, "grad_norm": 1.000220775604248, "learning_rate": 3.89406779661017e-05, "loss": 3.1787, "step": 783 }, { "epoch": 0.6644067796610169, "grad_norm": 0.9153347015380859, "learning_rate": 3.892655367231638e-05, "loss": 3.3877, "step": 784 }, { "epoch": 0.6652542372881356, "grad_norm": 0.882677435874939, "learning_rate": 3.891242937853107e-05, "loss": 3.3769, "step": 785 }, { "epoch": 0.6661016949152543, "grad_norm": 0.8701127171516418, "learning_rate": 3.889830508474576e-05, "loss": 3.1109, "step": 786 }, { "epoch": 0.6669491525423729, "grad_norm": 0.9817793369293213, "learning_rate": 3.888418079096046e-05, "loss": 3.204, "step": 787 }, { "epoch": 0.6677966101694915, "grad_norm": 1.0561681985855103, "learning_rate": 3.887005649717514e-05, "loss": 3.1629, "step": 788 }, { "epoch": 0.6686440677966101, "grad_norm": 0.9255909323692322, "learning_rate": 3.885593220338983e-05, "loss": 3.2657, "step": 789 }, { "epoch": 0.6694915254237288, "grad_norm": 0.9756750464439392, "learning_rate": 3.884180790960452e-05, "loss": 3.2273, "step": 790 }, { "epoch": 0.6703389830508475, "grad_norm": 0.8761763572692871, "learning_rate": 3.882768361581921e-05, "loss": 3.2865, "step": 791 }, { "epoch": 0.6711864406779661, "grad_norm": 0.9409726858139038, "learning_rate": 3.88135593220339e-05, "loss": 3.14, "step": 792 }, { "epoch": 0.6720338983050848, "grad_norm": 1.0637263059616089, "learning_rate": 3.8799435028248585e-05, "loss": 3.303, "step": 793 }, { "epoch": 0.6728813559322034, "grad_norm": 1.0434190034866333, "learning_rate": 3.8785310734463275e-05, "loss": 3.1735, "step": 794 }, { "epoch": 0.673728813559322, "grad_norm": 1.0193620920181274, "learning_rate": 3.877118644067797e-05, "loss": 3.2918, "step": 795 }, { "epoch": 0.6745762711864407, "grad_norm": 0.8655635714530945, "learning_rate": 3.875706214689266e-05, "loss": 3.1329, "step": 796 }, { "epoch": 0.6754237288135593, "grad_norm": 1.1758955717086792, "learning_rate": 3.8742937853107345e-05, "loss": 3.1198, "step": 797 }, { "epoch": 0.676271186440678, "grad_norm": 0.9966703057289124, "learning_rate": 3.8728813559322035e-05, "loss": 3.2267, "step": 798 }, { "epoch": 0.6771186440677966, "grad_norm": 1.0191624164581299, "learning_rate": 3.8714689265536725e-05, "loss": 3.2686, "step": 799 }, { "epoch": 0.6779661016949152, "grad_norm": 0.9812909960746765, "learning_rate": 3.8700564971751415e-05, "loss": 3.1289, "step": 800 }, { "epoch": 0.6788135593220339, "grad_norm": 1.0124516487121582, "learning_rate": 3.86864406779661e-05, "loss": 3.354, "step": 801 }, { "epoch": 0.6796610169491526, "grad_norm": 0.9470294117927551, "learning_rate": 3.867231638418079e-05, "loss": 3.3026, "step": 802 }, { "epoch": 0.6805084745762712, "grad_norm": 1.2277476787567139, "learning_rate": 3.8658192090395485e-05, "loss": 3.2692, "step": 803 }, { "epoch": 0.6813559322033899, "grad_norm": 1.003859281539917, "learning_rate": 3.8644067796610175e-05, "loss": 3.2283, "step": 804 }, { "epoch": 0.6822033898305084, "grad_norm": 0.9394976496696472, "learning_rate": 3.862994350282486e-05, "loss": 3.1652, "step": 805 }, { "epoch": 0.6830508474576271, "grad_norm": 0.9242647290229797, "learning_rate": 3.861581920903955e-05, "loss": 3.2833, "step": 806 }, { "epoch": 0.6838983050847458, "grad_norm": 1.0409823656082153, "learning_rate": 3.860169491525424e-05, "loss": 3.276, "step": 807 }, { "epoch": 0.6847457627118644, "grad_norm": 1.079538106918335, "learning_rate": 3.858757062146893e-05, "loss": 3.13, "step": 808 }, { "epoch": 0.6855932203389831, "grad_norm": 1.1553713083267212, "learning_rate": 3.857344632768362e-05, "loss": 3.1682, "step": 809 }, { "epoch": 0.6864406779661016, "grad_norm": 1.3005847930908203, "learning_rate": 3.855932203389831e-05, "loss": 3.2126, "step": 810 }, { "epoch": 0.6872881355932203, "grad_norm": 0.918170690536499, "learning_rate": 3.8545197740113e-05, "loss": 3.2297, "step": 811 }, { "epoch": 0.688135593220339, "grad_norm": 0.9381653666496277, "learning_rate": 3.853107344632769e-05, "loss": 3.1483, "step": 812 }, { "epoch": 0.6889830508474576, "grad_norm": 0.9094195365905762, "learning_rate": 3.851694915254238e-05, "loss": 3.116, "step": 813 }, { "epoch": 0.6898305084745763, "grad_norm": 0.9763381481170654, "learning_rate": 3.850282485875706e-05, "loss": 3.2376, "step": 814 }, { "epoch": 0.690677966101695, "grad_norm": 1.1527831554412842, "learning_rate": 3.848870056497175e-05, "loss": 3.1901, "step": 815 }, { "epoch": 0.6915254237288135, "grad_norm": 1.1849908828735352, "learning_rate": 3.847457627118644e-05, "loss": 3.3913, "step": 816 }, { "epoch": 0.6923728813559322, "grad_norm": 1.0601989030838013, "learning_rate": 3.846045197740113e-05, "loss": 3.2095, "step": 817 }, { "epoch": 0.6932203389830508, "grad_norm": 1.2081618309020996, "learning_rate": 3.844632768361582e-05, "loss": 3.2408, "step": 818 }, { "epoch": 0.6940677966101695, "grad_norm": 1.1345655918121338, "learning_rate": 3.843220338983051e-05, "loss": 3.2974, "step": 819 }, { "epoch": 0.6949152542372882, "grad_norm": 1.0576690435409546, "learning_rate": 3.84180790960452e-05, "loss": 3.3008, "step": 820 }, { "epoch": 0.6957627118644067, "grad_norm": 0.9421098232269287, "learning_rate": 3.840395480225989e-05, "loss": 3.2526, "step": 821 }, { "epoch": 0.6966101694915254, "grad_norm": 0.8413639068603516, "learning_rate": 3.838983050847458e-05, "loss": 3.216, "step": 822 }, { "epoch": 0.6974576271186441, "grad_norm": 1.0307563543319702, "learning_rate": 3.8375706214689265e-05, "loss": 3.1639, "step": 823 }, { "epoch": 0.6983050847457627, "grad_norm": 1.0490492582321167, "learning_rate": 3.8361581920903955e-05, "loss": 3.0989, "step": 824 }, { "epoch": 0.6991525423728814, "grad_norm": 1.03816819190979, "learning_rate": 3.8347457627118644e-05, "loss": 3.1146, "step": 825 }, { "epoch": 0.7, "grad_norm": 0.902752697467804, "learning_rate": 3.8333333333333334e-05, "loss": 3.3416, "step": 826 }, { "epoch": 0.7008474576271186, "grad_norm": 1.0437109470367432, "learning_rate": 3.8319209039548024e-05, "loss": 3.1637, "step": 827 }, { "epoch": 0.7016949152542373, "grad_norm": 1.011850357055664, "learning_rate": 3.8305084745762714e-05, "loss": 3.2652, "step": 828 }, { "epoch": 0.7025423728813559, "grad_norm": 1.0259604454040527, "learning_rate": 3.8290960451977404e-05, "loss": 3.183, "step": 829 }, { "epoch": 0.7033898305084746, "grad_norm": 0.9570820927619934, "learning_rate": 3.8276836158192094e-05, "loss": 3.2709, "step": 830 }, { "epoch": 0.7042372881355933, "grad_norm": 1.1066950559616089, "learning_rate": 3.826271186440678e-05, "loss": 3.1859, "step": 831 }, { "epoch": 0.7050847457627119, "grad_norm": 0.9420603513717651, "learning_rate": 3.824858757062147e-05, "loss": 3.2022, "step": 832 }, { "epoch": 0.7059322033898305, "grad_norm": 1.1082286834716797, "learning_rate": 3.823446327683616e-05, "loss": 3.2539, "step": 833 }, { "epoch": 0.7067796610169491, "grad_norm": 0.8663553595542908, "learning_rate": 3.8220338983050854e-05, "loss": 3.1253, "step": 834 }, { "epoch": 0.7076271186440678, "grad_norm": 0.9714896082878113, "learning_rate": 3.820621468926554e-05, "loss": 3.4205, "step": 835 }, { "epoch": 0.7084745762711865, "grad_norm": 1.0602551698684692, "learning_rate": 3.819209039548023e-05, "loss": 3.1955, "step": 836 }, { "epoch": 0.7093220338983051, "grad_norm": 1.0083402395248413, "learning_rate": 3.817796610169492e-05, "loss": 3.1873, "step": 837 }, { "epoch": 0.7101694915254237, "grad_norm": 1.052625298500061, "learning_rate": 3.816384180790961e-05, "loss": 3.2745, "step": 838 }, { "epoch": 0.7110169491525423, "grad_norm": 0.9955534338951111, "learning_rate": 3.81497175141243e-05, "loss": 3.1355, "step": 839 }, { "epoch": 0.711864406779661, "grad_norm": 1.1159393787384033, "learning_rate": 3.813559322033898e-05, "loss": 3.0875, "step": 840 }, { "epoch": 0.7127118644067797, "grad_norm": 0.9464603662490845, "learning_rate": 3.812146892655367e-05, "loss": 3.3046, "step": 841 }, { "epoch": 0.7135593220338983, "grad_norm": 0.8679315447807312, "learning_rate": 3.810734463276837e-05, "loss": 3.2797, "step": 842 }, { "epoch": 0.714406779661017, "grad_norm": 1.0267614126205444, "learning_rate": 3.809322033898306e-05, "loss": 3.1855, "step": 843 }, { "epoch": 0.7152542372881356, "grad_norm": 1.043196201324463, "learning_rate": 3.807909604519774e-05, "loss": 3.1491, "step": 844 }, { "epoch": 0.7161016949152542, "grad_norm": 0.9654357433319092, "learning_rate": 3.806497175141243e-05, "loss": 3.1582, "step": 845 }, { "epoch": 0.7169491525423729, "grad_norm": 1.2324398756027222, "learning_rate": 3.805084745762712e-05, "loss": 3.1493, "step": 846 }, { "epoch": 0.7177966101694915, "grad_norm": 1.07389497756958, "learning_rate": 3.803672316384181e-05, "loss": 3.22, "step": 847 }, { "epoch": 0.7186440677966102, "grad_norm": 1.0601229667663574, "learning_rate": 3.8022598870056494e-05, "loss": 3.3015, "step": 848 }, { "epoch": 0.7194915254237289, "grad_norm": 0.9054369330406189, "learning_rate": 3.8008474576271184e-05, "loss": 3.1114, "step": 849 }, { "epoch": 0.7203389830508474, "grad_norm": 0.9484003186225891, "learning_rate": 3.799435028248588e-05, "loss": 3.2363, "step": 850 }, { "epoch": 0.7211864406779661, "grad_norm": 0.9740283489227295, "learning_rate": 3.798022598870057e-05, "loss": 3.1581, "step": 851 }, { "epoch": 0.7220338983050848, "grad_norm": 1.0103195905685425, "learning_rate": 3.7966101694915254e-05, "loss": 3.2511, "step": 852 }, { "epoch": 0.7228813559322034, "grad_norm": 0.8766213655471802, "learning_rate": 3.7951977401129944e-05, "loss": 3.2887, "step": 853 }, { "epoch": 0.7237288135593221, "grad_norm": 0.9263383746147156, "learning_rate": 3.7937853107344634e-05, "loss": 3.144, "step": 854 }, { "epoch": 0.7245762711864406, "grad_norm": 1.0999993085861206, "learning_rate": 3.7923728813559324e-05, "loss": 3.2315, "step": 855 }, { "epoch": 0.7254237288135593, "grad_norm": 1.10708487033844, "learning_rate": 3.7909604519774014e-05, "loss": 3.2733, "step": 856 }, { "epoch": 0.726271186440678, "grad_norm": 0.9970806837081909, "learning_rate": 3.78954802259887e-05, "loss": 3.1575, "step": 857 }, { "epoch": 0.7271186440677966, "grad_norm": 0.8489723205566406, "learning_rate": 3.7881355932203394e-05, "loss": 3.1487, "step": 858 }, { "epoch": 0.7279661016949153, "grad_norm": 0.9401115775108337, "learning_rate": 3.7867231638418084e-05, "loss": 3.2257, "step": 859 }, { "epoch": 0.7288135593220338, "grad_norm": 0.8342302441596985, "learning_rate": 3.7853107344632774e-05, "loss": 3.2797, "step": 860 }, { "epoch": 0.7296610169491525, "grad_norm": 1.0335946083068848, "learning_rate": 3.783898305084746e-05, "loss": 3.1556, "step": 861 }, { "epoch": 0.7305084745762712, "grad_norm": 1.0974116325378418, "learning_rate": 3.782485875706215e-05, "loss": 3.1745, "step": 862 }, { "epoch": 0.7313559322033898, "grad_norm": 1.2478196620941162, "learning_rate": 3.781073446327684e-05, "loss": 3.1794, "step": 863 }, { "epoch": 0.7322033898305085, "grad_norm": 1.2136197090148926, "learning_rate": 3.779661016949153e-05, "loss": 3.2302, "step": 864 }, { "epoch": 0.7330508474576272, "grad_norm": 0.876383900642395, "learning_rate": 3.778248587570622e-05, "loss": 3.1867, "step": 865 }, { "epoch": 0.7338983050847457, "grad_norm": 1.1100138425827026, "learning_rate": 3.776836158192091e-05, "loss": 3.1793, "step": 866 }, { "epoch": 0.7347457627118644, "grad_norm": 1.0313066244125366, "learning_rate": 3.77542372881356e-05, "loss": 3.1828, "step": 867 }, { "epoch": 0.735593220338983, "grad_norm": 1.0279622077941895, "learning_rate": 3.774011299435029e-05, "loss": 3.1729, "step": 868 }, { "epoch": 0.7364406779661017, "grad_norm": 1.026025652885437, "learning_rate": 3.772598870056497e-05, "loss": 3.1684, "step": 869 }, { "epoch": 0.7372881355932204, "grad_norm": 0.9214180707931519, "learning_rate": 3.771186440677966e-05, "loss": 3.2182, "step": 870 }, { "epoch": 0.738135593220339, "grad_norm": 1.0707991123199463, "learning_rate": 3.769774011299435e-05, "loss": 3.2536, "step": 871 }, { "epoch": 0.7389830508474576, "grad_norm": 0.9317803978919983, "learning_rate": 3.768361581920904e-05, "loss": 3.1933, "step": 872 }, { "epoch": 0.7398305084745763, "grad_norm": 1.0014886856079102, "learning_rate": 3.766949152542373e-05, "loss": 3.2591, "step": 873 }, { "epoch": 0.7406779661016949, "grad_norm": 1.0117686986923218, "learning_rate": 3.765536723163842e-05, "loss": 3.1762, "step": 874 }, { "epoch": 0.7415254237288136, "grad_norm": 1.034946084022522, "learning_rate": 3.764124293785311e-05, "loss": 3.3375, "step": 875 }, { "epoch": 0.7423728813559322, "grad_norm": 0.8465149402618408, "learning_rate": 3.76271186440678e-05, "loss": 3.157, "step": 876 }, { "epoch": 0.7432203389830508, "grad_norm": 1.0686304569244385, "learning_rate": 3.761299435028249e-05, "loss": 3.132, "step": 877 }, { "epoch": 0.7440677966101695, "grad_norm": 1.0762747526168823, "learning_rate": 3.7598870056497174e-05, "loss": 3.11, "step": 878 }, { "epoch": 0.7449152542372881, "grad_norm": 0.9002189636230469, "learning_rate": 3.7584745762711864e-05, "loss": 3.2446, "step": 879 }, { "epoch": 0.7457627118644068, "grad_norm": 0.9305406808853149, "learning_rate": 3.7570621468926554e-05, "loss": 3.202, "step": 880 }, { "epoch": 0.7466101694915255, "grad_norm": 1.0967098474502563, "learning_rate": 3.7556497175141244e-05, "loss": 3.1081, "step": 881 }, { "epoch": 0.747457627118644, "grad_norm": 1.1288423538208008, "learning_rate": 3.7542372881355934e-05, "loss": 3.1961, "step": 882 }, { "epoch": 0.7483050847457627, "grad_norm": 1.2286369800567627, "learning_rate": 3.7528248587570624e-05, "loss": 3.2565, "step": 883 }, { "epoch": 0.7491525423728813, "grad_norm": 1.069288730621338, "learning_rate": 3.7514124293785313e-05, "loss": 3.1073, "step": 884 }, { "epoch": 0.75, "grad_norm": 1.1600650548934937, "learning_rate": 3.7500000000000003e-05, "loss": 3.1716, "step": 885 }, { "epoch": 0.7508474576271187, "grad_norm": 0.9592218399047852, "learning_rate": 3.7485875706214693e-05, "loss": 3.1922, "step": 886 }, { "epoch": 0.7516949152542373, "grad_norm": 1.0506439208984375, "learning_rate": 3.747175141242938e-05, "loss": 3.0984, "step": 887 }, { "epoch": 0.752542372881356, "grad_norm": 0.9353660941123962, "learning_rate": 3.745762711864407e-05, "loss": 3.0849, "step": 888 }, { "epoch": 0.7533898305084745, "grad_norm": 1.062563180923462, "learning_rate": 3.7443502824858763e-05, "loss": 3.0448, "step": 889 }, { "epoch": 0.7542372881355932, "grad_norm": 0.9308637380599976, "learning_rate": 3.7429378531073453e-05, "loss": 3.1779, "step": 890 }, { "epoch": 0.7550847457627119, "grad_norm": 0.8553966283798218, "learning_rate": 3.741525423728814e-05, "loss": 3.2715, "step": 891 }, { "epoch": 0.7559322033898305, "grad_norm": 0.9313588738441467, "learning_rate": 3.740112994350283e-05, "loss": 3.2929, "step": 892 }, { "epoch": 0.7567796610169492, "grad_norm": 0.8892399072647095, "learning_rate": 3.738700564971752e-05, "loss": 3.1983, "step": 893 }, { "epoch": 0.7576271186440678, "grad_norm": 0.9385632872581482, "learning_rate": 3.737288135593221e-05, "loss": 3.1614, "step": 894 }, { "epoch": 0.7584745762711864, "grad_norm": 1.4118468761444092, "learning_rate": 3.735875706214689e-05, "loss": 3.1691, "step": 895 }, { "epoch": 0.7593220338983051, "grad_norm": 1.0137449502944946, "learning_rate": 3.734463276836158e-05, "loss": 3.2526, "step": 896 }, { "epoch": 0.7601694915254237, "grad_norm": 1.022806167602539, "learning_rate": 3.733050847457628e-05, "loss": 3.0853, "step": 897 }, { "epoch": 0.7610169491525424, "grad_norm": 0.96624755859375, "learning_rate": 3.731638418079097e-05, "loss": 3.2513, "step": 898 }, { "epoch": 0.761864406779661, "grad_norm": 1.2545897960662842, "learning_rate": 3.730225988700565e-05, "loss": 3.0895, "step": 899 }, { "epoch": 0.7627118644067796, "grad_norm": 1.1567143201828003, "learning_rate": 3.728813559322034e-05, "loss": 3.128, "step": 900 }, { "epoch": 0.7635593220338983, "grad_norm": 0.9578268527984619, "learning_rate": 3.727401129943503e-05, "loss": 3.2373, "step": 901 }, { "epoch": 0.764406779661017, "grad_norm": 0.8743712902069092, "learning_rate": 3.725988700564972e-05, "loss": 3.0874, "step": 902 }, { "epoch": 0.7652542372881356, "grad_norm": 1.1642415523529053, "learning_rate": 3.724576271186441e-05, "loss": 3.1992, "step": 903 }, { "epoch": 0.7661016949152543, "grad_norm": 1.0963908433914185, "learning_rate": 3.723163841807909e-05, "loss": 3.2434, "step": 904 }, { "epoch": 0.7669491525423728, "grad_norm": 0.9827107787132263, "learning_rate": 3.721751412429379e-05, "loss": 3.0884, "step": 905 }, { "epoch": 0.7677966101694915, "grad_norm": 0.9908869862556458, "learning_rate": 3.720338983050848e-05, "loss": 3.0993, "step": 906 }, { "epoch": 0.7686440677966102, "grad_norm": 0.939150333404541, "learning_rate": 3.718926553672317e-05, "loss": 3.2156, "step": 907 }, { "epoch": 0.7694915254237288, "grad_norm": 1.029697060585022, "learning_rate": 3.717514124293785e-05, "loss": 3.1514, "step": 908 }, { "epoch": 0.7703389830508475, "grad_norm": 0.9674649238586426, "learning_rate": 3.716101694915254e-05, "loss": 3.1725, "step": 909 }, { "epoch": 0.7711864406779662, "grad_norm": 1.0553406476974487, "learning_rate": 3.714689265536723e-05, "loss": 3.1553, "step": 910 }, { "epoch": 0.7720338983050847, "grad_norm": 0.8645443916320801, "learning_rate": 3.713276836158192e-05, "loss": 3.0972, "step": 911 }, { "epoch": 0.7728813559322034, "grad_norm": 0.909220814704895, "learning_rate": 3.711864406779661e-05, "loss": 3.1069, "step": 912 }, { "epoch": 0.773728813559322, "grad_norm": 1.0346428155899048, "learning_rate": 3.71045197740113e-05, "loss": 3.1356, "step": 913 }, { "epoch": 0.7745762711864407, "grad_norm": 1.0702482461929321, "learning_rate": 3.709039548022599e-05, "loss": 3.2995, "step": 914 }, { "epoch": 0.7754237288135594, "grad_norm": 0.93990159034729, "learning_rate": 3.707627118644068e-05, "loss": 3.1958, "step": 915 }, { "epoch": 0.7762711864406779, "grad_norm": 1.1232110261917114, "learning_rate": 3.7062146892655366e-05, "loss": 3.1094, "step": 916 }, { "epoch": 0.7771186440677966, "grad_norm": 1.079178810119629, "learning_rate": 3.7048022598870056e-05, "loss": 3.3298, "step": 917 }, { "epoch": 0.7779661016949152, "grad_norm": 1.0218323469161987, "learning_rate": 3.7033898305084746e-05, "loss": 3.1555, "step": 918 }, { "epoch": 0.7788135593220339, "grad_norm": 1.0812876224517822, "learning_rate": 3.7019774011299436e-05, "loss": 3.1834, "step": 919 }, { "epoch": 0.7796610169491526, "grad_norm": 0.9746326804161072, "learning_rate": 3.7005649717514126e-05, "loss": 3.1584, "step": 920 }, { "epoch": 0.7805084745762711, "grad_norm": 1.3261560201644897, "learning_rate": 3.6991525423728816e-05, "loss": 3.1947, "step": 921 }, { "epoch": 0.7813559322033898, "grad_norm": 1.0622180700302124, "learning_rate": 3.6977401129943506e-05, "loss": 3.1363, "step": 922 }, { "epoch": 0.7822033898305085, "grad_norm": 0.9775424003601074, "learning_rate": 3.6963276836158196e-05, "loss": 3.1012, "step": 923 }, { "epoch": 0.7830508474576271, "grad_norm": 1.0168304443359375, "learning_rate": 3.6949152542372886e-05, "loss": 3.1112, "step": 924 }, { "epoch": 0.7838983050847458, "grad_norm": 1.0288655757904053, "learning_rate": 3.693502824858757e-05, "loss": 3.2457, "step": 925 }, { "epoch": 0.7847457627118644, "grad_norm": 0.9620063304901123, "learning_rate": 3.692090395480226e-05, "loss": 3.2217, "step": 926 }, { "epoch": 0.785593220338983, "grad_norm": 1.0269447565078735, "learning_rate": 3.690677966101695e-05, "loss": 3.1024, "step": 927 }, { "epoch": 0.7864406779661017, "grad_norm": 0.8785094022750854, "learning_rate": 3.689265536723164e-05, "loss": 3.1708, "step": 928 }, { "epoch": 0.7872881355932203, "grad_norm": 0.9808406829833984, "learning_rate": 3.687853107344633e-05, "loss": 3.202, "step": 929 }, { "epoch": 0.788135593220339, "grad_norm": 1.1924328804016113, "learning_rate": 3.686440677966102e-05, "loss": 3.2412, "step": 930 }, { "epoch": 0.7889830508474577, "grad_norm": 1.0383801460266113, "learning_rate": 3.685028248587571e-05, "loss": 3.1894, "step": 931 }, { "epoch": 0.7898305084745763, "grad_norm": 1.1522718667984009, "learning_rate": 3.68361581920904e-05, "loss": 3.0404, "step": 932 }, { "epoch": 0.7906779661016949, "grad_norm": 0.9969879388809204, "learning_rate": 3.682203389830509e-05, "loss": 3.1535, "step": 933 }, { "epoch": 0.7915254237288135, "grad_norm": 1.0482536554336548, "learning_rate": 3.680790960451977e-05, "loss": 3.194, "step": 934 }, { "epoch": 0.7923728813559322, "grad_norm": 0.9472736120223999, "learning_rate": 3.679378531073446e-05, "loss": 3.242, "step": 935 }, { "epoch": 0.7932203389830509, "grad_norm": 1.1946145296096802, "learning_rate": 3.677966101694915e-05, "loss": 3.0908, "step": 936 }, { "epoch": 0.7940677966101695, "grad_norm": 0.8134632706642151, "learning_rate": 3.676553672316384e-05, "loss": 3.0917, "step": 937 }, { "epoch": 0.7949152542372881, "grad_norm": 1.1249165534973145, "learning_rate": 3.675141242937853e-05, "loss": 3.189, "step": 938 }, { "epoch": 0.7957627118644067, "grad_norm": 0.9513387084007263, "learning_rate": 3.673728813559322e-05, "loss": 3.1798, "step": 939 }, { "epoch": 0.7966101694915254, "grad_norm": 0.9367225766181946, "learning_rate": 3.672316384180791e-05, "loss": 3.2219, "step": 940 }, { "epoch": 0.7974576271186441, "grad_norm": 0.8651477694511414, "learning_rate": 3.67090395480226e-05, "loss": 3.0195, "step": 941 }, { "epoch": 0.7983050847457627, "grad_norm": 1.0951869487762451, "learning_rate": 3.6694915254237286e-05, "loss": 3.1604, "step": 942 }, { "epoch": 0.7991525423728814, "grad_norm": 0.9521233439445496, "learning_rate": 3.6680790960451976e-05, "loss": 3.3187, "step": 943 }, { "epoch": 0.8, "grad_norm": 1.0337250232696533, "learning_rate": 3.6666666666666666e-05, "loss": 3.2058, "step": 944 }, { "epoch": 0.8008474576271186, "grad_norm": 0.9719616174697876, "learning_rate": 3.665254237288136e-05, "loss": 3.179, "step": 945 }, { "epoch": 0.8016949152542373, "grad_norm": 0.9687196016311646, "learning_rate": 3.6638418079096046e-05, "loss": 3.2478, "step": 946 }, { "epoch": 0.8025423728813559, "grad_norm": 0.9144794344902039, "learning_rate": 3.6624293785310736e-05, "loss": 3.2144, "step": 947 }, { "epoch": 0.8033898305084746, "grad_norm": 1.1403071880340576, "learning_rate": 3.6610169491525426e-05, "loss": 3.1879, "step": 948 }, { "epoch": 0.8042372881355933, "grad_norm": 0.9317966103553772, "learning_rate": 3.6596045197740116e-05, "loss": 3.189, "step": 949 }, { "epoch": 0.8050847457627118, "grad_norm": 0.9150249361991882, "learning_rate": 3.6581920903954806e-05, "loss": 3.2399, "step": 950 }, { "epoch": 0.8059322033898305, "grad_norm": 1.1650607585906982, "learning_rate": 3.656779661016949e-05, "loss": 3.1, "step": 951 }, { "epoch": 0.8067796610169492, "grad_norm": 1.0316554307937622, "learning_rate": 3.655367231638418e-05, "loss": 3.1944, "step": 952 }, { "epoch": 0.8076271186440678, "grad_norm": 1.0725008249282837, "learning_rate": 3.6539548022598876e-05, "loss": 3.1793, "step": 953 }, { "epoch": 0.8084745762711865, "grad_norm": 0.9841150641441345, "learning_rate": 3.6525423728813566e-05, "loss": 3.2736, "step": 954 }, { "epoch": 0.809322033898305, "grad_norm": 0.9306738972663879, "learning_rate": 3.651129943502825e-05, "loss": 3.1939, "step": 955 }, { "epoch": 0.8101694915254237, "grad_norm": 0.7785322070121765, "learning_rate": 3.649717514124294e-05, "loss": 3.2341, "step": 956 }, { "epoch": 0.8110169491525424, "grad_norm": 0.9500228762626648, "learning_rate": 3.648305084745763e-05, "loss": 3.1834, "step": 957 }, { "epoch": 0.811864406779661, "grad_norm": 0.9458290338516235, "learning_rate": 3.646892655367232e-05, "loss": 3.2147, "step": 958 }, { "epoch": 0.8127118644067797, "grad_norm": 1.0137789249420166, "learning_rate": 3.6454802259887e-05, "loss": 3.0907, "step": 959 }, { "epoch": 0.8135593220338984, "grad_norm": 1.0603535175323486, "learning_rate": 3.644067796610169e-05, "loss": 3.288, "step": 960 }, { "epoch": 0.8144067796610169, "grad_norm": 0.9757066369056702, "learning_rate": 3.642655367231639e-05, "loss": 3.1827, "step": 961 }, { "epoch": 0.8152542372881356, "grad_norm": 0.9474855661392212, "learning_rate": 3.641242937853108e-05, "loss": 3.1803, "step": 962 }, { "epoch": 0.8161016949152542, "grad_norm": 0.9666436314582825, "learning_rate": 3.639830508474576e-05, "loss": 3.202, "step": 963 }, { "epoch": 0.8169491525423729, "grad_norm": 0.9885278940200806, "learning_rate": 3.638418079096045e-05, "loss": 3.3161, "step": 964 }, { "epoch": 0.8177966101694916, "grad_norm": 1.1008918285369873, "learning_rate": 3.637005649717514e-05, "loss": 3.2544, "step": 965 }, { "epoch": 0.8186440677966101, "grad_norm": 1.0333728790283203, "learning_rate": 3.635593220338983e-05, "loss": 3.1205, "step": 966 }, { "epoch": 0.8194915254237288, "grad_norm": 1.096467137336731, "learning_rate": 3.634180790960452e-05, "loss": 3.0647, "step": 967 }, { "epoch": 0.8203389830508474, "grad_norm": 0.9635850787162781, "learning_rate": 3.632768361581921e-05, "loss": 3.1874, "step": 968 }, { "epoch": 0.8211864406779661, "grad_norm": 1.0629239082336426, "learning_rate": 3.63135593220339e-05, "loss": 3.1279, "step": 969 }, { "epoch": 0.8220338983050848, "grad_norm": 0.9065790772438049, "learning_rate": 3.629943502824859e-05, "loss": 3.2068, "step": 970 }, { "epoch": 0.8228813559322034, "grad_norm": 1.0202919244766235, "learning_rate": 3.628531073446328e-05, "loss": 2.9332, "step": 971 }, { "epoch": 0.823728813559322, "grad_norm": 1.0279127359390259, "learning_rate": 3.6271186440677965e-05, "loss": 3.1212, "step": 972 }, { "epoch": 0.8245762711864407, "grad_norm": 1.0611159801483154, "learning_rate": 3.6257062146892655e-05, "loss": 3.2993, "step": 973 }, { "epoch": 0.8254237288135593, "grad_norm": 0.8197402954101562, "learning_rate": 3.6242937853107345e-05, "loss": 3.1063, "step": 974 }, { "epoch": 0.826271186440678, "grad_norm": 0.9545575976371765, "learning_rate": 3.6228813559322035e-05, "loss": 3.0439, "step": 975 }, { "epoch": 0.8271186440677966, "grad_norm": 0.8873161673545837, "learning_rate": 3.6214689265536725e-05, "loss": 3.1075, "step": 976 }, { "epoch": 0.8279661016949152, "grad_norm": 0.981919527053833, "learning_rate": 3.6200564971751415e-05, "loss": 3.151, "step": 977 }, { "epoch": 0.8288135593220339, "grad_norm": 1.072557806968689, "learning_rate": 3.6186440677966105e-05, "loss": 3.0936, "step": 978 }, { "epoch": 0.8296610169491525, "grad_norm": 1.063246726989746, "learning_rate": 3.6172316384180795e-05, "loss": 3.2732, "step": 979 }, { "epoch": 0.8305084745762712, "grad_norm": 1.1005663871765137, "learning_rate": 3.6158192090395485e-05, "loss": 3.1267, "step": 980 }, { "epoch": 0.8313559322033899, "grad_norm": 0.9517815113067627, "learning_rate": 3.614406779661017e-05, "loss": 3.1137, "step": 981 }, { "epoch": 0.8322033898305085, "grad_norm": 1.1257226467132568, "learning_rate": 3.612994350282486e-05, "loss": 3.1086, "step": 982 }, { "epoch": 0.8330508474576271, "grad_norm": 1.0828503370285034, "learning_rate": 3.611581920903955e-05, "loss": 3.1462, "step": 983 }, { "epoch": 0.8338983050847457, "grad_norm": 0.8862940669059753, "learning_rate": 3.610169491525424e-05, "loss": 3.1509, "step": 984 }, { "epoch": 0.8347457627118644, "grad_norm": 0.9474267363548279, "learning_rate": 3.608757062146893e-05, "loss": 3.0484, "step": 985 }, { "epoch": 0.8355932203389831, "grad_norm": 1.0636128187179565, "learning_rate": 3.607344632768362e-05, "loss": 3.0442, "step": 986 }, { "epoch": 0.8364406779661017, "grad_norm": 1.1512815952301025, "learning_rate": 3.605932203389831e-05, "loss": 3.1838, "step": 987 }, { "epoch": 0.8372881355932204, "grad_norm": 0.9204326868057251, "learning_rate": 3.6045197740113e-05, "loss": 3.1685, "step": 988 }, { "epoch": 0.838135593220339, "grad_norm": 0.978344738483429, "learning_rate": 3.603107344632768e-05, "loss": 3.0575, "step": 989 }, { "epoch": 0.8389830508474576, "grad_norm": 0.8899116516113281, "learning_rate": 3.601694915254237e-05, "loss": 3.2671, "step": 990 }, { "epoch": 0.8398305084745763, "grad_norm": 1.0270024538040161, "learning_rate": 3.600282485875706e-05, "loss": 3.0934, "step": 991 }, { "epoch": 0.8406779661016949, "grad_norm": 1.0981197357177734, "learning_rate": 3.598870056497176e-05, "loss": 3.2038, "step": 992 }, { "epoch": 0.8415254237288136, "grad_norm": 1.0259603261947632, "learning_rate": 3.597457627118644e-05, "loss": 3.1694, "step": 993 }, { "epoch": 0.8423728813559322, "grad_norm": 1.0101300477981567, "learning_rate": 3.596045197740113e-05, "loss": 3.2253, "step": 994 }, { "epoch": 0.8432203389830508, "grad_norm": 1.0292799472808838, "learning_rate": 3.594632768361582e-05, "loss": 3.27, "step": 995 }, { "epoch": 0.8440677966101695, "grad_norm": 0.9315595030784607, "learning_rate": 3.593220338983051e-05, "loss": 3.1596, "step": 996 }, { "epoch": 0.8449152542372881, "grad_norm": 1.0345228910446167, "learning_rate": 3.59180790960452e-05, "loss": 3.0738, "step": 997 }, { "epoch": 0.8457627118644068, "grad_norm": 0.887259304523468, "learning_rate": 3.5903954802259885e-05, "loss": 3.152, "step": 998 }, { "epoch": 0.8466101694915255, "grad_norm": 0.9285693168640137, "learning_rate": 3.5889830508474575e-05, "loss": 3.0995, "step": 999 }, { "epoch": 0.847457627118644, "grad_norm": 1.0636188983917236, "learning_rate": 3.587570621468927e-05, "loss": 3.0885, "step": 1000 }, { "epoch": 0.8483050847457627, "grad_norm": 0.9358274936676025, "learning_rate": 3.586158192090396e-05, "loss": 3.1655, "step": 1001 }, { "epoch": 0.8491525423728814, "grad_norm": 1.0933887958526611, "learning_rate": 3.5847457627118645e-05, "loss": 3.123, "step": 1002 }, { "epoch": 0.85, "grad_norm": 0.8956212401390076, "learning_rate": 3.5833333333333335e-05, "loss": 3.158, "step": 1003 }, { "epoch": 0.8508474576271187, "grad_norm": 1.0863122940063477, "learning_rate": 3.5819209039548025e-05, "loss": 3.2314, "step": 1004 }, { "epoch": 0.8516949152542372, "grad_norm": 0.9890239834785461, "learning_rate": 3.5805084745762715e-05, "loss": 3.0972, "step": 1005 }, { "epoch": 0.8525423728813559, "grad_norm": 1.0596765279769897, "learning_rate": 3.57909604519774e-05, "loss": 3.074, "step": 1006 }, { "epoch": 0.8533898305084746, "grad_norm": 0.9362082481384277, "learning_rate": 3.577683615819209e-05, "loss": 3.0835, "step": 1007 }, { "epoch": 0.8542372881355932, "grad_norm": 0.9448023438453674, "learning_rate": 3.5762711864406785e-05, "loss": 3.2233, "step": 1008 }, { "epoch": 0.8550847457627119, "grad_norm": 0.9858917593955994, "learning_rate": 3.5748587570621475e-05, "loss": 3.0609, "step": 1009 }, { "epoch": 0.8559322033898306, "grad_norm": 0.9291073679924011, "learning_rate": 3.573446327683616e-05, "loss": 3.1418, "step": 1010 }, { "epoch": 0.8567796610169491, "grad_norm": 1.019467830657959, "learning_rate": 3.572033898305085e-05, "loss": 3.1534, "step": 1011 }, { "epoch": 0.8576271186440678, "grad_norm": 0.8311463594436646, "learning_rate": 3.570621468926554e-05, "loss": 3.18, "step": 1012 }, { "epoch": 0.8584745762711864, "grad_norm": 1.0131714344024658, "learning_rate": 3.569209039548023e-05, "loss": 2.9819, "step": 1013 }, { "epoch": 0.8593220338983051, "grad_norm": 1.0068503618240356, "learning_rate": 3.567796610169492e-05, "loss": 3.1073, "step": 1014 }, { "epoch": 0.8601694915254238, "grad_norm": 0.8969554901123047, "learning_rate": 3.56638418079096e-05, "loss": 3.1638, "step": 1015 }, { "epoch": 0.8610169491525423, "grad_norm": 0.8899940252304077, "learning_rate": 3.56497175141243e-05, "loss": 3.2716, "step": 1016 }, { "epoch": 0.861864406779661, "grad_norm": 0.9648430943489075, "learning_rate": 3.563559322033899e-05, "loss": 3.2465, "step": 1017 }, { "epoch": 0.8627118644067797, "grad_norm": 1.0650931596755981, "learning_rate": 3.562146892655368e-05, "loss": 3.2254, "step": 1018 }, { "epoch": 0.8635593220338983, "grad_norm": 1.2342017889022827, "learning_rate": 3.560734463276836e-05, "loss": 2.9903, "step": 1019 }, { "epoch": 0.864406779661017, "grad_norm": 1.0534242391586304, "learning_rate": 3.559322033898305e-05, "loss": 3.0908, "step": 1020 }, { "epoch": 0.8652542372881356, "grad_norm": 1.1241739988327026, "learning_rate": 3.557909604519774e-05, "loss": 3.1064, "step": 1021 }, { "epoch": 0.8661016949152542, "grad_norm": 0.9351906776428223, "learning_rate": 3.556497175141243e-05, "loss": 3.2339, "step": 1022 }, { "epoch": 0.8669491525423729, "grad_norm": 1.0255619287490845, "learning_rate": 3.555084745762712e-05, "loss": 3.1819, "step": 1023 }, { "epoch": 0.8677966101694915, "grad_norm": 1.1390897035598755, "learning_rate": 3.553672316384181e-05, "loss": 2.9748, "step": 1024 }, { "epoch": 0.8686440677966102, "grad_norm": 0.9607123732566833, "learning_rate": 3.55225988700565e-05, "loss": 3.229, "step": 1025 }, { "epoch": 0.8694915254237288, "grad_norm": 0.9447827935218811, "learning_rate": 3.550847457627119e-05, "loss": 3.1474, "step": 1026 }, { "epoch": 0.8703389830508474, "grad_norm": 0.8953350782394409, "learning_rate": 3.5494350282485874e-05, "loss": 3.1226, "step": 1027 }, { "epoch": 0.8711864406779661, "grad_norm": 0.8721854090690613, "learning_rate": 3.5480225988700564e-05, "loss": 3.1238, "step": 1028 }, { "epoch": 0.8720338983050847, "grad_norm": 0.9612311124801636, "learning_rate": 3.5466101694915254e-05, "loss": 3.1679, "step": 1029 }, { "epoch": 0.8728813559322034, "grad_norm": 0.9968669414520264, "learning_rate": 3.5451977401129944e-05, "loss": 3.092, "step": 1030 }, { "epoch": 0.8737288135593221, "grad_norm": 1.1011210680007935, "learning_rate": 3.5437853107344634e-05, "loss": 3.1723, "step": 1031 }, { "epoch": 0.8745762711864407, "grad_norm": 1.0265681743621826, "learning_rate": 3.5423728813559324e-05, "loss": 3.2485, "step": 1032 }, { "epoch": 0.8754237288135593, "grad_norm": 1.0001736879348755, "learning_rate": 3.5409604519774014e-05, "loss": 3.1407, "step": 1033 }, { "epoch": 0.8762711864406779, "grad_norm": 0.862217903137207, "learning_rate": 3.5395480225988704e-05, "loss": 3.1386, "step": 1034 }, { "epoch": 0.8771186440677966, "grad_norm": 1.0363800525665283, "learning_rate": 3.5381355932203394e-05, "loss": 3.1702, "step": 1035 }, { "epoch": 0.8779661016949153, "grad_norm": 1.1347532272338867, "learning_rate": 3.536723163841808e-05, "loss": 3.0559, "step": 1036 }, { "epoch": 0.8788135593220339, "grad_norm": 1.0662977695465088, "learning_rate": 3.535310734463277e-05, "loss": 3.0345, "step": 1037 }, { "epoch": 0.8796610169491526, "grad_norm": 0.9417952299118042, "learning_rate": 3.533898305084746e-05, "loss": 3.1034, "step": 1038 }, { "epoch": 0.8805084745762712, "grad_norm": 1.018358588218689, "learning_rate": 3.532485875706215e-05, "loss": 3.1722, "step": 1039 }, { "epoch": 0.8813559322033898, "grad_norm": 0.9088732004165649, "learning_rate": 3.531073446327684e-05, "loss": 3.2741, "step": 1040 }, { "epoch": 0.8822033898305085, "grad_norm": 1.2275269031524658, "learning_rate": 3.529661016949153e-05, "loss": 3.1578, "step": 1041 }, { "epoch": 0.8830508474576271, "grad_norm": 0.9241889715194702, "learning_rate": 3.528248587570622e-05, "loss": 3.1585, "step": 1042 }, { "epoch": 0.8838983050847458, "grad_norm": 1.006468415260315, "learning_rate": 3.526836158192091e-05, "loss": 3.1019, "step": 1043 }, { "epoch": 0.8847457627118644, "grad_norm": 1.0921082496643066, "learning_rate": 3.52542372881356e-05, "loss": 3.1618, "step": 1044 }, { "epoch": 0.885593220338983, "grad_norm": 0.9923046231269836, "learning_rate": 3.524011299435028e-05, "loss": 3.2301, "step": 1045 }, { "epoch": 0.8864406779661017, "grad_norm": 1.018099308013916, "learning_rate": 3.522598870056497e-05, "loss": 3.042, "step": 1046 }, { "epoch": 0.8872881355932203, "grad_norm": 0.9938775300979614, "learning_rate": 3.521186440677967e-05, "loss": 3.0783, "step": 1047 }, { "epoch": 0.888135593220339, "grad_norm": 1.0113307237625122, "learning_rate": 3.519774011299436e-05, "loss": 3.1987, "step": 1048 }, { "epoch": 0.8889830508474577, "grad_norm": 0.9744778871536255, "learning_rate": 3.518361581920904e-05, "loss": 3.2, "step": 1049 }, { "epoch": 0.8898305084745762, "grad_norm": 1.1762267351150513, "learning_rate": 3.516949152542373e-05, "loss": 3.0518, "step": 1050 }, { "epoch": 0.8906779661016949, "grad_norm": 0.9027320146560669, "learning_rate": 3.515536723163842e-05, "loss": 3.1483, "step": 1051 }, { "epoch": 0.8915254237288136, "grad_norm": 1.0681418180465698, "learning_rate": 3.514124293785311e-05, "loss": 3.2282, "step": 1052 }, { "epoch": 0.8923728813559322, "grad_norm": 1.1005299091339111, "learning_rate": 3.5127118644067794e-05, "loss": 3.152, "step": 1053 }, { "epoch": 0.8932203389830509, "grad_norm": 0.8355681300163269, "learning_rate": 3.5112994350282484e-05, "loss": 3.0815, "step": 1054 }, { "epoch": 0.8940677966101694, "grad_norm": 0.8652706146240234, "learning_rate": 3.509887005649718e-05, "loss": 3.1695, "step": 1055 }, { "epoch": 0.8949152542372881, "grad_norm": 1.1734929084777832, "learning_rate": 3.508474576271187e-05, "loss": 3.1214, "step": 1056 }, { "epoch": 0.8957627118644068, "grad_norm": 0.866345226764679, "learning_rate": 3.5070621468926554e-05, "loss": 3.098, "step": 1057 }, { "epoch": 0.8966101694915254, "grad_norm": 1.004052758216858, "learning_rate": 3.5056497175141244e-05, "loss": 3.2785, "step": 1058 }, { "epoch": 0.8974576271186441, "grad_norm": 1.025038719177246, "learning_rate": 3.5042372881355934e-05, "loss": 3.2701, "step": 1059 }, { "epoch": 0.8983050847457628, "grad_norm": 0.9300771355628967, "learning_rate": 3.5028248587570624e-05, "loss": 3.0623, "step": 1060 }, { "epoch": 0.8991525423728813, "grad_norm": 0.8885724544525146, "learning_rate": 3.5014124293785314e-05, "loss": 3.1737, "step": 1061 }, { "epoch": 0.9, "grad_norm": 0.9810854196548462, "learning_rate": 3.5e-05, "loss": 3.2598, "step": 1062 }, { "epoch": 0.9008474576271186, "grad_norm": 1.1284035444259644, "learning_rate": 3.4985875706214694e-05, "loss": 3.1814, "step": 1063 }, { "epoch": 0.9016949152542373, "grad_norm": 0.9558525085449219, "learning_rate": 3.4971751412429384e-05, "loss": 3.1885, "step": 1064 }, { "epoch": 0.902542372881356, "grad_norm": 0.8473979234695435, "learning_rate": 3.4957627118644074e-05, "loss": 3.137, "step": 1065 }, { "epoch": 0.9033898305084745, "grad_norm": 0.8437260389328003, "learning_rate": 3.494350282485876e-05, "loss": 3.389, "step": 1066 }, { "epoch": 0.9042372881355932, "grad_norm": 1.0226668119430542, "learning_rate": 3.492937853107345e-05, "loss": 3.0994, "step": 1067 }, { "epoch": 0.9050847457627119, "grad_norm": 0.822215735912323, "learning_rate": 3.491525423728814e-05, "loss": 3.2503, "step": 1068 }, { "epoch": 0.9059322033898305, "grad_norm": 1.1045360565185547, "learning_rate": 3.490112994350283e-05, "loss": 3.2006, "step": 1069 }, { "epoch": 0.9067796610169492, "grad_norm": 0.9761415123939514, "learning_rate": 3.488700564971752e-05, "loss": 2.9618, "step": 1070 }, { "epoch": 0.9076271186440678, "grad_norm": 0.955146849155426, "learning_rate": 3.487288135593221e-05, "loss": 3.2325, "step": 1071 }, { "epoch": 0.9084745762711864, "grad_norm": 1.0589054822921753, "learning_rate": 3.48587570621469e-05, "loss": 3.1792, "step": 1072 }, { "epoch": 0.9093220338983051, "grad_norm": 0.9603967070579529, "learning_rate": 3.484463276836159e-05, "loss": 3.0278, "step": 1073 }, { "epoch": 0.9101694915254237, "grad_norm": 1.0430519580841064, "learning_rate": 3.483050847457627e-05, "loss": 3.2564, "step": 1074 }, { "epoch": 0.9110169491525424, "grad_norm": 0.9699019193649292, "learning_rate": 3.481638418079096e-05, "loss": 3.1516, "step": 1075 }, { "epoch": 0.911864406779661, "grad_norm": 0.9822274446487427, "learning_rate": 3.480225988700565e-05, "loss": 3.1477, "step": 1076 }, { "epoch": 0.9127118644067796, "grad_norm": 0.8602921962738037, "learning_rate": 3.478813559322034e-05, "loss": 3.2671, "step": 1077 }, { "epoch": 0.9135593220338983, "grad_norm": 1.0001736879348755, "learning_rate": 3.477401129943503e-05, "loss": 3.056, "step": 1078 }, { "epoch": 0.9144067796610169, "grad_norm": 0.9457423686981201, "learning_rate": 3.475988700564972e-05, "loss": 3.2139, "step": 1079 }, { "epoch": 0.9152542372881356, "grad_norm": 1.0191426277160645, "learning_rate": 3.474576271186441e-05, "loss": 3.231, "step": 1080 }, { "epoch": 0.9161016949152543, "grad_norm": 0.9214416146278381, "learning_rate": 3.47316384180791e-05, "loss": 3.1936, "step": 1081 }, { "epoch": 0.9169491525423729, "grad_norm": 1.0589238405227661, "learning_rate": 3.471751412429379e-05, "loss": 3.0493, "step": 1082 }, { "epoch": 0.9177966101694915, "grad_norm": 1.1473020315170288, "learning_rate": 3.470338983050847e-05, "loss": 3.1314, "step": 1083 }, { "epoch": 0.9186440677966101, "grad_norm": 1.0457464456558228, "learning_rate": 3.468926553672316e-05, "loss": 3.1452, "step": 1084 }, { "epoch": 0.9194915254237288, "grad_norm": 0.9404765963554382, "learning_rate": 3.467514124293785e-05, "loss": 3.1878, "step": 1085 }, { "epoch": 0.9203389830508475, "grad_norm": 1.1287540197372437, "learning_rate": 3.466101694915254e-05, "loss": 3.1953, "step": 1086 }, { "epoch": 0.9211864406779661, "grad_norm": 1.0317624807357788, "learning_rate": 3.464689265536723e-05, "loss": 3.0901, "step": 1087 }, { "epoch": 0.9220338983050848, "grad_norm": 1.1479357481002808, "learning_rate": 3.463276836158192e-05, "loss": 3.2511, "step": 1088 }, { "epoch": 0.9228813559322034, "grad_norm": 1.1169977188110352, "learning_rate": 3.461864406779661e-05, "loss": 3.176, "step": 1089 }, { "epoch": 0.923728813559322, "grad_norm": 0.8637784719467163, "learning_rate": 3.46045197740113e-05, "loss": 3.2857, "step": 1090 }, { "epoch": 0.9245762711864407, "grad_norm": 0.860206127166748, "learning_rate": 3.459039548022599e-05, "loss": 3.3376, "step": 1091 }, { "epoch": 0.9254237288135593, "grad_norm": 1.0485258102416992, "learning_rate": 3.4576271186440676e-05, "loss": 3.155, "step": 1092 }, { "epoch": 0.926271186440678, "grad_norm": 0.9788015484809875, "learning_rate": 3.4562146892655366e-05, "loss": 3.3039, "step": 1093 }, { "epoch": 0.9271186440677966, "grad_norm": 0.9749478101730347, "learning_rate": 3.4548022598870056e-05, "loss": 3.1296, "step": 1094 }, { "epoch": 0.9279661016949152, "grad_norm": 1.0621051788330078, "learning_rate": 3.4533898305084746e-05, "loss": 3.2054, "step": 1095 }, { "epoch": 0.9288135593220339, "grad_norm": 0.7976939082145691, "learning_rate": 3.4519774011299436e-05, "loss": 3.1385, "step": 1096 }, { "epoch": 0.9296610169491526, "grad_norm": 1.0727747678756714, "learning_rate": 3.4505649717514126e-05, "loss": 3.1828, "step": 1097 }, { "epoch": 0.9305084745762712, "grad_norm": 0.9779229164123535, "learning_rate": 3.4491525423728816e-05, "loss": 3.0821, "step": 1098 }, { "epoch": 0.9313559322033899, "grad_norm": 0.9867045879364014, "learning_rate": 3.4477401129943506e-05, "loss": 3.1804, "step": 1099 }, { "epoch": 0.9322033898305084, "grad_norm": 0.9286884665489197, "learning_rate": 3.446327683615819e-05, "loss": 3.2151, "step": 1100 }, { "epoch": 0.9330508474576271, "grad_norm": 0.9230116009712219, "learning_rate": 3.444915254237288e-05, "loss": 3.2612, "step": 1101 }, { "epoch": 0.9338983050847458, "grad_norm": 0.9774742126464844, "learning_rate": 3.443502824858757e-05, "loss": 3.086, "step": 1102 }, { "epoch": 0.9347457627118644, "grad_norm": 1.0071637630462646, "learning_rate": 3.4420903954802266e-05, "loss": 3.1048, "step": 1103 }, { "epoch": 0.9355932203389831, "grad_norm": 0.956024706363678, "learning_rate": 3.440677966101695e-05, "loss": 3.0709, "step": 1104 }, { "epoch": 0.9364406779661016, "grad_norm": 0.9846750497817993, "learning_rate": 3.439265536723164e-05, "loss": 3.2082, "step": 1105 }, { "epoch": 0.9372881355932203, "grad_norm": 1.0164185762405396, "learning_rate": 3.437853107344633e-05, "loss": 3.0972, "step": 1106 }, { "epoch": 0.938135593220339, "grad_norm": 1.0777028799057007, "learning_rate": 3.436440677966102e-05, "loss": 2.9985, "step": 1107 }, { "epoch": 0.9389830508474576, "grad_norm": 0.9713720679283142, "learning_rate": 3.435028248587571e-05, "loss": 3.17, "step": 1108 }, { "epoch": 0.9398305084745763, "grad_norm": 0.8038375377655029, "learning_rate": 3.433615819209039e-05, "loss": 3.2833, "step": 1109 }, { "epoch": 0.940677966101695, "grad_norm": 1.069003939628601, "learning_rate": 3.432203389830508e-05, "loss": 3.1023, "step": 1110 }, { "epoch": 0.9415254237288135, "grad_norm": 2.074564218521118, "learning_rate": 3.430790960451978e-05, "loss": 3.1168, "step": 1111 }, { "epoch": 0.9423728813559322, "grad_norm": 1.077620506286621, "learning_rate": 3.429378531073447e-05, "loss": 3.0685, "step": 1112 }, { "epoch": 0.9432203389830508, "grad_norm": 0.9343460202217102, "learning_rate": 3.427966101694915e-05, "loss": 3.0626, "step": 1113 }, { "epoch": 0.9440677966101695, "grad_norm": 1.0174064636230469, "learning_rate": 3.426553672316384e-05, "loss": 3.1679, "step": 1114 }, { "epoch": 0.9449152542372882, "grad_norm": 0.9161520600318909, "learning_rate": 3.425141242937853e-05, "loss": 3.0145, "step": 1115 }, { "epoch": 0.9457627118644067, "grad_norm": 0.9859434962272644, "learning_rate": 3.423728813559322e-05, "loss": 3.2604, "step": 1116 }, { "epoch": 0.9466101694915254, "grad_norm": 1.0903204679489136, "learning_rate": 3.4223163841807906e-05, "loss": 3.2061, "step": 1117 }, { "epoch": 0.9474576271186441, "grad_norm": 0.9489259719848633, "learning_rate": 3.4209039548022596e-05, "loss": 3.0373, "step": 1118 }, { "epoch": 0.9483050847457627, "grad_norm": 1.042888879776001, "learning_rate": 3.419491525423729e-05, "loss": 3.0331, "step": 1119 }, { "epoch": 0.9491525423728814, "grad_norm": 0.9367261528968811, "learning_rate": 3.418079096045198e-05, "loss": 3.0994, "step": 1120 }, { "epoch": 0.95, "grad_norm": 0.9779276847839355, "learning_rate": 3.4166666666666666e-05, "loss": 3.0605, "step": 1121 }, { "epoch": 0.9508474576271186, "grad_norm": 0.9787055850028992, "learning_rate": 3.4152542372881356e-05, "loss": 3.1164, "step": 1122 }, { "epoch": 0.9516949152542373, "grad_norm": 0.926442563533783, "learning_rate": 3.4138418079096046e-05, "loss": 3.1982, "step": 1123 }, { "epoch": 0.9525423728813559, "grad_norm": 1.0305582284927368, "learning_rate": 3.4124293785310736e-05, "loss": 3.1462, "step": 1124 }, { "epoch": 0.9533898305084746, "grad_norm": 0.895269513130188, "learning_rate": 3.4110169491525426e-05, "loss": 3.2023, "step": 1125 }, { "epoch": 0.9542372881355933, "grad_norm": 0.8479098081588745, "learning_rate": 3.409604519774011e-05, "loss": 3.0351, "step": 1126 }, { "epoch": 0.9550847457627119, "grad_norm": 0.9858337044715881, "learning_rate": 3.4081920903954806e-05, "loss": 3.2064, "step": 1127 }, { "epoch": 0.9559322033898305, "grad_norm": 0.9453044533729553, "learning_rate": 3.4067796610169496e-05, "loss": 3.0836, "step": 1128 }, { "epoch": 0.9567796610169491, "grad_norm": 0.9639103412628174, "learning_rate": 3.4053672316384186e-05, "loss": 3.1083, "step": 1129 }, { "epoch": 0.9576271186440678, "grad_norm": 0.955803632736206, "learning_rate": 3.403954802259887e-05, "loss": 3.1033, "step": 1130 }, { "epoch": 0.9584745762711865, "grad_norm": 1.1019872426986694, "learning_rate": 3.402542372881356e-05, "loss": 3.1655, "step": 1131 }, { "epoch": 0.9593220338983051, "grad_norm": 0.8813576698303223, "learning_rate": 3.401129943502825e-05, "loss": 3.1289, "step": 1132 }, { "epoch": 0.9601694915254237, "grad_norm": 0.8251473307609558, "learning_rate": 3.399717514124294e-05, "loss": 3.1034, "step": 1133 }, { "epoch": 0.9610169491525423, "grad_norm": 0.9033800363540649, "learning_rate": 3.398305084745763e-05, "loss": 3.1821, "step": 1134 }, { "epoch": 0.961864406779661, "grad_norm": 1.3056793212890625, "learning_rate": 3.396892655367232e-05, "loss": 3.163, "step": 1135 }, { "epoch": 0.9627118644067797, "grad_norm": 0.9215332865715027, "learning_rate": 3.395480225988701e-05, "loss": 3.1409, "step": 1136 }, { "epoch": 0.9635593220338983, "grad_norm": 0.9346233606338501, "learning_rate": 3.39406779661017e-05, "loss": 3.1261, "step": 1137 }, { "epoch": 0.964406779661017, "grad_norm": 0.980396568775177, "learning_rate": 3.392655367231639e-05, "loss": 3.0834, "step": 1138 }, { "epoch": 0.9652542372881356, "grad_norm": 0.9811868071556091, "learning_rate": 3.391242937853107e-05, "loss": 3.2059, "step": 1139 }, { "epoch": 0.9661016949152542, "grad_norm": 0.9104767441749573, "learning_rate": 3.389830508474576e-05, "loss": 3.0948, "step": 1140 }, { "epoch": 0.9669491525423729, "grad_norm": 0.9422304630279541, "learning_rate": 3.388418079096045e-05, "loss": 3.3007, "step": 1141 }, { "epoch": 0.9677966101694915, "grad_norm": 1.0825037956237793, "learning_rate": 3.387005649717514e-05, "loss": 3.2037, "step": 1142 }, { "epoch": 0.9686440677966102, "grad_norm": 0.8510132431983948, "learning_rate": 3.385593220338983e-05, "loss": 3.1312, "step": 1143 }, { "epoch": 0.9694915254237289, "grad_norm": 0.8678553700447083, "learning_rate": 3.384180790960452e-05, "loss": 3.0714, "step": 1144 }, { "epoch": 0.9703389830508474, "grad_norm": 0.9524672627449036, "learning_rate": 3.382768361581921e-05, "loss": 3.1557, "step": 1145 }, { "epoch": 0.9711864406779661, "grad_norm": 0.8765469193458557, "learning_rate": 3.38135593220339e-05, "loss": 3.2833, "step": 1146 }, { "epoch": 0.9720338983050848, "grad_norm": 1.1473183631896973, "learning_rate": 3.3799435028248585e-05, "loss": 3.1759, "step": 1147 }, { "epoch": 0.9728813559322034, "grad_norm": 1.138814926147461, "learning_rate": 3.3785310734463275e-05, "loss": 3.1647, "step": 1148 }, { "epoch": 0.9737288135593221, "grad_norm": 1.0071004629135132, "learning_rate": 3.3771186440677965e-05, "loss": 3.1534, "step": 1149 }, { "epoch": 0.9745762711864406, "grad_norm": 1.166270136833191, "learning_rate": 3.375706214689266e-05, "loss": 3.1155, "step": 1150 }, { "epoch": 0.9754237288135593, "grad_norm": 0.9369944930076599, "learning_rate": 3.3742937853107345e-05, "loss": 3.0024, "step": 1151 }, { "epoch": 0.976271186440678, "grad_norm": 0.9275339245796204, "learning_rate": 3.3728813559322035e-05, "loss": 3.0509, "step": 1152 }, { "epoch": 0.9771186440677966, "grad_norm": 0.8760161399841309, "learning_rate": 3.3714689265536725e-05, "loss": 3.2487, "step": 1153 }, { "epoch": 0.9779661016949153, "grad_norm": 1.0563088655471802, "learning_rate": 3.3700564971751415e-05, "loss": 3.1959, "step": 1154 }, { "epoch": 0.9788135593220338, "grad_norm": 0.9117673635482788, "learning_rate": 3.3686440677966105e-05, "loss": 3.1583, "step": 1155 }, { "epoch": 0.9796610169491525, "grad_norm": 1.1466143131256104, "learning_rate": 3.367231638418079e-05, "loss": 3.1284, "step": 1156 }, { "epoch": 0.9805084745762712, "grad_norm": 0.8947937488555908, "learning_rate": 3.365819209039548e-05, "loss": 3.0446, "step": 1157 }, { "epoch": 0.9813559322033898, "grad_norm": 0.9606156945228577, "learning_rate": 3.3644067796610175e-05, "loss": 3.1659, "step": 1158 }, { "epoch": 0.9822033898305085, "grad_norm": 1.0145713090896606, "learning_rate": 3.3629943502824865e-05, "loss": 3.2073, "step": 1159 }, { "epoch": 0.9830508474576272, "grad_norm": 0.8798396587371826, "learning_rate": 3.361581920903955e-05, "loss": 3.06, "step": 1160 }, { "epoch": 0.9838983050847457, "grad_norm": 0.9526357054710388, "learning_rate": 3.360169491525424e-05, "loss": 3.1306, "step": 1161 }, { "epoch": 0.9847457627118644, "grad_norm": 0.967297375202179, "learning_rate": 3.358757062146893e-05, "loss": 3.0771, "step": 1162 }, { "epoch": 0.985593220338983, "grad_norm": 1.0536571741104126, "learning_rate": 3.357344632768362e-05, "loss": 3.1413, "step": 1163 }, { "epoch": 0.9864406779661017, "grad_norm": 0.9622642397880554, "learning_rate": 3.35593220338983e-05, "loss": 3.0232, "step": 1164 }, { "epoch": 0.9872881355932204, "grad_norm": 1.0252420902252197, "learning_rate": 3.354519774011299e-05, "loss": 3.1725, "step": 1165 }, { "epoch": 0.988135593220339, "grad_norm": 1.169731616973877, "learning_rate": 3.353107344632769e-05, "loss": 3.1866, "step": 1166 }, { "epoch": 0.9889830508474576, "grad_norm": 1.0889506340026855, "learning_rate": 3.351694915254238e-05, "loss": 3.0281, "step": 1167 }, { "epoch": 0.9898305084745763, "grad_norm": 1.040248990058899, "learning_rate": 3.350282485875706e-05, "loss": 3.1935, "step": 1168 }, { "epoch": 0.9906779661016949, "grad_norm": 0.9672927260398865, "learning_rate": 3.348870056497175e-05, "loss": 3.2079, "step": 1169 }, { "epoch": 0.9915254237288136, "grad_norm": 0.9489609599113464, "learning_rate": 3.347457627118644e-05, "loss": 3.1211, "step": 1170 }, { "epoch": 0.9923728813559322, "grad_norm": 1.0373070240020752, "learning_rate": 3.346045197740113e-05, "loss": 3.1851, "step": 1171 }, { "epoch": 0.9932203389830508, "grad_norm": 0.9559980630874634, "learning_rate": 3.344632768361582e-05, "loss": 3.1407, "step": 1172 }, { "epoch": 0.9940677966101695, "grad_norm": 1.0616077184677124, "learning_rate": 3.3432203389830505e-05, "loss": 3.0577, "step": 1173 }, { "epoch": 0.9949152542372881, "grad_norm": 1.0753858089447021, "learning_rate": 3.34180790960452e-05, "loss": 3.2391, "step": 1174 }, { "epoch": 0.9957627118644068, "grad_norm": 0.9057319164276123, "learning_rate": 3.340395480225989e-05, "loss": 3.1968, "step": 1175 }, { "epoch": 0.9966101694915255, "grad_norm": 0.9730505347251892, "learning_rate": 3.338983050847458e-05, "loss": 3.1794, "step": 1176 }, { "epoch": 0.997457627118644, "grad_norm": 1.2745962142944336, "learning_rate": 3.3375706214689265e-05, "loss": 3.0877, "step": 1177 }, { "epoch": 0.9983050847457627, "grad_norm": 1.0337282419204712, "learning_rate": 3.3361581920903955e-05, "loss": 3.1717, "step": 1178 }, { "epoch": 0.9991525423728813, "grad_norm": 0.8962303996086121, "learning_rate": 3.3347457627118645e-05, "loss": 3.089, "step": 1179 }, { "epoch": 1.0, "grad_norm": 0.8768669962882996, "learning_rate": 3.3333333333333335e-05, "loss": 3.1633, "step": 1180 }, { "epoch": 1.0008474576271187, "grad_norm": 1.1547901630401611, "learning_rate": 3.3319209039548025e-05, "loss": 2.9119, "step": 1181 }, { "epoch": 1.0016949152542374, "grad_norm": 1.2839479446411133, "learning_rate": 3.3305084745762715e-05, "loss": 2.7222, "step": 1182 }, { "epoch": 1.0025423728813558, "grad_norm": 1.108107566833496, "learning_rate": 3.3290960451977405e-05, "loss": 2.8815, "step": 1183 }, { "epoch": 1.0033898305084745, "grad_norm": 1.4743744134902954, "learning_rate": 3.3276836158192095e-05, "loss": 2.7498, "step": 1184 }, { "epoch": 1.0042372881355932, "grad_norm": 1.104684829711914, "learning_rate": 3.326271186440678e-05, "loss": 2.6521, "step": 1185 }, { "epoch": 1.005084745762712, "grad_norm": 1.1616811752319336, "learning_rate": 3.324858757062147e-05, "loss": 2.7501, "step": 1186 }, { "epoch": 1.0059322033898306, "grad_norm": 0.9652979373931885, "learning_rate": 3.323446327683616e-05, "loss": 2.8519, "step": 1187 }, { "epoch": 1.006779661016949, "grad_norm": 1.0420432090759277, "learning_rate": 3.322033898305085e-05, "loss": 2.6377, "step": 1188 }, { "epoch": 1.0076271186440677, "grad_norm": 1.0684734582901, "learning_rate": 3.320621468926554e-05, "loss": 2.7969, "step": 1189 }, { "epoch": 1.0084745762711864, "grad_norm": 1.0408910512924194, "learning_rate": 3.319209039548023e-05, "loss": 2.7661, "step": 1190 }, { "epoch": 1.009322033898305, "grad_norm": 0.9833167195320129, "learning_rate": 3.317796610169492e-05, "loss": 2.8351, "step": 1191 }, { "epoch": 1.0101694915254238, "grad_norm": 1.2136975526809692, "learning_rate": 3.316384180790961e-05, "loss": 2.7654, "step": 1192 }, { "epoch": 1.0110169491525425, "grad_norm": 1.0686248540878296, "learning_rate": 3.31497175141243e-05, "loss": 2.7673, "step": 1193 }, { "epoch": 1.011864406779661, "grad_norm": 1.084439992904663, "learning_rate": 3.313559322033898e-05, "loss": 2.8005, "step": 1194 }, { "epoch": 1.0127118644067796, "grad_norm": 1.1534408330917358, "learning_rate": 3.312146892655367e-05, "loss": 2.8348, "step": 1195 }, { "epoch": 1.0135593220338983, "grad_norm": 1.188952922821045, "learning_rate": 3.310734463276836e-05, "loss": 2.7189, "step": 1196 }, { "epoch": 1.014406779661017, "grad_norm": 1.1142390966415405, "learning_rate": 3.309322033898305e-05, "loss": 2.842, "step": 1197 }, { "epoch": 1.0152542372881357, "grad_norm": 1.0218876600265503, "learning_rate": 3.307909604519774e-05, "loss": 2.985, "step": 1198 }, { "epoch": 1.0161016949152541, "grad_norm": 1.0653412342071533, "learning_rate": 3.306497175141243e-05, "loss": 2.7049, "step": 1199 }, { "epoch": 1.0169491525423728, "grad_norm": 1.1319935321807861, "learning_rate": 3.305084745762712e-05, "loss": 2.594, "step": 1200 }, { "epoch": 1.0177966101694915, "grad_norm": 1.013126015663147, "learning_rate": 3.303672316384181e-05, "loss": 2.8042, "step": 1201 }, { "epoch": 1.0186440677966102, "grad_norm": 0.9670597314834595, "learning_rate": 3.30225988700565e-05, "loss": 2.7315, "step": 1202 }, { "epoch": 1.019491525423729, "grad_norm": 0.9469462633132935, "learning_rate": 3.3008474576271184e-05, "loss": 2.8303, "step": 1203 }, { "epoch": 1.0203389830508474, "grad_norm": 1.1374952793121338, "learning_rate": 3.2994350282485874e-05, "loss": 2.8758, "step": 1204 }, { "epoch": 1.021186440677966, "grad_norm": 1.1895190477371216, "learning_rate": 3.2980225988700564e-05, "loss": 2.7217, "step": 1205 }, { "epoch": 1.0220338983050847, "grad_norm": 1.0638797283172607, "learning_rate": 3.296610169491526e-05, "loss": 2.7883, "step": 1206 }, { "epoch": 1.0228813559322034, "grad_norm": 1.0590007305145264, "learning_rate": 3.2951977401129944e-05, "loss": 2.7737, "step": 1207 }, { "epoch": 1.023728813559322, "grad_norm": 1.0253334045410156, "learning_rate": 3.2937853107344634e-05, "loss": 2.7251, "step": 1208 }, { "epoch": 1.0245762711864406, "grad_norm": 1.1091796159744263, "learning_rate": 3.2923728813559324e-05, "loss": 2.723, "step": 1209 }, { "epoch": 1.0254237288135593, "grad_norm": 1.2076777219772339, "learning_rate": 3.2909604519774014e-05, "loss": 2.628, "step": 1210 }, { "epoch": 1.026271186440678, "grad_norm": 1.097025990486145, "learning_rate": 3.28954802259887e-05, "loss": 2.7423, "step": 1211 }, { "epoch": 1.0271186440677966, "grad_norm": 0.9871625304222107, "learning_rate": 3.288135593220339e-05, "loss": 2.9542, "step": 1212 }, { "epoch": 1.0279661016949153, "grad_norm": 1.4033690690994263, "learning_rate": 3.2867231638418084e-05, "loss": 2.408, "step": 1213 }, { "epoch": 1.028813559322034, "grad_norm": 1.1936814785003662, "learning_rate": 3.2853107344632774e-05, "loss": 2.6728, "step": 1214 }, { "epoch": 1.0296610169491525, "grad_norm": 1.0251595973968506, "learning_rate": 3.283898305084746e-05, "loss": 2.8915, "step": 1215 }, { "epoch": 1.0305084745762711, "grad_norm": 1.038507342338562, "learning_rate": 3.282485875706215e-05, "loss": 2.6211, "step": 1216 }, { "epoch": 1.0313559322033898, "grad_norm": 1.195397138595581, "learning_rate": 3.281073446327684e-05, "loss": 2.691, "step": 1217 }, { "epoch": 1.0322033898305085, "grad_norm": 1.0375072956085205, "learning_rate": 3.279661016949153e-05, "loss": 2.7848, "step": 1218 }, { "epoch": 1.0330508474576272, "grad_norm": 1.1907418966293335, "learning_rate": 3.278248587570622e-05, "loss": 2.6764, "step": 1219 }, { "epoch": 1.0338983050847457, "grad_norm": 1.1549733877182007, "learning_rate": 3.27683615819209e-05, "loss": 2.7625, "step": 1220 }, { "epoch": 1.0347457627118644, "grad_norm": 1.0455302000045776, "learning_rate": 3.27542372881356e-05, "loss": 2.7691, "step": 1221 }, { "epoch": 1.035593220338983, "grad_norm": 1.2212104797363281, "learning_rate": 3.274011299435029e-05, "loss": 2.7455, "step": 1222 }, { "epoch": 1.0364406779661017, "grad_norm": 1.0414098501205444, "learning_rate": 3.272598870056498e-05, "loss": 2.7957, "step": 1223 }, { "epoch": 1.0372881355932204, "grad_norm": 0.9203545451164246, "learning_rate": 3.271186440677966e-05, "loss": 2.8824, "step": 1224 }, { "epoch": 1.0381355932203389, "grad_norm": 1.130113124847412, "learning_rate": 3.269774011299435e-05, "loss": 2.7725, "step": 1225 }, { "epoch": 1.0389830508474576, "grad_norm": 0.9704325199127197, "learning_rate": 3.268361581920904e-05, "loss": 2.7973, "step": 1226 }, { "epoch": 1.0398305084745763, "grad_norm": 0.9468993544578552, "learning_rate": 3.266949152542373e-05, "loss": 2.8226, "step": 1227 }, { "epoch": 1.040677966101695, "grad_norm": 1.0267146825790405, "learning_rate": 3.2655367231638414e-05, "loss": 2.7649, "step": 1228 }, { "epoch": 1.0415254237288136, "grad_norm": 1.0473774671554565, "learning_rate": 3.264124293785311e-05, "loss": 2.708, "step": 1229 }, { "epoch": 1.042372881355932, "grad_norm": 0.9830741286277771, "learning_rate": 3.26271186440678e-05, "loss": 2.7738, "step": 1230 }, { "epoch": 1.0432203389830508, "grad_norm": 1.0539050102233887, "learning_rate": 3.261299435028249e-05, "loss": 2.7942, "step": 1231 }, { "epoch": 1.0440677966101695, "grad_norm": 1.2233527898788452, "learning_rate": 3.2598870056497174e-05, "loss": 2.6881, "step": 1232 }, { "epoch": 1.0449152542372881, "grad_norm": 1.1111575365066528, "learning_rate": 3.2584745762711864e-05, "loss": 2.7387, "step": 1233 }, { "epoch": 1.0457627118644068, "grad_norm": 1.2477937936782837, "learning_rate": 3.2570621468926554e-05, "loss": 2.6672, "step": 1234 }, { "epoch": 1.0466101694915255, "grad_norm": 1.075348138809204, "learning_rate": 3.2556497175141244e-05, "loss": 2.81, "step": 1235 }, { "epoch": 1.047457627118644, "grad_norm": 1.1108568906784058, "learning_rate": 3.2542372881355934e-05, "loss": 2.635, "step": 1236 }, { "epoch": 1.0483050847457627, "grad_norm": 1.04779851436615, "learning_rate": 3.2528248587570624e-05, "loss": 2.9208, "step": 1237 }, { "epoch": 1.0491525423728814, "grad_norm": 1.3098849058151245, "learning_rate": 3.2514124293785314e-05, "loss": 2.6156, "step": 1238 }, { "epoch": 1.05, "grad_norm": 1.1287405490875244, "learning_rate": 3.2500000000000004e-05, "loss": 2.6735, "step": 1239 }, { "epoch": 1.0508474576271187, "grad_norm": 1.1208323240280151, "learning_rate": 3.2485875706214694e-05, "loss": 2.7031, "step": 1240 }, { "epoch": 1.0516949152542372, "grad_norm": 1.1564394235610962, "learning_rate": 3.247175141242938e-05, "loss": 2.7012, "step": 1241 }, { "epoch": 1.0525423728813559, "grad_norm": 1.0639265775680542, "learning_rate": 3.245762711864407e-05, "loss": 2.7558, "step": 1242 }, { "epoch": 1.0533898305084746, "grad_norm": 1.3013960123062134, "learning_rate": 3.244350282485876e-05, "loss": 2.7392, "step": 1243 }, { "epoch": 1.0542372881355933, "grad_norm": 1.2004514932632446, "learning_rate": 3.242937853107345e-05, "loss": 2.6263, "step": 1244 }, { "epoch": 1.055084745762712, "grad_norm": 1.1411895751953125, "learning_rate": 3.241525423728814e-05, "loss": 2.7157, "step": 1245 }, { "epoch": 1.0559322033898304, "grad_norm": 0.9765958189964294, "learning_rate": 3.240112994350283e-05, "loss": 2.7072, "step": 1246 }, { "epoch": 1.056779661016949, "grad_norm": 1.0957763195037842, "learning_rate": 3.238700564971752e-05, "loss": 2.8281, "step": 1247 }, { "epoch": 1.0576271186440678, "grad_norm": 0.9992038607597351, "learning_rate": 3.237288135593221e-05, "loss": 2.8777, "step": 1248 }, { "epoch": 1.0584745762711865, "grad_norm": 1.152011513710022, "learning_rate": 3.23587570621469e-05, "loss": 2.6419, "step": 1249 }, { "epoch": 1.0593220338983051, "grad_norm": 1.0962355136871338, "learning_rate": 3.234463276836158e-05, "loss": 2.7468, "step": 1250 }, { "epoch": 1.0601694915254238, "grad_norm": 1.2691516876220703, "learning_rate": 3.233050847457627e-05, "loss": 2.7081, "step": 1251 }, { "epoch": 1.0610169491525423, "grad_norm": 0.8663666844367981, "learning_rate": 3.231638418079096e-05, "loss": 2.8186, "step": 1252 }, { "epoch": 1.061864406779661, "grad_norm": 1.0920077562332153, "learning_rate": 3.230225988700565e-05, "loss": 2.9159, "step": 1253 }, { "epoch": 1.0627118644067797, "grad_norm": 1.0791915655136108, "learning_rate": 3.228813559322034e-05, "loss": 2.8422, "step": 1254 }, { "epoch": 1.0635593220338984, "grad_norm": 1.2146872282028198, "learning_rate": 3.227401129943503e-05, "loss": 2.5906, "step": 1255 }, { "epoch": 1.064406779661017, "grad_norm": 1.0364774465560913, "learning_rate": 3.225988700564972e-05, "loss": 2.6943, "step": 1256 }, { "epoch": 1.0652542372881355, "grad_norm": 1.1046572923660278, "learning_rate": 3.224576271186441e-05, "loss": 2.7248, "step": 1257 }, { "epoch": 1.0661016949152542, "grad_norm": 1.0412293672561646, "learning_rate": 3.2231638418079093e-05, "loss": 2.8076, "step": 1258 }, { "epoch": 1.0669491525423729, "grad_norm": 1.0001367330551147, "learning_rate": 3.2217514124293783e-05, "loss": 2.8023, "step": 1259 }, { "epoch": 1.0677966101694916, "grad_norm": 1.0104426145553589, "learning_rate": 3.2203389830508473e-05, "loss": 2.8011, "step": 1260 }, { "epoch": 1.0686440677966103, "grad_norm": 1.2797577381134033, "learning_rate": 3.218926553672317e-05, "loss": 2.6239, "step": 1261 }, { "epoch": 1.0694915254237287, "grad_norm": 0.9057015776634216, "learning_rate": 3.2175141242937853e-05, "loss": 2.9027, "step": 1262 }, { "epoch": 1.0703389830508474, "grad_norm": 1.0806961059570312, "learning_rate": 3.2161016949152543e-05, "loss": 2.7245, "step": 1263 }, { "epoch": 1.071186440677966, "grad_norm": 0.9371135830879211, "learning_rate": 3.2146892655367233e-05, "loss": 2.8056, "step": 1264 }, { "epoch": 1.0720338983050848, "grad_norm": 1.0706758499145508, "learning_rate": 3.2132768361581923e-05, "loss": 2.7724, "step": 1265 }, { "epoch": 1.0728813559322035, "grad_norm": 1.069041132926941, "learning_rate": 3.2118644067796613e-05, "loss": 2.7142, "step": 1266 }, { "epoch": 1.0737288135593221, "grad_norm": 1.1043695211410522, "learning_rate": 3.21045197740113e-05, "loss": 2.6181, "step": 1267 }, { "epoch": 1.0745762711864406, "grad_norm": 0.9835469722747803, "learning_rate": 3.209039548022599e-05, "loss": 2.781, "step": 1268 }, { "epoch": 1.0754237288135593, "grad_norm": 1.2907835245132446, "learning_rate": 3.2076271186440683e-05, "loss": 2.501, "step": 1269 }, { "epoch": 1.076271186440678, "grad_norm": 1.1371930837631226, "learning_rate": 3.2062146892655373e-05, "loss": 2.6317, "step": 1270 }, { "epoch": 1.0771186440677967, "grad_norm": 0.9557111263275146, "learning_rate": 3.204802259887006e-05, "loss": 2.9196, "step": 1271 }, { "epoch": 1.0779661016949154, "grad_norm": 1.3277744054794312, "learning_rate": 3.203389830508475e-05, "loss": 2.6195, "step": 1272 }, { "epoch": 1.0788135593220338, "grad_norm": 1.0682483911514282, "learning_rate": 3.201977401129944e-05, "loss": 2.6922, "step": 1273 }, { "epoch": 1.0796610169491525, "grad_norm": 1.1835170984268188, "learning_rate": 3.200564971751413e-05, "loss": 2.6221, "step": 1274 }, { "epoch": 1.0805084745762712, "grad_norm": 1.1254576444625854, "learning_rate": 3.199152542372881e-05, "loss": 2.7208, "step": 1275 }, { "epoch": 1.0813559322033899, "grad_norm": 1.0872687101364136, "learning_rate": 3.19774011299435e-05, "loss": 2.7985, "step": 1276 }, { "epoch": 1.0822033898305086, "grad_norm": 1.0942344665527344, "learning_rate": 3.1963276836158197e-05, "loss": 2.6473, "step": 1277 }, { "epoch": 1.083050847457627, "grad_norm": 1.2729380130767822, "learning_rate": 3.1949152542372887e-05, "loss": 2.6103, "step": 1278 }, { "epoch": 1.0838983050847457, "grad_norm": 1.2173162698745728, "learning_rate": 3.193502824858757e-05, "loss": 2.573, "step": 1279 }, { "epoch": 1.0847457627118644, "grad_norm": 1.0390675067901611, "learning_rate": 3.192090395480226e-05, "loss": 2.7393, "step": 1280 }, { "epoch": 1.085593220338983, "grad_norm": 0.9719063639640808, "learning_rate": 3.190677966101695e-05, "loss": 2.793, "step": 1281 }, { "epoch": 1.0864406779661018, "grad_norm": 1.2492557764053345, "learning_rate": 3.189265536723164e-05, "loss": 2.5754, "step": 1282 }, { "epoch": 1.0872881355932202, "grad_norm": 1.089656949043274, "learning_rate": 3.187853107344633e-05, "loss": 2.8917, "step": 1283 }, { "epoch": 1.088135593220339, "grad_norm": 1.071498155593872, "learning_rate": 3.186440677966101e-05, "loss": 2.8634, "step": 1284 }, { "epoch": 1.0889830508474576, "grad_norm": 1.124239444732666, "learning_rate": 3.185028248587571e-05, "loss": 2.7158, "step": 1285 }, { "epoch": 1.0898305084745763, "grad_norm": 1.1056195497512817, "learning_rate": 3.18361581920904e-05, "loss": 2.7763, "step": 1286 }, { "epoch": 1.090677966101695, "grad_norm": 0.9646786451339722, "learning_rate": 3.182203389830509e-05, "loss": 2.9494, "step": 1287 }, { "epoch": 1.0915254237288137, "grad_norm": 1.1487090587615967, "learning_rate": 3.180790960451977e-05, "loss": 2.82, "step": 1288 }, { "epoch": 1.0923728813559321, "grad_norm": 1.1627110242843628, "learning_rate": 3.179378531073446e-05, "loss": 2.6772, "step": 1289 }, { "epoch": 1.0932203389830508, "grad_norm": 0.9802050590515137, "learning_rate": 3.177966101694915e-05, "loss": 2.7999, "step": 1290 }, { "epoch": 1.0940677966101695, "grad_norm": 1.009687066078186, "learning_rate": 3.176553672316384e-05, "loss": 2.9195, "step": 1291 }, { "epoch": 1.0949152542372882, "grad_norm": 1.2086440324783325, "learning_rate": 3.175141242937853e-05, "loss": 2.6826, "step": 1292 }, { "epoch": 1.0957627118644069, "grad_norm": 0.9848881363868713, "learning_rate": 3.173728813559322e-05, "loss": 2.8132, "step": 1293 }, { "epoch": 1.0966101694915253, "grad_norm": 1.3762964010238647, "learning_rate": 3.172316384180791e-05, "loss": 2.532, "step": 1294 }, { "epoch": 1.097457627118644, "grad_norm": 1.1501647233963013, "learning_rate": 3.17090395480226e-05, "loss": 2.6281, "step": 1295 }, { "epoch": 1.0983050847457627, "grad_norm": 1.080271601676941, "learning_rate": 3.169491525423729e-05, "loss": 2.8009, "step": 1296 }, { "epoch": 1.0991525423728814, "grad_norm": 1.1762102842330933, "learning_rate": 3.1680790960451976e-05, "loss": 2.6938, "step": 1297 }, { "epoch": 1.1, "grad_norm": 1.2100443840026855, "learning_rate": 3.1666666666666666e-05, "loss": 2.5648, "step": 1298 }, { "epoch": 1.1008474576271186, "grad_norm": 0.9676806330680847, "learning_rate": 3.1652542372881356e-05, "loss": 2.8492, "step": 1299 }, { "epoch": 1.1016949152542372, "grad_norm": 1.1340441703796387, "learning_rate": 3.1638418079096046e-05, "loss": 2.6525, "step": 1300 }, { "epoch": 1.102542372881356, "grad_norm": 1.207497239112854, "learning_rate": 3.1624293785310736e-05, "loss": 2.6985, "step": 1301 }, { "epoch": 1.1033898305084746, "grad_norm": 1.3901035785675049, "learning_rate": 3.1610169491525426e-05, "loss": 2.5859, "step": 1302 }, { "epoch": 1.1042372881355933, "grad_norm": 1.3611477613449097, "learning_rate": 3.1596045197740116e-05, "loss": 2.7371, "step": 1303 }, { "epoch": 1.1050847457627118, "grad_norm": 1.0604912042617798, "learning_rate": 3.1581920903954806e-05, "loss": 2.7643, "step": 1304 }, { "epoch": 1.1059322033898304, "grad_norm": 1.2928720712661743, "learning_rate": 3.156779661016949e-05, "loss": 2.5703, "step": 1305 }, { "epoch": 1.1067796610169491, "grad_norm": 0.9556746482849121, "learning_rate": 3.155367231638418e-05, "loss": 2.7694, "step": 1306 }, { "epoch": 1.1076271186440678, "grad_norm": 1.0268397331237793, "learning_rate": 3.153954802259887e-05, "loss": 2.7441, "step": 1307 }, { "epoch": 1.1084745762711865, "grad_norm": 1.4118244647979736, "learning_rate": 3.1525423728813566e-05, "loss": 2.5304, "step": 1308 }, { "epoch": 1.1093220338983052, "grad_norm": 1.2121537923812866, "learning_rate": 3.151129943502825e-05, "loss": 2.5853, "step": 1309 }, { "epoch": 1.1101694915254237, "grad_norm": 1.0660877227783203, "learning_rate": 3.149717514124294e-05, "loss": 2.8251, "step": 1310 }, { "epoch": 1.1110169491525423, "grad_norm": 1.2280398607254028, "learning_rate": 3.148305084745763e-05, "loss": 2.7137, "step": 1311 }, { "epoch": 1.111864406779661, "grad_norm": 1.3553054332733154, "learning_rate": 3.146892655367232e-05, "loss": 2.5988, "step": 1312 }, { "epoch": 1.1127118644067797, "grad_norm": 1.094398021697998, "learning_rate": 3.145480225988701e-05, "loss": 2.7889, "step": 1313 }, { "epoch": 1.1135593220338984, "grad_norm": 1.17271888256073, "learning_rate": 3.144067796610169e-05, "loss": 2.753, "step": 1314 }, { "epoch": 1.1144067796610169, "grad_norm": 1.3248339891433716, "learning_rate": 3.142655367231638e-05, "loss": 2.5908, "step": 1315 }, { "epoch": 1.1152542372881356, "grad_norm": 1.353985071182251, "learning_rate": 3.141242937853108e-05, "loss": 2.8259, "step": 1316 }, { "epoch": 1.1161016949152542, "grad_norm": 1.0256413221359253, "learning_rate": 3.139830508474577e-05, "loss": 2.9938, "step": 1317 }, { "epoch": 1.116949152542373, "grad_norm": 1.1994404792785645, "learning_rate": 3.138418079096045e-05, "loss": 2.7358, "step": 1318 }, { "epoch": 1.1177966101694916, "grad_norm": 1.0351284742355347, "learning_rate": 3.137005649717514e-05, "loss": 2.7658, "step": 1319 }, { "epoch": 1.11864406779661, "grad_norm": 0.8627428412437439, "learning_rate": 3.135593220338983e-05, "loss": 3.0224, "step": 1320 }, { "epoch": 1.1194915254237288, "grad_norm": 0.9812117218971252, "learning_rate": 3.134180790960452e-05, "loss": 2.879, "step": 1321 }, { "epoch": 1.1203389830508474, "grad_norm": 1.0016770362854004, "learning_rate": 3.1327683615819206e-05, "loss": 2.7623, "step": 1322 }, { "epoch": 1.1211864406779661, "grad_norm": 1.1582958698272705, "learning_rate": 3.1313559322033896e-05, "loss": 2.7582, "step": 1323 }, { "epoch": 1.1220338983050848, "grad_norm": 1.0659432411193848, "learning_rate": 3.129943502824859e-05, "loss": 2.7713, "step": 1324 }, { "epoch": 1.1228813559322033, "grad_norm": 1.0960432291030884, "learning_rate": 3.128531073446328e-05, "loss": 2.6518, "step": 1325 }, { "epoch": 1.123728813559322, "grad_norm": 1.0338033437728882, "learning_rate": 3.1271186440677966e-05, "loss": 2.7533, "step": 1326 }, { "epoch": 1.1245762711864407, "grad_norm": 1.039915919303894, "learning_rate": 3.1257062146892656e-05, "loss": 2.7186, "step": 1327 }, { "epoch": 1.1254237288135593, "grad_norm": 1.2192764282226562, "learning_rate": 3.1242937853107346e-05, "loss": 2.6632, "step": 1328 }, { "epoch": 1.126271186440678, "grad_norm": 1.0014623403549194, "learning_rate": 3.1228813559322036e-05, "loss": 2.7623, "step": 1329 }, { "epoch": 1.1271186440677967, "grad_norm": 1.0116573572158813, "learning_rate": 3.1214689265536726e-05, "loss": 2.894, "step": 1330 }, { "epoch": 1.1279661016949152, "grad_norm": 1.1956560611724854, "learning_rate": 3.120056497175141e-05, "loss": 2.7316, "step": 1331 }, { "epoch": 1.1288135593220339, "grad_norm": 1.0047454833984375, "learning_rate": 3.1186440677966106e-05, "loss": 2.7688, "step": 1332 }, { "epoch": 1.1296610169491526, "grad_norm": 0.9235908389091492, "learning_rate": 3.1172316384180796e-05, "loss": 2.8439, "step": 1333 }, { "epoch": 1.1305084745762712, "grad_norm": 1.4579118490219116, "learning_rate": 3.1158192090395486e-05, "loss": 2.5944, "step": 1334 }, { "epoch": 1.13135593220339, "grad_norm": 1.1531609296798706, "learning_rate": 3.114406779661017e-05, "loss": 2.7467, "step": 1335 }, { "epoch": 1.1322033898305084, "grad_norm": 0.9288375973701477, "learning_rate": 3.112994350282486e-05, "loss": 2.8698, "step": 1336 }, { "epoch": 1.133050847457627, "grad_norm": 1.1345102787017822, "learning_rate": 3.111581920903955e-05, "loss": 2.8097, "step": 1337 }, { "epoch": 1.1338983050847458, "grad_norm": 1.0098881721496582, "learning_rate": 3.110169491525424e-05, "loss": 2.8774, "step": 1338 }, { "epoch": 1.1347457627118644, "grad_norm": 1.323324203491211, "learning_rate": 3.108757062146893e-05, "loss": 2.5576, "step": 1339 }, { "epoch": 1.1355932203389831, "grad_norm": 0.9318826198577881, "learning_rate": 3.107344632768362e-05, "loss": 2.9073, "step": 1340 }, { "epoch": 1.1364406779661016, "grad_norm": 1.0271952152252197, "learning_rate": 3.105932203389831e-05, "loss": 2.8181, "step": 1341 }, { "epoch": 1.1372881355932203, "grad_norm": 0.9295307993888855, "learning_rate": 3.1045197740113e-05, "loss": 2.7162, "step": 1342 }, { "epoch": 1.138135593220339, "grad_norm": 1.0659383535385132, "learning_rate": 3.103107344632768e-05, "loss": 2.694, "step": 1343 }, { "epoch": 1.1389830508474577, "grad_norm": 1.2376900911331177, "learning_rate": 3.101694915254237e-05, "loss": 2.6704, "step": 1344 }, { "epoch": 1.1398305084745763, "grad_norm": 1.0314176082611084, "learning_rate": 3.100282485875706e-05, "loss": 2.9519, "step": 1345 }, { "epoch": 1.1406779661016948, "grad_norm": 1.1144027709960938, "learning_rate": 3.098870056497175e-05, "loss": 2.8029, "step": 1346 }, { "epoch": 1.1415254237288135, "grad_norm": 0.9675883650779724, "learning_rate": 3.097457627118644e-05, "loss": 2.7469, "step": 1347 }, { "epoch": 1.1423728813559322, "grad_norm": 1.004947304725647, "learning_rate": 3.096045197740113e-05, "loss": 2.7452, "step": 1348 }, { "epoch": 1.1432203389830509, "grad_norm": 1.0772300958633423, "learning_rate": 3.094632768361582e-05, "loss": 2.7596, "step": 1349 }, { "epoch": 1.1440677966101696, "grad_norm": 1.249557375907898, "learning_rate": 3.093220338983051e-05, "loss": 2.7725, "step": 1350 }, { "epoch": 1.1449152542372882, "grad_norm": 1.1400595903396606, "learning_rate": 3.09180790960452e-05, "loss": 2.7257, "step": 1351 }, { "epoch": 1.1457627118644067, "grad_norm": 1.0549755096435547, "learning_rate": 3.0903954802259885e-05, "loss": 2.7247, "step": 1352 }, { "epoch": 1.1466101694915254, "grad_norm": 1.2909915447235107, "learning_rate": 3.0889830508474575e-05, "loss": 2.5551, "step": 1353 }, { "epoch": 1.147457627118644, "grad_norm": 1.232528567314148, "learning_rate": 3.0875706214689265e-05, "loss": 2.8768, "step": 1354 }, { "epoch": 1.1483050847457628, "grad_norm": 1.3124966621398926, "learning_rate": 3.0861581920903955e-05, "loss": 2.8048, "step": 1355 }, { "epoch": 1.1491525423728814, "grad_norm": 1.0502287149429321, "learning_rate": 3.0847457627118645e-05, "loss": 2.7559, "step": 1356 }, { "epoch": 1.15, "grad_norm": 1.252584457397461, "learning_rate": 3.0833333333333335e-05, "loss": 2.7825, "step": 1357 }, { "epoch": 1.1508474576271186, "grad_norm": 1.24968683719635, "learning_rate": 3.0819209039548025e-05, "loss": 2.6221, "step": 1358 }, { "epoch": 1.1516949152542373, "grad_norm": 1.22371244430542, "learning_rate": 3.0805084745762715e-05, "loss": 2.7645, "step": 1359 }, { "epoch": 1.152542372881356, "grad_norm": 1.1917370557785034, "learning_rate": 3.0790960451977405e-05, "loss": 2.6992, "step": 1360 }, { "epoch": 1.1533898305084747, "grad_norm": 1.5677720308303833, "learning_rate": 3.077683615819209e-05, "loss": 2.4938, "step": 1361 }, { "epoch": 1.1542372881355933, "grad_norm": 0.9529935717582703, "learning_rate": 3.076271186440678e-05, "loss": 2.8425, "step": 1362 }, { "epoch": 1.1550847457627118, "grad_norm": 1.0626323223114014, "learning_rate": 3.074858757062147e-05, "loss": 2.7787, "step": 1363 }, { "epoch": 1.1559322033898305, "grad_norm": 0.9936074018478394, "learning_rate": 3.0734463276836165e-05, "loss": 2.7484, "step": 1364 }, { "epoch": 1.1567796610169492, "grad_norm": 1.3875172138214111, "learning_rate": 3.072033898305085e-05, "loss": 2.6042, "step": 1365 }, { "epoch": 1.1576271186440679, "grad_norm": 1.0043777227401733, "learning_rate": 3.070621468926554e-05, "loss": 2.7155, "step": 1366 }, { "epoch": 1.1584745762711863, "grad_norm": 1.1337268352508545, "learning_rate": 3.069209039548023e-05, "loss": 2.8497, "step": 1367 }, { "epoch": 1.159322033898305, "grad_norm": 1.079054832458496, "learning_rate": 3.067796610169492e-05, "loss": 2.7388, "step": 1368 }, { "epoch": 1.1601694915254237, "grad_norm": 1.1742485761642456, "learning_rate": 3.06638418079096e-05, "loss": 2.7066, "step": 1369 }, { "epoch": 1.1610169491525424, "grad_norm": 1.1586884260177612, "learning_rate": 3.064971751412429e-05, "loss": 2.7078, "step": 1370 }, { "epoch": 1.161864406779661, "grad_norm": 1.1820478439331055, "learning_rate": 3.063559322033899e-05, "loss": 2.7279, "step": 1371 }, { "epoch": 1.1627118644067798, "grad_norm": 1.2828441858291626, "learning_rate": 3.062146892655368e-05, "loss": 2.6271, "step": 1372 }, { "epoch": 1.1635593220338982, "grad_norm": 1.115065336227417, "learning_rate": 3.060734463276836e-05, "loss": 2.877, "step": 1373 }, { "epoch": 1.164406779661017, "grad_norm": 1.353116750717163, "learning_rate": 3.059322033898305e-05, "loss": 2.6163, "step": 1374 }, { "epoch": 1.1652542372881356, "grad_norm": 1.2320899963378906, "learning_rate": 3.057909604519774e-05, "loss": 2.7729, "step": 1375 }, { "epoch": 1.1661016949152543, "grad_norm": 1.1651616096496582, "learning_rate": 3.056497175141243e-05, "loss": 2.8106, "step": 1376 }, { "epoch": 1.166949152542373, "grad_norm": 1.0272785425186157, "learning_rate": 3.055084745762712e-05, "loss": 2.7614, "step": 1377 }, { "epoch": 1.1677966101694914, "grad_norm": 1.002989649772644, "learning_rate": 3.0536723163841805e-05, "loss": 2.7554, "step": 1378 }, { "epoch": 1.1686440677966101, "grad_norm": 1.0328543186187744, "learning_rate": 3.05225988700565e-05, "loss": 2.7662, "step": 1379 }, { "epoch": 1.1694915254237288, "grad_norm": 1.0183919668197632, "learning_rate": 3.050847457627119e-05, "loss": 2.8341, "step": 1380 }, { "epoch": 1.1703389830508475, "grad_norm": 1.1252543926239014, "learning_rate": 3.0494350282485878e-05, "loss": 2.5955, "step": 1381 }, { "epoch": 1.1711864406779662, "grad_norm": 0.9931685924530029, "learning_rate": 3.0480225988700568e-05, "loss": 2.65, "step": 1382 }, { "epoch": 1.1720338983050849, "grad_norm": 1.1650798320770264, "learning_rate": 3.0466101694915255e-05, "loss": 2.7463, "step": 1383 }, { "epoch": 1.1728813559322033, "grad_norm": 0.9627938270568848, "learning_rate": 3.0451977401129945e-05, "loss": 2.9753, "step": 1384 }, { "epoch": 1.173728813559322, "grad_norm": 1.136277437210083, "learning_rate": 3.043785310734463e-05, "loss": 2.7037, "step": 1385 }, { "epoch": 1.1745762711864407, "grad_norm": 1.0859447717666626, "learning_rate": 3.042372881355932e-05, "loss": 2.9777, "step": 1386 }, { "epoch": 1.1754237288135594, "grad_norm": 1.218129277229309, "learning_rate": 3.0409604519774015e-05, "loss": 2.7412, "step": 1387 }, { "epoch": 1.1762711864406779, "grad_norm": 0.9532760977745056, "learning_rate": 3.0395480225988705e-05, "loss": 2.8729, "step": 1388 }, { "epoch": 1.1771186440677965, "grad_norm": 1.3282712697982788, "learning_rate": 3.038135593220339e-05, "loss": 2.6393, "step": 1389 }, { "epoch": 1.1779661016949152, "grad_norm": 1.4244226217269897, "learning_rate": 3.036723163841808e-05, "loss": 2.7061, "step": 1390 }, { "epoch": 1.178813559322034, "grad_norm": 1.3524574041366577, "learning_rate": 3.035310734463277e-05, "loss": 2.7684, "step": 1391 }, { "epoch": 1.1796610169491526, "grad_norm": 1.028135895729065, "learning_rate": 3.0338983050847458e-05, "loss": 2.9048, "step": 1392 }, { "epoch": 1.1805084745762713, "grad_norm": 1.0645534992218018, "learning_rate": 3.0324858757062148e-05, "loss": 2.704, "step": 1393 }, { "epoch": 1.1813559322033897, "grad_norm": 1.1310391426086426, "learning_rate": 3.0310734463276834e-05, "loss": 2.7615, "step": 1394 }, { "epoch": 1.1822033898305084, "grad_norm": 0.9573028087615967, "learning_rate": 3.0296610169491528e-05, "loss": 2.8263, "step": 1395 }, { "epoch": 1.1830508474576271, "grad_norm": 1.1354314088821411, "learning_rate": 3.0282485875706218e-05, "loss": 2.6444, "step": 1396 }, { "epoch": 1.1838983050847458, "grad_norm": 0.9704809188842773, "learning_rate": 3.0268361581920908e-05, "loss": 2.9929, "step": 1397 }, { "epoch": 1.1847457627118645, "grad_norm": 1.1086372137069702, "learning_rate": 3.0254237288135594e-05, "loss": 2.6978, "step": 1398 }, { "epoch": 1.185593220338983, "grad_norm": 1.0527740716934204, "learning_rate": 3.0240112994350284e-05, "loss": 2.7222, "step": 1399 }, { "epoch": 1.1864406779661016, "grad_norm": 0.9511035084724426, "learning_rate": 3.022598870056497e-05, "loss": 2.8779, "step": 1400 }, { "epoch": 1.1872881355932203, "grad_norm": 0.953075110912323, "learning_rate": 3.021186440677966e-05, "loss": 2.8414, "step": 1401 }, { "epoch": 1.188135593220339, "grad_norm": 1.228217363357544, "learning_rate": 3.019774011299435e-05, "loss": 2.7516, "step": 1402 }, { "epoch": 1.1889830508474577, "grad_norm": 1.2182135581970215, "learning_rate": 3.0183615819209044e-05, "loss": 2.6564, "step": 1403 }, { "epoch": 1.1898305084745764, "grad_norm": 0.9400214552879333, "learning_rate": 3.016949152542373e-05, "loss": 2.7884, "step": 1404 }, { "epoch": 1.1906779661016949, "grad_norm": 1.2233799695968628, "learning_rate": 3.015536723163842e-05, "loss": 2.6467, "step": 1405 }, { "epoch": 1.1915254237288135, "grad_norm": 0.9913449883460999, "learning_rate": 3.0141242937853108e-05, "loss": 2.8703, "step": 1406 }, { "epoch": 1.1923728813559322, "grad_norm": 1.144369125366211, "learning_rate": 3.0127118644067798e-05, "loss": 2.6984, "step": 1407 }, { "epoch": 1.193220338983051, "grad_norm": 1.1918572187423706, "learning_rate": 3.0112994350282488e-05, "loss": 2.7068, "step": 1408 }, { "epoch": 1.1940677966101694, "grad_norm": 1.0861234664916992, "learning_rate": 3.0098870056497174e-05, "loss": 2.7732, "step": 1409 }, { "epoch": 1.194915254237288, "grad_norm": 0.9997703433036804, "learning_rate": 3.0084745762711864e-05, "loss": 2.8182, "step": 1410 }, { "epoch": 1.1957627118644067, "grad_norm": 1.2805825471878052, "learning_rate": 3.0070621468926558e-05, "loss": 2.6062, "step": 1411 }, { "epoch": 1.1966101694915254, "grad_norm": 1.077366590499878, "learning_rate": 3.0056497175141248e-05, "loss": 2.9351, "step": 1412 }, { "epoch": 1.1974576271186441, "grad_norm": 1.4923194646835327, "learning_rate": 3.0042372881355934e-05, "loss": 2.661, "step": 1413 }, { "epoch": 1.1983050847457628, "grad_norm": 1.1752442121505737, "learning_rate": 3.0028248587570624e-05, "loss": 2.6618, "step": 1414 }, { "epoch": 1.1991525423728813, "grad_norm": 1.1268277168273926, "learning_rate": 3.001412429378531e-05, "loss": 2.6549, "step": 1415 }, { "epoch": 1.2, "grad_norm": 1.1235272884368896, "learning_rate": 3e-05, "loss": 2.8397, "step": 1416 }, { "epoch": 1.2008474576271186, "grad_norm": 1.0348951816558838, "learning_rate": 2.9985875706214687e-05, "loss": 2.8437, "step": 1417 }, { "epoch": 1.2016949152542373, "grad_norm": 1.1336616277694702, "learning_rate": 2.9971751412429377e-05, "loss": 2.8073, "step": 1418 }, { "epoch": 1.202542372881356, "grad_norm": 1.147831916809082, "learning_rate": 2.995762711864407e-05, "loss": 2.6553, "step": 1419 }, { "epoch": 1.2033898305084745, "grad_norm": 1.1440467834472656, "learning_rate": 2.994350282485876e-05, "loss": 2.6407, "step": 1420 }, { "epoch": 1.2042372881355932, "grad_norm": 1.1262454986572266, "learning_rate": 2.9929378531073447e-05, "loss": 2.7065, "step": 1421 }, { "epoch": 1.2050847457627119, "grad_norm": 1.222533106803894, "learning_rate": 2.9915254237288137e-05, "loss": 2.8257, "step": 1422 }, { "epoch": 1.2059322033898305, "grad_norm": 1.0046762228012085, "learning_rate": 2.9901129943502827e-05, "loss": 2.7413, "step": 1423 }, { "epoch": 1.2067796610169492, "grad_norm": 0.9929599761962891, "learning_rate": 2.9887005649717514e-05, "loss": 2.9109, "step": 1424 }, { "epoch": 1.207627118644068, "grad_norm": 1.1053084135055542, "learning_rate": 2.9872881355932204e-05, "loss": 2.6662, "step": 1425 }, { "epoch": 1.2084745762711864, "grad_norm": 1.1630088090896606, "learning_rate": 2.985875706214689e-05, "loss": 2.7099, "step": 1426 }, { "epoch": 1.209322033898305, "grad_norm": 1.0490261316299438, "learning_rate": 2.9844632768361587e-05, "loss": 2.6698, "step": 1427 }, { "epoch": 1.2101694915254237, "grad_norm": 1.0740617513656616, "learning_rate": 2.9830508474576274e-05, "loss": 2.6712, "step": 1428 }, { "epoch": 1.2110169491525424, "grad_norm": 1.0406426191329956, "learning_rate": 2.9816384180790964e-05, "loss": 2.8546, "step": 1429 }, { "epoch": 1.211864406779661, "grad_norm": 1.1298630237579346, "learning_rate": 2.980225988700565e-05, "loss": 2.7401, "step": 1430 }, { "epoch": 1.2127118644067796, "grad_norm": 1.1068373918533325, "learning_rate": 2.978813559322034e-05, "loss": 2.8314, "step": 1431 }, { "epoch": 1.2135593220338983, "grad_norm": 1.4264651536941528, "learning_rate": 2.9774011299435027e-05, "loss": 2.575, "step": 1432 }, { "epoch": 1.214406779661017, "grad_norm": 1.017459750175476, "learning_rate": 2.9759887005649717e-05, "loss": 2.9322, "step": 1433 }, { "epoch": 1.2152542372881356, "grad_norm": 1.1577345132827759, "learning_rate": 2.9745762711864407e-05, "loss": 2.6906, "step": 1434 }, { "epoch": 1.2161016949152543, "grad_norm": 1.0599602460861206, "learning_rate": 2.97316384180791e-05, "loss": 2.9082, "step": 1435 }, { "epoch": 1.2169491525423728, "grad_norm": 1.1662206649780273, "learning_rate": 2.9717514124293787e-05, "loss": 2.5966, "step": 1436 }, { "epoch": 1.2177966101694915, "grad_norm": 1.1501646041870117, "learning_rate": 2.9703389830508477e-05, "loss": 2.7142, "step": 1437 }, { "epoch": 1.2186440677966102, "grad_norm": 1.2464590072631836, "learning_rate": 2.9689265536723164e-05, "loss": 2.7361, "step": 1438 }, { "epoch": 1.2194915254237289, "grad_norm": 1.0664721727371216, "learning_rate": 2.9675141242937854e-05, "loss": 2.8738, "step": 1439 }, { "epoch": 1.2203389830508475, "grad_norm": 1.086744785308838, "learning_rate": 2.9661016949152544e-05, "loss": 2.7444, "step": 1440 }, { "epoch": 1.221186440677966, "grad_norm": 1.0933836698532104, "learning_rate": 2.964689265536723e-05, "loss": 2.6999, "step": 1441 }, { "epoch": 1.2220338983050847, "grad_norm": 0.9780147075653076, "learning_rate": 2.963276836158192e-05, "loss": 2.8808, "step": 1442 }, { "epoch": 1.2228813559322034, "grad_norm": 1.0730159282684326, "learning_rate": 2.9618644067796614e-05, "loss": 2.9024, "step": 1443 }, { "epoch": 1.223728813559322, "grad_norm": 0.9952771663665771, "learning_rate": 2.9604519774011304e-05, "loss": 2.8256, "step": 1444 }, { "epoch": 1.2245762711864407, "grad_norm": 1.0768945217132568, "learning_rate": 2.959039548022599e-05, "loss": 2.7651, "step": 1445 }, { "epoch": 1.2254237288135594, "grad_norm": 0.9231528639793396, "learning_rate": 2.957627118644068e-05, "loss": 2.9463, "step": 1446 }, { "epoch": 1.226271186440678, "grad_norm": 1.3907428979873657, "learning_rate": 2.9562146892655367e-05, "loss": 2.6094, "step": 1447 }, { "epoch": 1.2271186440677966, "grad_norm": 1.030638575553894, "learning_rate": 2.9548022598870057e-05, "loss": 3.032, "step": 1448 }, { "epoch": 1.2279661016949153, "grad_norm": 1.017385721206665, "learning_rate": 2.9533898305084743e-05, "loss": 2.8845, "step": 1449 }, { "epoch": 1.228813559322034, "grad_norm": 1.0024408102035522, "learning_rate": 2.951977401129944e-05, "loss": 3.0054, "step": 1450 }, { "epoch": 1.2296610169491524, "grad_norm": 1.0980383157730103, "learning_rate": 2.9505649717514127e-05, "loss": 2.8692, "step": 1451 }, { "epoch": 1.230508474576271, "grad_norm": 1.1808464527130127, "learning_rate": 2.9491525423728817e-05, "loss": 2.8447, "step": 1452 }, { "epoch": 1.2313559322033898, "grad_norm": 0.9598093628883362, "learning_rate": 2.9477401129943503e-05, "loss": 2.7909, "step": 1453 }, { "epoch": 1.2322033898305085, "grad_norm": 1.0515165328979492, "learning_rate": 2.9463276836158193e-05, "loss": 2.8163, "step": 1454 }, { "epoch": 1.2330508474576272, "grad_norm": 1.0463999509811401, "learning_rate": 2.9449152542372883e-05, "loss": 2.8476, "step": 1455 }, { "epoch": 1.2338983050847459, "grad_norm": 0.9163907170295715, "learning_rate": 2.943502824858757e-05, "loss": 2.7353, "step": 1456 }, { "epoch": 1.2347457627118643, "grad_norm": 0.9019412398338318, "learning_rate": 2.942090395480226e-05, "loss": 2.7821, "step": 1457 }, { "epoch": 1.235593220338983, "grad_norm": 1.192559003829956, "learning_rate": 2.9406779661016953e-05, "loss": 2.6194, "step": 1458 }, { "epoch": 1.2364406779661017, "grad_norm": 1.111675500869751, "learning_rate": 2.9392655367231643e-05, "loss": 2.7556, "step": 1459 }, { "epoch": 1.2372881355932204, "grad_norm": 0.9991422891616821, "learning_rate": 2.937853107344633e-05, "loss": 2.8622, "step": 1460 }, { "epoch": 1.238135593220339, "grad_norm": 0.9809384346008301, "learning_rate": 2.936440677966102e-05, "loss": 2.742, "step": 1461 }, { "epoch": 1.2389830508474575, "grad_norm": 1.0809677839279175, "learning_rate": 2.9350282485875707e-05, "loss": 2.7372, "step": 1462 }, { "epoch": 1.2398305084745762, "grad_norm": 1.1370301246643066, "learning_rate": 2.9336158192090397e-05, "loss": 2.8335, "step": 1463 }, { "epoch": 1.240677966101695, "grad_norm": 1.2629517316818237, "learning_rate": 2.9322033898305083e-05, "loss": 2.6227, "step": 1464 }, { "epoch": 1.2415254237288136, "grad_norm": 1.0647246837615967, "learning_rate": 2.9307909604519773e-05, "loss": 2.7986, "step": 1465 }, { "epoch": 1.2423728813559323, "grad_norm": 1.108402132987976, "learning_rate": 2.9293785310734467e-05, "loss": 2.7282, "step": 1466 }, { "epoch": 1.243220338983051, "grad_norm": 1.4929739236831665, "learning_rate": 2.9279661016949157e-05, "loss": 2.5842, "step": 1467 }, { "epoch": 1.2440677966101694, "grad_norm": 0.9982267618179321, "learning_rate": 2.9265536723163843e-05, "loss": 2.7141, "step": 1468 }, { "epoch": 1.244915254237288, "grad_norm": 1.1600935459136963, "learning_rate": 2.9251412429378533e-05, "loss": 2.6322, "step": 1469 }, { "epoch": 1.2457627118644068, "grad_norm": 1.0451264381408691, "learning_rate": 2.9237288135593223e-05, "loss": 2.823, "step": 1470 }, { "epoch": 1.2466101694915255, "grad_norm": 1.1497570276260376, "learning_rate": 2.922316384180791e-05, "loss": 2.7282, "step": 1471 }, { "epoch": 1.2474576271186442, "grad_norm": 1.070095181465149, "learning_rate": 2.92090395480226e-05, "loss": 2.8049, "step": 1472 }, { "epoch": 1.2483050847457626, "grad_norm": 0.9942064881324768, "learning_rate": 2.9194915254237286e-05, "loss": 2.8992, "step": 1473 }, { "epoch": 1.2491525423728813, "grad_norm": 1.1197259426116943, "learning_rate": 2.918079096045198e-05, "loss": 2.7402, "step": 1474 }, { "epoch": 1.25, "grad_norm": 1.0887670516967773, "learning_rate": 2.916666666666667e-05, "loss": 2.7317, "step": 1475 }, { "epoch": 1.2508474576271187, "grad_norm": 1.061383605003357, "learning_rate": 2.915254237288136e-05, "loss": 2.7887, "step": 1476 }, { "epoch": 1.2516949152542374, "grad_norm": 1.5140694379806519, "learning_rate": 2.9138418079096046e-05, "loss": 2.5884, "step": 1477 }, { "epoch": 1.252542372881356, "grad_norm": 1.155662178993225, "learning_rate": 2.9124293785310736e-05, "loss": 2.6443, "step": 1478 }, { "epoch": 1.2533898305084745, "grad_norm": 1.0662916898727417, "learning_rate": 2.9110169491525423e-05, "loss": 2.8724, "step": 1479 }, { "epoch": 1.2542372881355932, "grad_norm": 1.0730879306793213, "learning_rate": 2.9096045197740113e-05, "loss": 2.7966, "step": 1480 }, { "epoch": 1.255084745762712, "grad_norm": 1.324304223060608, "learning_rate": 2.9081920903954803e-05, "loss": 2.5592, "step": 1481 }, { "epoch": 1.2559322033898306, "grad_norm": 1.211549162864685, "learning_rate": 2.9067796610169496e-05, "loss": 2.6806, "step": 1482 }, { "epoch": 1.256779661016949, "grad_norm": 1.0320687294006348, "learning_rate": 2.9053672316384183e-05, "loss": 2.7471, "step": 1483 }, { "epoch": 1.2576271186440677, "grad_norm": 1.028415322303772, "learning_rate": 2.9039548022598873e-05, "loss": 2.7035, "step": 1484 }, { "epoch": 1.2584745762711864, "grad_norm": 1.4593757390975952, "learning_rate": 2.902542372881356e-05, "loss": 2.7799, "step": 1485 }, { "epoch": 1.259322033898305, "grad_norm": 0.9076970219612122, "learning_rate": 2.901129943502825e-05, "loss": 2.8766, "step": 1486 }, { "epoch": 1.2601694915254238, "grad_norm": 1.0136754512786865, "learning_rate": 2.899717514124294e-05, "loss": 2.9435, "step": 1487 }, { "epoch": 1.2610169491525425, "grad_norm": 1.1704306602478027, "learning_rate": 2.8983050847457626e-05, "loss": 2.7245, "step": 1488 }, { "epoch": 1.261864406779661, "grad_norm": 1.155551552772522, "learning_rate": 2.8968926553672316e-05, "loss": 2.5759, "step": 1489 }, { "epoch": 1.2627118644067796, "grad_norm": 1.2033894062042236, "learning_rate": 2.895480225988701e-05, "loss": 2.6947, "step": 1490 }, { "epoch": 1.2635593220338983, "grad_norm": 1.052081823348999, "learning_rate": 2.89406779661017e-05, "loss": 2.6866, "step": 1491 }, { "epoch": 1.264406779661017, "grad_norm": 1.2990190982818604, "learning_rate": 2.8926553672316386e-05, "loss": 2.6354, "step": 1492 }, { "epoch": 1.2652542372881355, "grad_norm": 0.9295713901519775, "learning_rate": 2.8912429378531076e-05, "loss": 2.7963, "step": 1493 }, { "epoch": 1.2661016949152541, "grad_norm": 1.1894383430480957, "learning_rate": 2.8898305084745763e-05, "loss": 2.5809, "step": 1494 }, { "epoch": 1.2669491525423728, "grad_norm": 1.1730421781539917, "learning_rate": 2.8884180790960453e-05, "loss": 2.5862, "step": 1495 }, { "epoch": 1.2677966101694915, "grad_norm": 1.0560240745544434, "learning_rate": 2.887005649717514e-05, "loss": 2.8433, "step": 1496 }, { "epoch": 1.2686440677966102, "grad_norm": 1.52442467212677, "learning_rate": 2.885593220338983e-05, "loss": 2.4808, "step": 1497 }, { "epoch": 1.269491525423729, "grad_norm": 1.186542272567749, "learning_rate": 2.8841807909604523e-05, "loss": 2.7571, "step": 1498 }, { "epoch": 1.2703389830508476, "grad_norm": 1.104839563369751, "learning_rate": 2.8827683615819213e-05, "loss": 2.879, "step": 1499 }, { "epoch": 1.271186440677966, "grad_norm": 1.1865556240081787, "learning_rate": 2.88135593220339e-05, "loss": 2.806, "step": 1500 }, { "epoch": 1.2720338983050847, "grad_norm": 1.2355942726135254, "learning_rate": 2.879943502824859e-05, "loss": 2.7078, "step": 1501 }, { "epoch": 1.2728813559322034, "grad_norm": 1.0432177782058716, "learning_rate": 2.878531073446328e-05, "loss": 2.7354, "step": 1502 }, { "epoch": 1.273728813559322, "grad_norm": 0.9389800429344177, "learning_rate": 2.8771186440677966e-05, "loss": 2.9527, "step": 1503 }, { "epoch": 1.2745762711864406, "grad_norm": 1.133212685585022, "learning_rate": 2.8757062146892656e-05, "loss": 2.935, "step": 1504 }, { "epoch": 1.2754237288135593, "grad_norm": 1.0598336458206177, "learning_rate": 2.8742937853107342e-05, "loss": 2.7303, "step": 1505 }, { "epoch": 1.276271186440678, "grad_norm": 1.124864935874939, "learning_rate": 2.8728813559322036e-05, "loss": 2.849, "step": 1506 }, { "epoch": 1.2771186440677966, "grad_norm": 0.9836654663085938, "learning_rate": 2.8714689265536726e-05, "loss": 2.9168, "step": 1507 }, { "epoch": 1.2779661016949153, "grad_norm": 1.2238267660140991, "learning_rate": 2.8700564971751416e-05, "loss": 2.7461, "step": 1508 }, { "epoch": 1.278813559322034, "grad_norm": 1.1149492263793945, "learning_rate": 2.8686440677966102e-05, "loss": 2.7516, "step": 1509 }, { "epoch": 1.2796610169491525, "grad_norm": 1.0936237573623657, "learning_rate": 2.8672316384180792e-05, "loss": 2.7713, "step": 1510 }, { "epoch": 1.2805084745762711, "grad_norm": 1.239094614982605, "learning_rate": 2.865819209039548e-05, "loss": 2.6343, "step": 1511 }, { "epoch": 1.2813559322033898, "grad_norm": 1.0122066736221313, "learning_rate": 2.864406779661017e-05, "loss": 2.7883, "step": 1512 }, { "epoch": 1.2822033898305085, "grad_norm": 1.1487418413162231, "learning_rate": 2.862994350282486e-05, "loss": 2.5364, "step": 1513 }, { "epoch": 1.283050847457627, "grad_norm": 1.1750707626342773, "learning_rate": 2.8615819209039552e-05, "loss": 2.8201, "step": 1514 }, { "epoch": 1.2838983050847457, "grad_norm": 0.9801908731460571, "learning_rate": 2.860169491525424e-05, "loss": 2.6655, "step": 1515 }, { "epoch": 1.2847457627118644, "grad_norm": 1.0138574838638306, "learning_rate": 2.858757062146893e-05, "loss": 2.8752, "step": 1516 }, { "epoch": 1.285593220338983, "grad_norm": 1.3303834199905396, "learning_rate": 2.8573446327683616e-05, "loss": 2.8182, "step": 1517 }, { "epoch": 1.2864406779661017, "grad_norm": 1.0959980487823486, "learning_rate": 2.8559322033898306e-05, "loss": 2.7536, "step": 1518 }, { "epoch": 1.2872881355932204, "grad_norm": 1.2038679122924805, "learning_rate": 2.8545197740112996e-05, "loss": 2.7121, "step": 1519 }, { "epoch": 1.288135593220339, "grad_norm": 0.9292485117912292, "learning_rate": 2.8531073446327682e-05, "loss": 2.9412, "step": 1520 }, { "epoch": 1.2889830508474576, "grad_norm": 1.2763807773590088, "learning_rate": 2.8516949152542372e-05, "loss": 2.8264, "step": 1521 }, { "epoch": 1.2898305084745763, "grad_norm": 1.2033941745758057, "learning_rate": 2.8502824858757066e-05, "loss": 2.5891, "step": 1522 }, { "epoch": 1.290677966101695, "grad_norm": 0.9282636046409607, "learning_rate": 2.8488700564971756e-05, "loss": 2.8097, "step": 1523 }, { "epoch": 1.2915254237288136, "grad_norm": 1.1813933849334717, "learning_rate": 2.8474576271186442e-05, "loss": 2.7252, "step": 1524 }, { "epoch": 1.292372881355932, "grad_norm": 1.190480351448059, "learning_rate": 2.8460451977401132e-05, "loss": 2.5932, "step": 1525 }, { "epoch": 1.2932203389830508, "grad_norm": 1.1157951354980469, "learning_rate": 2.844632768361582e-05, "loss": 2.679, "step": 1526 }, { "epoch": 1.2940677966101695, "grad_norm": 1.2422996759414673, "learning_rate": 2.843220338983051e-05, "loss": 2.7436, "step": 1527 }, { "epoch": 1.2949152542372881, "grad_norm": 1.383557677268982, "learning_rate": 2.8418079096045195e-05, "loss": 2.4242, "step": 1528 }, { "epoch": 1.2957627118644068, "grad_norm": 1.2432738542556763, "learning_rate": 2.8403954802259892e-05, "loss": 2.6603, "step": 1529 }, { "epoch": 1.2966101694915255, "grad_norm": 1.155629277229309, "learning_rate": 2.838983050847458e-05, "loss": 2.8567, "step": 1530 }, { "epoch": 1.297457627118644, "grad_norm": 1.200217604637146, "learning_rate": 2.837570621468927e-05, "loss": 2.6833, "step": 1531 }, { "epoch": 1.2983050847457627, "grad_norm": 1.031434178352356, "learning_rate": 2.8361581920903955e-05, "loss": 2.7, "step": 1532 }, { "epoch": 1.2991525423728814, "grad_norm": 1.168958306312561, "learning_rate": 2.8347457627118645e-05, "loss": 2.6607, "step": 1533 }, { "epoch": 1.3, "grad_norm": 0.9971181750297546, "learning_rate": 2.8333333333333335e-05, "loss": 2.7787, "step": 1534 }, { "epoch": 1.3008474576271185, "grad_norm": 1.00819993019104, "learning_rate": 2.8319209039548022e-05, "loss": 2.8582, "step": 1535 }, { "epoch": 1.3016949152542372, "grad_norm": 1.047484040260315, "learning_rate": 2.8305084745762712e-05, "loss": 2.6924, "step": 1536 }, { "epoch": 1.3025423728813559, "grad_norm": 1.1539669036865234, "learning_rate": 2.8290960451977405e-05, "loss": 2.8083, "step": 1537 }, { "epoch": 1.3033898305084746, "grad_norm": 1.2319265604019165, "learning_rate": 2.8276836158192095e-05, "loss": 2.8651, "step": 1538 }, { "epoch": 1.3042372881355933, "grad_norm": 0.9315810203552246, "learning_rate": 2.8262711864406782e-05, "loss": 2.8533, "step": 1539 }, { "epoch": 1.305084745762712, "grad_norm": 1.0935174226760864, "learning_rate": 2.8248587570621472e-05, "loss": 2.6608, "step": 1540 }, { "epoch": 1.3059322033898306, "grad_norm": 1.512353539466858, "learning_rate": 2.823446327683616e-05, "loss": 2.4215, "step": 1541 }, { "epoch": 1.306779661016949, "grad_norm": 1.205745816230774, "learning_rate": 2.822033898305085e-05, "loss": 2.7931, "step": 1542 }, { "epoch": 1.3076271186440678, "grad_norm": 1.1302180290222168, "learning_rate": 2.8206214689265535e-05, "loss": 2.7272, "step": 1543 }, { "epoch": 1.3084745762711865, "grad_norm": 1.0707296133041382, "learning_rate": 2.8192090395480225e-05, "loss": 2.8232, "step": 1544 }, { "epoch": 1.3093220338983051, "grad_norm": 1.346828818321228, "learning_rate": 2.817796610169492e-05, "loss": 2.5666, "step": 1545 }, { "epoch": 1.3101694915254236, "grad_norm": 1.23905611038208, "learning_rate": 2.816384180790961e-05, "loss": 2.6975, "step": 1546 }, { "epoch": 1.3110169491525423, "grad_norm": 0.95879065990448, "learning_rate": 2.8149717514124295e-05, "loss": 2.9588, "step": 1547 }, { "epoch": 1.311864406779661, "grad_norm": 1.173724889755249, "learning_rate": 2.8135593220338985e-05, "loss": 2.7457, "step": 1548 }, { "epoch": 1.3127118644067797, "grad_norm": 1.2177459001541138, "learning_rate": 2.8121468926553675e-05, "loss": 2.5301, "step": 1549 }, { "epoch": 1.3135593220338984, "grad_norm": 1.3209285736083984, "learning_rate": 2.8107344632768362e-05, "loss": 2.6206, "step": 1550 }, { "epoch": 1.314406779661017, "grad_norm": 1.108282446861267, "learning_rate": 2.8093220338983052e-05, "loss": 2.6941, "step": 1551 }, { "epoch": 1.3152542372881357, "grad_norm": 1.2630671262741089, "learning_rate": 2.807909604519774e-05, "loss": 2.6299, "step": 1552 }, { "epoch": 1.3161016949152542, "grad_norm": 1.1710407733917236, "learning_rate": 2.8064971751412432e-05, "loss": 2.6817, "step": 1553 }, { "epoch": 1.3169491525423729, "grad_norm": 1.0206531286239624, "learning_rate": 2.8050847457627122e-05, "loss": 2.7454, "step": 1554 }, { "epoch": 1.3177966101694916, "grad_norm": 1.2598350048065186, "learning_rate": 2.803672316384181e-05, "loss": 2.7229, "step": 1555 }, { "epoch": 1.31864406779661, "grad_norm": 1.4363096952438354, "learning_rate": 2.8022598870056498e-05, "loss": 2.5569, "step": 1556 }, { "epoch": 1.3194915254237287, "grad_norm": 1.116281509399414, "learning_rate": 2.8008474576271188e-05, "loss": 2.6604, "step": 1557 }, { "epoch": 1.3203389830508474, "grad_norm": 1.04258131980896, "learning_rate": 2.7994350282485875e-05, "loss": 2.7203, "step": 1558 }, { "epoch": 1.321186440677966, "grad_norm": 1.1648573875427246, "learning_rate": 2.7980225988700565e-05, "loss": 2.5303, "step": 1559 }, { "epoch": 1.3220338983050848, "grad_norm": 1.2743134498596191, "learning_rate": 2.7966101694915255e-05, "loss": 2.5215, "step": 1560 }, { "epoch": 1.3228813559322035, "grad_norm": 0.9022263884544373, "learning_rate": 2.7951977401129948e-05, "loss": 2.8265, "step": 1561 }, { "epoch": 1.3237288135593221, "grad_norm": 0.9590513110160828, "learning_rate": 2.7937853107344635e-05, "loss": 2.7795, "step": 1562 }, { "epoch": 1.3245762711864406, "grad_norm": 1.5466468334197998, "learning_rate": 2.7923728813559325e-05, "loss": 2.6883, "step": 1563 }, { "epoch": 1.3254237288135593, "grad_norm": 0.9928246140480042, "learning_rate": 2.790960451977401e-05, "loss": 2.9273, "step": 1564 }, { "epoch": 1.326271186440678, "grad_norm": 1.2357574701309204, "learning_rate": 2.78954802259887e-05, "loss": 2.893, "step": 1565 }, { "epoch": 1.3271186440677967, "grad_norm": 0.9751428365707397, "learning_rate": 2.788135593220339e-05, "loss": 2.885, "step": 1566 }, { "epoch": 1.3279661016949151, "grad_norm": 1.1124868392944336, "learning_rate": 2.7867231638418078e-05, "loss": 2.8267, "step": 1567 }, { "epoch": 1.3288135593220338, "grad_norm": 1.1400896310806274, "learning_rate": 2.7853107344632768e-05, "loss": 2.7506, "step": 1568 }, { "epoch": 1.3296610169491525, "grad_norm": 1.4547759294509888, "learning_rate": 2.783898305084746e-05, "loss": 2.5548, "step": 1569 }, { "epoch": 1.3305084745762712, "grad_norm": 1.1902233362197876, "learning_rate": 2.782485875706215e-05, "loss": 2.5925, "step": 1570 }, { "epoch": 1.3313559322033899, "grad_norm": 1.2671072483062744, "learning_rate": 2.7810734463276838e-05, "loss": 2.6583, "step": 1571 }, { "epoch": 1.3322033898305086, "grad_norm": 1.1155524253845215, "learning_rate": 2.7796610169491528e-05, "loss": 2.7445, "step": 1572 }, { "epoch": 1.3330508474576273, "grad_norm": 1.1112772226333618, "learning_rate": 2.7782485875706215e-05, "loss": 2.7136, "step": 1573 }, { "epoch": 1.3338983050847457, "grad_norm": 1.033073902130127, "learning_rate": 2.7768361581920905e-05, "loss": 2.7376, "step": 1574 }, { "epoch": 1.3347457627118644, "grad_norm": 1.1341297626495361, "learning_rate": 2.775423728813559e-05, "loss": 2.6522, "step": 1575 }, { "epoch": 1.335593220338983, "grad_norm": 1.0817641019821167, "learning_rate": 2.774011299435028e-05, "loss": 2.7918, "step": 1576 }, { "epoch": 1.3364406779661018, "grad_norm": 1.2469141483306885, "learning_rate": 2.7725988700564975e-05, "loss": 2.7048, "step": 1577 }, { "epoch": 1.3372881355932202, "grad_norm": 1.068687915802002, "learning_rate": 2.7711864406779665e-05, "loss": 2.54, "step": 1578 }, { "epoch": 1.338135593220339, "grad_norm": 1.1469836235046387, "learning_rate": 2.769774011299435e-05, "loss": 2.5623, "step": 1579 }, { "epoch": 1.3389830508474576, "grad_norm": 0.9954297542572021, "learning_rate": 2.768361581920904e-05, "loss": 2.8458, "step": 1580 }, { "epoch": 1.3398305084745763, "grad_norm": 1.0572340488433838, "learning_rate": 2.766949152542373e-05, "loss": 2.7111, "step": 1581 }, { "epoch": 1.340677966101695, "grad_norm": 1.21023690700531, "learning_rate": 2.7655367231638418e-05, "loss": 2.62, "step": 1582 }, { "epoch": 1.3415254237288137, "grad_norm": 1.0304431915283203, "learning_rate": 2.7641242937853108e-05, "loss": 2.8045, "step": 1583 }, { "epoch": 1.3423728813559321, "grad_norm": 1.6247771978378296, "learning_rate": 2.7627118644067794e-05, "loss": 2.6552, "step": 1584 }, { "epoch": 1.3432203389830508, "grad_norm": 1.2078450918197632, "learning_rate": 2.7612994350282488e-05, "loss": 2.8752, "step": 1585 }, { "epoch": 1.3440677966101695, "grad_norm": 1.1666221618652344, "learning_rate": 2.7598870056497178e-05, "loss": 2.7792, "step": 1586 }, { "epoch": 1.3449152542372882, "grad_norm": 1.1695728302001953, "learning_rate": 2.7584745762711868e-05, "loss": 2.8091, "step": 1587 }, { "epoch": 1.3457627118644067, "grad_norm": 1.0968023538589478, "learning_rate": 2.7570621468926554e-05, "loss": 2.8666, "step": 1588 }, { "epoch": 1.3466101694915253, "grad_norm": 1.167147159576416, "learning_rate": 2.7556497175141244e-05, "loss": 2.7643, "step": 1589 }, { "epoch": 1.347457627118644, "grad_norm": 1.1259609460830688, "learning_rate": 2.754237288135593e-05, "loss": 2.8463, "step": 1590 }, { "epoch": 1.3483050847457627, "grad_norm": 1.1308674812316895, "learning_rate": 2.752824858757062e-05, "loss": 2.788, "step": 1591 }, { "epoch": 1.3491525423728814, "grad_norm": 1.2447667121887207, "learning_rate": 2.751412429378531e-05, "loss": 2.6729, "step": 1592 }, { "epoch": 1.35, "grad_norm": 0.9429620504379272, "learning_rate": 2.7500000000000004e-05, "loss": 2.8734, "step": 1593 }, { "epoch": 1.3508474576271188, "grad_norm": 1.1183481216430664, "learning_rate": 2.748587570621469e-05, "loss": 2.7005, "step": 1594 }, { "epoch": 1.3516949152542372, "grad_norm": 1.1970775127410889, "learning_rate": 2.747175141242938e-05, "loss": 2.7509, "step": 1595 }, { "epoch": 1.352542372881356, "grad_norm": 1.1498631238937378, "learning_rate": 2.7457627118644068e-05, "loss": 2.7412, "step": 1596 }, { "epoch": 1.3533898305084746, "grad_norm": 1.047776222229004, "learning_rate": 2.7443502824858758e-05, "loss": 2.8804, "step": 1597 }, { "epoch": 1.3542372881355933, "grad_norm": 1.227165699005127, "learning_rate": 2.7429378531073448e-05, "loss": 2.7615, "step": 1598 }, { "epoch": 1.3550847457627118, "grad_norm": 1.1898428201675415, "learning_rate": 2.7415254237288134e-05, "loss": 2.8409, "step": 1599 }, { "epoch": 1.3559322033898304, "grad_norm": 1.0390897989273071, "learning_rate": 2.7401129943502824e-05, "loss": 2.687, "step": 1600 }, { "epoch": 1.3567796610169491, "grad_norm": 1.0258177518844604, "learning_rate": 2.7387005649717518e-05, "loss": 2.7485, "step": 1601 }, { "epoch": 1.3576271186440678, "grad_norm": 1.0979821681976318, "learning_rate": 2.7372881355932208e-05, "loss": 2.901, "step": 1602 }, { "epoch": 1.3584745762711865, "grad_norm": 1.1121689081192017, "learning_rate": 2.7358757062146894e-05, "loss": 2.8018, "step": 1603 }, { "epoch": 1.3593220338983052, "grad_norm": 1.0678071975708008, "learning_rate": 2.7344632768361584e-05, "loss": 2.8211, "step": 1604 }, { "epoch": 1.3601694915254237, "grad_norm": 0.9185408353805542, "learning_rate": 2.733050847457627e-05, "loss": 2.9341, "step": 1605 }, { "epoch": 1.3610169491525423, "grad_norm": 1.3406561613082886, "learning_rate": 2.731638418079096e-05, "loss": 2.6606, "step": 1606 }, { "epoch": 1.361864406779661, "grad_norm": 1.0097061395645142, "learning_rate": 2.7302259887005647e-05, "loss": 2.8991, "step": 1607 }, { "epoch": 1.3627118644067797, "grad_norm": 1.1009202003479004, "learning_rate": 2.7288135593220337e-05, "loss": 2.6984, "step": 1608 }, { "epoch": 1.3635593220338982, "grad_norm": 1.0728604793548584, "learning_rate": 2.727401129943503e-05, "loss": 2.8817, "step": 1609 }, { "epoch": 1.3644067796610169, "grad_norm": 1.459185004234314, "learning_rate": 2.725988700564972e-05, "loss": 2.5627, "step": 1610 }, { "epoch": 1.3652542372881356, "grad_norm": 1.506783366203308, "learning_rate": 2.7245762711864407e-05, "loss": 2.4316, "step": 1611 }, { "epoch": 1.3661016949152542, "grad_norm": 1.1802157163619995, "learning_rate": 2.7231638418079097e-05, "loss": 2.6748, "step": 1612 }, { "epoch": 1.366949152542373, "grad_norm": 0.9531373381614685, "learning_rate": 2.7217514124293787e-05, "loss": 2.8042, "step": 1613 }, { "epoch": 1.3677966101694916, "grad_norm": 1.0940016508102417, "learning_rate": 2.7203389830508474e-05, "loss": 2.7417, "step": 1614 }, { "epoch": 1.3686440677966103, "grad_norm": 1.1369091272354126, "learning_rate": 2.7189265536723164e-05, "loss": 2.742, "step": 1615 }, { "epoch": 1.3694915254237288, "grad_norm": 1.4163222312927246, "learning_rate": 2.7175141242937857e-05, "loss": 2.6235, "step": 1616 }, { "epoch": 1.3703389830508474, "grad_norm": 1.1588315963745117, "learning_rate": 2.7161016949152547e-05, "loss": 2.6688, "step": 1617 }, { "epoch": 1.3711864406779661, "grad_norm": 1.1556146144866943, "learning_rate": 2.7146892655367234e-05, "loss": 2.594, "step": 1618 }, { "epoch": 1.3720338983050848, "grad_norm": 1.055390477180481, "learning_rate": 2.7132768361581924e-05, "loss": 2.7697, "step": 1619 }, { "epoch": 1.3728813559322033, "grad_norm": 0.9598971009254456, "learning_rate": 2.711864406779661e-05, "loss": 2.7616, "step": 1620 }, { "epoch": 1.373728813559322, "grad_norm": 1.347302794456482, "learning_rate": 2.71045197740113e-05, "loss": 2.5287, "step": 1621 }, { "epoch": 1.3745762711864407, "grad_norm": 1.150273084640503, "learning_rate": 2.7090395480225987e-05, "loss": 2.6823, "step": 1622 }, { "epoch": 1.3754237288135593, "grad_norm": 1.1114012002944946, "learning_rate": 2.7076271186440677e-05, "loss": 2.708, "step": 1623 }, { "epoch": 1.376271186440678, "grad_norm": 1.2295079231262207, "learning_rate": 2.706214689265537e-05, "loss": 2.7309, "step": 1624 }, { "epoch": 1.3771186440677967, "grad_norm": 1.1405549049377441, "learning_rate": 2.704802259887006e-05, "loss": 2.7787, "step": 1625 }, { "epoch": 1.3779661016949152, "grad_norm": 0.976355254650116, "learning_rate": 2.7033898305084747e-05, "loss": 2.7419, "step": 1626 }, { "epoch": 1.3788135593220339, "grad_norm": 1.122742772102356, "learning_rate": 2.7019774011299437e-05, "loss": 2.7403, "step": 1627 }, { "epoch": 1.3796610169491526, "grad_norm": 1.099894404411316, "learning_rate": 2.7005649717514127e-05, "loss": 2.7016, "step": 1628 }, { "epoch": 1.3805084745762712, "grad_norm": 1.1797363758087158, "learning_rate": 2.6991525423728814e-05, "loss": 2.6439, "step": 1629 }, { "epoch": 1.3813559322033897, "grad_norm": 1.2248269319534302, "learning_rate": 2.6977401129943504e-05, "loss": 2.8427, "step": 1630 }, { "epoch": 1.3822033898305084, "grad_norm": 1.2509466409683228, "learning_rate": 2.696327683615819e-05, "loss": 2.6564, "step": 1631 }, { "epoch": 1.383050847457627, "grad_norm": 1.3079769611358643, "learning_rate": 2.6949152542372884e-05, "loss": 2.6483, "step": 1632 }, { "epoch": 1.3838983050847458, "grad_norm": 1.0489200353622437, "learning_rate": 2.6935028248587574e-05, "loss": 2.7253, "step": 1633 }, { "epoch": 1.3847457627118644, "grad_norm": 1.2310895919799805, "learning_rate": 2.6920903954802264e-05, "loss": 2.793, "step": 1634 }, { "epoch": 1.3855932203389831, "grad_norm": 1.0037175416946411, "learning_rate": 2.690677966101695e-05, "loss": 3.0587, "step": 1635 }, { "epoch": 1.3864406779661018, "grad_norm": 1.1418050527572632, "learning_rate": 2.689265536723164e-05, "loss": 2.7489, "step": 1636 }, { "epoch": 1.3872881355932203, "grad_norm": 0.8957945108413696, "learning_rate": 2.6878531073446327e-05, "loss": 3.0793, "step": 1637 }, { "epoch": 1.388135593220339, "grad_norm": 1.1641825437545776, "learning_rate": 2.6864406779661017e-05, "loss": 2.8238, "step": 1638 }, { "epoch": 1.3889830508474577, "grad_norm": 1.217188835144043, "learning_rate": 2.6850282485875707e-05, "loss": 2.6698, "step": 1639 }, { "epoch": 1.3898305084745763, "grad_norm": 1.331202507019043, "learning_rate": 2.68361581920904e-05, "loss": 2.7459, "step": 1640 }, { "epoch": 1.3906779661016948, "grad_norm": 0.9791933298110962, "learning_rate": 2.6822033898305087e-05, "loss": 2.7336, "step": 1641 }, { "epoch": 1.3915254237288135, "grad_norm": 1.183793306350708, "learning_rate": 2.6807909604519777e-05, "loss": 2.7254, "step": 1642 }, { "epoch": 1.3923728813559322, "grad_norm": 1.0322171449661255, "learning_rate": 2.6793785310734463e-05, "loss": 2.8907, "step": 1643 }, { "epoch": 1.3932203389830509, "grad_norm": 1.5175987482070923, "learning_rate": 2.6779661016949153e-05, "loss": 2.6094, "step": 1644 }, { "epoch": 1.3940677966101696, "grad_norm": 1.1093040704727173, "learning_rate": 2.6765536723163843e-05, "loss": 2.7547, "step": 1645 }, { "epoch": 1.3949152542372882, "grad_norm": 1.0322200059890747, "learning_rate": 2.675141242937853e-05, "loss": 2.8324, "step": 1646 }, { "epoch": 1.3957627118644067, "grad_norm": 1.0330455303192139, "learning_rate": 2.673728813559322e-05, "loss": 2.7325, "step": 1647 }, { "epoch": 1.3966101694915254, "grad_norm": 1.1793253421783447, "learning_rate": 2.6723163841807913e-05, "loss": 2.6088, "step": 1648 }, { "epoch": 1.397457627118644, "grad_norm": 1.4393837451934814, "learning_rate": 2.6709039548022603e-05, "loss": 2.5526, "step": 1649 }, { "epoch": 1.3983050847457628, "grad_norm": 1.3199998140335083, "learning_rate": 2.669491525423729e-05, "loss": 2.7828, "step": 1650 }, { "epoch": 1.3991525423728812, "grad_norm": 1.1920833587646484, "learning_rate": 2.668079096045198e-05, "loss": 2.7317, "step": 1651 }, { "epoch": 1.4, "grad_norm": 1.0866401195526123, "learning_rate": 2.6666666666666667e-05, "loss": 2.7917, "step": 1652 }, { "epoch": 1.4008474576271186, "grad_norm": 1.4187414646148682, "learning_rate": 2.6652542372881357e-05, "loss": 2.5495, "step": 1653 }, { "epoch": 1.4016949152542373, "grad_norm": 1.0413742065429688, "learning_rate": 2.6638418079096043e-05, "loss": 2.6181, "step": 1654 }, { "epoch": 1.402542372881356, "grad_norm": 1.2924762964248657, "learning_rate": 2.6624293785310733e-05, "loss": 2.6635, "step": 1655 }, { "epoch": 1.4033898305084747, "grad_norm": 1.1318178176879883, "learning_rate": 2.6610169491525427e-05, "loss": 2.6875, "step": 1656 }, { "epoch": 1.4042372881355933, "grad_norm": 1.1205334663391113, "learning_rate": 2.6596045197740117e-05, "loss": 2.6214, "step": 1657 }, { "epoch": 1.4050847457627118, "grad_norm": 0.9691821336746216, "learning_rate": 2.6581920903954803e-05, "loss": 2.7269, "step": 1658 }, { "epoch": 1.4059322033898305, "grad_norm": 1.0846081972122192, "learning_rate": 2.6567796610169493e-05, "loss": 2.8449, "step": 1659 }, { "epoch": 1.4067796610169492, "grad_norm": 1.0578593015670776, "learning_rate": 2.6553672316384183e-05, "loss": 2.7101, "step": 1660 }, { "epoch": 1.4076271186440679, "grad_norm": 1.1737051010131836, "learning_rate": 2.653954802259887e-05, "loss": 2.6119, "step": 1661 }, { "epoch": 1.4084745762711863, "grad_norm": 1.1304816007614136, "learning_rate": 2.652542372881356e-05, "loss": 2.9181, "step": 1662 }, { "epoch": 1.409322033898305, "grad_norm": 1.335292100906372, "learning_rate": 2.6511299435028246e-05, "loss": 2.603, "step": 1663 }, { "epoch": 1.4101694915254237, "grad_norm": 1.1524286270141602, "learning_rate": 2.649717514124294e-05, "loss": 2.6836, "step": 1664 }, { "epoch": 1.4110169491525424, "grad_norm": 1.1399136781692505, "learning_rate": 2.648305084745763e-05, "loss": 2.693, "step": 1665 }, { "epoch": 1.411864406779661, "grad_norm": 1.0053728818893433, "learning_rate": 2.646892655367232e-05, "loss": 2.8955, "step": 1666 }, { "epoch": 1.4127118644067798, "grad_norm": 0.9324097037315369, "learning_rate": 2.6454802259887006e-05, "loss": 2.7561, "step": 1667 }, { "epoch": 1.4135593220338982, "grad_norm": 1.0216845273971558, "learning_rate": 2.6440677966101696e-05, "loss": 2.8979, "step": 1668 }, { "epoch": 1.414406779661017, "grad_norm": 1.094473958015442, "learning_rate": 2.6426553672316383e-05, "loss": 2.6899, "step": 1669 }, { "epoch": 1.4152542372881356, "grad_norm": 1.1630312204360962, "learning_rate": 2.6412429378531073e-05, "loss": 2.7004, "step": 1670 }, { "epoch": 1.4161016949152543, "grad_norm": 1.5137243270874023, "learning_rate": 2.6398305084745763e-05, "loss": 2.6018, "step": 1671 }, { "epoch": 1.4169491525423727, "grad_norm": 1.1621006727218628, "learning_rate": 2.6384180790960456e-05, "loss": 2.8431, "step": 1672 }, { "epoch": 1.4177966101694914, "grad_norm": 1.2741423845291138, "learning_rate": 2.6370056497175143e-05, "loss": 2.601, "step": 1673 }, { "epoch": 1.4186440677966101, "grad_norm": 1.2445948123931885, "learning_rate": 2.6355932203389833e-05, "loss": 2.6987, "step": 1674 }, { "epoch": 1.4194915254237288, "grad_norm": 1.028989553451538, "learning_rate": 2.634180790960452e-05, "loss": 2.7849, "step": 1675 }, { "epoch": 1.4203389830508475, "grad_norm": 0.9762065410614014, "learning_rate": 2.632768361581921e-05, "loss": 2.8959, "step": 1676 }, { "epoch": 1.4211864406779662, "grad_norm": 1.280434489250183, "learning_rate": 2.63135593220339e-05, "loss": 2.6658, "step": 1677 }, { "epoch": 1.4220338983050849, "grad_norm": 1.094059944152832, "learning_rate": 2.6299435028248586e-05, "loss": 2.757, "step": 1678 }, { "epoch": 1.4228813559322033, "grad_norm": 1.388394832611084, "learning_rate": 2.6285310734463276e-05, "loss": 2.7182, "step": 1679 }, { "epoch": 1.423728813559322, "grad_norm": 1.4729382991790771, "learning_rate": 2.627118644067797e-05, "loss": 2.5601, "step": 1680 }, { "epoch": 1.4245762711864407, "grad_norm": 1.4540657997131348, "learning_rate": 2.625706214689266e-05, "loss": 2.6347, "step": 1681 }, { "epoch": 1.4254237288135594, "grad_norm": 0.8756323456764221, "learning_rate": 2.6242937853107346e-05, "loss": 2.964, "step": 1682 }, { "epoch": 1.4262711864406779, "grad_norm": 1.0743720531463623, "learning_rate": 2.6228813559322036e-05, "loss": 2.8669, "step": 1683 }, { "epoch": 1.4271186440677965, "grad_norm": 1.1583024263381958, "learning_rate": 2.6214689265536723e-05, "loss": 2.7134, "step": 1684 }, { "epoch": 1.4279661016949152, "grad_norm": 1.476781964302063, "learning_rate": 2.6200564971751413e-05, "loss": 2.6654, "step": 1685 }, { "epoch": 1.428813559322034, "grad_norm": 1.3158514499664307, "learning_rate": 2.61864406779661e-05, "loss": 2.727, "step": 1686 }, { "epoch": 1.4296610169491526, "grad_norm": 1.1468459367752075, "learning_rate": 2.617231638418079e-05, "loss": 2.7465, "step": 1687 }, { "epoch": 1.4305084745762713, "grad_norm": 1.385338306427002, "learning_rate": 2.6158192090395483e-05, "loss": 2.4701, "step": 1688 }, { "epoch": 1.43135593220339, "grad_norm": 1.102578043937683, "learning_rate": 2.6144067796610173e-05, "loss": 2.9113, "step": 1689 }, { "epoch": 1.4322033898305084, "grad_norm": 1.049554705619812, "learning_rate": 2.612994350282486e-05, "loss": 2.6923, "step": 1690 }, { "epoch": 1.4330508474576271, "grad_norm": 1.1550787687301636, "learning_rate": 2.611581920903955e-05, "loss": 2.8105, "step": 1691 }, { "epoch": 1.4338983050847458, "grad_norm": 1.4350584745407104, "learning_rate": 2.610169491525424e-05, "loss": 2.5275, "step": 1692 }, { "epoch": 1.4347457627118643, "grad_norm": 0.9529697895050049, "learning_rate": 2.6087570621468926e-05, "loss": 2.7931, "step": 1693 }, { "epoch": 1.435593220338983, "grad_norm": 1.3449734449386597, "learning_rate": 2.6073446327683616e-05, "loss": 2.6613, "step": 1694 }, { "epoch": 1.4364406779661016, "grad_norm": 0.968408465385437, "learning_rate": 2.605932203389831e-05, "loss": 2.8287, "step": 1695 }, { "epoch": 1.4372881355932203, "grad_norm": 1.1594336032867432, "learning_rate": 2.6045197740113e-05, "loss": 2.8094, "step": 1696 }, { "epoch": 1.438135593220339, "grad_norm": 1.3010283708572388, "learning_rate": 2.6031073446327686e-05, "loss": 2.635, "step": 1697 }, { "epoch": 1.4389830508474577, "grad_norm": 1.1900665760040283, "learning_rate": 2.6016949152542376e-05, "loss": 2.6159, "step": 1698 }, { "epoch": 1.4398305084745764, "grad_norm": 1.013306975364685, "learning_rate": 2.6002824858757062e-05, "loss": 2.8351, "step": 1699 }, { "epoch": 1.4406779661016949, "grad_norm": 1.3891487121582031, "learning_rate": 2.5988700564971752e-05, "loss": 2.4998, "step": 1700 }, { "epoch": 1.4415254237288135, "grad_norm": 1.34544038772583, "learning_rate": 2.597457627118644e-05, "loss": 2.7262, "step": 1701 }, { "epoch": 1.4423728813559322, "grad_norm": 1.1992326974868774, "learning_rate": 2.596045197740113e-05, "loss": 2.6216, "step": 1702 }, { "epoch": 1.443220338983051, "grad_norm": 1.0708142518997192, "learning_rate": 2.5946327683615822e-05, "loss": 2.8212, "step": 1703 }, { "epoch": 1.4440677966101694, "grad_norm": 1.231167197227478, "learning_rate": 2.5932203389830512e-05, "loss": 2.6428, "step": 1704 }, { "epoch": 1.444915254237288, "grad_norm": 0.9052745699882507, "learning_rate": 2.59180790960452e-05, "loss": 2.9066, "step": 1705 }, { "epoch": 1.4457627118644067, "grad_norm": 1.2641510963439941, "learning_rate": 2.590395480225989e-05, "loss": 2.6419, "step": 1706 }, { "epoch": 1.4466101694915254, "grad_norm": 1.1792833805084229, "learning_rate": 2.588983050847458e-05, "loss": 2.7039, "step": 1707 }, { "epoch": 1.4474576271186441, "grad_norm": 1.0563932657241821, "learning_rate": 2.5875706214689266e-05, "loss": 2.7077, "step": 1708 }, { "epoch": 1.4483050847457628, "grad_norm": 1.3587576150894165, "learning_rate": 2.5861581920903956e-05, "loss": 2.8453, "step": 1709 }, { "epoch": 1.4491525423728815, "grad_norm": 1.1781967878341675, "learning_rate": 2.5847457627118642e-05, "loss": 2.7997, "step": 1710 }, { "epoch": 1.45, "grad_norm": 1.46746027469635, "learning_rate": 2.5833333333333336e-05, "loss": 2.6531, "step": 1711 }, { "epoch": 1.4508474576271186, "grad_norm": 0.918735682964325, "learning_rate": 2.5819209039548026e-05, "loss": 2.9465, "step": 1712 }, { "epoch": 1.4516949152542373, "grad_norm": 1.0160126686096191, "learning_rate": 2.5805084745762716e-05, "loss": 2.9582, "step": 1713 }, { "epoch": 1.452542372881356, "grad_norm": 1.0358372926712036, "learning_rate": 2.5790960451977402e-05, "loss": 2.6894, "step": 1714 }, { "epoch": 1.4533898305084745, "grad_norm": 1.0059192180633545, "learning_rate": 2.5776836158192092e-05, "loss": 2.8616, "step": 1715 }, { "epoch": 1.4542372881355932, "grad_norm": 1.1379948854446411, "learning_rate": 2.576271186440678e-05, "loss": 2.6607, "step": 1716 }, { "epoch": 1.4550847457627119, "grad_norm": 1.1622596979141235, "learning_rate": 2.574858757062147e-05, "loss": 2.8035, "step": 1717 }, { "epoch": 1.4559322033898305, "grad_norm": 1.3360217809677124, "learning_rate": 2.573446327683616e-05, "loss": 2.6771, "step": 1718 }, { "epoch": 1.4567796610169492, "grad_norm": 1.2945500612258911, "learning_rate": 2.5720338983050852e-05, "loss": 2.5487, "step": 1719 }, { "epoch": 1.457627118644068, "grad_norm": 1.3775050640106201, "learning_rate": 2.570621468926554e-05, "loss": 2.5943, "step": 1720 }, { "epoch": 1.4584745762711864, "grad_norm": 1.2074862718582153, "learning_rate": 2.569209039548023e-05, "loss": 2.7334, "step": 1721 }, { "epoch": 1.459322033898305, "grad_norm": 0.8728470802307129, "learning_rate": 2.5677966101694915e-05, "loss": 2.9732, "step": 1722 }, { "epoch": 1.4601694915254237, "grad_norm": 1.1561737060546875, "learning_rate": 2.5663841807909605e-05, "loss": 2.7425, "step": 1723 }, { "epoch": 1.4610169491525424, "grad_norm": 1.5228084325790405, "learning_rate": 2.5649717514124295e-05, "loss": 2.4591, "step": 1724 }, { "epoch": 1.461864406779661, "grad_norm": 1.0325641632080078, "learning_rate": 2.5635593220338982e-05, "loss": 2.7793, "step": 1725 }, { "epoch": 1.4627118644067796, "grad_norm": 1.1751588582992554, "learning_rate": 2.5621468926553672e-05, "loss": 2.8034, "step": 1726 }, { "epoch": 1.4635593220338983, "grad_norm": 1.2335675954818726, "learning_rate": 2.5607344632768365e-05, "loss": 2.6746, "step": 1727 }, { "epoch": 1.464406779661017, "grad_norm": 1.31641685962677, "learning_rate": 2.5593220338983055e-05, "loss": 2.678, "step": 1728 }, { "epoch": 1.4652542372881356, "grad_norm": 1.3009672164916992, "learning_rate": 2.5579096045197742e-05, "loss": 2.8398, "step": 1729 }, { "epoch": 1.4661016949152543, "grad_norm": 1.1604074239730835, "learning_rate": 2.5564971751412432e-05, "loss": 2.8324, "step": 1730 }, { "epoch": 1.466949152542373, "grad_norm": 1.3685365915298462, "learning_rate": 2.555084745762712e-05, "loss": 2.5721, "step": 1731 }, { "epoch": 1.4677966101694915, "grad_norm": 1.187212347984314, "learning_rate": 2.553672316384181e-05, "loss": 2.7589, "step": 1732 }, { "epoch": 1.4686440677966102, "grad_norm": 1.1539112329483032, "learning_rate": 2.5522598870056495e-05, "loss": 2.6395, "step": 1733 }, { "epoch": 1.4694915254237289, "grad_norm": 1.0945439338684082, "learning_rate": 2.5508474576271185e-05, "loss": 2.8949, "step": 1734 }, { "epoch": 1.4703389830508475, "grad_norm": 1.189335584640503, "learning_rate": 2.549435028248588e-05, "loss": 2.7979, "step": 1735 }, { "epoch": 1.471186440677966, "grad_norm": 1.138032078742981, "learning_rate": 2.548022598870057e-05, "loss": 2.7786, "step": 1736 }, { "epoch": 1.4720338983050847, "grad_norm": 1.6101855039596558, "learning_rate": 2.5466101694915255e-05, "loss": 2.3606, "step": 1737 }, { "epoch": 1.4728813559322034, "grad_norm": 0.9621807336807251, "learning_rate": 2.5451977401129945e-05, "loss": 2.8372, "step": 1738 }, { "epoch": 1.473728813559322, "grad_norm": 1.0987484455108643, "learning_rate": 2.5437853107344635e-05, "loss": 2.7639, "step": 1739 }, { "epoch": 1.4745762711864407, "grad_norm": 1.103590488433838, "learning_rate": 2.5423728813559322e-05, "loss": 2.6284, "step": 1740 }, { "epoch": 1.4754237288135594, "grad_norm": 0.938614547252655, "learning_rate": 2.5409604519774012e-05, "loss": 2.7863, "step": 1741 }, { "epoch": 1.476271186440678, "grad_norm": 1.0708339214324951, "learning_rate": 2.5395480225988698e-05, "loss": 2.7349, "step": 1742 }, { "epoch": 1.4771186440677966, "grad_norm": 0.8444576859474182, "learning_rate": 2.538135593220339e-05, "loss": 2.8595, "step": 1743 }, { "epoch": 1.4779661016949153, "grad_norm": 1.3227760791778564, "learning_rate": 2.536723163841808e-05, "loss": 2.7629, "step": 1744 }, { "epoch": 1.478813559322034, "grad_norm": 1.4015511274337769, "learning_rate": 2.535310734463277e-05, "loss": 2.6918, "step": 1745 }, { "epoch": 1.4796610169491524, "grad_norm": 0.9624196887016296, "learning_rate": 2.5338983050847458e-05, "loss": 2.8514, "step": 1746 }, { "epoch": 1.480508474576271, "grad_norm": 1.2324362993240356, "learning_rate": 2.5324858757062148e-05, "loss": 2.6816, "step": 1747 }, { "epoch": 1.4813559322033898, "grad_norm": 1.64078688621521, "learning_rate": 2.5310734463276835e-05, "loss": 2.4035, "step": 1748 }, { "epoch": 1.4822033898305085, "grad_norm": 1.0446299314498901, "learning_rate": 2.5296610169491525e-05, "loss": 2.824, "step": 1749 }, { "epoch": 1.4830508474576272, "grad_norm": 0.9898091554641724, "learning_rate": 2.5282485875706215e-05, "loss": 2.8308, "step": 1750 }, { "epoch": 1.4838983050847459, "grad_norm": 1.2462795972824097, "learning_rate": 2.5268361581920908e-05, "loss": 2.6622, "step": 1751 }, { "epoch": 1.4847457627118645, "grad_norm": 1.0705769062042236, "learning_rate": 2.5254237288135595e-05, "loss": 2.7527, "step": 1752 }, { "epoch": 1.485593220338983, "grad_norm": 1.0593948364257812, "learning_rate": 2.5240112994350285e-05, "loss": 2.8198, "step": 1753 }, { "epoch": 1.4864406779661017, "grad_norm": 1.1521574258804321, "learning_rate": 2.522598870056497e-05, "loss": 2.8464, "step": 1754 }, { "epoch": 1.4872881355932204, "grad_norm": 1.0825778245925903, "learning_rate": 2.521186440677966e-05, "loss": 2.6956, "step": 1755 }, { "epoch": 1.488135593220339, "grad_norm": 1.1135597229003906, "learning_rate": 2.519774011299435e-05, "loss": 2.7401, "step": 1756 }, { "epoch": 1.4889830508474575, "grad_norm": 1.4296479225158691, "learning_rate": 2.5183615819209038e-05, "loss": 2.5709, "step": 1757 }, { "epoch": 1.4898305084745762, "grad_norm": 1.0948504209518433, "learning_rate": 2.5169491525423728e-05, "loss": 2.5154, "step": 1758 }, { "epoch": 1.490677966101695, "grad_norm": 1.2337549924850464, "learning_rate": 2.515536723163842e-05, "loss": 2.5797, "step": 1759 }, { "epoch": 1.4915254237288136, "grad_norm": 1.142493724822998, "learning_rate": 2.514124293785311e-05, "loss": 2.7528, "step": 1760 }, { "epoch": 1.4923728813559323, "grad_norm": 1.2989643812179565, "learning_rate": 2.5127118644067798e-05, "loss": 2.4817, "step": 1761 }, { "epoch": 1.493220338983051, "grad_norm": 1.1945594549179077, "learning_rate": 2.5112994350282488e-05, "loss": 2.8188, "step": 1762 }, { "epoch": 1.4940677966101694, "grad_norm": 1.234764575958252, "learning_rate": 2.5098870056497175e-05, "loss": 2.703, "step": 1763 }, { "epoch": 1.494915254237288, "grad_norm": 1.1107516288757324, "learning_rate": 2.5084745762711865e-05, "loss": 2.649, "step": 1764 }, { "epoch": 1.4957627118644068, "grad_norm": 1.1896271705627441, "learning_rate": 2.507062146892655e-05, "loss": 2.7029, "step": 1765 }, { "epoch": 1.4966101694915255, "grad_norm": 0.9180924296379089, "learning_rate": 2.505649717514124e-05, "loss": 2.8742, "step": 1766 }, { "epoch": 1.497457627118644, "grad_norm": 0.9900988340377808, "learning_rate": 2.5042372881355935e-05, "loss": 2.7208, "step": 1767 }, { "epoch": 1.4983050847457626, "grad_norm": 1.1403770446777344, "learning_rate": 2.5028248587570625e-05, "loss": 2.6409, "step": 1768 }, { "epoch": 1.4991525423728813, "grad_norm": 1.050744652748108, "learning_rate": 2.501412429378531e-05, "loss": 2.6425, "step": 1769 }, { "epoch": 1.5, "grad_norm": 1.0845805406570435, "learning_rate": 2.5e-05, "loss": 2.7535, "step": 1770 }, { "epoch": 1.5008474576271187, "grad_norm": 1.1701406240463257, "learning_rate": 2.498587570621469e-05, "loss": 2.7587, "step": 1771 }, { "epoch": 1.5016949152542374, "grad_norm": 1.1284900903701782, "learning_rate": 2.497175141242938e-05, "loss": 2.6616, "step": 1772 }, { "epoch": 1.502542372881356, "grad_norm": 1.1275113821029663, "learning_rate": 2.495762711864407e-05, "loss": 2.7371, "step": 1773 }, { "epoch": 1.5033898305084745, "grad_norm": 1.18697190284729, "learning_rate": 2.4943502824858758e-05, "loss": 2.7345, "step": 1774 }, { "epoch": 1.5042372881355932, "grad_norm": 1.1543711423873901, "learning_rate": 2.4929378531073448e-05, "loss": 2.7861, "step": 1775 }, { "epoch": 1.505084745762712, "grad_norm": 0.9525389671325684, "learning_rate": 2.4915254237288138e-05, "loss": 2.7745, "step": 1776 }, { "epoch": 1.5059322033898304, "grad_norm": 1.243618369102478, "learning_rate": 2.4901129943502828e-05, "loss": 2.5957, "step": 1777 }, { "epoch": 1.506779661016949, "grad_norm": 0.9921051263809204, "learning_rate": 2.4887005649717514e-05, "loss": 2.6621, "step": 1778 }, { "epoch": 1.5076271186440677, "grad_norm": 0.9193978309631348, "learning_rate": 2.4872881355932204e-05, "loss": 2.8639, "step": 1779 }, { "epoch": 1.5084745762711864, "grad_norm": 1.0668909549713135, "learning_rate": 2.4858757062146894e-05, "loss": 2.8757, "step": 1780 }, { "epoch": 1.509322033898305, "grad_norm": 1.579140543937683, "learning_rate": 2.4844632768361584e-05, "loss": 2.4371, "step": 1781 }, { "epoch": 1.5101694915254238, "grad_norm": 1.1261061429977417, "learning_rate": 2.483050847457627e-05, "loss": 2.6986, "step": 1782 }, { "epoch": 1.5110169491525425, "grad_norm": 1.0194988250732422, "learning_rate": 2.481638418079096e-05, "loss": 2.749, "step": 1783 }, { "epoch": 1.5118644067796612, "grad_norm": 1.1523332595825195, "learning_rate": 2.480225988700565e-05, "loss": 2.7165, "step": 1784 }, { "epoch": 1.5127118644067796, "grad_norm": 1.1937497854232788, "learning_rate": 2.478813559322034e-05, "loss": 2.7766, "step": 1785 }, { "epoch": 1.5135593220338983, "grad_norm": 1.128448486328125, "learning_rate": 2.477401129943503e-05, "loss": 2.7316, "step": 1786 }, { "epoch": 1.514406779661017, "grad_norm": 1.3518177270889282, "learning_rate": 2.4759887005649718e-05, "loss": 2.6125, "step": 1787 }, { "epoch": 1.5152542372881355, "grad_norm": 1.0886684656143188, "learning_rate": 2.4745762711864408e-05, "loss": 2.8088, "step": 1788 }, { "epoch": 1.5161016949152541, "grad_norm": 1.0553864240646362, "learning_rate": 2.4731638418079098e-05, "loss": 2.7269, "step": 1789 }, { "epoch": 1.5169491525423728, "grad_norm": 0.9939340949058533, "learning_rate": 2.4717514124293788e-05, "loss": 2.8575, "step": 1790 }, { "epoch": 1.5177966101694915, "grad_norm": 1.4051233530044556, "learning_rate": 2.4703389830508474e-05, "loss": 2.6995, "step": 1791 }, { "epoch": 1.5186440677966102, "grad_norm": 1.1315674781799316, "learning_rate": 2.4689265536723168e-05, "loss": 2.8004, "step": 1792 }, { "epoch": 1.519491525423729, "grad_norm": 1.2620928287506104, "learning_rate": 2.4675141242937854e-05, "loss": 2.6974, "step": 1793 }, { "epoch": 1.5203389830508476, "grad_norm": 1.2280446290969849, "learning_rate": 2.4661016949152544e-05, "loss": 2.5931, "step": 1794 }, { "epoch": 1.5211864406779663, "grad_norm": 1.1947920322418213, "learning_rate": 2.464689265536723e-05, "loss": 2.6941, "step": 1795 }, { "epoch": 1.5220338983050847, "grad_norm": 1.2421876192092896, "learning_rate": 2.4632768361581924e-05, "loss": 2.7424, "step": 1796 }, { "epoch": 1.5228813559322034, "grad_norm": 1.032297134399414, "learning_rate": 2.461864406779661e-05, "loss": 2.8071, "step": 1797 }, { "epoch": 1.5237288135593219, "grad_norm": 1.0700403451919556, "learning_rate": 2.46045197740113e-05, "loss": 2.9141, "step": 1798 }, { "epoch": 1.5245762711864406, "grad_norm": 1.2481400966644287, "learning_rate": 2.4590395480225987e-05, "loss": 2.7076, "step": 1799 }, { "epoch": 1.5254237288135593, "grad_norm": 1.1966779232025146, "learning_rate": 2.457627118644068e-05, "loss": 2.6864, "step": 1800 }, { "epoch": 1.526271186440678, "grad_norm": 1.036336898803711, "learning_rate": 2.4562146892655367e-05, "loss": 2.7467, "step": 1801 }, { "epoch": 1.5271186440677966, "grad_norm": 0.9020054936408997, "learning_rate": 2.4548022598870057e-05, "loss": 3.0466, "step": 1802 }, { "epoch": 1.5279661016949153, "grad_norm": 1.2337982654571533, "learning_rate": 2.4533898305084747e-05, "loss": 2.6878, "step": 1803 }, { "epoch": 1.528813559322034, "grad_norm": 1.003456473350525, "learning_rate": 2.4519774011299437e-05, "loss": 2.9421, "step": 1804 }, { "epoch": 1.5296610169491527, "grad_norm": 0.945570170879364, "learning_rate": 2.4505649717514127e-05, "loss": 2.7682, "step": 1805 }, { "epoch": 1.5305084745762711, "grad_norm": 1.0575337409973145, "learning_rate": 2.4491525423728814e-05, "loss": 2.7781, "step": 1806 }, { "epoch": 1.5313559322033898, "grad_norm": 1.2243188619613647, "learning_rate": 2.4477401129943504e-05, "loss": 2.8299, "step": 1807 }, { "epoch": 1.5322033898305085, "grad_norm": 1.0221068859100342, "learning_rate": 2.4463276836158194e-05, "loss": 2.863, "step": 1808 }, { "epoch": 1.533050847457627, "grad_norm": 0.961390495300293, "learning_rate": 2.4449152542372884e-05, "loss": 2.7633, "step": 1809 }, { "epoch": 1.5338983050847457, "grad_norm": 1.3998870849609375, "learning_rate": 2.443502824858757e-05, "loss": 2.5782, "step": 1810 }, { "epoch": 1.5347457627118644, "grad_norm": 1.0910758972167969, "learning_rate": 2.442090395480226e-05, "loss": 2.8715, "step": 1811 }, { "epoch": 1.535593220338983, "grad_norm": 1.138889193534851, "learning_rate": 2.440677966101695e-05, "loss": 2.7039, "step": 1812 }, { "epoch": 1.5364406779661017, "grad_norm": 1.3160468339920044, "learning_rate": 2.439265536723164e-05, "loss": 2.6346, "step": 1813 }, { "epoch": 1.5372881355932204, "grad_norm": 1.3582723140716553, "learning_rate": 2.4378531073446327e-05, "loss": 2.5785, "step": 1814 }, { "epoch": 1.538135593220339, "grad_norm": 1.2774215936660767, "learning_rate": 2.4364406779661017e-05, "loss": 2.7209, "step": 1815 }, { "epoch": 1.5389830508474578, "grad_norm": 1.0066934823989868, "learning_rate": 2.4350282485875707e-05, "loss": 2.8582, "step": 1816 }, { "epoch": 1.5398305084745763, "grad_norm": 1.0705580711364746, "learning_rate": 2.4336158192090397e-05, "loss": 2.7616, "step": 1817 }, { "epoch": 1.540677966101695, "grad_norm": 1.2924797534942627, "learning_rate": 2.4322033898305087e-05, "loss": 2.7503, "step": 1818 }, { "epoch": 1.5415254237288134, "grad_norm": 1.266080617904663, "learning_rate": 2.4307909604519774e-05, "loss": 2.6332, "step": 1819 }, { "epoch": 1.542372881355932, "grad_norm": 0.9904401302337646, "learning_rate": 2.4293785310734467e-05, "loss": 2.7892, "step": 1820 }, { "epoch": 1.5432203389830508, "grad_norm": 1.0994033813476562, "learning_rate": 2.4279661016949154e-05, "loss": 2.8478, "step": 1821 }, { "epoch": 1.5440677966101695, "grad_norm": 1.2663304805755615, "learning_rate": 2.4265536723163844e-05, "loss": 2.888, "step": 1822 }, { "epoch": 1.5449152542372881, "grad_norm": 1.1716861724853516, "learning_rate": 2.425141242937853e-05, "loss": 2.6604, "step": 1823 }, { "epoch": 1.5457627118644068, "grad_norm": 1.1331392526626587, "learning_rate": 2.4237288135593224e-05, "loss": 2.7479, "step": 1824 }, { "epoch": 1.5466101694915255, "grad_norm": 1.0596132278442383, "learning_rate": 2.422316384180791e-05, "loss": 2.6982, "step": 1825 }, { "epoch": 1.5474576271186442, "grad_norm": 1.2000031471252441, "learning_rate": 2.42090395480226e-05, "loss": 2.745, "step": 1826 }, { "epoch": 1.5483050847457627, "grad_norm": 1.1302003860473633, "learning_rate": 2.4194915254237287e-05, "loss": 2.7681, "step": 1827 }, { "epoch": 1.5491525423728814, "grad_norm": 1.099095106124878, "learning_rate": 2.418079096045198e-05, "loss": 2.5677, "step": 1828 }, { "epoch": 1.55, "grad_norm": 1.074489951133728, "learning_rate": 2.4166666666666667e-05, "loss": 2.7839, "step": 1829 }, { "epoch": 1.5508474576271185, "grad_norm": 1.2375173568725586, "learning_rate": 2.4152542372881357e-05, "loss": 2.7313, "step": 1830 }, { "epoch": 1.5516949152542372, "grad_norm": 1.0323246717453003, "learning_rate": 2.4138418079096047e-05, "loss": 2.8609, "step": 1831 }, { "epoch": 1.5525423728813559, "grad_norm": 1.2047739028930664, "learning_rate": 2.4124293785310737e-05, "loss": 2.6048, "step": 1832 }, { "epoch": 1.5533898305084746, "grad_norm": 1.1812658309936523, "learning_rate": 2.4110169491525423e-05, "loss": 2.7495, "step": 1833 }, { "epoch": 1.5542372881355933, "grad_norm": 1.3200953006744385, "learning_rate": 2.4096045197740113e-05, "loss": 2.671, "step": 1834 }, { "epoch": 1.555084745762712, "grad_norm": 1.1331515312194824, "learning_rate": 2.4081920903954803e-05, "loss": 2.7263, "step": 1835 }, { "epoch": 1.5559322033898306, "grad_norm": 0.9608669281005859, "learning_rate": 2.4067796610169493e-05, "loss": 2.8821, "step": 1836 }, { "epoch": 1.5567796610169493, "grad_norm": 0.9678069949150085, "learning_rate": 2.4053672316384183e-05, "loss": 2.857, "step": 1837 }, { "epoch": 1.5576271186440678, "grad_norm": 1.0414766073226929, "learning_rate": 2.403954802259887e-05, "loss": 2.7889, "step": 1838 }, { "epoch": 1.5584745762711865, "grad_norm": 1.003260612487793, "learning_rate": 2.402542372881356e-05, "loss": 2.7885, "step": 1839 }, { "epoch": 1.559322033898305, "grad_norm": 1.2616666555404663, "learning_rate": 2.401129943502825e-05, "loss": 2.8512, "step": 1840 }, { "epoch": 1.5601694915254236, "grad_norm": 1.6481845378875732, "learning_rate": 2.399717514124294e-05, "loss": 2.5663, "step": 1841 }, { "epoch": 1.5610169491525423, "grad_norm": 1.0453968048095703, "learning_rate": 2.3983050847457627e-05, "loss": 2.7644, "step": 1842 }, { "epoch": 1.561864406779661, "grad_norm": 0.992255687713623, "learning_rate": 2.3968926553672317e-05, "loss": 2.8203, "step": 1843 }, { "epoch": 1.5627118644067797, "grad_norm": 1.561972737312317, "learning_rate": 2.3954802259887007e-05, "loss": 2.581, "step": 1844 }, { "epoch": 1.5635593220338984, "grad_norm": 1.0985513925552368, "learning_rate": 2.3940677966101697e-05, "loss": 2.7722, "step": 1845 }, { "epoch": 1.564406779661017, "grad_norm": 1.2070634365081787, "learning_rate": 2.3926553672316383e-05, "loss": 2.6985, "step": 1846 }, { "epoch": 1.5652542372881357, "grad_norm": 1.043664574623108, "learning_rate": 2.3912429378531073e-05, "loss": 2.715, "step": 1847 }, { "epoch": 1.5661016949152542, "grad_norm": 1.4287388324737549, "learning_rate": 2.3898305084745763e-05, "loss": 2.6028, "step": 1848 }, { "epoch": 1.5669491525423729, "grad_norm": 1.2643784284591675, "learning_rate": 2.3884180790960453e-05, "loss": 2.6177, "step": 1849 }, { "epoch": 1.5677966101694916, "grad_norm": 1.0906355381011963, "learning_rate": 2.3870056497175143e-05, "loss": 2.8016, "step": 1850 }, { "epoch": 1.56864406779661, "grad_norm": 1.302488088607788, "learning_rate": 2.3855932203389833e-05, "loss": 2.683, "step": 1851 }, { "epoch": 1.5694915254237287, "grad_norm": 1.0560115575790405, "learning_rate": 2.3841807909604523e-05, "loss": 2.8361, "step": 1852 }, { "epoch": 1.5703389830508474, "grad_norm": 1.3614486455917358, "learning_rate": 2.382768361581921e-05, "loss": 2.5801, "step": 1853 }, { "epoch": 1.571186440677966, "grad_norm": 1.3357983827590942, "learning_rate": 2.38135593220339e-05, "loss": 2.6701, "step": 1854 }, { "epoch": 1.5720338983050848, "grad_norm": 1.0064375400543213, "learning_rate": 2.379943502824859e-05, "loss": 2.8113, "step": 1855 }, { "epoch": 1.5728813559322035, "grad_norm": 1.1035053730010986, "learning_rate": 2.378531073446328e-05, "loss": 2.822, "step": 1856 }, { "epoch": 1.5737288135593221, "grad_norm": 0.9912019371986389, "learning_rate": 2.3771186440677966e-05, "loss": 2.8875, "step": 1857 }, { "epoch": 1.5745762711864408, "grad_norm": 1.3179409503936768, "learning_rate": 2.3757062146892656e-05, "loss": 2.6668, "step": 1858 }, { "epoch": 1.5754237288135593, "grad_norm": 1.052337884902954, "learning_rate": 2.3742937853107346e-05, "loss": 2.7392, "step": 1859 }, { "epoch": 1.576271186440678, "grad_norm": 1.140894889831543, "learning_rate": 2.3728813559322036e-05, "loss": 2.8434, "step": 1860 }, { "epoch": 1.5771186440677964, "grad_norm": 1.0413471460342407, "learning_rate": 2.3714689265536723e-05, "loss": 2.7602, "step": 1861 }, { "epoch": 1.5779661016949151, "grad_norm": 1.1304707527160645, "learning_rate": 2.3700564971751413e-05, "loss": 2.5664, "step": 1862 }, { "epoch": 1.5788135593220338, "grad_norm": 0.958276629447937, "learning_rate": 2.3686440677966103e-05, "loss": 2.684, "step": 1863 }, { "epoch": 1.5796610169491525, "grad_norm": 1.195338249206543, "learning_rate": 2.3672316384180793e-05, "loss": 2.7652, "step": 1864 }, { "epoch": 1.5805084745762712, "grad_norm": 1.096615195274353, "learning_rate": 2.3658192090395483e-05, "loss": 2.6835, "step": 1865 }, { "epoch": 1.5813559322033899, "grad_norm": 1.0772604942321777, "learning_rate": 2.364406779661017e-05, "loss": 2.7362, "step": 1866 }, { "epoch": 1.5822033898305086, "grad_norm": 1.1231951713562012, "learning_rate": 2.362994350282486e-05, "loss": 2.7992, "step": 1867 }, { "epoch": 1.5830508474576273, "grad_norm": 1.3755862712860107, "learning_rate": 2.361581920903955e-05, "loss": 2.6187, "step": 1868 }, { "epoch": 1.5838983050847457, "grad_norm": 0.9679206013679504, "learning_rate": 2.360169491525424e-05, "loss": 2.7601, "step": 1869 }, { "epoch": 1.5847457627118644, "grad_norm": 1.0926215648651123, "learning_rate": 2.3587570621468926e-05, "loss": 2.6911, "step": 1870 }, { "epoch": 1.585593220338983, "grad_norm": 1.1125919818878174, "learning_rate": 2.357344632768362e-05, "loss": 2.7169, "step": 1871 }, { "epoch": 1.5864406779661016, "grad_norm": 1.0439825057983398, "learning_rate": 2.3559322033898306e-05, "loss": 2.8413, "step": 1872 }, { "epoch": 1.5872881355932202, "grad_norm": 1.411345362663269, "learning_rate": 2.3545197740112996e-05, "loss": 2.7078, "step": 1873 }, { "epoch": 1.588135593220339, "grad_norm": 1.0051039457321167, "learning_rate": 2.3531073446327683e-05, "loss": 2.8511, "step": 1874 }, { "epoch": 1.5889830508474576, "grad_norm": 1.382293939590454, "learning_rate": 2.3516949152542376e-05, "loss": 2.5916, "step": 1875 }, { "epoch": 1.5898305084745763, "grad_norm": 1.1161446571350098, "learning_rate": 2.3502824858757063e-05, "loss": 2.6336, "step": 1876 }, { "epoch": 1.590677966101695, "grad_norm": 1.3635808229446411, "learning_rate": 2.3488700564971753e-05, "loss": 2.5219, "step": 1877 }, { "epoch": 1.5915254237288137, "grad_norm": 1.280559778213501, "learning_rate": 2.347457627118644e-05, "loss": 2.6639, "step": 1878 }, { "epoch": 1.5923728813559324, "grad_norm": 1.0561965703964233, "learning_rate": 2.3460451977401133e-05, "loss": 2.7664, "step": 1879 }, { "epoch": 1.5932203389830508, "grad_norm": 0.8967818021774292, "learning_rate": 2.344632768361582e-05, "loss": 2.9752, "step": 1880 }, { "epoch": 1.5940677966101695, "grad_norm": 1.0559184551239014, "learning_rate": 2.343220338983051e-05, "loss": 2.9621, "step": 1881 }, { "epoch": 1.594915254237288, "grad_norm": 1.0838136672973633, "learning_rate": 2.34180790960452e-05, "loss": 2.8132, "step": 1882 }, { "epoch": 1.5957627118644067, "grad_norm": 1.212471604347229, "learning_rate": 2.340395480225989e-05, "loss": 2.8631, "step": 1883 }, { "epoch": 1.5966101694915253, "grad_norm": 0.9554041028022766, "learning_rate": 2.338983050847458e-05, "loss": 2.7652, "step": 1884 }, { "epoch": 1.597457627118644, "grad_norm": 0.9795320630073547, "learning_rate": 2.3375706214689266e-05, "loss": 2.8476, "step": 1885 }, { "epoch": 1.5983050847457627, "grad_norm": 0.8778656125068665, "learning_rate": 2.3361581920903956e-05, "loss": 2.8676, "step": 1886 }, { "epoch": 1.5991525423728814, "grad_norm": 0.999576985836029, "learning_rate": 2.3347457627118646e-05, "loss": 2.6703, "step": 1887 }, { "epoch": 1.6, "grad_norm": 0.9874566197395325, "learning_rate": 2.3333333333333336e-05, "loss": 2.8561, "step": 1888 }, { "epoch": 1.6008474576271188, "grad_norm": 1.0946693420410156, "learning_rate": 2.3319209039548022e-05, "loss": 2.5874, "step": 1889 }, { "epoch": 1.6016949152542372, "grad_norm": 1.1346005201339722, "learning_rate": 2.3305084745762712e-05, "loss": 2.7149, "step": 1890 }, { "epoch": 1.602542372881356, "grad_norm": 1.1960369348526, "learning_rate": 2.3290960451977402e-05, "loss": 2.5591, "step": 1891 }, { "epoch": 1.6033898305084746, "grad_norm": 1.0531506538391113, "learning_rate": 2.3276836158192092e-05, "loss": 2.7084, "step": 1892 }, { "epoch": 1.604237288135593, "grad_norm": 1.2548158168792725, "learning_rate": 2.326271186440678e-05, "loss": 2.6188, "step": 1893 }, { "epoch": 1.6050847457627118, "grad_norm": 0.9160987734794617, "learning_rate": 2.324858757062147e-05, "loss": 2.9216, "step": 1894 }, { "epoch": 1.6059322033898304, "grad_norm": 1.2223246097564697, "learning_rate": 2.323446327683616e-05, "loss": 2.5757, "step": 1895 }, { "epoch": 1.6067796610169491, "grad_norm": 1.0778521299362183, "learning_rate": 2.322033898305085e-05, "loss": 2.6405, "step": 1896 }, { "epoch": 1.6076271186440678, "grad_norm": 0.9888900518417358, "learning_rate": 2.320621468926554e-05, "loss": 2.8904, "step": 1897 }, { "epoch": 1.6084745762711865, "grad_norm": 1.130634069442749, "learning_rate": 2.3192090395480226e-05, "loss": 2.8235, "step": 1898 }, { "epoch": 1.6093220338983052, "grad_norm": 1.1311981678009033, "learning_rate": 2.317796610169492e-05, "loss": 2.7509, "step": 1899 }, { "epoch": 1.6101694915254239, "grad_norm": 1.1725093126296997, "learning_rate": 2.3163841807909606e-05, "loss": 2.6908, "step": 1900 }, { "epoch": 1.6110169491525423, "grad_norm": 1.0086802244186401, "learning_rate": 2.3149717514124296e-05, "loss": 2.8708, "step": 1901 }, { "epoch": 1.611864406779661, "grad_norm": 1.1591293811798096, "learning_rate": 2.3135593220338982e-05, "loss": 2.8952, "step": 1902 }, { "epoch": 1.6127118644067797, "grad_norm": 0.9742602705955505, "learning_rate": 2.3121468926553676e-05, "loss": 2.725, "step": 1903 }, { "epoch": 1.6135593220338982, "grad_norm": 1.1805888414382935, "learning_rate": 2.3107344632768362e-05, "loss": 2.953, "step": 1904 }, { "epoch": 1.6144067796610169, "grad_norm": 0.9819029569625854, "learning_rate": 2.3093220338983052e-05, "loss": 2.7734, "step": 1905 }, { "epoch": 1.6152542372881356, "grad_norm": 1.1077427864074707, "learning_rate": 2.307909604519774e-05, "loss": 2.76, "step": 1906 }, { "epoch": 1.6161016949152542, "grad_norm": 1.243615984916687, "learning_rate": 2.3064971751412432e-05, "loss": 2.6716, "step": 1907 }, { "epoch": 1.616949152542373, "grad_norm": 1.054997205734253, "learning_rate": 2.305084745762712e-05, "loss": 2.8281, "step": 1908 }, { "epoch": 1.6177966101694916, "grad_norm": 1.0687192678451538, "learning_rate": 2.303672316384181e-05, "loss": 2.8526, "step": 1909 }, { "epoch": 1.6186440677966103, "grad_norm": 1.1847962141036987, "learning_rate": 2.30225988700565e-05, "loss": 2.6144, "step": 1910 }, { "epoch": 1.6194915254237288, "grad_norm": 0.9783480763435364, "learning_rate": 2.300847457627119e-05, "loss": 2.7837, "step": 1911 }, { "epoch": 1.6203389830508474, "grad_norm": 1.0385940074920654, "learning_rate": 2.2994350282485875e-05, "loss": 2.781, "step": 1912 }, { "epoch": 1.6211864406779661, "grad_norm": 1.1184855699539185, "learning_rate": 2.2980225988700565e-05, "loss": 2.6853, "step": 1913 }, { "epoch": 1.6220338983050846, "grad_norm": 0.9464384913444519, "learning_rate": 2.2966101694915255e-05, "loss": 2.8047, "step": 1914 }, { "epoch": 1.6228813559322033, "grad_norm": 1.2419618368148804, "learning_rate": 2.2951977401129945e-05, "loss": 2.7639, "step": 1915 }, { "epoch": 1.623728813559322, "grad_norm": 1.2323758602142334, "learning_rate": 2.2937853107344635e-05, "loss": 2.776, "step": 1916 }, { "epoch": 1.6245762711864407, "grad_norm": 1.1942468881607056, "learning_rate": 2.2923728813559322e-05, "loss": 2.7054, "step": 1917 }, { "epoch": 1.6254237288135593, "grad_norm": 1.1644326448440552, "learning_rate": 2.2909604519774012e-05, "loss": 2.6464, "step": 1918 }, { "epoch": 1.626271186440678, "grad_norm": 1.204388976097107, "learning_rate": 2.2895480225988702e-05, "loss": 2.6597, "step": 1919 }, { "epoch": 1.6271186440677967, "grad_norm": 1.2430897951126099, "learning_rate": 2.2881355932203392e-05, "loss": 2.7475, "step": 1920 }, { "epoch": 1.6279661016949154, "grad_norm": 1.0028111934661865, "learning_rate": 2.286723163841808e-05, "loss": 2.9939, "step": 1921 }, { "epoch": 1.6288135593220339, "grad_norm": 1.1967658996582031, "learning_rate": 2.285310734463277e-05, "loss": 2.7748, "step": 1922 }, { "epoch": 1.6296610169491526, "grad_norm": 1.1778464317321777, "learning_rate": 2.283898305084746e-05, "loss": 2.7629, "step": 1923 }, { "epoch": 1.6305084745762712, "grad_norm": 1.6117842197418213, "learning_rate": 2.282485875706215e-05, "loss": 2.4678, "step": 1924 }, { "epoch": 1.6313559322033897, "grad_norm": 0.9721297025680542, "learning_rate": 2.2810734463276835e-05, "loss": 2.8654, "step": 1925 }, { "epoch": 1.6322033898305084, "grad_norm": 1.3772358894348145, "learning_rate": 2.2796610169491525e-05, "loss": 2.7953, "step": 1926 }, { "epoch": 1.633050847457627, "grad_norm": 1.2268047332763672, "learning_rate": 2.2782485875706215e-05, "loss": 2.5029, "step": 1927 }, { "epoch": 1.6338983050847458, "grad_norm": 1.0063567161560059, "learning_rate": 2.2768361581920905e-05, "loss": 2.7561, "step": 1928 }, { "epoch": 1.6347457627118644, "grad_norm": 1.155717372894287, "learning_rate": 2.2754237288135595e-05, "loss": 2.7237, "step": 1929 }, { "epoch": 1.6355932203389831, "grad_norm": 1.1854345798492432, "learning_rate": 2.274011299435028e-05, "loss": 2.6825, "step": 1930 }, { "epoch": 1.6364406779661018, "grad_norm": 1.2681466341018677, "learning_rate": 2.2725988700564975e-05, "loss": 2.5796, "step": 1931 }, { "epoch": 1.6372881355932203, "grad_norm": 1.0469571352005005, "learning_rate": 2.271186440677966e-05, "loss": 2.7957, "step": 1932 }, { "epoch": 1.638135593220339, "grad_norm": 1.0084184408187866, "learning_rate": 2.269774011299435e-05, "loss": 2.8405, "step": 1933 }, { "epoch": 1.6389830508474577, "grad_norm": 1.0356253385543823, "learning_rate": 2.268361581920904e-05, "loss": 2.694, "step": 1934 }, { "epoch": 1.6398305084745761, "grad_norm": 1.150457739830017, "learning_rate": 2.266949152542373e-05, "loss": 2.7612, "step": 1935 }, { "epoch": 1.6406779661016948, "grad_norm": 1.0183244943618774, "learning_rate": 2.2655367231638418e-05, "loss": 2.745, "step": 1936 }, { "epoch": 1.6415254237288135, "grad_norm": 1.1340312957763672, "learning_rate": 2.2641242937853108e-05, "loss": 2.6348, "step": 1937 }, { "epoch": 1.6423728813559322, "grad_norm": 1.0995330810546875, "learning_rate": 2.2627118644067798e-05, "loss": 2.8379, "step": 1938 }, { "epoch": 1.6432203389830509, "grad_norm": 0.9740652441978455, "learning_rate": 2.2612994350282488e-05, "loss": 2.7789, "step": 1939 }, { "epoch": 1.6440677966101696, "grad_norm": 1.1527608633041382, "learning_rate": 2.2598870056497175e-05, "loss": 2.8493, "step": 1940 }, { "epoch": 1.6449152542372882, "grad_norm": 1.5391933917999268, "learning_rate": 2.2584745762711865e-05, "loss": 2.3918, "step": 1941 }, { "epoch": 1.645762711864407, "grad_norm": 1.2131741046905518, "learning_rate": 2.2570621468926555e-05, "loss": 2.6372, "step": 1942 }, { "epoch": 1.6466101694915254, "grad_norm": 0.8697400689125061, "learning_rate": 2.2556497175141245e-05, "loss": 2.9724, "step": 1943 }, { "epoch": 1.647457627118644, "grad_norm": 1.125433325767517, "learning_rate": 2.2542372881355935e-05, "loss": 2.7292, "step": 1944 }, { "epoch": 1.6483050847457628, "grad_norm": 1.0400315523147583, "learning_rate": 2.252824858757062e-05, "loss": 2.8846, "step": 1945 }, { "epoch": 1.6491525423728812, "grad_norm": 1.170027732849121, "learning_rate": 2.251412429378531e-05, "loss": 2.6606, "step": 1946 }, { "epoch": 1.65, "grad_norm": 1.337228536605835, "learning_rate": 2.25e-05, "loss": 2.6804, "step": 1947 }, { "epoch": 1.6508474576271186, "grad_norm": 1.1318726539611816, "learning_rate": 2.248587570621469e-05, "loss": 2.7268, "step": 1948 }, { "epoch": 1.6516949152542373, "grad_norm": 0.984640896320343, "learning_rate": 2.2471751412429378e-05, "loss": 2.6941, "step": 1949 }, { "epoch": 1.652542372881356, "grad_norm": 0.9486579895019531, "learning_rate": 2.245762711864407e-05, "loss": 2.7897, "step": 1950 }, { "epoch": 1.6533898305084747, "grad_norm": 1.0196901559829712, "learning_rate": 2.2443502824858758e-05, "loss": 2.7002, "step": 1951 }, { "epoch": 1.6542372881355933, "grad_norm": 1.0792779922485352, "learning_rate": 2.2429378531073448e-05, "loss": 2.647, "step": 1952 }, { "epoch": 1.655084745762712, "grad_norm": 1.0463929176330566, "learning_rate": 2.2415254237288135e-05, "loss": 2.7542, "step": 1953 }, { "epoch": 1.6559322033898305, "grad_norm": 1.1446179151535034, "learning_rate": 2.2401129943502828e-05, "loss": 2.812, "step": 1954 }, { "epoch": 1.6567796610169492, "grad_norm": 1.073575735092163, "learning_rate": 2.2387005649717515e-05, "loss": 2.7027, "step": 1955 }, { "epoch": 1.6576271186440676, "grad_norm": 1.2444988489151, "learning_rate": 2.2372881355932205e-05, "loss": 2.8578, "step": 1956 }, { "epoch": 1.6584745762711863, "grad_norm": 1.077351450920105, "learning_rate": 2.235875706214689e-05, "loss": 2.7909, "step": 1957 }, { "epoch": 1.659322033898305, "grad_norm": 1.1523077487945557, "learning_rate": 2.2344632768361585e-05, "loss": 2.7268, "step": 1958 }, { "epoch": 1.6601694915254237, "grad_norm": 0.9423948526382446, "learning_rate": 2.233050847457627e-05, "loss": 2.6836, "step": 1959 }, { "epoch": 1.6610169491525424, "grad_norm": 1.1524324417114258, "learning_rate": 2.231638418079096e-05, "loss": 2.6782, "step": 1960 }, { "epoch": 1.661864406779661, "grad_norm": 1.0088781118392944, "learning_rate": 2.230225988700565e-05, "loss": 2.7087, "step": 1961 }, { "epoch": 1.6627118644067798, "grad_norm": 1.4018473625183105, "learning_rate": 2.228813559322034e-05, "loss": 2.567, "step": 1962 }, { "epoch": 1.6635593220338984, "grad_norm": 0.9550151228904724, "learning_rate": 2.227401129943503e-05, "loss": 2.8346, "step": 1963 }, { "epoch": 1.664406779661017, "grad_norm": 1.198732614517212, "learning_rate": 2.2259887005649718e-05, "loss": 2.6654, "step": 1964 }, { "epoch": 1.6652542372881356, "grad_norm": 1.1893459558486938, "learning_rate": 2.2245762711864408e-05, "loss": 2.7053, "step": 1965 }, { "epoch": 1.6661016949152543, "grad_norm": 1.078377604484558, "learning_rate": 2.2231638418079098e-05, "loss": 2.7485, "step": 1966 }, { "epoch": 1.6669491525423727, "grad_norm": 1.2355225086212158, "learning_rate": 2.2217514124293788e-05, "loss": 2.6898, "step": 1967 }, { "epoch": 1.6677966101694914, "grad_norm": 1.1316275596618652, "learning_rate": 2.2203389830508474e-05, "loss": 2.737, "step": 1968 }, { "epoch": 1.6686440677966101, "grad_norm": 1.065366506576538, "learning_rate": 2.2189265536723164e-05, "loss": 2.7938, "step": 1969 }, { "epoch": 1.6694915254237288, "grad_norm": 1.3895978927612305, "learning_rate": 2.2175141242937854e-05, "loss": 2.5969, "step": 1970 }, { "epoch": 1.6703389830508475, "grad_norm": 1.1037791967391968, "learning_rate": 2.2161016949152544e-05, "loss": 2.8264, "step": 1971 }, { "epoch": 1.6711864406779662, "grad_norm": 1.2196025848388672, "learning_rate": 2.214689265536723e-05, "loss": 2.8404, "step": 1972 }, { "epoch": 1.6720338983050849, "grad_norm": 1.1980524063110352, "learning_rate": 2.213276836158192e-05, "loss": 2.7032, "step": 1973 }, { "epoch": 1.6728813559322036, "grad_norm": 1.2860914468765259, "learning_rate": 2.211864406779661e-05, "loss": 2.7491, "step": 1974 }, { "epoch": 1.673728813559322, "grad_norm": 1.0300694704055786, "learning_rate": 2.21045197740113e-05, "loss": 2.679, "step": 1975 }, { "epoch": 1.6745762711864407, "grad_norm": 1.0046870708465576, "learning_rate": 2.209039548022599e-05, "loss": 2.6776, "step": 1976 }, { "epoch": 1.6754237288135592, "grad_norm": 1.3938623666763306, "learning_rate": 2.2076271186440678e-05, "loss": 2.6404, "step": 1977 }, { "epoch": 1.6762711864406779, "grad_norm": 1.0191386938095093, "learning_rate": 2.206214689265537e-05, "loss": 2.8735, "step": 1978 }, { "epoch": 1.6771186440677965, "grad_norm": 1.0004791021347046, "learning_rate": 2.2048022598870058e-05, "loss": 2.7898, "step": 1979 }, { "epoch": 1.6779661016949152, "grad_norm": 1.1002367734909058, "learning_rate": 2.2033898305084748e-05, "loss": 2.6758, "step": 1980 }, { "epoch": 1.678813559322034, "grad_norm": 1.1558040380477905, "learning_rate": 2.2019774011299434e-05, "loss": 2.689, "step": 1981 }, { "epoch": 1.6796610169491526, "grad_norm": 1.2505487203598022, "learning_rate": 2.2005649717514127e-05, "loss": 2.6016, "step": 1982 }, { "epoch": 1.6805084745762713, "grad_norm": 1.2562954425811768, "learning_rate": 2.1991525423728814e-05, "loss": 2.6221, "step": 1983 }, { "epoch": 1.68135593220339, "grad_norm": 1.0955479145050049, "learning_rate": 2.1977401129943504e-05, "loss": 2.7393, "step": 1984 }, { "epoch": 1.6822033898305084, "grad_norm": 1.326519250869751, "learning_rate": 2.196327683615819e-05, "loss": 2.6772, "step": 1985 }, { "epoch": 1.6830508474576271, "grad_norm": 1.0585700273513794, "learning_rate": 2.1949152542372884e-05, "loss": 2.7725, "step": 1986 }, { "epoch": 1.6838983050847458, "grad_norm": 0.9212808609008789, "learning_rate": 2.193502824858757e-05, "loss": 2.7984, "step": 1987 }, { "epoch": 1.6847457627118643, "grad_norm": 1.2387858629226685, "learning_rate": 2.192090395480226e-05, "loss": 2.7233, "step": 1988 }, { "epoch": 1.685593220338983, "grad_norm": 1.1236997842788696, "learning_rate": 2.190677966101695e-05, "loss": 2.5904, "step": 1989 }, { "epoch": 1.6864406779661016, "grad_norm": 1.1906774044036865, "learning_rate": 2.189265536723164e-05, "loss": 2.7381, "step": 1990 }, { "epoch": 1.6872881355932203, "grad_norm": 1.161232352256775, "learning_rate": 2.1878531073446327e-05, "loss": 2.5432, "step": 1991 }, { "epoch": 1.688135593220339, "grad_norm": 0.8911912441253662, "learning_rate": 2.1864406779661017e-05, "loss": 2.8894, "step": 1992 }, { "epoch": 1.6889830508474577, "grad_norm": 0.9659197330474854, "learning_rate": 2.1850282485875707e-05, "loss": 2.7261, "step": 1993 }, { "epoch": 1.6898305084745764, "grad_norm": 1.0060009956359863, "learning_rate": 2.1836158192090397e-05, "loss": 2.7793, "step": 1994 }, { "epoch": 1.690677966101695, "grad_norm": 1.2250174283981323, "learning_rate": 2.1822033898305087e-05, "loss": 2.6318, "step": 1995 }, { "epoch": 1.6915254237288135, "grad_norm": 1.129069209098816, "learning_rate": 2.1807909604519774e-05, "loss": 2.7596, "step": 1996 }, { "epoch": 1.6923728813559322, "grad_norm": 1.1996171474456787, "learning_rate": 2.1793785310734464e-05, "loss": 2.6167, "step": 1997 }, { "epoch": 1.6932203389830507, "grad_norm": 1.1588618755340576, "learning_rate": 2.1779661016949154e-05, "loss": 2.7102, "step": 1998 }, { "epoch": 1.6940677966101694, "grad_norm": 1.0075221061706543, "learning_rate": 2.1765536723163844e-05, "loss": 2.7758, "step": 1999 }, { "epoch": 1.694915254237288, "grad_norm": 1.0586878061294556, "learning_rate": 2.175141242937853e-05, "loss": 2.8316, "step": 2000 }, { "epoch": 1.6957627118644067, "grad_norm": 1.0339900255203247, "learning_rate": 2.173728813559322e-05, "loss": 2.6571, "step": 2001 }, { "epoch": 1.6966101694915254, "grad_norm": 0.967481791973114, "learning_rate": 2.172316384180791e-05, "loss": 2.869, "step": 2002 }, { "epoch": 1.6974576271186441, "grad_norm": 0.8604980707168579, "learning_rate": 2.17090395480226e-05, "loss": 2.8952, "step": 2003 }, { "epoch": 1.6983050847457628, "grad_norm": 1.0698426961898804, "learning_rate": 2.1694915254237287e-05, "loss": 2.6932, "step": 2004 }, { "epoch": 1.6991525423728815, "grad_norm": 1.254738211631775, "learning_rate": 2.1680790960451977e-05, "loss": 2.8027, "step": 2005 }, { "epoch": 1.7, "grad_norm": 1.1405632495880127, "learning_rate": 2.1666666666666667e-05, "loss": 2.6379, "step": 2006 }, { "epoch": 1.7008474576271186, "grad_norm": 1.1888066530227661, "learning_rate": 2.1652542372881357e-05, "loss": 2.5186, "step": 2007 }, { "epoch": 1.7016949152542373, "grad_norm": 1.0663483142852783, "learning_rate": 2.1638418079096047e-05, "loss": 2.6905, "step": 2008 }, { "epoch": 1.7025423728813558, "grad_norm": 1.0556362867355347, "learning_rate": 2.1624293785310734e-05, "loss": 2.8582, "step": 2009 }, { "epoch": 1.7033898305084745, "grad_norm": 0.9792917966842651, "learning_rate": 2.1610169491525427e-05, "loss": 2.7174, "step": 2010 }, { "epoch": 1.7042372881355932, "grad_norm": 0.9167950749397278, "learning_rate": 2.1596045197740114e-05, "loss": 2.7645, "step": 2011 }, { "epoch": 1.7050847457627119, "grad_norm": 1.2853854894638062, "learning_rate": 2.1581920903954804e-05, "loss": 2.6393, "step": 2012 }, { "epoch": 1.7059322033898305, "grad_norm": 1.1802425384521484, "learning_rate": 2.1567796610169494e-05, "loss": 2.8005, "step": 2013 }, { "epoch": 1.7067796610169492, "grad_norm": 1.1032609939575195, "learning_rate": 2.1553672316384184e-05, "loss": 2.7769, "step": 2014 }, { "epoch": 1.707627118644068, "grad_norm": 1.0899887084960938, "learning_rate": 2.153954802259887e-05, "loss": 2.7425, "step": 2015 }, { "epoch": 1.7084745762711866, "grad_norm": 1.0336703062057495, "learning_rate": 2.152542372881356e-05, "loss": 2.9216, "step": 2016 }, { "epoch": 1.709322033898305, "grad_norm": 1.4915704727172852, "learning_rate": 2.151129943502825e-05, "loss": 2.5933, "step": 2017 }, { "epoch": 1.7101694915254237, "grad_norm": 1.213881015777588, "learning_rate": 2.149717514124294e-05, "loss": 2.6736, "step": 2018 }, { "epoch": 1.7110169491525422, "grad_norm": 1.058539628982544, "learning_rate": 2.1483050847457627e-05, "loss": 2.9701, "step": 2019 }, { "epoch": 1.711864406779661, "grad_norm": 0.9809156060218811, "learning_rate": 2.1468926553672317e-05, "loss": 2.7857, "step": 2020 }, { "epoch": 1.7127118644067796, "grad_norm": 0.9568430781364441, "learning_rate": 2.1454802259887007e-05, "loss": 2.7869, "step": 2021 }, { "epoch": 1.7135593220338983, "grad_norm": 1.3682050704956055, "learning_rate": 2.1440677966101697e-05, "loss": 2.6752, "step": 2022 }, { "epoch": 1.714406779661017, "grad_norm": 0.9975570440292358, "learning_rate": 2.1426553672316387e-05, "loss": 2.8756, "step": 2023 }, { "epoch": 1.7152542372881356, "grad_norm": 1.389093279838562, "learning_rate": 2.1412429378531073e-05, "loss": 2.7175, "step": 2024 }, { "epoch": 1.7161016949152543, "grad_norm": 1.0701448917388916, "learning_rate": 2.1398305084745763e-05, "loss": 2.7378, "step": 2025 }, { "epoch": 1.716949152542373, "grad_norm": 0.9431941509246826, "learning_rate": 2.1384180790960453e-05, "loss": 2.8699, "step": 2026 }, { "epoch": 1.7177966101694915, "grad_norm": 1.179017186164856, "learning_rate": 2.1370056497175143e-05, "loss": 2.6515, "step": 2027 }, { "epoch": 1.7186440677966102, "grad_norm": 1.029389500617981, "learning_rate": 2.135593220338983e-05, "loss": 2.6199, "step": 2028 }, { "epoch": 1.7194915254237289, "grad_norm": 1.1984156370162964, "learning_rate": 2.1341807909604523e-05, "loss": 2.7407, "step": 2029 }, { "epoch": 1.7203389830508473, "grad_norm": 1.0471163988113403, "learning_rate": 2.132768361581921e-05, "loss": 2.9038, "step": 2030 }, { "epoch": 1.721186440677966, "grad_norm": 1.1695588827133179, "learning_rate": 2.13135593220339e-05, "loss": 2.6905, "step": 2031 }, { "epoch": 1.7220338983050847, "grad_norm": 1.046454668045044, "learning_rate": 2.1299435028248587e-05, "loss": 2.7893, "step": 2032 }, { "epoch": 1.7228813559322034, "grad_norm": 1.100285530090332, "learning_rate": 2.128531073446328e-05, "loss": 2.7836, "step": 2033 }, { "epoch": 1.723728813559322, "grad_norm": 1.6512371301651, "learning_rate": 2.1271186440677967e-05, "loss": 2.5651, "step": 2034 }, { "epoch": 1.7245762711864407, "grad_norm": 1.2220083475112915, "learning_rate": 2.1257062146892657e-05, "loss": 2.7178, "step": 2035 }, { "epoch": 1.7254237288135594, "grad_norm": 0.9805908799171448, "learning_rate": 2.1242937853107343e-05, "loss": 2.7406, "step": 2036 }, { "epoch": 1.7262711864406781, "grad_norm": 0.9179597496986389, "learning_rate": 2.1228813559322037e-05, "loss": 2.7452, "step": 2037 }, { "epoch": 1.7271186440677966, "grad_norm": 0.8249093890190125, "learning_rate": 2.1214689265536723e-05, "loss": 2.9393, "step": 2038 }, { "epoch": 1.7279661016949153, "grad_norm": 1.2761414051055908, "learning_rate": 2.1200564971751413e-05, "loss": 2.6703, "step": 2039 }, { "epoch": 1.7288135593220337, "grad_norm": 1.178770899772644, "learning_rate": 2.1186440677966103e-05, "loss": 2.6835, "step": 2040 }, { "epoch": 1.7296610169491524, "grad_norm": 1.1519194841384888, "learning_rate": 2.1172316384180793e-05, "loss": 2.8016, "step": 2041 }, { "epoch": 1.730508474576271, "grad_norm": 1.166398525238037, "learning_rate": 2.1158192090395483e-05, "loss": 2.7879, "step": 2042 }, { "epoch": 1.7313559322033898, "grad_norm": 0.9436035752296448, "learning_rate": 2.114406779661017e-05, "loss": 2.7919, "step": 2043 }, { "epoch": 1.7322033898305085, "grad_norm": 1.119163155555725, "learning_rate": 2.112994350282486e-05, "loss": 2.6872, "step": 2044 }, { "epoch": 1.7330508474576272, "grad_norm": 1.446716070175171, "learning_rate": 2.111581920903955e-05, "loss": 2.5478, "step": 2045 }, { "epoch": 1.7338983050847459, "grad_norm": 1.1846516132354736, "learning_rate": 2.110169491525424e-05, "loss": 2.768, "step": 2046 }, { "epoch": 1.7347457627118645, "grad_norm": 0.8903865218162537, "learning_rate": 2.1087570621468926e-05, "loss": 2.803, "step": 2047 }, { "epoch": 1.735593220338983, "grad_norm": 1.2745734453201294, "learning_rate": 2.1073446327683616e-05, "loss": 2.6938, "step": 2048 }, { "epoch": 1.7364406779661017, "grad_norm": 1.1986397504806519, "learning_rate": 2.1059322033898306e-05, "loss": 2.7291, "step": 2049 }, { "epoch": 1.7372881355932204, "grad_norm": 1.3172969818115234, "learning_rate": 2.1045197740112996e-05, "loss": 2.5729, "step": 2050 }, { "epoch": 1.7381355932203388, "grad_norm": 1.0908544063568115, "learning_rate": 2.1031073446327683e-05, "loss": 2.8598, "step": 2051 }, { "epoch": 1.7389830508474575, "grad_norm": 1.2555841207504272, "learning_rate": 2.1016949152542373e-05, "loss": 2.6042, "step": 2052 }, { "epoch": 1.7398305084745762, "grad_norm": 1.2310426235198975, "learning_rate": 2.1002824858757063e-05, "loss": 2.5916, "step": 2053 }, { "epoch": 1.740677966101695, "grad_norm": 0.9163360595703125, "learning_rate": 2.0988700564971753e-05, "loss": 2.8336, "step": 2054 }, { "epoch": 1.7415254237288136, "grad_norm": 1.045538306236267, "learning_rate": 2.0974576271186443e-05, "loss": 2.8838, "step": 2055 }, { "epoch": 1.7423728813559323, "grad_norm": 1.1291589736938477, "learning_rate": 2.096045197740113e-05, "loss": 2.7363, "step": 2056 }, { "epoch": 1.743220338983051, "grad_norm": 1.0940430164337158, "learning_rate": 2.0946327683615823e-05, "loss": 2.7447, "step": 2057 }, { "epoch": 1.7440677966101696, "grad_norm": 1.12651789188385, "learning_rate": 2.093220338983051e-05, "loss": 2.7276, "step": 2058 }, { "epoch": 1.744915254237288, "grad_norm": 1.0112786293029785, "learning_rate": 2.09180790960452e-05, "loss": 2.8141, "step": 2059 }, { "epoch": 1.7457627118644068, "grad_norm": 1.320303201675415, "learning_rate": 2.0903954802259886e-05, "loss": 2.5977, "step": 2060 }, { "epoch": 1.7466101694915255, "grad_norm": 1.361250638961792, "learning_rate": 2.088983050847458e-05, "loss": 2.6541, "step": 2061 }, { "epoch": 1.747457627118644, "grad_norm": 1.0819406509399414, "learning_rate": 2.0875706214689266e-05, "loss": 2.83, "step": 2062 }, { "epoch": 1.7483050847457626, "grad_norm": 1.2149043083190918, "learning_rate": 2.0861581920903956e-05, "loss": 2.6023, "step": 2063 }, { "epoch": 1.7491525423728813, "grad_norm": 0.9299839735031128, "learning_rate": 2.0847457627118643e-05, "loss": 2.8601, "step": 2064 }, { "epoch": 1.75, "grad_norm": 0.8682264685630798, "learning_rate": 2.0833333333333336e-05, "loss": 2.8056, "step": 2065 }, { "epoch": 1.7508474576271187, "grad_norm": 1.0110892057418823, "learning_rate": 2.0819209039548023e-05, "loss": 2.7593, "step": 2066 }, { "epoch": 1.7516949152542374, "grad_norm": 1.448298454284668, "learning_rate": 2.0805084745762713e-05, "loss": 2.341, "step": 2067 }, { "epoch": 1.752542372881356, "grad_norm": 0.9779382944107056, "learning_rate": 2.0790960451977403e-05, "loss": 2.7048, "step": 2068 }, { "epoch": 1.7533898305084745, "grad_norm": 1.1361010074615479, "learning_rate": 2.0776836158192093e-05, "loss": 2.6464, "step": 2069 }, { "epoch": 1.7542372881355932, "grad_norm": 1.1303348541259766, "learning_rate": 2.076271186440678e-05, "loss": 2.7549, "step": 2070 }, { "epoch": 1.755084745762712, "grad_norm": 1.1424024105072021, "learning_rate": 2.074858757062147e-05, "loss": 2.7068, "step": 2071 }, { "epoch": 1.7559322033898304, "grad_norm": 0.9543996453285217, "learning_rate": 2.073446327683616e-05, "loss": 2.7092, "step": 2072 }, { "epoch": 1.756779661016949, "grad_norm": 1.3549798727035522, "learning_rate": 2.072033898305085e-05, "loss": 2.6977, "step": 2073 }, { "epoch": 1.7576271186440677, "grad_norm": 0.9426900744438171, "learning_rate": 2.070621468926554e-05, "loss": 2.8266, "step": 2074 }, { "epoch": 1.7584745762711864, "grad_norm": 0.9650610089302063, "learning_rate": 2.0692090395480226e-05, "loss": 2.7062, "step": 2075 }, { "epoch": 1.759322033898305, "grad_norm": 1.1414823532104492, "learning_rate": 2.0677966101694916e-05, "loss": 2.692, "step": 2076 }, { "epoch": 1.7601694915254238, "grad_norm": 1.1584335565567017, "learning_rate": 2.0663841807909606e-05, "loss": 2.7301, "step": 2077 }, { "epoch": 1.7610169491525425, "grad_norm": 1.273834466934204, "learning_rate": 2.0649717514124296e-05, "loss": 2.6977, "step": 2078 }, { "epoch": 1.7618644067796612, "grad_norm": 1.0295066833496094, "learning_rate": 2.0635593220338982e-05, "loss": 2.7839, "step": 2079 }, { "epoch": 1.7627118644067796, "grad_norm": 1.2004121541976929, "learning_rate": 2.0621468926553672e-05, "loss": 2.7101, "step": 2080 }, { "epoch": 1.7635593220338983, "grad_norm": 1.094724416732788, "learning_rate": 2.0607344632768362e-05, "loss": 2.823, "step": 2081 }, { "epoch": 1.764406779661017, "grad_norm": 1.1883249282836914, "learning_rate": 2.0593220338983052e-05, "loss": 2.7372, "step": 2082 }, { "epoch": 1.7652542372881355, "grad_norm": 1.0656414031982422, "learning_rate": 2.057909604519774e-05, "loss": 2.9454, "step": 2083 }, { "epoch": 1.7661016949152541, "grad_norm": 0.9597123265266418, "learning_rate": 2.056497175141243e-05, "loss": 2.8202, "step": 2084 }, { "epoch": 1.7669491525423728, "grad_norm": 1.1121854782104492, "learning_rate": 2.055084745762712e-05, "loss": 2.8834, "step": 2085 }, { "epoch": 1.7677966101694915, "grad_norm": 1.2816334962844849, "learning_rate": 2.053672316384181e-05, "loss": 2.6536, "step": 2086 }, { "epoch": 1.7686440677966102, "grad_norm": 0.9627737402915955, "learning_rate": 2.05225988700565e-05, "loss": 2.8593, "step": 2087 }, { "epoch": 1.769491525423729, "grad_norm": 1.2878440618515015, "learning_rate": 2.0508474576271186e-05, "loss": 2.6157, "step": 2088 }, { "epoch": 1.7703389830508476, "grad_norm": 0.9968247413635254, "learning_rate": 2.049435028248588e-05, "loss": 3.0156, "step": 2089 }, { "epoch": 1.7711864406779663, "grad_norm": 1.215289831161499, "learning_rate": 2.0480225988700566e-05, "loss": 2.833, "step": 2090 }, { "epoch": 1.7720338983050847, "grad_norm": 1.0854917764663696, "learning_rate": 2.0466101694915256e-05, "loss": 2.8132, "step": 2091 }, { "epoch": 1.7728813559322034, "grad_norm": 0.9769462943077087, "learning_rate": 2.0451977401129946e-05, "loss": 2.7863, "step": 2092 }, { "epoch": 1.7737288135593219, "grad_norm": 1.1860283613204956, "learning_rate": 2.0437853107344636e-05, "loss": 2.6353, "step": 2093 }, { "epoch": 1.7745762711864406, "grad_norm": 1.1719470024108887, "learning_rate": 2.0423728813559322e-05, "loss": 2.6009, "step": 2094 }, { "epoch": 1.7754237288135593, "grad_norm": 1.058423638343811, "learning_rate": 2.0409604519774012e-05, "loss": 2.6743, "step": 2095 }, { "epoch": 1.776271186440678, "grad_norm": 0.9297478795051575, "learning_rate": 2.0395480225988702e-05, "loss": 2.7433, "step": 2096 }, { "epoch": 1.7771186440677966, "grad_norm": 0.9432790875434875, "learning_rate": 2.0381355932203392e-05, "loss": 2.8175, "step": 2097 }, { "epoch": 1.7779661016949153, "grad_norm": 1.2309918403625488, "learning_rate": 2.036723163841808e-05, "loss": 2.5988, "step": 2098 }, { "epoch": 1.778813559322034, "grad_norm": 1.0979061126708984, "learning_rate": 2.035310734463277e-05, "loss": 2.7199, "step": 2099 }, { "epoch": 1.7796610169491527, "grad_norm": 1.209404706954956, "learning_rate": 2.033898305084746e-05, "loss": 2.774, "step": 2100 }, { "epoch": 1.7805084745762711, "grad_norm": 1.371813178062439, "learning_rate": 2.032485875706215e-05, "loss": 2.6455, "step": 2101 }, { "epoch": 1.7813559322033898, "grad_norm": 1.0844600200653076, "learning_rate": 2.031073446327684e-05, "loss": 2.6517, "step": 2102 }, { "epoch": 1.7822033898305085, "grad_norm": 1.1611357927322388, "learning_rate": 2.0296610169491525e-05, "loss": 2.7386, "step": 2103 }, { "epoch": 1.783050847457627, "grad_norm": 1.02379310131073, "learning_rate": 2.0282485875706215e-05, "loss": 2.8435, "step": 2104 }, { "epoch": 1.7838983050847457, "grad_norm": 0.9997214674949646, "learning_rate": 2.0268361581920905e-05, "loss": 2.7857, "step": 2105 }, { "epoch": 1.7847457627118644, "grad_norm": 1.022742748260498, "learning_rate": 2.0254237288135595e-05, "loss": 2.7433, "step": 2106 }, { "epoch": 1.785593220338983, "grad_norm": 1.0071974992752075, "learning_rate": 2.0240112994350282e-05, "loss": 2.8102, "step": 2107 }, { "epoch": 1.7864406779661017, "grad_norm": 1.0485122203826904, "learning_rate": 2.0225988700564975e-05, "loss": 2.748, "step": 2108 }, { "epoch": 1.7872881355932204, "grad_norm": 1.1566766500473022, "learning_rate": 2.0211864406779662e-05, "loss": 2.7775, "step": 2109 }, { "epoch": 1.788135593220339, "grad_norm": 1.1086699962615967, "learning_rate": 2.0197740112994352e-05, "loss": 2.6061, "step": 2110 }, { "epoch": 1.7889830508474578, "grad_norm": 1.3776793479919434, "learning_rate": 2.018361581920904e-05, "loss": 2.5477, "step": 2111 }, { "epoch": 1.7898305084745763, "grad_norm": 1.1087639331817627, "learning_rate": 2.0169491525423732e-05, "loss": 2.6624, "step": 2112 }, { "epoch": 1.790677966101695, "grad_norm": 1.152855396270752, "learning_rate": 2.015536723163842e-05, "loss": 2.7034, "step": 2113 }, { "epoch": 1.7915254237288134, "grad_norm": 1.210541844367981, "learning_rate": 2.014124293785311e-05, "loss": 2.6426, "step": 2114 }, { "epoch": 1.792372881355932, "grad_norm": 1.0268137454986572, "learning_rate": 2.0127118644067795e-05, "loss": 2.7474, "step": 2115 }, { "epoch": 1.7932203389830508, "grad_norm": 0.939582884311676, "learning_rate": 2.011299435028249e-05, "loss": 2.9162, "step": 2116 }, { "epoch": 1.7940677966101695, "grad_norm": 1.0411386489868164, "learning_rate": 2.0098870056497175e-05, "loss": 2.8323, "step": 2117 }, { "epoch": 1.7949152542372881, "grad_norm": 0.9271771907806396, "learning_rate": 2.0084745762711865e-05, "loss": 2.8062, "step": 2118 }, { "epoch": 1.7957627118644068, "grad_norm": 1.2471201419830322, "learning_rate": 2.0070621468926555e-05, "loss": 2.5155, "step": 2119 }, { "epoch": 1.7966101694915255, "grad_norm": 1.2951605319976807, "learning_rate": 2.0056497175141245e-05, "loss": 2.728, "step": 2120 }, { "epoch": 1.7974576271186442, "grad_norm": 1.287815809249878, "learning_rate": 2.0042372881355935e-05, "loss": 2.6545, "step": 2121 }, { "epoch": 1.7983050847457627, "grad_norm": 1.2197237014770508, "learning_rate": 2.002824858757062e-05, "loss": 2.7637, "step": 2122 }, { "epoch": 1.7991525423728814, "grad_norm": 1.4295421838760376, "learning_rate": 2.001412429378531e-05, "loss": 2.403, "step": 2123 }, { "epoch": 1.8, "grad_norm": 1.1763114929199219, "learning_rate": 2e-05, "loss": 2.7786, "step": 2124 }, { "epoch": 1.8008474576271185, "grad_norm": 1.3243646621704102, "learning_rate": 1.998587570621469e-05, "loss": 2.5181, "step": 2125 }, { "epoch": 1.8016949152542372, "grad_norm": 1.274304986000061, "learning_rate": 1.9971751412429378e-05, "loss": 2.694, "step": 2126 }, { "epoch": 1.8025423728813559, "grad_norm": 0.9818578958511353, "learning_rate": 1.9957627118644068e-05, "loss": 2.8525, "step": 2127 }, { "epoch": 1.8033898305084746, "grad_norm": 0.9407678842544556, "learning_rate": 1.9943502824858758e-05, "loss": 2.8587, "step": 2128 }, { "epoch": 1.8042372881355933, "grad_norm": 1.1443729400634766, "learning_rate": 1.9929378531073448e-05, "loss": 2.7655, "step": 2129 }, { "epoch": 1.805084745762712, "grad_norm": 1.6161749362945557, "learning_rate": 1.9915254237288135e-05, "loss": 2.4994, "step": 2130 }, { "epoch": 1.8059322033898306, "grad_norm": 0.8980959057807922, "learning_rate": 1.9901129943502825e-05, "loss": 2.9398, "step": 2131 }, { "epoch": 1.8067796610169493, "grad_norm": 1.040073037147522, "learning_rate": 1.9887005649717515e-05, "loss": 2.7628, "step": 2132 }, { "epoch": 1.8076271186440678, "grad_norm": 0.876245379447937, "learning_rate": 1.9872881355932205e-05, "loss": 2.8709, "step": 2133 }, { "epoch": 1.8084745762711865, "grad_norm": 1.258423089981079, "learning_rate": 1.9858757062146895e-05, "loss": 2.6829, "step": 2134 }, { "epoch": 1.809322033898305, "grad_norm": 1.1346708536148071, "learning_rate": 1.984463276836158e-05, "loss": 2.6062, "step": 2135 }, { "epoch": 1.8101694915254236, "grad_norm": 1.0326768159866333, "learning_rate": 1.9830508474576275e-05, "loss": 2.8353, "step": 2136 }, { "epoch": 1.8110169491525423, "grad_norm": 1.1588972806930542, "learning_rate": 1.981638418079096e-05, "loss": 2.7704, "step": 2137 }, { "epoch": 1.811864406779661, "grad_norm": 1.1882299184799194, "learning_rate": 1.980225988700565e-05, "loss": 2.756, "step": 2138 }, { "epoch": 1.8127118644067797, "grad_norm": 1.1389881372451782, "learning_rate": 1.9788135593220338e-05, "loss": 2.7587, "step": 2139 }, { "epoch": 1.8135593220338984, "grad_norm": 1.2022271156311035, "learning_rate": 1.977401129943503e-05, "loss": 2.7051, "step": 2140 }, { "epoch": 1.814406779661017, "grad_norm": 0.979529857635498, "learning_rate": 1.9759887005649718e-05, "loss": 2.7192, "step": 2141 }, { "epoch": 1.8152542372881357, "grad_norm": 1.2489396333694458, "learning_rate": 1.9745762711864408e-05, "loss": 2.6634, "step": 2142 }, { "epoch": 1.8161016949152542, "grad_norm": 1.022739291191101, "learning_rate": 1.9731638418079095e-05, "loss": 2.8749, "step": 2143 }, { "epoch": 1.8169491525423729, "grad_norm": 1.1680999994277954, "learning_rate": 1.9717514124293788e-05, "loss": 2.7571, "step": 2144 }, { "epoch": 1.8177966101694916, "grad_norm": 0.9415079355239868, "learning_rate": 1.9703389830508475e-05, "loss": 2.9925, "step": 2145 }, { "epoch": 1.81864406779661, "grad_norm": 1.0888499021530151, "learning_rate": 1.9689265536723165e-05, "loss": 2.8189, "step": 2146 }, { "epoch": 1.8194915254237287, "grad_norm": 1.1665993928909302, "learning_rate": 1.9675141242937855e-05, "loss": 2.7571, "step": 2147 }, { "epoch": 1.8203389830508474, "grad_norm": 0.8712019920349121, "learning_rate": 1.9661016949152545e-05, "loss": 2.8911, "step": 2148 }, { "epoch": 1.821186440677966, "grad_norm": 1.4699656963348389, "learning_rate": 1.964689265536723e-05, "loss": 2.4671, "step": 2149 }, { "epoch": 1.8220338983050848, "grad_norm": 1.2239631414413452, "learning_rate": 1.963276836158192e-05, "loss": 2.7121, "step": 2150 }, { "epoch": 1.8228813559322035, "grad_norm": 0.9783174395561218, "learning_rate": 1.961864406779661e-05, "loss": 2.7338, "step": 2151 }, { "epoch": 1.8237288135593221, "grad_norm": 1.070614218711853, "learning_rate": 1.96045197740113e-05, "loss": 2.7555, "step": 2152 }, { "epoch": 1.8245762711864408, "grad_norm": 1.1637415885925293, "learning_rate": 1.959039548022599e-05, "loss": 2.5712, "step": 2153 }, { "epoch": 1.8254237288135593, "grad_norm": 1.3556163311004639, "learning_rate": 1.9576271186440678e-05, "loss": 2.5231, "step": 2154 }, { "epoch": 1.826271186440678, "grad_norm": 1.3749444484710693, "learning_rate": 1.9562146892655368e-05, "loss": 2.6462, "step": 2155 }, { "epoch": 1.8271186440677964, "grad_norm": 0.9771831631660461, "learning_rate": 1.9548022598870058e-05, "loss": 2.7385, "step": 2156 }, { "epoch": 1.8279661016949151, "grad_norm": 1.2296011447906494, "learning_rate": 1.9533898305084748e-05, "loss": 2.6671, "step": 2157 }, { "epoch": 1.8288135593220338, "grad_norm": 1.0204658508300781, "learning_rate": 1.9519774011299434e-05, "loss": 2.7002, "step": 2158 }, { "epoch": 1.8296610169491525, "grad_norm": 1.3096648454666138, "learning_rate": 1.9505649717514124e-05, "loss": 2.6166, "step": 2159 }, { "epoch": 1.8305084745762712, "grad_norm": 1.095273733139038, "learning_rate": 1.9491525423728814e-05, "loss": 2.6907, "step": 2160 }, { "epoch": 1.8313559322033899, "grad_norm": 1.1508970260620117, "learning_rate": 1.9477401129943504e-05, "loss": 2.584, "step": 2161 }, { "epoch": 1.8322033898305086, "grad_norm": 1.021013617515564, "learning_rate": 1.946327683615819e-05, "loss": 2.7443, "step": 2162 }, { "epoch": 1.8330508474576273, "grad_norm": 1.1462894678115845, "learning_rate": 1.944915254237288e-05, "loss": 2.7603, "step": 2163 }, { "epoch": 1.8338983050847457, "grad_norm": 1.1708370447158813, "learning_rate": 1.943502824858757e-05, "loss": 2.7284, "step": 2164 }, { "epoch": 1.8347457627118644, "grad_norm": 1.0491888523101807, "learning_rate": 1.942090395480226e-05, "loss": 2.7925, "step": 2165 }, { "epoch": 1.835593220338983, "grad_norm": 1.0713671445846558, "learning_rate": 1.940677966101695e-05, "loss": 2.8049, "step": 2166 }, { "epoch": 1.8364406779661016, "grad_norm": 0.9071455001831055, "learning_rate": 1.9392655367231638e-05, "loss": 2.8179, "step": 2167 }, { "epoch": 1.8372881355932202, "grad_norm": 1.0441315174102783, "learning_rate": 1.937853107344633e-05, "loss": 2.7795, "step": 2168 }, { "epoch": 1.838135593220339, "grad_norm": 1.1267039775848389, "learning_rate": 1.9364406779661017e-05, "loss": 2.8345, "step": 2169 }, { "epoch": 1.8389830508474576, "grad_norm": 0.9966780543327332, "learning_rate": 1.9350282485875707e-05, "loss": 2.8733, "step": 2170 }, { "epoch": 1.8398305084745763, "grad_norm": 0.9944255352020264, "learning_rate": 1.9336158192090394e-05, "loss": 2.8219, "step": 2171 }, { "epoch": 1.840677966101695, "grad_norm": 1.210039734840393, "learning_rate": 1.9322033898305087e-05, "loss": 2.6556, "step": 2172 }, { "epoch": 1.8415254237288137, "grad_norm": 0.9125044941902161, "learning_rate": 1.9307909604519774e-05, "loss": 2.7901, "step": 2173 }, { "epoch": 1.8423728813559324, "grad_norm": 1.077428936958313, "learning_rate": 1.9293785310734464e-05, "loss": 2.6814, "step": 2174 }, { "epoch": 1.8432203389830508, "grad_norm": 1.1277340650558472, "learning_rate": 1.9279661016949154e-05, "loss": 2.7702, "step": 2175 }, { "epoch": 1.8440677966101695, "grad_norm": 0.9860967397689819, "learning_rate": 1.9265536723163844e-05, "loss": 2.8478, "step": 2176 }, { "epoch": 1.844915254237288, "grad_norm": 1.0616719722747803, "learning_rate": 1.925141242937853e-05, "loss": 2.6662, "step": 2177 }, { "epoch": 1.8457627118644067, "grad_norm": 1.067347526550293, "learning_rate": 1.923728813559322e-05, "loss": 2.6692, "step": 2178 }, { "epoch": 1.8466101694915253, "grad_norm": 1.049932837486267, "learning_rate": 1.922316384180791e-05, "loss": 2.9301, "step": 2179 }, { "epoch": 1.847457627118644, "grad_norm": 1.3768372535705566, "learning_rate": 1.92090395480226e-05, "loss": 2.6821, "step": 2180 }, { "epoch": 1.8483050847457627, "grad_norm": 1.0082629919052124, "learning_rate": 1.919491525423729e-05, "loss": 2.6566, "step": 2181 }, { "epoch": 1.8491525423728814, "grad_norm": 1.349716305732727, "learning_rate": 1.9180790960451977e-05, "loss": 2.5341, "step": 2182 }, { "epoch": 1.85, "grad_norm": 1.0393669605255127, "learning_rate": 1.9166666666666667e-05, "loss": 2.7813, "step": 2183 }, { "epoch": 1.8508474576271188, "grad_norm": 1.1654995679855347, "learning_rate": 1.9152542372881357e-05, "loss": 2.7206, "step": 2184 }, { "epoch": 1.8516949152542372, "grad_norm": 1.2510303258895874, "learning_rate": 1.9138418079096047e-05, "loss": 2.7678, "step": 2185 }, { "epoch": 1.852542372881356, "grad_norm": 1.1300528049468994, "learning_rate": 1.9124293785310734e-05, "loss": 2.8084, "step": 2186 }, { "epoch": 1.8533898305084746, "grad_norm": 1.061294436454773, "learning_rate": 1.9110169491525427e-05, "loss": 2.7987, "step": 2187 }, { "epoch": 1.854237288135593, "grad_norm": 1.3102290630340576, "learning_rate": 1.9096045197740114e-05, "loss": 2.717, "step": 2188 }, { "epoch": 1.8550847457627118, "grad_norm": 1.0602400302886963, "learning_rate": 1.9081920903954804e-05, "loss": 2.778, "step": 2189 }, { "epoch": 1.8559322033898304, "grad_norm": 1.071914553642273, "learning_rate": 1.906779661016949e-05, "loss": 2.6677, "step": 2190 }, { "epoch": 1.8567796610169491, "grad_norm": 1.200467824935913, "learning_rate": 1.9053672316384184e-05, "loss": 2.765, "step": 2191 }, { "epoch": 1.8576271186440678, "grad_norm": 1.106851577758789, "learning_rate": 1.903954802259887e-05, "loss": 2.7109, "step": 2192 }, { "epoch": 1.8584745762711865, "grad_norm": 1.20026433467865, "learning_rate": 1.902542372881356e-05, "loss": 2.6831, "step": 2193 }, { "epoch": 1.8593220338983052, "grad_norm": 1.107622742652893, "learning_rate": 1.9011299435028247e-05, "loss": 2.6599, "step": 2194 }, { "epoch": 1.8601694915254239, "grad_norm": 0.9286526441574097, "learning_rate": 1.899717514124294e-05, "loss": 2.8641, "step": 2195 }, { "epoch": 1.8610169491525423, "grad_norm": 1.001364827156067, "learning_rate": 1.8983050847457627e-05, "loss": 2.7642, "step": 2196 }, { "epoch": 1.861864406779661, "grad_norm": 1.1795566082000732, "learning_rate": 1.8968926553672317e-05, "loss": 2.7738, "step": 2197 }, { "epoch": 1.8627118644067797, "grad_norm": 0.9288800954818726, "learning_rate": 1.8954802259887007e-05, "loss": 2.7021, "step": 2198 }, { "epoch": 1.8635593220338982, "grad_norm": 1.2071678638458252, "learning_rate": 1.8940677966101697e-05, "loss": 2.7749, "step": 2199 }, { "epoch": 1.8644067796610169, "grad_norm": 1.0884045362472534, "learning_rate": 1.8926553672316387e-05, "loss": 2.7885, "step": 2200 }, { "epoch": 1.8652542372881356, "grad_norm": 1.0357964038848877, "learning_rate": 1.8912429378531074e-05, "loss": 2.7344, "step": 2201 }, { "epoch": 1.8661016949152542, "grad_norm": 1.1365445852279663, "learning_rate": 1.8898305084745764e-05, "loss": 2.7749, "step": 2202 }, { "epoch": 1.866949152542373, "grad_norm": 1.066388487815857, "learning_rate": 1.8884180790960454e-05, "loss": 2.7136, "step": 2203 }, { "epoch": 1.8677966101694916, "grad_norm": 1.2417033910751343, "learning_rate": 1.8870056497175144e-05, "loss": 2.6089, "step": 2204 }, { "epoch": 1.8686440677966103, "grad_norm": 1.0588102340698242, "learning_rate": 1.885593220338983e-05, "loss": 2.737, "step": 2205 }, { "epoch": 1.8694915254237288, "grad_norm": 1.0809686183929443, "learning_rate": 1.884180790960452e-05, "loss": 2.6101, "step": 2206 }, { "epoch": 1.8703389830508474, "grad_norm": 0.9519467353820801, "learning_rate": 1.882768361581921e-05, "loss": 2.8298, "step": 2207 }, { "epoch": 1.8711864406779661, "grad_norm": 0.9441993236541748, "learning_rate": 1.88135593220339e-05, "loss": 2.7523, "step": 2208 }, { "epoch": 1.8720338983050846, "grad_norm": 0.996285080909729, "learning_rate": 1.8799435028248587e-05, "loss": 2.8619, "step": 2209 }, { "epoch": 1.8728813559322033, "grad_norm": 1.322708010673523, "learning_rate": 1.8785310734463277e-05, "loss": 2.6528, "step": 2210 }, { "epoch": 1.873728813559322, "grad_norm": 1.1378732919692993, "learning_rate": 1.8771186440677967e-05, "loss": 2.6988, "step": 2211 }, { "epoch": 1.8745762711864407, "grad_norm": 1.2078667879104614, "learning_rate": 1.8757062146892657e-05, "loss": 2.6782, "step": 2212 }, { "epoch": 1.8754237288135593, "grad_norm": 1.134391188621521, "learning_rate": 1.8742937853107347e-05, "loss": 2.701, "step": 2213 }, { "epoch": 1.876271186440678, "grad_norm": 0.9816274642944336, "learning_rate": 1.8728813559322033e-05, "loss": 2.6723, "step": 2214 }, { "epoch": 1.8771186440677967, "grad_norm": 1.3717323541641235, "learning_rate": 1.8714689265536727e-05, "loss": 2.4816, "step": 2215 }, { "epoch": 1.8779661016949154, "grad_norm": 1.190843105316162, "learning_rate": 1.8700564971751413e-05, "loss": 2.6468, "step": 2216 }, { "epoch": 1.8788135593220339, "grad_norm": 0.9955922961235046, "learning_rate": 1.8686440677966103e-05, "loss": 2.7169, "step": 2217 }, { "epoch": 1.8796610169491526, "grad_norm": 1.3320982456207275, "learning_rate": 1.867231638418079e-05, "loss": 2.6252, "step": 2218 }, { "epoch": 1.8805084745762712, "grad_norm": 1.1454169750213623, "learning_rate": 1.8658192090395483e-05, "loss": 2.7791, "step": 2219 }, { "epoch": 1.8813559322033897, "grad_norm": 1.174757480621338, "learning_rate": 1.864406779661017e-05, "loss": 2.7574, "step": 2220 }, { "epoch": 1.8822033898305084, "grad_norm": 1.0740658044815063, "learning_rate": 1.862994350282486e-05, "loss": 2.6172, "step": 2221 }, { "epoch": 1.883050847457627, "grad_norm": 1.2753480672836304, "learning_rate": 1.8615819209039547e-05, "loss": 2.8159, "step": 2222 }, { "epoch": 1.8838983050847458, "grad_norm": 1.1446774005889893, "learning_rate": 1.860169491525424e-05, "loss": 2.7301, "step": 2223 }, { "epoch": 1.8847457627118644, "grad_norm": 1.1188921928405762, "learning_rate": 1.8587570621468927e-05, "loss": 2.6202, "step": 2224 }, { "epoch": 1.8855932203389831, "grad_norm": 0.9338502883911133, "learning_rate": 1.8573446327683617e-05, "loss": 2.7956, "step": 2225 }, { "epoch": 1.8864406779661018, "grad_norm": 1.2575122117996216, "learning_rate": 1.8559322033898307e-05, "loss": 2.613, "step": 2226 }, { "epoch": 1.8872881355932203, "grad_norm": 1.1470165252685547, "learning_rate": 1.8545197740112996e-05, "loss": 2.6774, "step": 2227 }, { "epoch": 1.888135593220339, "grad_norm": 1.2760436534881592, "learning_rate": 1.8531073446327683e-05, "loss": 2.5741, "step": 2228 }, { "epoch": 1.8889830508474577, "grad_norm": 1.1932239532470703, "learning_rate": 1.8516949152542373e-05, "loss": 2.5973, "step": 2229 }, { "epoch": 1.8898305084745761, "grad_norm": 1.0808064937591553, "learning_rate": 1.8502824858757063e-05, "loss": 2.7211, "step": 2230 }, { "epoch": 1.8906779661016948, "grad_norm": 1.3394954204559326, "learning_rate": 1.8488700564971753e-05, "loss": 2.6208, "step": 2231 }, { "epoch": 1.8915254237288135, "grad_norm": 1.260796070098877, "learning_rate": 1.8474576271186443e-05, "loss": 2.675, "step": 2232 }, { "epoch": 1.8923728813559322, "grad_norm": 0.9751023054122925, "learning_rate": 1.846045197740113e-05, "loss": 2.8524, "step": 2233 }, { "epoch": 1.8932203389830509, "grad_norm": 0.94771409034729, "learning_rate": 1.844632768361582e-05, "loss": 2.7193, "step": 2234 }, { "epoch": 1.8940677966101696, "grad_norm": 1.106333613395691, "learning_rate": 1.843220338983051e-05, "loss": 2.736, "step": 2235 }, { "epoch": 1.8949152542372882, "grad_norm": 1.285568356513977, "learning_rate": 1.84180790960452e-05, "loss": 2.6821, "step": 2236 }, { "epoch": 1.895762711864407, "grad_norm": 1.1260167360305786, "learning_rate": 1.8403954802259886e-05, "loss": 2.7211, "step": 2237 }, { "epoch": 1.8966101694915254, "grad_norm": 1.0271403789520264, "learning_rate": 1.8389830508474576e-05, "loss": 2.723, "step": 2238 }, { "epoch": 1.897457627118644, "grad_norm": 1.2635908126831055, "learning_rate": 1.8375706214689266e-05, "loss": 2.6671, "step": 2239 }, { "epoch": 1.8983050847457628, "grad_norm": 1.036975622177124, "learning_rate": 1.8361581920903956e-05, "loss": 2.7593, "step": 2240 }, { "epoch": 1.8991525423728812, "grad_norm": 1.1501283645629883, "learning_rate": 1.8347457627118643e-05, "loss": 2.6194, "step": 2241 }, { "epoch": 1.9, "grad_norm": 1.1000744104385376, "learning_rate": 1.8333333333333333e-05, "loss": 2.742, "step": 2242 }, { "epoch": 1.9008474576271186, "grad_norm": 1.2108416557312012, "learning_rate": 1.8319209039548023e-05, "loss": 2.5673, "step": 2243 }, { "epoch": 1.9016949152542373, "grad_norm": 1.5413299798965454, "learning_rate": 1.8305084745762713e-05, "loss": 2.6373, "step": 2244 }, { "epoch": 1.902542372881356, "grad_norm": 1.1783409118652344, "learning_rate": 1.8290960451977403e-05, "loss": 2.7444, "step": 2245 }, { "epoch": 1.9033898305084747, "grad_norm": 1.1007132530212402, "learning_rate": 1.827683615819209e-05, "loss": 2.6802, "step": 2246 }, { "epoch": 1.9042372881355933, "grad_norm": 1.174351692199707, "learning_rate": 1.8262711864406783e-05, "loss": 2.5913, "step": 2247 }, { "epoch": 1.905084745762712, "grad_norm": 1.317480444908142, "learning_rate": 1.824858757062147e-05, "loss": 2.7326, "step": 2248 }, { "epoch": 1.9059322033898305, "grad_norm": 1.2611827850341797, "learning_rate": 1.823446327683616e-05, "loss": 2.5288, "step": 2249 }, { "epoch": 1.9067796610169492, "grad_norm": 1.1809849739074707, "learning_rate": 1.8220338983050846e-05, "loss": 2.7133, "step": 2250 }, { "epoch": 1.9076271186440676, "grad_norm": 1.1002089977264404, "learning_rate": 1.820621468926554e-05, "loss": 2.7633, "step": 2251 }, { "epoch": 1.9084745762711863, "grad_norm": 0.9564800262451172, "learning_rate": 1.8192090395480226e-05, "loss": 2.7799, "step": 2252 }, { "epoch": 1.909322033898305, "grad_norm": 1.0163698196411133, "learning_rate": 1.8177966101694916e-05, "loss": 2.8996, "step": 2253 }, { "epoch": 1.9101694915254237, "grad_norm": 1.151449203491211, "learning_rate": 1.8163841807909606e-05, "loss": 2.6482, "step": 2254 }, { "epoch": 1.9110169491525424, "grad_norm": 1.3174258470535278, "learning_rate": 1.8149717514124296e-05, "loss": 2.6334, "step": 2255 }, { "epoch": 1.911864406779661, "grad_norm": 1.4039971828460693, "learning_rate": 1.8135593220338983e-05, "loss": 2.6215, "step": 2256 }, { "epoch": 1.9127118644067798, "grad_norm": 1.1137800216674805, "learning_rate": 1.8121468926553673e-05, "loss": 2.653, "step": 2257 }, { "epoch": 1.9135593220338984, "grad_norm": 1.1515967845916748, "learning_rate": 1.8107344632768363e-05, "loss": 2.6147, "step": 2258 }, { "epoch": 1.914406779661017, "grad_norm": 1.1500734090805054, "learning_rate": 1.8093220338983053e-05, "loss": 2.6794, "step": 2259 }, { "epoch": 1.9152542372881356, "grad_norm": 1.1761561632156372, "learning_rate": 1.8079096045197743e-05, "loss": 2.6194, "step": 2260 }, { "epoch": 1.9161016949152543, "grad_norm": 1.298223853111267, "learning_rate": 1.806497175141243e-05, "loss": 2.5401, "step": 2261 }, { "epoch": 1.9169491525423727, "grad_norm": 1.3742104768753052, "learning_rate": 1.805084745762712e-05, "loss": 2.4979, "step": 2262 }, { "epoch": 1.9177966101694914, "grad_norm": 1.0494269132614136, "learning_rate": 1.803672316384181e-05, "loss": 2.692, "step": 2263 }, { "epoch": 1.9186440677966101, "grad_norm": 1.443848967552185, "learning_rate": 1.80225988700565e-05, "loss": 2.5217, "step": 2264 }, { "epoch": 1.9194915254237288, "grad_norm": 0.9262023568153381, "learning_rate": 1.8008474576271186e-05, "loss": 2.8329, "step": 2265 }, { "epoch": 1.9203389830508475, "grad_norm": 1.0921679735183716, "learning_rate": 1.799435028248588e-05, "loss": 2.7401, "step": 2266 }, { "epoch": 1.9211864406779662, "grad_norm": 0.9958825707435608, "learning_rate": 1.7980225988700566e-05, "loss": 2.7963, "step": 2267 }, { "epoch": 1.9220338983050849, "grad_norm": 1.287933349609375, "learning_rate": 1.7966101694915256e-05, "loss": 2.6219, "step": 2268 }, { "epoch": 1.9228813559322036, "grad_norm": 1.3278027772903442, "learning_rate": 1.7951977401129942e-05, "loss": 2.6627, "step": 2269 }, { "epoch": 1.923728813559322, "grad_norm": 1.1513268947601318, "learning_rate": 1.7937853107344636e-05, "loss": 2.6723, "step": 2270 }, { "epoch": 1.9245762711864407, "grad_norm": 1.1693884134292603, "learning_rate": 1.7923728813559322e-05, "loss": 2.7443, "step": 2271 }, { "epoch": 1.9254237288135592, "grad_norm": 1.3294950723648071, "learning_rate": 1.7909604519774012e-05, "loss": 2.7371, "step": 2272 }, { "epoch": 1.9262711864406779, "grad_norm": 1.154387354850769, "learning_rate": 1.78954802259887e-05, "loss": 2.644, "step": 2273 }, { "epoch": 1.9271186440677965, "grad_norm": 1.3097997903823853, "learning_rate": 1.7881355932203392e-05, "loss": 2.4605, "step": 2274 }, { "epoch": 1.9279661016949152, "grad_norm": 1.0979993343353271, "learning_rate": 1.786723163841808e-05, "loss": 2.823, "step": 2275 }, { "epoch": 1.928813559322034, "grad_norm": 1.0449799299240112, "learning_rate": 1.785310734463277e-05, "loss": 2.8048, "step": 2276 }, { "epoch": 1.9296610169491526, "grad_norm": 1.0796915292739868, "learning_rate": 1.783898305084746e-05, "loss": 2.6155, "step": 2277 }, { "epoch": 1.9305084745762713, "grad_norm": 1.142608880996704, "learning_rate": 1.782485875706215e-05, "loss": 2.7279, "step": 2278 }, { "epoch": 1.93135593220339, "grad_norm": 1.0652730464935303, "learning_rate": 1.781073446327684e-05, "loss": 2.7479, "step": 2279 }, { "epoch": 1.9322033898305084, "grad_norm": 1.1445574760437012, "learning_rate": 1.7796610169491526e-05, "loss": 2.7113, "step": 2280 }, { "epoch": 1.9330508474576271, "grad_norm": 1.0645530223846436, "learning_rate": 1.7782485875706216e-05, "loss": 2.6893, "step": 2281 }, { "epoch": 1.9338983050847458, "grad_norm": 1.1419038772583008, "learning_rate": 1.7768361581920906e-05, "loss": 2.9107, "step": 2282 }, { "epoch": 1.9347457627118643, "grad_norm": 1.1077309846878052, "learning_rate": 1.7754237288135596e-05, "loss": 2.6608, "step": 2283 }, { "epoch": 1.935593220338983, "grad_norm": 1.1196407079696655, "learning_rate": 1.7740112994350282e-05, "loss": 2.6925, "step": 2284 }, { "epoch": 1.9364406779661016, "grad_norm": 0.935629665851593, "learning_rate": 1.7725988700564972e-05, "loss": 2.8368, "step": 2285 }, { "epoch": 1.9372881355932203, "grad_norm": 1.159534215927124, "learning_rate": 1.7711864406779662e-05, "loss": 2.7368, "step": 2286 }, { "epoch": 1.938135593220339, "grad_norm": 1.1814593076705933, "learning_rate": 1.7697740112994352e-05, "loss": 2.6718, "step": 2287 }, { "epoch": 1.9389830508474577, "grad_norm": 1.2709976434707642, "learning_rate": 1.768361581920904e-05, "loss": 2.6549, "step": 2288 }, { "epoch": 1.9398305084745764, "grad_norm": 1.1238071918487549, "learning_rate": 1.766949152542373e-05, "loss": 2.7141, "step": 2289 }, { "epoch": 1.940677966101695, "grad_norm": 1.2803257703781128, "learning_rate": 1.765536723163842e-05, "loss": 2.629, "step": 2290 }, { "epoch": 1.9415254237288135, "grad_norm": 1.1455533504486084, "learning_rate": 1.764124293785311e-05, "loss": 2.8567, "step": 2291 }, { "epoch": 1.9423728813559322, "grad_norm": 0.9419334530830383, "learning_rate": 1.76271186440678e-05, "loss": 2.9034, "step": 2292 }, { "epoch": 1.9432203389830507, "grad_norm": 1.222684383392334, "learning_rate": 1.7612994350282485e-05, "loss": 2.7279, "step": 2293 }, { "epoch": 1.9440677966101694, "grad_norm": 1.0229870080947876, "learning_rate": 1.759887005649718e-05, "loss": 2.8089, "step": 2294 }, { "epoch": 1.944915254237288, "grad_norm": 0.9789454340934753, "learning_rate": 1.7584745762711865e-05, "loss": 2.7545, "step": 2295 }, { "epoch": 1.9457627118644067, "grad_norm": 1.1572662591934204, "learning_rate": 1.7570621468926555e-05, "loss": 2.7593, "step": 2296 }, { "epoch": 1.9466101694915254, "grad_norm": 1.4241892099380493, "learning_rate": 1.7556497175141242e-05, "loss": 2.6931, "step": 2297 }, { "epoch": 1.9474576271186441, "grad_norm": 1.0359416007995605, "learning_rate": 1.7542372881355935e-05, "loss": 2.7762, "step": 2298 }, { "epoch": 1.9483050847457628, "grad_norm": 1.3145811557769775, "learning_rate": 1.7528248587570622e-05, "loss": 2.5772, "step": 2299 }, { "epoch": 1.9491525423728815, "grad_norm": 1.0483804941177368, "learning_rate": 1.7514124293785312e-05, "loss": 2.8207, "step": 2300 }, { "epoch": 1.95, "grad_norm": 0.9640621542930603, "learning_rate": 1.75e-05, "loss": 2.8818, "step": 2301 }, { "epoch": 1.9508474576271186, "grad_norm": 1.356916069984436, "learning_rate": 1.7485875706214692e-05, "loss": 2.6609, "step": 2302 }, { "epoch": 1.9516949152542373, "grad_norm": 1.158837080001831, "learning_rate": 1.747175141242938e-05, "loss": 2.8008, "step": 2303 }, { "epoch": 1.9525423728813558, "grad_norm": 1.094359278678894, "learning_rate": 1.745762711864407e-05, "loss": 2.6449, "step": 2304 }, { "epoch": 1.9533898305084745, "grad_norm": 1.1630890369415283, "learning_rate": 1.744350282485876e-05, "loss": 2.7461, "step": 2305 }, { "epoch": 1.9542372881355932, "grad_norm": 1.2461740970611572, "learning_rate": 1.742937853107345e-05, "loss": 2.7222, "step": 2306 }, { "epoch": 1.9550847457627119, "grad_norm": 0.9834041595458984, "learning_rate": 1.7415254237288135e-05, "loss": 2.8727, "step": 2307 }, { "epoch": 1.9559322033898305, "grad_norm": 1.1653308868408203, "learning_rate": 1.7401129943502825e-05, "loss": 2.7133, "step": 2308 }, { "epoch": 1.9567796610169492, "grad_norm": 1.253213882446289, "learning_rate": 1.7387005649717515e-05, "loss": 2.614, "step": 2309 }, { "epoch": 1.957627118644068, "grad_norm": 1.2413033246994019, "learning_rate": 1.7372881355932205e-05, "loss": 2.793, "step": 2310 }, { "epoch": 1.9584745762711866, "grad_norm": 1.64323890209198, "learning_rate": 1.7358757062146895e-05, "loss": 2.3969, "step": 2311 }, { "epoch": 1.959322033898305, "grad_norm": 1.1061500310897827, "learning_rate": 1.734463276836158e-05, "loss": 2.6983, "step": 2312 }, { "epoch": 1.9601694915254237, "grad_norm": 1.1904529333114624, "learning_rate": 1.733050847457627e-05, "loss": 2.599, "step": 2313 }, { "epoch": 1.9610169491525422, "grad_norm": 0.9217993021011353, "learning_rate": 1.731638418079096e-05, "loss": 2.8319, "step": 2314 }, { "epoch": 1.961864406779661, "grad_norm": 1.1838812828063965, "learning_rate": 1.730225988700565e-05, "loss": 2.6076, "step": 2315 }, { "epoch": 1.9627118644067796, "grad_norm": 1.109630823135376, "learning_rate": 1.7288135593220338e-05, "loss": 2.7433, "step": 2316 }, { "epoch": 1.9635593220338983, "grad_norm": 1.0150429010391235, "learning_rate": 1.7274011299435028e-05, "loss": 2.8878, "step": 2317 }, { "epoch": 1.964406779661017, "grad_norm": 1.0494391918182373, "learning_rate": 1.7259887005649718e-05, "loss": 2.7438, "step": 2318 }, { "epoch": 1.9652542372881356, "grad_norm": 0.9451582431793213, "learning_rate": 1.7245762711864408e-05, "loss": 2.9528, "step": 2319 }, { "epoch": 1.9661016949152543, "grad_norm": 1.250891089439392, "learning_rate": 1.7231638418079095e-05, "loss": 2.7219, "step": 2320 }, { "epoch": 1.966949152542373, "grad_norm": 1.2979968786239624, "learning_rate": 1.7217514124293785e-05, "loss": 2.7372, "step": 2321 }, { "epoch": 1.9677966101694915, "grad_norm": 1.022067666053772, "learning_rate": 1.7203389830508475e-05, "loss": 2.7496, "step": 2322 }, { "epoch": 1.9686440677966102, "grad_norm": 1.2830743789672852, "learning_rate": 1.7189265536723165e-05, "loss": 2.6263, "step": 2323 }, { "epoch": 1.9694915254237289, "grad_norm": 0.9140318036079407, "learning_rate": 1.7175141242937855e-05, "loss": 2.8402, "step": 2324 }, { "epoch": 1.9703389830508473, "grad_norm": 0.993951141834259, "learning_rate": 1.716101694915254e-05, "loss": 2.82, "step": 2325 }, { "epoch": 1.971186440677966, "grad_norm": 1.0804479122161865, "learning_rate": 1.7146892655367235e-05, "loss": 2.7957, "step": 2326 }, { "epoch": 1.9720338983050847, "grad_norm": 1.2359389066696167, "learning_rate": 1.713276836158192e-05, "loss": 2.5425, "step": 2327 }, { "epoch": 1.9728813559322034, "grad_norm": 1.4272359609603882, "learning_rate": 1.711864406779661e-05, "loss": 2.4573, "step": 2328 }, { "epoch": 1.973728813559322, "grad_norm": 1.2127587795257568, "learning_rate": 1.7104519774011298e-05, "loss": 2.615, "step": 2329 }, { "epoch": 1.9745762711864407, "grad_norm": 1.5587515830993652, "learning_rate": 1.709039548022599e-05, "loss": 2.364, "step": 2330 }, { "epoch": 1.9754237288135594, "grad_norm": 1.2130502462387085, "learning_rate": 1.7076271186440678e-05, "loss": 2.6527, "step": 2331 }, { "epoch": 1.9762711864406781, "grad_norm": 0.9383020997047424, "learning_rate": 1.7062146892655368e-05, "loss": 2.9776, "step": 2332 }, { "epoch": 1.9771186440677966, "grad_norm": 1.0962392091751099, "learning_rate": 1.7048022598870055e-05, "loss": 2.6739, "step": 2333 }, { "epoch": 1.9779661016949153, "grad_norm": 1.1072427034378052, "learning_rate": 1.7033898305084748e-05, "loss": 2.5939, "step": 2334 }, { "epoch": 1.9788135593220337, "grad_norm": 1.2364929914474487, "learning_rate": 1.7019774011299435e-05, "loss": 2.6278, "step": 2335 }, { "epoch": 1.9796610169491524, "grad_norm": 1.187670350074768, "learning_rate": 1.7005649717514125e-05, "loss": 2.6159, "step": 2336 }, { "epoch": 1.980508474576271, "grad_norm": 1.0247775316238403, "learning_rate": 1.6991525423728815e-05, "loss": 2.6774, "step": 2337 }, { "epoch": 1.9813559322033898, "grad_norm": 0.7927355766296387, "learning_rate": 1.6977401129943505e-05, "loss": 2.8451, "step": 2338 }, { "epoch": 1.9822033898305085, "grad_norm": 1.3290356397628784, "learning_rate": 1.6963276836158195e-05, "loss": 2.7093, "step": 2339 }, { "epoch": 1.9830508474576272, "grad_norm": 1.2458208799362183, "learning_rate": 1.694915254237288e-05, "loss": 2.6228, "step": 2340 }, { "epoch": 1.9838983050847459, "grad_norm": 0.9700512290000916, "learning_rate": 1.693502824858757e-05, "loss": 2.7374, "step": 2341 }, { "epoch": 1.9847457627118645, "grad_norm": 0.9352066516876221, "learning_rate": 1.692090395480226e-05, "loss": 3.0912, "step": 2342 }, { "epoch": 1.985593220338983, "grad_norm": 1.1421597003936768, "learning_rate": 1.690677966101695e-05, "loss": 2.6697, "step": 2343 }, { "epoch": 1.9864406779661017, "grad_norm": 1.1418193578720093, "learning_rate": 1.6892655367231638e-05, "loss": 2.8203, "step": 2344 }, { "epoch": 1.9872881355932204, "grad_norm": 1.2710899114608765, "learning_rate": 1.687853107344633e-05, "loss": 2.7604, "step": 2345 }, { "epoch": 1.9881355932203388, "grad_norm": 1.2632701396942139, "learning_rate": 1.6864406779661018e-05, "loss": 2.7564, "step": 2346 }, { "epoch": 1.9889830508474575, "grad_norm": 0.9911966323852539, "learning_rate": 1.6850282485875708e-05, "loss": 2.9501, "step": 2347 }, { "epoch": 1.9898305084745762, "grad_norm": 1.1990536451339722, "learning_rate": 1.6836158192090394e-05, "loss": 2.6537, "step": 2348 }, { "epoch": 1.990677966101695, "grad_norm": 1.1346081495285034, "learning_rate": 1.6822033898305088e-05, "loss": 2.7558, "step": 2349 }, { "epoch": 1.9915254237288136, "grad_norm": 1.1950312852859497, "learning_rate": 1.6807909604519774e-05, "loss": 2.6231, "step": 2350 }, { "epoch": 1.9923728813559323, "grad_norm": 1.1291497945785522, "learning_rate": 1.6793785310734464e-05, "loss": 2.6095, "step": 2351 }, { "epoch": 1.993220338983051, "grad_norm": 1.2912720441818237, "learning_rate": 1.677966101694915e-05, "loss": 2.5782, "step": 2352 }, { "epoch": 1.9940677966101696, "grad_norm": 1.1213643550872803, "learning_rate": 1.6765536723163844e-05, "loss": 2.6206, "step": 2353 }, { "epoch": 1.994915254237288, "grad_norm": 1.0405361652374268, "learning_rate": 1.675141242937853e-05, "loss": 2.8375, "step": 2354 }, { "epoch": 1.9957627118644068, "grad_norm": 1.1870661973953247, "learning_rate": 1.673728813559322e-05, "loss": 2.6768, "step": 2355 }, { "epoch": 1.9966101694915255, "grad_norm": 1.1844818592071533, "learning_rate": 1.672316384180791e-05, "loss": 2.7363, "step": 2356 }, { "epoch": 1.997457627118644, "grad_norm": 1.2449629306793213, "learning_rate": 1.67090395480226e-05, "loss": 2.7136, "step": 2357 }, { "epoch": 1.9983050847457626, "grad_norm": 0.9491821527481079, "learning_rate": 1.669491525423729e-05, "loss": 2.8445, "step": 2358 }, { "epoch": 1.9991525423728813, "grad_norm": 1.2417043447494507, "learning_rate": 1.6680790960451977e-05, "loss": 2.7803, "step": 2359 }, { "epoch": 2.0, "grad_norm": 1.399558663368225, "learning_rate": 1.6666666666666667e-05, "loss": 2.5864, "step": 2360 }, { "epoch": 2.0008474576271187, "grad_norm": 1.954841136932373, "learning_rate": 1.6652542372881357e-05, "loss": 2.4323, "step": 2361 }, { "epoch": 2.0016949152542374, "grad_norm": 1.9575591087341309, "learning_rate": 1.6638418079096047e-05, "loss": 2.2182, "step": 2362 }, { "epoch": 2.002542372881356, "grad_norm": 1.880661964416504, "learning_rate": 1.6624293785310734e-05, "loss": 2.3444, "step": 2363 }, { "epoch": 2.0033898305084747, "grad_norm": 2.620041847229004, "learning_rate": 1.6610169491525424e-05, "loss": 2.0165, "step": 2364 }, { "epoch": 2.0042372881355934, "grad_norm": 1.9008338451385498, "learning_rate": 1.6596045197740114e-05, "loss": 2.3014, "step": 2365 }, { "epoch": 2.0050847457627117, "grad_norm": 1.9222924709320068, "learning_rate": 1.6581920903954804e-05, "loss": 2.4128, "step": 2366 }, { "epoch": 2.0059322033898304, "grad_norm": 1.6511833667755127, "learning_rate": 1.656779661016949e-05, "loss": 2.3422, "step": 2367 }, { "epoch": 2.006779661016949, "grad_norm": 1.7319014072418213, "learning_rate": 1.655367231638418e-05, "loss": 2.0119, "step": 2368 }, { "epoch": 2.0076271186440677, "grad_norm": 1.5663323402404785, "learning_rate": 1.653954802259887e-05, "loss": 2.111, "step": 2369 }, { "epoch": 2.0084745762711864, "grad_norm": 2.0640721321105957, "learning_rate": 1.652542372881356e-05, "loss": 1.8947, "step": 2370 }, { "epoch": 2.009322033898305, "grad_norm": 1.609630823135376, "learning_rate": 1.651129943502825e-05, "loss": 2.1651, "step": 2371 }, { "epoch": 2.010169491525424, "grad_norm": 1.3497323989868164, "learning_rate": 1.6497175141242937e-05, "loss": 2.6179, "step": 2372 }, { "epoch": 2.0110169491525425, "grad_norm": 2.0327656269073486, "learning_rate": 1.648305084745763e-05, "loss": 1.9496, "step": 2373 }, { "epoch": 2.011864406779661, "grad_norm": 1.9299832582473755, "learning_rate": 1.6468926553672317e-05, "loss": 2.1082, "step": 2374 }, { "epoch": 2.01271186440678, "grad_norm": 2.3567018508911133, "learning_rate": 1.6454802259887007e-05, "loss": 1.81, "step": 2375 }, { "epoch": 2.013559322033898, "grad_norm": 1.9467577934265137, "learning_rate": 1.6440677966101694e-05, "loss": 2.2086, "step": 2376 }, { "epoch": 2.0144067796610168, "grad_norm": 2.413874864578247, "learning_rate": 1.6426553672316387e-05, "loss": 2.0452, "step": 2377 }, { "epoch": 2.0152542372881355, "grad_norm": 2.1449880599975586, "learning_rate": 1.6412429378531074e-05, "loss": 2.3787, "step": 2378 }, { "epoch": 2.016101694915254, "grad_norm": 2.0846853256225586, "learning_rate": 1.6398305084745764e-05, "loss": 1.9526, "step": 2379 }, { "epoch": 2.016949152542373, "grad_norm": 2.0826375484466553, "learning_rate": 1.638418079096045e-05, "loss": 2.0344, "step": 2380 }, { "epoch": 2.0177966101694915, "grad_norm": 2.189852237701416, "learning_rate": 1.6370056497175144e-05, "loss": 1.6708, "step": 2381 }, { "epoch": 2.01864406779661, "grad_norm": 1.6226427555084229, "learning_rate": 1.635593220338983e-05, "loss": 2.3535, "step": 2382 }, { "epoch": 2.019491525423729, "grad_norm": 1.3864531517028809, "learning_rate": 1.634180790960452e-05, "loss": 2.4154, "step": 2383 }, { "epoch": 2.0203389830508476, "grad_norm": 1.4276227951049805, "learning_rate": 1.6327683615819207e-05, "loss": 2.2698, "step": 2384 }, { "epoch": 2.0211864406779663, "grad_norm": 1.8648180961608887, "learning_rate": 1.63135593220339e-05, "loss": 1.7603, "step": 2385 }, { "epoch": 2.022033898305085, "grad_norm": 1.316807746887207, "learning_rate": 1.6299435028248587e-05, "loss": 2.3367, "step": 2386 }, { "epoch": 2.022881355932203, "grad_norm": 2.0323164463043213, "learning_rate": 1.6285310734463277e-05, "loss": 1.7487, "step": 2387 }, { "epoch": 2.023728813559322, "grad_norm": 1.241205096244812, "learning_rate": 1.6271186440677967e-05, "loss": 2.426, "step": 2388 }, { "epoch": 2.0245762711864406, "grad_norm": 1.3961604833602905, "learning_rate": 1.6257062146892657e-05, "loss": 2.1473, "step": 2389 }, { "epoch": 2.0254237288135593, "grad_norm": 1.5464637279510498, "learning_rate": 1.6242937853107347e-05, "loss": 2.1547, "step": 2390 }, { "epoch": 2.026271186440678, "grad_norm": 1.2724565267562866, "learning_rate": 1.6228813559322034e-05, "loss": 2.3671, "step": 2391 }, { "epoch": 2.0271186440677966, "grad_norm": 1.15665602684021, "learning_rate": 1.6214689265536724e-05, "loss": 2.3736, "step": 2392 }, { "epoch": 2.0279661016949153, "grad_norm": 1.5079132318496704, "learning_rate": 1.6200564971751414e-05, "loss": 2.0668, "step": 2393 }, { "epoch": 2.028813559322034, "grad_norm": 1.650124430656433, "learning_rate": 1.6186440677966104e-05, "loss": 1.9998, "step": 2394 }, { "epoch": 2.0296610169491527, "grad_norm": 1.4562724828720093, "learning_rate": 1.617231638418079e-05, "loss": 2.3713, "step": 2395 }, { "epoch": 2.0305084745762714, "grad_norm": 1.5359783172607422, "learning_rate": 1.615819209039548e-05, "loss": 2.2703, "step": 2396 }, { "epoch": 2.0313559322033896, "grad_norm": 1.863041877746582, "learning_rate": 1.614406779661017e-05, "loss": 2.0951, "step": 2397 }, { "epoch": 2.0322033898305083, "grad_norm": 1.913000226020813, "learning_rate": 1.612994350282486e-05, "loss": 1.8643, "step": 2398 }, { "epoch": 2.033050847457627, "grad_norm": 1.6459336280822754, "learning_rate": 1.6115819209039547e-05, "loss": 2.2146, "step": 2399 }, { "epoch": 2.0338983050847457, "grad_norm": 1.598587989807129, "learning_rate": 1.6101694915254237e-05, "loss": 2.2268, "step": 2400 }, { "epoch": 2.0347457627118644, "grad_norm": 1.9659417867660522, "learning_rate": 1.6087570621468927e-05, "loss": 2.0437, "step": 2401 }, { "epoch": 2.035593220338983, "grad_norm": 1.3990250825881958, "learning_rate": 1.6073446327683617e-05, "loss": 2.1897, "step": 2402 }, { "epoch": 2.0364406779661017, "grad_norm": 1.0709125995635986, "learning_rate": 1.6059322033898307e-05, "loss": 2.4127, "step": 2403 }, { "epoch": 2.0372881355932204, "grad_norm": 1.5688316822052002, "learning_rate": 1.6045197740112993e-05, "loss": 2.1076, "step": 2404 }, { "epoch": 2.038135593220339, "grad_norm": 1.3257402181625366, "learning_rate": 1.6031073446327687e-05, "loss": 2.3743, "step": 2405 }, { "epoch": 2.038983050847458, "grad_norm": 1.7270565032958984, "learning_rate": 1.6016949152542373e-05, "loss": 2.0426, "step": 2406 }, { "epoch": 2.0398305084745765, "grad_norm": 1.4624197483062744, "learning_rate": 1.6002824858757063e-05, "loss": 2.0831, "step": 2407 }, { "epoch": 2.0406779661016947, "grad_norm": 2.149177312850952, "learning_rate": 1.598870056497175e-05, "loss": 1.8567, "step": 2408 }, { "epoch": 2.0415254237288134, "grad_norm": 1.4796191453933716, "learning_rate": 1.5974576271186443e-05, "loss": 2.3493, "step": 2409 }, { "epoch": 2.042372881355932, "grad_norm": 1.7318049669265747, "learning_rate": 1.596045197740113e-05, "loss": 2.0822, "step": 2410 }, { "epoch": 2.0432203389830508, "grad_norm": 1.522325038909912, "learning_rate": 1.594632768361582e-05, "loss": 2.1485, "step": 2411 }, { "epoch": 2.0440677966101695, "grad_norm": 1.7488744258880615, "learning_rate": 1.5932203389830507e-05, "loss": 2.0617, "step": 2412 }, { "epoch": 2.044915254237288, "grad_norm": 1.9676945209503174, "learning_rate": 1.59180790960452e-05, "loss": 1.8596, "step": 2413 }, { "epoch": 2.045762711864407, "grad_norm": 1.7921208143234253, "learning_rate": 1.5903954802259886e-05, "loss": 2.1513, "step": 2414 }, { "epoch": 2.0466101694915255, "grad_norm": 1.465053915977478, "learning_rate": 1.5889830508474576e-05, "loss": 2.0812, "step": 2415 }, { "epoch": 2.047457627118644, "grad_norm": 2.2215895652770996, "learning_rate": 1.5875706214689266e-05, "loss": 2.2899, "step": 2416 }, { "epoch": 2.048305084745763, "grad_norm": 1.323302984237671, "learning_rate": 1.5861581920903956e-05, "loss": 2.3655, "step": 2417 }, { "epoch": 2.049152542372881, "grad_norm": 1.572412371635437, "learning_rate": 1.5847457627118646e-05, "loss": 2.0031, "step": 2418 }, { "epoch": 2.05, "grad_norm": 1.5636740922927856, "learning_rate": 1.5833333333333333e-05, "loss": 2.2966, "step": 2419 }, { "epoch": 2.0508474576271185, "grad_norm": 1.3185337781906128, "learning_rate": 1.5819209039548023e-05, "loss": 2.2967, "step": 2420 }, { "epoch": 2.051694915254237, "grad_norm": 1.592160940170288, "learning_rate": 1.5805084745762713e-05, "loss": 2.2037, "step": 2421 }, { "epoch": 2.052542372881356, "grad_norm": 2.0628747940063477, "learning_rate": 1.5790960451977403e-05, "loss": 1.652, "step": 2422 }, { "epoch": 2.0533898305084746, "grad_norm": 2.0035552978515625, "learning_rate": 1.577683615819209e-05, "loss": 2.0703, "step": 2423 }, { "epoch": 2.0542372881355933, "grad_norm": 1.6242327690124512, "learning_rate": 1.5762711864406783e-05, "loss": 2.1453, "step": 2424 }, { "epoch": 2.055084745762712, "grad_norm": 1.9367742538452148, "learning_rate": 1.574858757062147e-05, "loss": 2.0133, "step": 2425 }, { "epoch": 2.0559322033898306, "grad_norm": 1.5298945903778076, "learning_rate": 1.573446327683616e-05, "loss": 2.0661, "step": 2426 }, { "epoch": 2.0567796610169493, "grad_norm": 1.9443947076797485, "learning_rate": 1.5720338983050846e-05, "loss": 1.7775, "step": 2427 }, { "epoch": 2.057627118644068, "grad_norm": 1.8264436721801758, "learning_rate": 1.570621468926554e-05, "loss": 1.694, "step": 2428 }, { "epoch": 2.0584745762711862, "grad_norm": 1.9748023748397827, "learning_rate": 1.5692090395480226e-05, "loss": 1.6182, "step": 2429 }, { "epoch": 2.059322033898305, "grad_norm": 1.5430885553359985, "learning_rate": 1.5677966101694916e-05, "loss": 2.3806, "step": 2430 }, { "epoch": 2.0601694915254236, "grad_norm": 1.8126107454299927, "learning_rate": 1.5663841807909603e-05, "loss": 1.9787, "step": 2431 }, { "epoch": 2.0610169491525423, "grad_norm": 2.0328750610351562, "learning_rate": 1.5649717514124296e-05, "loss": 1.864, "step": 2432 }, { "epoch": 2.061864406779661, "grad_norm": 1.8950706720352173, "learning_rate": 1.5635593220338983e-05, "loss": 1.7348, "step": 2433 }, { "epoch": 2.0627118644067797, "grad_norm": 1.7175761461257935, "learning_rate": 1.5621468926553673e-05, "loss": 2.2092, "step": 2434 }, { "epoch": 2.0635593220338984, "grad_norm": 1.7328435182571411, "learning_rate": 1.5607344632768363e-05, "loss": 1.9453, "step": 2435 }, { "epoch": 2.064406779661017, "grad_norm": 1.4919472932815552, "learning_rate": 1.5593220338983053e-05, "loss": 2.2208, "step": 2436 }, { "epoch": 2.0652542372881357, "grad_norm": 1.7060024738311768, "learning_rate": 1.5579096045197743e-05, "loss": 2.1301, "step": 2437 }, { "epoch": 2.0661016949152544, "grad_norm": 1.7842141389846802, "learning_rate": 1.556497175141243e-05, "loss": 2.0364, "step": 2438 }, { "epoch": 2.0669491525423727, "grad_norm": 1.7225085496902466, "learning_rate": 1.555084745762712e-05, "loss": 2.2352, "step": 2439 }, { "epoch": 2.0677966101694913, "grad_norm": 1.8794629573822021, "learning_rate": 1.553672316384181e-05, "loss": 2.077, "step": 2440 }, { "epoch": 2.06864406779661, "grad_norm": 2.137139320373535, "learning_rate": 1.55225988700565e-05, "loss": 1.9319, "step": 2441 }, { "epoch": 2.0694915254237287, "grad_norm": 1.8608564138412476, "learning_rate": 1.5508474576271186e-05, "loss": 1.8859, "step": 2442 }, { "epoch": 2.0703389830508474, "grad_norm": 1.515536904335022, "learning_rate": 1.5494350282485876e-05, "loss": 2.2575, "step": 2443 }, { "epoch": 2.071186440677966, "grad_norm": 1.615857481956482, "learning_rate": 1.5480225988700566e-05, "loss": 2.242, "step": 2444 }, { "epoch": 2.0720338983050848, "grad_norm": 1.7035619020462036, "learning_rate": 1.5466101694915256e-05, "loss": 2.0534, "step": 2445 }, { "epoch": 2.0728813559322035, "grad_norm": 1.9742575883865356, "learning_rate": 1.5451977401129943e-05, "loss": 1.8712, "step": 2446 }, { "epoch": 2.073728813559322, "grad_norm": 1.5173736810684204, "learning_rate": 1.5437853107344633e-05, "loss": 2.2006, "step": 2447 }, { "epoch": 2.074576271186441, "grad_norm": 1.8541594743728638, "learning_rate": 1.5423728813559323e-05, "loss": 2.1028, "step": 2448 }, { "epoch": 2.0754237288135595, "grad_norm": 1.481216549873352, "learning_rate": 1.5409604519774013e-05, "loss": 2.3778, "step": 2449 }, { "epoch": 2.0762711864406778, "grad_norm": 1.506359577178955, "learning_rate": 1.5395480225988703e-05, "loss": 2.2367, "step": 2450 }, { "epoch": 2.0771186440677964, "grad_norm": 1.9489935636520386, "learning_rate": 1.538135593220339e-05, "loss": 2.0066, "step": 2451 }, { "epoch": 2.077966101694915, "grad_norm": 1.7360435724258423, "learning_rate": 1.5367231638418083e-05, "loss": 2.1559, "step": 2452 }, { "epoch": 2.078813559322034, "grad_norm": 1.4112789630889893, "learning_rate": 1.535310734463277e-05, "loss": 2.2551, "step": 2453 }, { "epoch": 2.0796610169491525, "grad_norm": 1.9627602100372314, "learning_rate": 1.533898305084746e-05, "loss": 2.0402, "step": 2454 }, { "epoch": 2.080508474576271, "grad_norm": 1.8383172750473022, "learning_rate": 1.5324858757062146e-05, "loss": 2.1307, "step": 2455 }, { "epoch": 2.08135593220339, "grad_norm": 1.5599380731582642, "learning_rate": 1.531073446327684e-05, "loss": 2.3239, "step": 2456 }, { "epoch": 2.0822033898305086, "grad_norm": 2.331843852996826, "learning_rate": 1.5296610169491526e-05, "loss": 1.7808, "step": 2457 }, { "epoch": 2.0830508474576273, "grad_norm": 2.7220191955566406, "learning_rate": 1.5282485875706216e-05, "loss": 1.7593, "step": 2458 }, { "epoch": 2.083898305084746, "grad_norm": 1.6436541080474854, "learning_rate": 1.5268361581920902e-05, "loss": 2.1357, "step": 2459 }, { "epoch": 2.084745762711864, "grad_norm": 1.0616300106048584, "learning_rate": 1.5254237288135596e-05, "loss": 2.6704, "step": 2460 }, { "epoch": 2.085593220338983, "grad_norm": 1.5112767219543457, "learning_rate": 1.5240112994350284e-05, "loss": 2.3113, "step": 2461 }, { "epoch": 2.0864406779661016, "grad_norm": 1.6891629695892334, "learning_rate": 1.5225988700564972e-05, "loss": 1.9751, "step": 2462 }, { "epoch": 2.0872881355932202, "grad_norm": 1.5620174407958984, "learning_rate": 1.521186440677966e-05, "loss": 2.2542, "step": 2463 }, { "epoch": 2.088135593220339, "grad_norm": 1.851977825164795, "learning_rate": 1.5197740112994352e-05, "loss": 1.9648, "step": 2464 }, { "epoch": 2.0889830508474576, "grad_norm": 1.4661450386047363, "learning_rate": 1.518361581920904e-05, "loss": 2.1972, "step": 2465 }, { "epoch": 2.0898305084745763, "grad_norm": 1.6566978693008423, "learning_rate": 1.5169491525423729e-05, "loss": 2.2925, "step": 2466 }, { "epoch": 2.090677966101695, "grad_norm": 1.2233691215515137, "learning_rate": 1.5155367231638417e-05, "loss": 2.4515, "step": 2467 }, { "epoch": 2.0915254237288137, "grad_norm": 2.050008773803711, "learning_rate": 1.5141242937853109e-05, "loss": 1.6345, "step": 2468 }, { "epoch": 2.0923728813559324, "grad_norm": 1.3644006252288818, "learning_rate": 1.5127118644067797e-05, "loss": 2.2742, "step": 2469 }, { "epoch": 2.093220338983051, "grad_norm": 1.8843778371810913, "learning_rate": 1.5112994350282486e-05, "loss": 2.1363, "step": 2470 }, { "epoch": 2.0940677966101693, "grad_norm": 1.8734453916549683, "learning_rate": 1.5098870056497176e-05, "loss": 1.9785, "step": 2471 }, { "epoch": 2.094915254237288, "grad_norm": 1.614042043685913, "learning_rate": 1.5084745762711865e-05, "loss": 2.2247, "step": 2472 }, { "epoch": 2.0957627118644067, "grad_norm": 1.8866091966629028, "learning_rate": 1.5070621468926554e-05, "loss": 2.0338, "step": 2473 }, { "epoch": 2.0966101694915253, "grad_norm": 1.4197208881378174, "learning_rate": 1.5056497175141244e-05, "loss": 2.3567, "step": 2474 }, { "epoch": 2.097457627118644, "grad_norm": 1.4004616737365723, "learning_rate": 1.5042372881355932e-05, "loss": 2.2448, "step": 2475 }, { "epoch": 2.0983050847457627, "grad_norm": 1.7236913442611694, "learning_rate": 1.5028248587570624e-05, "loss": 1.9488, "step": 2476 }, { "epoch": 2.0991525423728814, "grad_norm": 1.5777957439422607, "learning_rate": 1.5014124293785312e-05, "loss": 2.1542, "step": 2477 }, { "epoch": 2.1, "grad_norm": 1.8451133966445923, "learning_rate": 1.5e-05, "loss": 2.1358, "step": 2478 }, { "epoch": 2.1008474576271188, "grad_norm": 3.029209852218628, "learning_rate": 1.4985875706214689e-05, "loss": 2.0149, "step": 2479 }, { "epoch": 2.1016949152542375, "grad_norm": 1.345157265663147, "learning_rate": 1.497175141242938e-05, "loss": 2.4342, "step": 2480 }, { "epoch": 2.1025423728813557, "grad_norm": 1.496124267578125, "learning_rate": 1.4957627118644069e-05, "loss": 2.1712, "step": 2481 }, { "epoch": 2.1033898305084744, "grad_norm": 1.9682401418685913, "learning_rate": 1.4943502824858757e-05, "loss": 1.7816, "step": 2482 }, { "epoch": 2.104237288135593, "grad_norm": 1.799791693687439, "learning_rate": 1.4929378531073445e-05, "loss": 1.8135, "step": 2483 }, { "epoch": 2.1050847457627118, "grad_norm": 1.8387713432312012, "learning_rate": 1.4915254237288137e-05, "loss": 1.9054, "step": 2484 }, { "epoch": 2.1059322033898304, "grad_norm": 1.6758382320404053, "learning_rate": 1.4901129943502825e-05, "loss": 2.3163, "step": 2485 }, { "epoch": 2.106779661016949, "grad_norm": 1.6890454292297363, "learning_rate": 1.4887005649717514e-05, "loss": 2.1173, "step": 2486 }, { "epoch": 2.107627118644068, "grad_norm": 1.4125128984451294, "learning_rate": 1.4872881355932204e-05, "loss": 2.2514, "step": 2487 }, { "epoch": 2.1084745762711865, "grad_norm": 1.3438292741775513, "learning_rate": 1.4858757062146894e-05, "loss": 2.4917, "step": 2488 }, { "epoch": 2.109322033898305, "grad_norm": 2.0984585285186768, "learning_rate": 1.4844632768361582e-05, "loss": 1.6638, "step": 2489 }, { "epoch": 2.110169491525424, "grad_norm": 1.9447909593582153, "learning_rate": 1.4830508474576272e-05, "loss": 1.9094, "step": 2490 }, { "epoch": 2.1110169491525426, "grad_norm": 1.397506833076477, "learning_rate": 1.481638418079096e-05, "loss": 2.385, "step": 2491 }, { "epoch": 2.111864406779661, "grad_norm": 1.4305623769760132, "learning_rate": 1.4802259887005652e-05, "loss": 2.5347, "step": 2492 }, { "epoch": 2.1127118644067795, "grad_norm": 1.444412350654602, "learning_rate": 1.478813559322034e-05, "loss": 2.2904, "step": 2493 }, { "epoch": 2.113559322033898, "grad_norm": 1.458781361579895, "learning_rate": 1.4774011299435028e-05, "loss": 2.3122, "step": 2494 }, { "epoch": 2.114406779661017, "grad_norm": 1.4855881929397583, "learning_rate": 1.475988700564972e-05, "loss": 2.3321, "step": 2495 }, { "epoch": 2.1152542372881356, "grad_norm": 1.5247066020965576, "learning_rate": 1.4745762711864408e-05, "loss": 2.1949, "step": 2496 }, { "epoch": 2.1161016949152542, "grad_norm": 1.168670415878296, "learning_rate": 1.4731638418079097e-05, "loss": 2.6419, "step": 2497 }, { "epoch": 2.116949152542373, "grad_norm": 1.9894770383834839, "learning_rate": 1.4717514124293785e-05, "loss": 1.7941, "step": 2498 }, { "epoch": 2.1177966101694916, "grad_norm": 1.8084489107131958, "learning_rate": 1.4703389830508477e-05, "loss": 1.9211, "step": 2499 }, { "epoch": 2.1186440677966103, "grad_norm": 1.9926161766052246, "learning_rate": 1.4689265536723165e-05, "loss": 1.9123, "step": 2500 }, { "epoch": 2.119491525423729, "grad_norm": 1.8883634805679321, "learning_rate": 1.4675141242937853e-05, "loss": 1.905, "step": 2501 }, { "epoch": 2.1203389830508477, "grad_norm": 2.07058048248291, "learning_rate": 1.4661016949152542e-05, "loss": 1.9945, "step": 2502 }, { "epoch": 2.121186440677966, "grad_norm": 1.820389747619629, "learning_rate": 1.4646892655367233e-05, "loss": 2.0952, "step": 2503 }, { "epoch": 2.1220338983050846, "grad_norm": 1.6159000396728516, "learning_rate": 1.4632768361581922e-05, "loss": 2.0177, "step": 2504 }, { "epoch": 2.1228813559322033, "grad_norm": 1.8930219411849976, "learning_rate": 1.4618644067796612e-05, "loss": 2.0866, "step": 2505 }, { "epoch": 2.123728813559322, "grad_norm": 1.8374148607254028, "learning_rate": 1.46045197740113e-05, "loss": 1.938, "step": 2506 }, { "epoch": 2.1245762711864407, "grad_norm": 1.598009467124939, "learning_rate": 1.459039548022599e-05, "loss": 2.299, "step": 2507 }, { "epoch": 2.1254237288135593, "grad_norm": 1.8127394914627075, "learning_rate": 1.457627118644068e-05, "loss": 2.1198, "step": 2508 }, { "epoch": 2.126271186440678, "grad_norm": 1.3069536685943604, "learning_rate": 1.4562146892655368e-05, "loss": 2.4244, "step": 2509 }, { "epoch": 2.1271186440677967, "grad_norm": 1.4885691404342651, "learning_rate": 1.4548022598870056e-05, "loss": 2.1462, "step": 2510 }, { "epoch": 2.1279661016949154, "grad_norm": 1.642301321029663, "learning_rate": 1.4533898305084748e-05, "loss": 2.1435, "step": 2511 }, { "epoch": 2.128813559322034, "grad_norm": 1.5486087799072266, "learning_rate": 1.4519774011299436e-05, "loss": 2.2388, "step": 2512 }, { "epoch": 2.1296610169491528, "grad_norm": 1.9132736921310425, "learning_rate": 1.4505649717514125e-05, "loss": 1.9723, "step": 2513 }, { "epoch": 2.130508474576271, "grad_norm": 1.6700730323791504, "learning_rate": 1.4491525423728813e-05, "loss": 2.1479, "step": 2514 }, { "epoch": 2.1313559322033897, "grad_norm": 1.5021833181381226, "learning_rate": 1.4477401129943505e-05, "loss": 2.3397, "step": 2515 }, { "epoch": 2.1322033898305084, "grad_norm": 2.378293991088867, "learning_rate": 1.4463276836158193e-05, "loss": 2.3639, "step": 2516 }, { "epoch": 2.133050847457627, "grad_norm": 1.5428694486618042, "learning_rate": 1.4449152542372881e-05, "loss": 2.2182, "step": 2517 }, { "epoch": 2.1338983050847458, "grad_norm": 1.633458137512207, "learning_rate": 1.443502824858757e-05, "loss": 2.3229, "step": 2518 }, { "epoch": 2.1347457627118644, "grad_norm": 1.5148998498916626, "learning_rate": 1.4420903954802261e-05, "loss": 2.2358, "step": 2519 }, { "epoch": 2.135593220338983, "grad_norm": 1.5872907638549805, "learning_rate": 1.440677966101695e-05, "loss": 2.2053, "step": 2520 }, { "epoch": 2.136440677966102, "grad_norm": 1.6889498233795166, "learning_rate": 1.439265536723164e-05, "loss": 2.2384, "step": 2521 }, { "epoch": 2.1372881355932205, "grad_norm": 1.6237796545028687, "learning_rate": 1.4378531073446328e-05, "loss": 2.2532, "step": 2522 }, { "epoch": 2.1381355932203387, "grad_norm": 1.9023722410202026, "learning_rate": 1.4364406779661018e-05, "loss": 1.9404, "step": 2523 }, { "epoch": 2.1389830508474574, "grad_norm": 1.763069987297058, "learning_rate": 1.4350282485875708e-05, "loss": 2.0033, "step": 2524 }, { "epoch": 2.139830508474576, "grad_norm": 1.5526543855667114, "learning_rate": 1.4336158192090396e-05, "loss": 2.0892, "step": 2525 }, { "epoch": 2.140677966101695, "grad_norm": 1.6335585117340088, "learning_rate": 1.4322033898305085e-05, "loss": 1.9705, "step": 2526 }, { "epoch": 2.1415254237288135, "grad_norm": 1.6628258228302002, "learning_rate": 1.4307909604519776e-05, "loss": 2.1448, "step": 2527 }, { "epoch": 2.142372881355932, "grad_norm": 1.4387049674987793, "learning_rate": 1.4293785310734465e-05, "loss": 2.2944, "step": 2528 }, { "epoch": 2.143220338983051, "grad_norm": 1.481052279472351, "learning_rate": 1.4279661016949153e-05, "loss": 2.2219, "step": 2529 }, { "epoch": 2.1440677966101696, "grad_norm": 1.803296446800232, "learning_rate": 1.4265536723163841e-05, "loss": 2.1046, "step": 2530 }, { "epoch": 2.1449152542372882, "grad_norm": 1.8609411716461182, "learning_rate": 1.4251412429378533e-05, "loss": 2.0763, "step": 2531 }, { "epoch": 2.145762711864407, "grad_norm": 1.9128785133361816, "learning_rate": 1.4237288135593221e-05, "loss": 1.8958, "step": 2532 }, { "epoch": 2.1466101694915256, "grad_norm": 1.9034333229064941, "learning_rate": 1.422316384180791e-05, "loss": 1.8461, "step": 2533 }, { "epoch": 2.1474576271186443, "grad_norm": 2.5008606910705566, "learning_rate": 1.4209039548022598e-05, "loss": 1.91, "step": 2534 }, { "epoch": 2.1483050847457625, "grad_norm": 1.623769760131836, "learning_rate": 1.419491525423729e-05, "loss": 2.2586, "step": 2535 }, { "epoch": 2.1491525423728812, "grad_norm": 2.1108202934265137, "learning_rate": 1.4180790960451978e-05, "loss": 1.9027, "step": 2536 }, { "epoch": 2.15, "grad_norm": 1.9891767501831055, "learning_rate": 1.4166666666666668e-05, "loss": 1.9846, "step": 2537 }, { "epoch": 2.1508474576271186, "grad_norm": 2.1394753456115723, "learning_rate": 1.4152542372881356e-05, "loss": 1.9395, "step": 2538 }, { "epoch": 2.1516949152542373, "grad_norm": 1.662566900253296, "learning_rate": 1.4138418079096048e-05, "loss": 2.3018, "step": 2539 }, { "epoch": 2.152542372881356, "grad_norm": 2.159883737564087, "learning_rate": 1.4124293785310736e-05, "loss": 1.9256, "step": 2540 }, { "epoch": 2.1533898305084747, "grad_norm": 2.0187926292419434, "learning_rate": 1.4110169491525424e-05, "loss": 1.7855, "step": 2541 }, { "epoch": 2.1542372881355933, "grad_norm": 1.3911402225494385, "learning_rate": 1.4096045197740113e-05, "loss": 2.3972, "step": 2542 }, { "epoch": 2.155084745762712, "grad_norm": 2.866255283355713, "learning_rate": 1.4081920903954804e-05, "loss": 2.0278, "step": 2543 }, { "epoch": 2.1559322033898307, "grad_norm": 1.668233036994934, "learning_rate": 1.4067796610169493e-05, "loss": 2.1187, "step": 2544 }, { "epoch": 2.156779661016949, "grad_norm": 1.8213480710983276, "learning_rate": 1.4053672316384181e-05, "loss": 1.9344, "step": 2545 }, { "epoch": 2.1576271186440676, "grad_norm": 1.5447547435760498, "learning_rate": 1.403954802259887e-05, "loss": 2.2768, "step": 2546 }, { "epoch": 2.1584745762711863, "grad_norm": 1.3066322803497314, "learning_rate": 1.4025423728813561e-05, "loss": 2.427, "step": 2547 }, { "epoch": 2.159322033898305, "grad_norm": 1.8616938591003418, "learning_rate": 1.4011299435028249e-05, "loss": 1.9183, "step": 2548 }, { "epoch": 2.1601694915254237, "grad_norm": 1.8417625427246094, "learning_rate": 1.3997175141242937e-05, "loss": 1.982, "step": 2549 }, { "epoch": 2.1610169491525424, "grad_norm": 1.9708733558654785, "learning_rate": 1.3983050847457627e-05, "loss": 1.935, "step": 2550 }, { "epoch": 2.161864406779661, "grad_norm": 1.737199068069458, "learning_rate": 1.3968926553672317e-05, "loss": 2.0792, "step": 2551 }, { "epoch": 2.1627118644067798, "grad_norm": 1.598211646080017, "learning_rate": 1.3954802259887006e-05, "loss": 2.2252, "step": 2552 }, { "epoch": 2.1635593220338984, "grad_norm": 1.5978015661239624, "learning_rate": 1.3940677966101696e-05, "loss": 2.2673, "step": 2553 }, { "epoch": 2.164406779661017, "grad_norm": 1.3255168199539185, "learning_rate": 1.3926553672316384e-05, "loss": 2.3453, "step": 2554 }, { "epoch": 2.165254237288136, "grad_norm": 1.6760737895965576, "learning_rate": 1.3912429378531076e-05, "loss": 2.2258, "step": 2555 }, { "epoch": 2.166101694915254, "grad_norm": 1.4257606267929077, "learning_rate": 1.3898305084745764e-05, "loss": 2.2405, "step": 2556 }, { "epoch": 2.1669491525423727, "grad_norm": 1.9469870328903198, "learning_rate": 1.3884180790960452e-05, "loss": 1.8691, "step": 2557 }, { "epoch": 2.1677966101694914, "grad_norm": 1.497591495513916, "learning_rate": 1.387005649717514e-05, "loss": 2.2402, "step": 2558 }, { "epoch": 2.16864406779661, "grad_norm": 1.9147684574127197, "learning_rate": 1.3855932203389832e-05, "loss": 1.7802, "step": 2559 }, { "epoch": 2.169491525423729, "grad_norm": 1.671188235282898, "learning_rate": 1.384180790960452e-05, "loss": 2.2023, "step": 2560 }, { "epoch": 2.1703389830508475, "grad_norm": 2.085939407348633, "learning_rate": 1.3827683615819209e-05, "loss": 1.6894, "step": 2561 }, { "epoch": 2.171186440677966, "grad_norm": 2.1118597984313965, "learning_rate": 1.3813559322033897e-05, "loss": 1.8384, "step": 2562 }, { "epoch": 2.172033898305085, "grad_norm": 1.3560526371002197, "learning_rate": 1.3799435028248589e-05, "loss": 2.4099, "step": 2563 }, { "epoch": 2.1728813559322036, "grad_norm": 1.557410478591919, "learning_rate": 1.3785310734463277e-05, "loss": 2.3363, "step": 2564 }, { "epoch": 2.1737288135593222, "grad_norm": 2.623718738555908, "learning_rate": 1.3771186440677965e-05, "loss": 1.5711, "step": 2565 }, { "epoch": 2.1745762711864405, "grad_norm": 1.4253687858581543, "learning_rate": 1.3757062146892655e-05, "loss": 2.3093, "step": 2566 }, { "epoch": 2.175423728813559, "grad_norm": 1.5429052114486694, "learning_rate": 1.3742937853107345e-05, "loss": 2.3963, "step": 2567 }, { "epoch": 2.176271186440678, "grad_norm": 1.3952724933624268, "learning_rate": 1.3728813559322034e-05, "loss": 2.3189, "step": 2568 }, { "epoch": 2.1771186440677965, "grad_norm": 2.0680289268493652, "learning_rate": 1.3714689265536724e-05, "loss": 1.8086, "step": 2569 }, { "epoch": 2.1779661016949152, "grad_norm": 2.042335271835327, "learning_rate": 1.3700564971751412e-05, "loss": 1.8987, "step": 2570 }, { "epoch": 2.178813559322034, "grad_norm": 1.3315010070800781, "learning_rate": 1.3686440677966104e-05, "loss": 2.3551, "step": 2571 }, { "epoch": 2.1796610169491526, "grad_norm": 1.4842523336410522, "learning_rate": 1.3672316384180792e-05, "loss": 2.42, "step": 2572 }, { "epoch": 2.1805084745762713, "grad_norm": 2.355437994003296, "learning_rate": 1.365819209039548e-05, "loss": 1.3587, "step": 2573 }, { "epoch": 2.18135593220339, "grad_norm": 1.5268384218215942, "learning_rate": 1.3644067796610169e-05, "loss": 2.135, "step": 2574 }, { "epoch": 2.1822033898305087, "grad_norm": 1.2764055728912354, "learning_rate": 1.362994350282486e-05, "loss": 2.2816, "step": 2575 }, { "epoch": 2.1830508474576273, "grad_norm": 1.74439537525177, "learning_rate": 1.3615819209039549e-05, "loss": 2.0551, "step": 2576 }, { "epoch": 2.1838983050847456, "grad_norm": 1.5508873462677002, "learning_rate": 1.3601694915254237e-05, "loss": 2.0705, "step": 2577 }, { "epoch": 2.1847457627118643, "grad_norm": 1.7426412105560303, "learning_rate": 1.3587570621468929e-05, "loss": 2.1942, "step": 2578 }, { "epoch": 2.185593220338983, "grad_norm": 1.6260688304901123, "learning_rate": 1.3573446327683617e-05, "loss": 2.1463, "step": 2579 }, { "epoch": 2.1864406779661016, "grad_norm": 2.0990731716156006, "learning_rate": 1.3559322033898305e-05, "loss": 1.9374, "step": 2580 }, { "epoch": 2.1872881355932203, "grad_norm": 1.7750979661941528, "learning_rate": 1.3545197740112994e-05, "loss": 2.0562, "step": 2581 }, { "epoch": 2.188135593220339, "grad_norm": 1.866346001625061, "learning_rate": 1.3531073446327685e-05, "loss": 2.0816, "step": 2582 }, { "epoch": 2.1889830508474577, "grad_norm": 1.5054261684417725, "learning_rate": 1.3516949152542374e-05, "loss": 2.4755, "step": 2583 }, { "epoch": 2.1898305084745764, "grad_norm": 1.2178317308425903, "learning_rate": 1.3502824858757064e-05, "loss": 2.6567, "step": 2584 }, { "epoch": 2.190677966101695, "grad_norm": 1.5665068626403809, "learning_rate": 1.3488700564971752e-05, "loss": 2.2267, "step": 2585 }, { "epoch": 2.1915254237288138, "grad_norm": 2.063912868499756, "learning_rate": 1.3474576271186442e-05, "loss": 1.9713, "step": 2586 }, { "epoch": 2.192372881355932, "grad_norm": 1.6485923528671265, "learning_rate": 1.3460451977401132e-05, "loss": 2.1568, "step": 2587 }, { "epoch": 2.1932203389830507, "grad_norm": 1.222883939743042, "learning_rate": 1.344632768361582e-05, "loss": 2.5162, "step": 2588 }, { "epoch": 2.1940677966101694, "grad_norm": 1.8396320343017578, "learning_rate": 1.3432203389830508e-05, "loss": 1.8913, "step": 2589 }, { "epoch": 2.194915254237288, "grad_norm": 1.6970138549804688, "learning_rate": 1.34180790960452e-05, "loss": 1.767, "step": 2590 }, { "epoch": 2.1957627118644067, "grad_norm": 1.7239935398101807, "learning_rate": 1.3403954802259888e-05, "loss": 1.7974, "step": 2591 }, { "epoch": 2.1966101694915254, "grad_norm": 1.249467134475708, "learning_rate": 1.3389830508474577e-05, "loss": 2.3644, "step": 2592 }, { "epoch": 2.197457627118644, "grad_norm": 1.3214976787567139, "learning_rate": 1.3375706214689265e-05, "loss": 2.3804, "step": 2593 }, { "epoch": 2.198305084745763, "grad_norm": 1.844478964805603, "learning_rate": 1.3361581920903957e-05, "loss": 2.0592, "step": 2594 }, { "epoch": 2.1991525423728815, "grad_norm": 1.8242712020874023, "learning_rate": 1.3347457627118645e-05, "loss": 1.9523, "step": 2595 }, { "epoch": 2.2, "grad_norm": 1.3069279193878174, "learning_rate": 1.3333333333333333e-05, "loss": 2.3387, "step": 2596 }, { "epoch": 2.200847457627119, "grad_norm": 1.7718732357025146, "learning_rate": 1.3319209039548022e-05, "loss": 2.3433, "step": 2597 }, { "epoch": 2.201694915254237, "grad_norm": 1.8796429634094238, "learning_rate": 1.3305084745762713e-05, "loss": 1.9381, "step": 2598 }, { "epoch": 2.202542372881356, "grad_norm": 1.8052655458450317, "learning_rate": 1.3290960451977402e-05, "loss": 2.1447, "step": 2599 }, { "epoch": 2.2033898305084745, "grad_norm": 2.3024721145629883, "learning_rate": 1.3276836158192092e-05, "loss": 1.614, "step": 2600 }, { "epoch": 2.204237288135593, "grad_norm": 1.120464563369751, "learning_rate": 1.326271186440678e-05, "loss": 2.7688, "step": 2601 }, { "epoch": 2.205084745762712, "grad_norm": 1.4297536611557007, "learning_rate": 1.324858757062147e-05, "loss": 2.2969, "step": 2602 }, { "epoch": 2.2059322033898305, "grad_norm": 1.6033844947814941, "learning_rate": 1.323446327683616e-05, "loss": 2.3372, "step": 2603 }, { "epoch": 2.2067796610169492, "grad_norm": 1.608870506286621, "learning_rate": 1.3220338983050848e-05, "loss": 2.1432, "step": 2604 }, { "epoch": 2.207627118644068, "grad_norm": 1.7524429559707642, "learning_rate": 1.3206214689265536e-05, "loss": 1.9324, "step": 2605 }, { "epoch": 2.2084745762711866, "grad_norm": 1.6078535318374634, "learning_rate": 1.3192090395480228e-05, "loss": 2.2075, "step": 2606 }, { "epoch": 2.2093220338983053, "grad_norm": 2.2441558837890625, "learning_rate": 1.3177966101694916e-05, "loss": 1.624, "step": 2607 }, { "epoch": 2.2101694915254235, "grad_norm": 1.5701043605804443, "learning_rate": 1.3163841807909605e-05, "loss": 2.3632, "step": 2608 }, { "epoch": 2.211016949152542, "grad_norm": 2.2938971519470215, "learning_rate": 1.3149717514124293e-05, "loss": 1.5606, "step": 2609 }, { "epoch": 2.211864406779661, "grad_norm": 1.898794174194336, "learning_rate": 1.3135593220338985e-05, "loss": 2.3264, "step": 2610 }, { "epoch": 2.2127118644067796, "grad_norm": 2.166476011276245, "learning_rate": 1.3121468926553673e-05, "loss": 1.6981, "step": 2611 }, { "epoch": 2.2135593220338983, "grad_norm": 2.0929105281829834, "learning_rate": 1.3107344632768361e-05, "loss": 1.7719, "step": 2612 }, { "epoch": 2.214406779661017, "grad_norm": 2.0681040287017822, "learning_rate": 1.309322033898305e-05, "loss": 1.7477, "step": 2613 }, { "epoch": 2.2152542372881356, "grad_norm": 1.704654574394226, "learning_rate": 1.3079096045197741e-05, "loss": 2.1921, "step": 2614 }, { "epoch": 2.2161016949152543, "grad_norm": 1.507063388824463, "learning_rate": 1.306497175141243e-05, "loss": 2.3369, "step": 2615 }, { "epoch": 2.216949152542373, "grad_norm": 1.3990044593811035, "learning_rate": 1.305084745762712e-05, "loss": 2.3672, "step": 2616 }, { "epoch": 2.2177966101694917, "grad_norm": 1.8505538702011108, "learning_rate": 1.3036723163841808e-05, "loss": 1.9728, "step": 2617 }, { "epoch": 2.2186440677966104, "grad_norm": 1.4994605779647827, "learning_rate": 1.30225988700565e-05, "loss": 2.3302, "step": 2618 }, { "epoch": 2.2194915254237286, "grad_norm": 1.9457956552505493, "learning_rate": 1.3008474576271188e-05, "loss": 2.0578, "step": 2619 }, { "epoch": 2.2203389830508473, "grad_norm": 2.009333848953247, "learning_rate": 1.2994350282485876e-05, "loss": 2.0096, "step": 2620 }, { "epoch": 2.221186440677966, "grad_norm": 1.5474363565444946, "learning_rate": 1.2980225988700565e-05, "loss": 2.3008, "step": 2621 }, { "epoch": 2.2220338983050847, "grad_norm": 1.6145862340927124, "learning_rate": 1.2966101694915256e-05, "loss": 2.144, "step": 2622 }, { "epoch": 2.2228813559322034, "grad_norm": 1.2498451471328735, "learning_rate": 1.2951977401129945e-05, "loss": 2.4407, "step": 2623 }, { "epoch": 2.223728813559322, "grad_norm": 1.3638949394226074, "learning_rate": 1.2937853107344633e-05, "loss": 2.2856, "step": 2624 }, { "epoch": 2.2245762711864407, "grad_norm": 1.553649663925171, "learning_rate": 1.2923728813559321e-05, "loss": 2.2411, "step": 2625 }, { "epoch": 2.2254237288135594, "grad_norm": 2.9169318675994873, "learning_rate": 1.2909604519774013e-05, "loss": 1.9925, "step": 2626 }, { "epoch": 2.226271186440678, "grad_norm": 1.6467723846435547, "learning_rate": 1.2895480225988701e-05, "loss": 2.073, "step": 2627 }, { "epoch": 2.227118644067797, "grad_norm": 1.6251808404922485, "learning_rate": 1.288135593220339e-05, "loss": 2.2559, "step": 2628 }, { "epoch": 2.227966101694915, "grad_norm": 1.448697566986084, "learning_rate": 1.286723163841808e-05, "loss": 2.4973, "step": 2629 }, { "epoch": 2.2288135593220337, "grad_norm": 1.5851848125457764, "learning_rate": 1.285310734463277e-05, "loss": 2.1272, "step": 2630 }, { "epoch": 2.2296610169491524, "grad_norm": 1.842235803604126, "learning_rate": 1.2838983050847458e-05, "loss": 1.9558, "step": 2631 }, { "epoch": 2.230508474576271, "grad_norm": 1.7956777811050415, "learning_rate": 1.2824858757062148e-05, "loss": 2.1902, "step": 2632 }, { "epoch": 2.23135593220339, "grad_norm": 1.7201855182647705, "learning_rate": 1.2810734463276836e-05, "loss": 2.1372, "step": 2633 }, { "epoch": 2.2322033898305085, "grad_norm": 2.0029139518737793, "learning_rate": 1.2796610169491528e-05, "loss": 1.9169, "step": 2634 }, { "epoch": 2.233050847457627, "grad_norm": 1.797213077545166, "learning_rate": 1.2782485875706216e-05, "loss": 2.0318, "step": 2635 }, { "epoch": 2.233898305084746, "grad_norm": 1.9128228425979614, "learning_rate": 1.2768361581920904e-05, "loss": 2.0825, "step": 2636 }, { "epoch": 2.2347457627118645, "grad_norm": 1.6019635200500488, "learning_rate": 1.2754237288135593e-05, "loss": 2.1612, "step": 2637 }, { "epoch": 2.2355932203389832, "grad_norm": 1.6599159240722656, "learning_rate": 1.2740112994350284e-05, "loss": 2.2443, "step": 2638 }, { "epoch": 2.236440677966102, "grad_norm": 1.4098831415176392, "learning_rate": 1.2725988700564973e-05, "loss": 2.2044, "step": 2639 }, { "epoch": 2.23728813559322, "grad_norm": 1.9883822202682495, "learning_rate": 1.2711864406779661e-05, "loss": 1.9197, "step": 2640 }, { "epoch": 2.238135593220339, "grad_norm": 1.4971897602081299, "learning_rate": 1.2697740112994349e-05, "loss": 2.3918, "step": 2641 }, { "epoch": 2.2389830508474575, "grad_norm": 1.145992636680603, "learning_rate": 1.268361581920904e-05, "loss": 2.6599, "step": 2642 }, { "epoch": 2.239830508474576, "grad_norm": 1.6472599506378174, "learning_rate": 1.2669491525423729e-05, "loss": 2.2362, "step": 2643 }, { "epoch": 2.240677966101695, "grad_norm": 1.640352487564087, "learning_rate": 1.2655367231638417e-05, "loss": 1.9791, "step": 2644 }, { "epoch": 2.2415254237288136, "grad_norm": 1.8979905843734741, "learning_rate": 1.2641242937853107e-05, "loss": 1.9645, "step": 2645 }, { "epoch": 2.2423728813559323, "grad_norm": 1.440248966217041, "learning_rate": 1.2627118644067797e-05, "loss": 2.3772, "step": 2646 }, { "epoch": 2.243220338983051, "grad_norm": 1.2969114780426025, "learning_rate": 1.2612994350282486e-05, "loss": 2.4062, "step": 2647 }, { "epoch": 2.2440677966101696, "grad_norm": 1.9726459980010986, "learning_rate": 1.2598870056497176e-05, "loss": 1.7231, "step": 2648 }, { "epoch": 2.2449152542372883, "grad_norm": 1.451680064201355, "learning_rate": 1.2584745762711864e-05, "loss": 2.4602, "step": 2649 }, { "epoch": 2.2457627118644066, "grad_norm": 2.0853891372680664, "learning_rate": 1.2570621468926556e-05, "loss": 1.9044, "step": 2650 }, { "epoch": 2.2466101694915253, "grad_norm": 1.7201420068740845, "learning_rate": 1.2556497175141244e-05, "loss": 1.9594, "step": 2651 }, { "epoch": 2.247457627118644, "grad_norm": 1.182131290435791, "learning_rate": 1.2542372881355932e-05, "loss": 2.5289, "step": 2652 }, { "epoch": 2.2483050847457626, "grad_norm": 1.6462275981903076, "learning_rate": 1.252824858757062e-05, "loss": 2.2147, "step": 2653 }, { "epoch": 2.2491525423728813, "grad_norm": 1.8890339136123657, "learning_rate": 1.2514124293785312e-05, "loss": 1.7884, "step": 2654 }, { "epoch": 2.25, "grad_norm": 1.6973693370819092, "learning_rate": 1.25e-05, "loss": 2.2188, "step": 2655 }, { "epoch": 2.2508474576271187, "grad_norm": 1.3676884174346924, "learning_rate": 1.248587570621469e-05, "loss": 2.3783, "step": 2656 }, { "epoch": 2.2516949152542374, "grad_norm": 1.6827466487884521, "learning_rate": 1.2471751412429379e-05, "loss": 2.2626, "step": 2657 }, { "epoch": 2.252542372881356, "grad_norm": 2.5459835529327393, "learning_rate": 1.2457627118644069e-05, "loss": 1.4011, "step": 2658 }, { "epoch": 2.2533898305084747, "grad_norm": 1.1796575784683228, "learning_rate": 1.2443502824858757e-05, "loss": 2.6082, "step": 2659 }, { "epoch": 2.2542372881355934, "grad_norm": 2.302685499191284, "learning_rate": 1.2429378531073447e-05, "loss": 1.6164, "step": 2660 }, { "epoch": 2.2550847457627117, "grad_norm": 1.9911892414093018, "learning_rate": 1.2415254237288135e-05, "loss": 2.0376, "step": 2661 }, { "epoch": 2.2559322033898304, "grad_norm": 1.6822550296783447, "learning_rate": 1.2401129943502825e-05, "loss": 2.1804, "step": 2662 }, { "epoch": 2.256779661016949, "grad_norm": 1.8066636323928833, "learning_rate": 1.2387005649717515e-05, "loss": 2.1502, "step": 2663 }, { "epoch": 2.2576271186440677, "grad_norm": 1.7069318294525146, "learning_rate": 1.2372881355932204e-05, "loss": 2.0484, "step": 2664 }, { "epoch": 2.2584745762711864, "grad_norm": 1.8507767915725708, "learning_rate": 1.2358757062146894e-05, "loss": 2.0893, "step": 2665 }, { "epoch": 2.259322033898305, "grad_norm": 1.5114821195602417, "learning_rate": 1.2344632768361584e-05, "loss": 2.3222, "step": 2666 }, { "epoch": 2.260169491525424, "grad_norm": 1.617348074913025, "learning_rate": 1.2330508474576272e-05, "loss": 2.2411, "step": 2667 }, { "epoch": 2.2610169491525425, "grad_norm": 1.4618602991104126, "learning_rate": 1.2316384180790962e-05, "loss": 2.4252, "step": 2668 }, { "epoch": 2.261864406779661, "grad_norm": 1.623011589050293, "learning_rate": 1.230225988700565e-05, "loss": 2.1134, "step": 2669 }, { "epoch": 2.26271186440678, "grad_norm": 1.7612273693084717, "learning_rate": 1.228813559322034e-05, "loss": 2.0647, "step": 2670 }, { "epoch": 2.263559322033898, "grad_norm": 1.4703688621520996, "learning_rate": 1.2274011299435029e-05, "loss": 2.2724, "step": 2671 }, { "epoch": 2.2644067796610168, "grad_norm": 1.3753740787506104, "learning_rate": 1.2259887005649719e-05, "loss": 2.3529, "step": 2672 }, { "epoch": 2.2652542372881355, "grad_norm": 1.1475927829742432, "learning_rate": 1.2245762711864407e-05, "loss": 2.5087, "step": 2673 }, { "epoch": 2.266101694915254, "grad_norm": 1.6536250114440918, "learning_rate": 1.2231638418079097e-05, "loss": 2.065, "step": 2674 }, { "epoch": 2.266949152542373, "grad_norm": 1.6411614418029785, "learning_rate": 1.2217514124293785e-05, "loss": 2.2422, "step": 2675 }, { "epoch": 2.2677966101694915, "grad_norm": 1.7272388935089111, "learning_rate": 1.2203389830508475e-05, "loss": 2.053, "step": 2676 }, { "epoch": 2.26864406779661, "grad_norm": 1.8547290563583374, "learning_rate": 1.2189265536723164e-05, "loss": 1.9623, "step": 2677 }, { "epoch": 2.269491525423729, "grad_norm": 1.6145331859588623, "learning_rate": 1.2175141242937854e-05, "loss": 2.1809, "step": 2678 }, { "epoch": 2.2703389830508476, "grad_norm": 1.6128720045089722, "learning_rate": 1.2161016949152544e-05, "loss": 2.0409, "step": 2679 }, { "epoch": 2.2711864406779663, "grad_norm": 1.833178162574768, "learning_rate": 1.2146892655367234e-05, "loss": 2.043, "step": 2680 }, { "epoch": 2.272033898305085, "grad_norm": 1.2633790969848633, "learning_rate": 1.2132768361581922e-05, "loss": 2.5096, "step": 2681 }, { "epoch": 2.272881355932203, "grad_norm": 1.8555970191955566, "learning_rate": 1.2118644067796612e-05, "loss": 2.0319, "step": 2682 }, { "epoch": 2.273728813559322, "grad_norm": 1.6284433603286743, "learning_rate": 1.21045197740113e-05, "loss": 2.2383, "step": 2683 }, { "epoch": 2.2745762711864406, "grad_norm": 1.8158788681030273, "learning_rate": 1.209039548022599e-05, "loss": 2.0611, "step": 2684 }, { "epoch": 2.2754237288135593, "grad_norm": 1.6966010332107544, "learning_rate": 1.2076271186440678e-05, "loss": 2.0186, "step": 2685 }, { "epoch": 2.276271186440678, "grad_norm": 1.9948965311050415, "learning_rate": 1.2062146892655368e-05, "loss": 1.7529, "step": 2686 }, { "epoch": 2.2771186440677966, "grad_norm": 2.175189733505249, "learning_rate": 1.2048022598870057e-05, "loss": 1.7595, "step": 2687 }, { "epoch": 2.2779661016949153, "grad_norm": 1.519124150276184, "learning_rate": 1.2033898305084747e-05, "loss": 2.2935, "step": 2688 }, { "epoch": 2.278813559322034, "grad_norm": 1.118815541267395, "learning_rate": 1.2019774011299435e-05, "loss": 2.5004, "step": 2689 }, { "epoch": 2.2796610169491527, "grad_norm": 1.5908997058868408, "learning_rate": 1.2005649717514125e-05, "loss": 2.2099, "step": 2690 }, { "epoch": 2.2805084745762714, "grad_norm": 1.539771318435669, "learning_rate": 1.1991525423728813e-05, "loss": 2.1345, "step": 2691 }, { "epoch": 2.2813559322033896, "grad_norm": 1.9502480030059814, "learning_rate": 1.1977401129943503e-05, "loss": 1.8979, "step": 2692 }, { "epoch": 2.2822033898305083, "grad_norm": 1.8579846620559692, "learning_rate": 1.1963276836158192e-05, "loss": 2.2162, "step": 2693 }, { "epoch": 2.283050847457627, "grad_norm": 1.2290531396865845, "learning_rate": 1.1949152542372882e-05, "loss": 2.5044, "step": 2694 }, { "epoch": 2.2838983050847457, "grad_norm": 1.667528510093689, "learning_rate": 1.1935028248587572e-05, "loss": 2.0949, "step": 2695 }, { "epoch": 2.2847457627118644, "grad_norm": 1.9315160512924194, "learning_rate": 1.1920903954802262e-05, "loss": 1.864, "step": 2696 }, { "epoch": 2.285593220338983, "grad_norm": 1.8570200204849243, "learning_rate": 1.190677966101695e-05, "loss": 2.2182, "step": 2697 }, { "epoch": 2.2864406779661017, "grad_norm": 2.0234036445617676, "learning_rate": 1.189265536723164e-05, "loss": 1.8277, "step": 2698 }, { "epoch": 2.2872881355932204, "grad_norm": 1.6580219268798828, "learning_rate": 1.1878531073446328e-05, "loss": 2.0986, "step": 2699 }, { "epoch": 2.288135593220339, "grad_norm": 1.8003672361373901, "learning_rate": 1.1864406779661018e-05, "loss": 1.9952, "step": 2700 }, { "epoch": 2.288983050847458, "grad_norm": 2.0319578647613525, "learning_rate": 1.1850282485875706e-05, "loss": 1.8889, "step": 2701 }, { "epoch": 2.2898305084745765, "grad_norm": 2.218423843383789, "learning_rate": 1.1836158192090396e-05, "loss": 1.6844, "step": 2702 }, { "epoch": 2.290677966101695, "grad_norm": 1.7018412351608276, "learning_rate": 1.1822033898305085e-05, "loss": 2.0824, "step": 2703 }, { "epoch": 2.2915254237288134, "grad_norm": 1.8618487119674683, "learning_rate": 1.1807909604519775e-05, "loss": 1.9784, "step": 2704 }, { "epoch": 2.292372881355932, "grad_norm": 1.6080292463302612, "learning_rate": 1.1793785310734463e-05, "loss": 2.2731, "step": 2705 }, { "epoch": 2.2932203389830508, "grad_norm": 2.1574764251708984, "learning_rate": 1.1779661016949153e-05, "loss": 2.0009, "step": 2706 }, { "epoch": 2.2940677966101695, "grad_norm": 1.6515130996704102, "learning_rate": 1.1765536723163841e-05, "loss": 2.1144, "step": 2707 }, { "epoch": 2.294915254237288, "grad_norm": 1.5661877393722534, "learning_rate": 1.1751412429378531e-05, "loss": 2.2562, "step": 2708 }, { "epoch": 2.295762711864407, "grad_norm": 2.0387375354766846, "learning_rate": 1.173728813559322e-05, "loss": 1.8794, "step": 2709 }, { "epoch": 2.2966101694915255, "grad_norm": 1.8044418096542358, "learning_rate": 1.172316384180791e-05, "loss": 2.1209, "step": 2710 }, { "epoch": 2.297457627118644, "grad_norm": 1.8447290658950806, "learning_rate": 1.17090395480226e-05, "loss": 2.0123, "step": 2711 }, { "epoch": 2.298305084745763, "grad_norm": 1.7493408918380737, "learning_rate": 1.169491525423729e-05, "loss": 2.1515, "step": 2712 }, { "epoch": 2.299152542372881, "grad_norm": 1.6475424766540527, "learning_rate": 1.1680790960451978e-05, "loss": 2.2462, "step": 2713 }, { "epoch": 2.3, "grad_norm": 1.6466798782348633, "learning_rate": 1.1666666666666668e-05, "loss": 2.2663, "step": 2714 }, { "epoch": 2.3008474576271185, "grad_norm": 1.8530153036117554, "learning_rate": 1.1652542372881356e-05, "loss": 2.0214, "step": 2715 }, { "epoch": 2.301694915254237, "grad_norm": 1.6622998714447021, "learning_rate": 1.1638418079096046e-05, "loss": 2.1939, "step": 2716 }, { "epoch": 2.302542372881356, "grad_norm": 1.883543610572815, "learning_rate": 1.1624293785310734e-05, "loss": 1.9494, "step": 2717 }, { "epoch": 2.3033898305084746, "grad_norm": 1.7924774885177612, "learning_rate": 1.1610169491525424e-05, "loss": 2.0853, "step": 2718 }, { "epoch": 2.3042372881355933, "grad_norm": 1.6783621311187744, "learning_rate": 1.1596045197740113e-05, "loss": 2.1701, "step": 2719 }, { "epoch": 2.305084745762712, "grad_norm": 1.4182416200637817, "learning_rate": 1.1581920903954803e-05, "loss": 2.4056, "step": 2720 }, { "epoch": 2.3059322033898306, "grad_norm": 2.2613394260406494, "learning_rate": 1.1567796610169491e-05, "loss": 1.9061, "step": 2721 }, { "epoch": 2.3067796610169493, "grad_norm": 2.1956305503845215, "learning_rate": 1.1553672316384181e-05, "loss": 1.7216, "step": 2722 }, { "epoch": 2.307627118644068, "grad_norm": 1.333406686782837, "learning_rate": 1.153954802259887e-05, "loss": 2.3645, "step": 2723 }, { "epoch": 2.3084745762711867, "grad_norm": 1.9453949928283691, "learning_rate": 1.152542372881356e-05, "loss": 1.7782, "step": 2724 }, { "epoch": 2.309322033898305, "grad_norm": 1.9100425243377686, "learning_rate": 1.151129943502825e-05, "loss": 1.8895, "step": 2725 }, { "epoch": 2.3101694915254236, "grad_norm": 1.7991353273391724, "learning_rate": 1.1497175141242938e-05, "loss": 2.125, "step": 2726 }, { "epoch": 2.3110169491525423, "grad_norm": 1.6553651094436646, "learning_rate": 1.1483050847457628e-05, "loss": 2.0838, "step": 2727 }, { "epoch": 2.311864406779661, "grad_norm": 1.5747385025024414, "learning_rate": 1.1468926553672318e-05, "loss": 1.9827, "step": 2728 }, { "epoch": 2.3127118644067797, "grad_norm": 1.8182562589645386, "learning_rate": 1.1454802259887006e-05, "loss": 2.1982, "step": 2729 }, { "epoch": 2.3135593220338984, "grad_norm": 2.0031235218048096, "learning_rate": 1.1440677966101696e-05, "loss": 1.7508, "step": 2730 }, { "epoch": 2.314406779661017, "grad_norm": 1.7654558420181274, "learning_rate": 1.1426553672316384e-05, "loss": 2.1223, "step": 2731 }, { "epoch": 2.3152542372881357, "grad_norm": 1.3843965530395508, "learning_rate": 1.1412429378531074e-05, "loss": 2.3811, "step": 2732 }, { "epoch": 2.3161016949152544, "grad_norm": 2.099733352661133, "learning_rate": 1.1398305084745763e-05, "loss": 1.8498, "step": 2733 }, { "epoch": 2.3169491525423727, "grad_norm": 2.0038676261901855, "learning_rate": 1.1384180790960453e-05, "loss": 2.2145, "step": 2734 }, { "epoch": 2.3177966101694913, "grad_norm": 2.0445220470428467, "learning_rate": 1.137005649717514e-05, "loss": 1.9238, "step": 2735 }, { "epoch": 2.31864406779661, "grad_norm": 1.8142948150634766, "learning_rate": 1.135593220338983e-05, "loss": 2.0539, "step": 2736 }, { "epoch": 2.3194915254237287, "grad_norm": 2.1312167644500732, "learning_rate": 1.134180790960452e-05, "loss": 1.8662, "step": 2737 }, { "epoch": 2.3203389830508474, "grad_norm": 1.7986578941345215, "learning_rate": 1.1327683615819209e-05, "loss": 2.1375, "step": 2738 }, { "epoch": 2.321186440677966, "grad_norm": 1.3996113538742065, "learning_rate": 1.1313559322033899e-05, "loss": 2.3352, "step": 2739 }, { "epoch": 2.3220338983050848, "grad_norm": 1.4943941831588745, "learning_rate": 1.1299435028248587e-05, "loss": 2.3732, "step": 2740 }, { "epoch": 2.3228813559322035, "grad_norm": 1.417456865310669, "learning_rate": 1.1285310734463277e-05, "loss": 2.3092, "step": 2741 }, { "epoch": 2.323728813559322, "grad_norm": 1.2574368715286255, "learning_rate": 1.1271186440677967e-05, "loss": 2.6752, "step": 2742 }, { "epoch": 2.324576271186441, "grad_norm": 1.65830659866333, "learning_rate": 1.1257062146892656e-05, "loss": 2.2794, "step": 2743 }, { "epoch": 2.3254237288135595, "grad_norm": 1.9377236366271973, "learning_rate": 1.1242937853107346e-05, "loss": 1.8714, "step": 2744 }, { "epoch": 2.326271186440678, "grad_norm": 1.7150442600250244, "learning_rate": 1.1228813559322036e-05, "loss": 2.3143, "step": 2745 }, { "epoch": 2.3271186440677964, "grad_norm": 1.9324195384979248, "learning_rate": 1.1214689265536724e-05, "loss": 2.1327, "step": 2746 }, { "epoch": 2.327966101694915, "grad_norm": 1.2602370977401733, "learning_rate": 1.1200564971751414e-05, "loss": 2.4212, "step": 2747 }, { "epoch": 2.328813559322034, "grad_norm": 1.5296436548233032, "learning_rate": 1.1186440677966102e-05, "loss": 2.2718, "step": 2748 }, { "epoch": 2.3296610169491525, "grad_norm": 1.7456475496292114, "learning_rate": 1.1172316384180792e-05, "loss": 2.1383, "step": 2749 }, { "epoch": 2.330508474576271, "grad_norm": 1.623611330986023, "learning_rate": 1.115819209039548e-05, "loss": 2.1878, "step": 2750 }, { "epoch": 2.33135593220339, "grad_norm": 1.6439287662506104, "learning_rate": 1.114406779661017e-05, "loss": 2.1031, "step": 2751 }, { "epoch": 2.3322033898305086, "grad_norm": 1.5060378313064575, "learning_rate": 1.1129943502824859e-05, "loss": 2.2501, "step": 2752 }, { "epoch": 2.3330508474576273, "grad_norm": 1.630726933479309, "learning_rate": 1.1115819209039549e-05, "loss": 2.0935, "step": 2753 }, { "epoch": 2.333898305084746, "grad_norm": 1.8235399723052979, "learning_rate": 1.1101694915254237e-05, "loss": 1.8763, "step": 2754 }, { "epoch": 2.334745762711864, "grad_norm": 1.6734403371810913, "learning_rate": 1.1087570621468927e-05, "loss": 2.1593, "step": 2755 }, { "epoch": 2.335593220338983, "grad_norm": 1.6418523788452148, "learning_rate": 1.1073446327683615e-05, "loss": 2.3486, "step": 2756 }, { "epoch": 2.3364406779661016, "grad_norm": 1.5744705200195312, "learning_rate": 1.1059322033898305e-05, "loss": 2.3483, "step": 2757 }, { "epoch": 2.3372881355932202, "grad_norm": 2.012573719024658, "learning_rate": 1.1045197740112995e-05, "loss": 2.081, "step": 2758 }, { "epoch": 2.338135593220339, "grad_norm": 1.7633767127990723, "learning_rate": 1.1031073446327685e-05, "loss": 2.0997, "step": 2759 }, { "epoch": 2.3389830508474576, "grad_norm": 1.5577518939971924, "learning_rate": 1.1016949152542374e-05, "loss": 2.1127, "step": 2760 }, { "epoch": 2.3398305084745763, "grad_norm": 1.6724456548690796, "learning_rate": 1.1002824858757064e-05, "loss": 2.2547, "step": 2761 }, { "epoch": 2.340677966101695, "grad_norm": 2.2773401737213135, "learning_rate": 1.0988700564971752e-05, "loss": 1.7376, "step": 2762 }, { "epoch": 2.3415254237288137, "grad_norm": 1.601205825805664, "learning_rate": 1.0974576271186442e-05, "loss": 2.3381, "step": 2763 }, { "epoch": 2.3423728813559324, "grad_norm": 1.7957279682159424, "learning_rate": 1.096045197740113e-05, "loss": 2.246, "step": 2764 }, { "epoch": 2.343220338983051, "grad_norm": 2.417149305343628, "learning_rate": 1.094632768361582e-05, "loss": 1.99, "step": 2765 }, { "epoch": 2.3440677966101697, "grad_norm": 1.5688859224319458, "learning_rate": 1.0932203389830509e-05, "loss": 2.2738, "step": 2766 }, { "epoch": 2.344915254237288, "grad_norm": 1.453046441078186, "learning_rate": 1.0918079096045199e-05, "loss": 2.3192, "step": 2767 }, { "epoch": 2.3457627118644067, "grad_norm": 2.468358039855957, "learning_rate": 1.0903954802259887e-05, "loss": 1.6678, "step": 2768 }, { "epoch": 2.3466101694915253, "grad_norm": 1.8789864778518677, "learning_rate": 1.0889830508474577e-05, "loss": 2.1603, "step": 2769 }, { "epoch": 2.347457627118644, "grad_norm": 1.5327998399734497, "learning_rate": 1.0875706214689265e-05, "loss": 2.262, "step": 2770 }, { "epoch": 2.3483050847457627, "grad_norm": 1.5479819774627686, "learning_rate": 1.0861581920903955e-05, "loss": 2.2451, "step": 2771 }, { "epoch": 2.3491525423728814, "grad_norm": 1.2894599437713623, "learning_rate": 1.0847457627118644e-05, "loss": 2.4146, "step": 2772 }, { "epoch": 2.35, "grad_norm": 1.3935586214065552, "learning_rate": 1.0833333333333334e-05, "loss": 2.4342, "step": 2773 }, { "epoch": 2.3508474576271188, "grad_norm": 1.552455186843872, "learning_rate": 1.0819209039548024e-05, "loss": 2.2017, "step": 2774 }, { "epoch": 2.3516949152542375, "grad_norm": 1.7251968383789062, "learning_rate": 1.0805084745762714e-05, "loss": 2.0838, "step": 2775 }, { "epoch": 2.3525423728813557, "grad_norm": 1.8735079765319824, "learning_rate": 1.0790960451977402e-05, "loss": 1.9269, "step": 2776 }, { "epoch": 2.3533898305084744, "grad_norm": 1.9194589853286743, "learning_rate": 1.0776836158192092e-05, "loss": 1.9188, "step": 2777 }, { "epoch": 2.354237288135593, "grad_norm": 1.724419116973877, "learning_rate": 1.076271186440678e-05, "loss": 1.8482, "step": 2778 }, { "epoch": 2.3550847457627118, "grad_norm": 1.979636549949646, "learning_rate": 1.074858757062147e-05, "loss": 2.1036, "step": 2779 }, { "epoch": 2.3559322033898304, "grad_norm": 2.013507843017578, "learning_rate": 1.0734463276836158e-05, "loss": 1.9052, "step": 2780 }, { "epoch": 2.356779661016949, "grad_norm": 1.5404797792434692, "learning_rate": 1.0720338983050848e-05, "loss": 2.2025, "step": 2781 }, { "epoch": 2.357627118644068, "grad_norm": 1.9583626985549927, "learning_rate": 1.0706214689265537e-05, "loss": 2.0116, "step": 2782 }, { "epoch": 2.3584745762711865, "grad_norm": 1.8386528491973877, "learning_rate": 1.0692090395480227e-05, "loss": 2.1348, "step": 2783 }, { "epoch": 2.359322033898305, "grad_norm": 1.9649231433868408, "learning_rate": 1.0677966101694915e-05, "loss": 1.7567, "step": 2784 }, { "epoch": 2.360169491525424, "grad_norm": 1.9654440879821777, "learning_rate": 1.0663841807909605e-05, "loss": 2.0183, "step": 2785 }, { "epoch": 2.3610169491525426, "grad_norm": 1.9445511102676392, "learning_rate": 1.0649717514124293e-05, "loss": 2.1337, "step": 2786 }, { "epoch": 2.3618644067796613, "grad_norm": 1.9277061223983765, "learning_rate": 1.0635593220338983e-05, "loss": 1.8659, "step": 2787 }, { "epoch": 2.3627118644067795, "grad_norm": 1.9481409788131714, "learning_rate": 1.0621468926553672e-05, "loss": 1.9901, "step": 2788 }, { "epoch": 2.363559322033898, "grad_norm": 1.097215175628662, "learning_rate": 1.0607344632768362e-05, "loss": 2.5807, "step": 2789 }, { "epoch": 2.364406779661017, "grad_norm": 2.107255220413208, "learning_rate": 1.0593220338983052e-05, "loss": 1.6506, "step": 2790 }, { "epoch": 2.3652542372881356, "grad_norm": 1.51083242893219, "learning_rate": 1.0579096045197742e-05, "loss": 2.2698, "step": 2791 }, { "epoch": 2.3661016949152542, "grad_norm": 2.2006630897521973, "learning_rate": 1.056497175141243e-05, "loss": 1.6796, "step": 2792 }, { "epoch": 2.366949152542373, "grad_norm": 1.544604778289795, "learning_rate": 1.055084745762712e-05, "loss": 2.2162, "step": 2793 }, { "epoch": 2.3677966101694916, "grad_norm": 1.856799840927124, "learning_rate": 1.0536723163841808e-05, "loss": 1.8844, "step": 2794 }, { "epoch": 2.3686440677966103, "grad_norm": 1.3587238788604736, "learning_rate": 1.0522598870056498e-05, "loss": 2.2896, "step": 2795 }, { "epoch": 2.369491525423729, "grad_norm": 1.6245845556259155, "learning_rate": 1.0508474576271186e-05, "loss": 2.2257, "step": 2796 }, { "epoch": 2.3703389830508472, "grad_norm": 1.266310453414917, "learning_rate": 1.0494350282485876e-05, "loss": 2.4608, "step": 2797 }, { "epoch": 2.371186440677966, "grad_norm": 2.4574804306030273, "learning_rate": 1.0480225988700565e-05, "loss": 1.6334, "step": 2798 }, { "epoch": 2.3720338983050846, "grad_norm": 1.696324110031128, "learning_rate": 1.0466101694915255e-05, "loss": 2.1779, "step": 2799 }, { "epoch": 2.3728813559322033, "grad_norm": 1.362596035003662, "learning_rate": 1.0451977401129943e-05, "loss": 2.5589, "step": 2800 }, { "epoch": 2.373728813559322, "grad_norm": 1.9849655628204346, "learning_rate": 1.0437853107344633e-05, "loss": 2.1233, "step": 2801 }, { "epoch": 2.3745762711864407, "grad_norm": 2.01254940032959, "learning_rate": 1.0423728813559321e-05, "loss": 2.1137, "step": 2802 }, { "epoch": 2.3754237288135593, "grad_norm": 1.8358657360076904, "learning_rate": 1.0409604519774011e-05, "loss": 2.0911, "step": 2803 }, { "epoch": 2.376271186440678, "grad_norm": 1.8649876117706299, "learning_rate": 1.0395480225988701e-05, "loss": 1.9854, "step": 2804 }, { "epoch": 2.3771186440677967, "grad_norm": 2.0961883068084717, "learning_rate": 1.038135593220339e-05, "loss": 2.0988, "step": 2805 }, { "epoch": 2.3779661016949154, "grad_norm": 1.7328708171844482, "learning_rate": 1.036723163841808e-05, "loss": 2.2116, "step": 2806 }, { "epoch": 2.378813559322034, "grad_norm": 1.34184730052948, "learning_rate": 1.035310734463277e-05, "loss": 2.3053, "step": 2807 }, { "epoch": 2.3796610169491528, "grad_norm": 1.623054027557373, "learning_rate": 1.0338983050847458e-05, "loss": 2.3248, "step": 2808 }, { "epoch": 2.380508474576271, "grad_norm": 1.5115116834640503, "learning_rate": 1.0324858757062148e-05, "loss": 2.3151, "step": 2809 }, { "epoch": 2.3813559322033897, "grad_norm": 2.490647792816162, "learning_rate": 1.0310734463276836e-05, "loss": 1.6813, "step": 2810 }, { "epoch": 2.3822033898305084, "grad_norm": 1.9092079401016235, "learning_rate": 1.0296610169491526e-05, "loss": 1.9132, "step": 2811 }, { "epoch": 2.383050847457627, "grad_norm": 1.6100598573684692, "learning_rate": 1.0282485875706214e-05, "loss": 2.1598, "step": 2812 }, { "epoch": 2.3838983050847458, "grad_norm": 1.8987376689910889, "learning_rate": 1.0268361581920904e-05, "loss": 1.9817, "step": 2813 }, { "epoch": 2.3847457627118644, "grad_norm": 1.4695061445236206, "learning_rate": 1.0254237288135593e-05, "loss": 2.328, "step": 2814 }, { "epoch": 2.385593220338983, "grad_norm": 2.067765712738037, "learning_rate": 1.0240112994350283e-05, "loss": 1.8811, "step": 2815 }, { "epoch": 2.386440677966102, "grad_norm": 1.7795380353927612, "learning_rate": 1.0225988700564973e-05, "loss": 2.1139, "step": 2816 }, { "epoch": 2.3872881355932205, "grad_norm": 1.671406865119934, "learning_rate": 1.0211864406779661e-05, "loss": 2.1007, "step": 2817 }, { "epoch": 2.3881355932203387, "grad_norm": 2.126786947250366, "learning_rate": 1.0197740112994351e-05, "loss": 1.8009, "step": 2818 }, { "epoch": 2.3889830508474574, "grad_norm": 1.7504265308380127, "learning_rate": 1.018361581920904e-05, "loss": 2.0789, "step": 2819 }, { "epoch": 2.389830508474576, "grad_norm": 1.5832493305206299, "learning_rate": 1.016949152542373e-05, "loss": 2.2497, "step": 2820 }, { "epoch": 2.390677966101695, "grad_norm": 1.560479760169983, "learning_rate": 1.015536723163842e-05, "loss": 2.3253, "step": 2821 }, { "epoch": 2.3915254237288135, "grad_norm": 1.9991607666015625, "learning_rate": 1.0141242937853108e-05, "loss": 1.9794, "step": 2822 }, { "epoch": 2.392372881355932, "grad_norm": 1.3287773132324219, "learning_rate": 1.0127118644067798e-05, "loss": 2.4257, "step": 2823 }, { "epoch": 2.393220338983051, "grad_norm": 1.7449588775634766, "learning_rate": 1.0112994350282488e-05, "loss": 2.2996, "step": 2824 }, { "epoch": 2.3940677966101696, "grad_norm": 1.979357361793518, "learning_rate": 1.0098870056497176e-05, "loss": 2.0135, "step": 2825 }, { "epoch": 2.3949152542372882, "grad_norm": 1.8858957290649414, "learning_rate": 1.0084745762711866e-05, "loss": 1.8583, "step": 2826 }, { "epoch": 2.395762711864407, "grad_norm": 1.7009750604629517, "learning_rate": 1.0070621468926554e-05, "loss": 2.0953, "step": 2827 }, { "epoch": 2.3966101694915256, "grad_norm": 1.5228315591812134, "learning_rate": 1.0056497175141244e-05, "loss": 2.2899, "step": 2828 }, { "epoch": 2.3974576271186443, "grad_norm": 1.3745592832565308, "learning_rate": 1.0042372881355933e-05, "loss": 2.454, "step": 2829 }, { "epoch": 2.3983050847457625, "grad_norm": 2.0505104064941406, "learning_rate": 1.0028248587570623e-05, "loss": 1.6602, "step": 2830 }, { "epoch": 2.3991525423728812, "grad_norm": 2.1742746829986572, "learning_rate": 1.001412429378531e-05, "loss": 1.5815, "step": 2831 }, { "epoch": 2.4, "grad_norm": 1.5642485618591309, "learning_rate": 1e-05, "loss": 2.2674, "step": 2832 }, { "epoch": 2.4008474576271186, "grad_norm": 1.7752679586410522, "learning_rate": 9.985875706214689e-06, "loss": 2.2217, "step": 2833 }, { "epoch": 2.4016949152542373, "grad_norm": 1.7923104763031006, "learning_rate": 9.971751412429379e-06, "loss": 2.1723, "step": 2834 }, { "epoch": 2.402542372881356, "grad_norm": 1.6144061088562012, "learning_rate": 9.957627118644067e-06, "loss": 2.09, "step": 2835 }, { "epoch": 2.4033898305084747, "grad_norm": 1.3840607404708862, "learning_rate": 9.943502824858757e-06, "loss": 2.3489, "step": 2836 }, { "epoch": 2.4042372881355933, "grad_norm": 1.6972010135650635, "learning_rate": 9.929378531073447e-06, "loss": 2.1179, "step": 2837 }, { "epoch": 2.405084745762712, "grad_norm": 1.6566342115402222, "learning_rate": 9.915254237288137e-06, "loss": 2.3024, "step": 2838 }, { "epoch": 2.4059322033898303, "grad_norm": 1.9090768098831177, "learning_rate": 9.901129943502826e-06, "loss": 1.861, "step": 2839 }, { "epoch": 2.406779661016949, "grad_norm": 3.16331148147583, "learning_rate": 9.887005649717516e-06, "loss": 2.2437, "step": 2840 }, { "epoch": 2.4076271186440676, "grad_norm": 1.7605934143066406, "learning_rate": 9.872881355932204e-06, "loss": 1.996, "step": 2841 }, { "epoch": 2.4084745762711863, "grad_norm": 1.681734561920166, "learning_rate": 9.858757062146894e-06, "loss": 2.1829, "step": 2842 }, { "epoch": 2.409322033898305, "grad_norm": 1.2623953819274902, "learning_rate": 9.844632768361582e-06, "loss": 2.5751, "step": 2843 }, { "epoch": 2.4101694915254237, "grad_norm": 1.6833974123001099, "learning_rate": 9.830508474576272e-06, "loss": 2.3035, "step": 2844 }, { "epoch": 2.4110169491525424, "grad_norm": 1.344804048538208, "learning_rate": 9.81638418079096e-06, "loss": 2.5304, "step": 2845 }, { "epoch": 2.411864406779661, "grad_norm": 2.0134329795837402, "learning_rate": 9.80225988700565e-06, "loss": 1.9028, "step": 2846 }, { "epoch": 2.4127118644067798, "grad_norm": 1.6806460618972778, "learning_rate": 9.788135593220339e-06, "loss": 2.2782, "step": 2847 }, { "epoch": 2.4135593220338984, "grad_norm": 1.6235787868499756, "learning_rate": 9.774011299435029e-06, "loss": 2.3137, "step": 2848 }, { "epoch": 2.414406779661017, "grad_norm": 1.7612007856369019, "learning_rate": 9.759887005649717e-06, "loss": 1.9919, "step": 2849 }, { "epoch": 2.415254237288136, "grad_norm": 2.030646800994873, "learning_rate": 9.745762711864407e-06, "loss": 1.812, "step": 2850 }, { "epoch": 2.416101694915254, "grad_norm": 1.82063627243042, "learning_rate": 9.731638418079095e-06, "loss": 2.0613, "step": 2851 }, { "epoch": 2.4169491525423727, "grad_norm": 1.7056388854980469, "learning_rate": 9.717514124293785e-06, "loss": 2.1565, "step": 2852 }, { "epoch": 2.4177966101694914, "grad_norm": 1.4293837547302246, "learning_rate": 9.703389830508475e-06, "loss": 2.2925, "step": 2853 }, { "epoch": 2.41864406779661, "grad_norm": 2.1872732639312744, "learning_rate": 9.689265536723165e-06, "loss": 1.9586, "step": 2854 }, { "epoch": 2.419491525423729, "grad_norm": 1.6308726072311401, "learning_rate": 9.675141242937854e-06, "loss": 2.1272, "step": 2855 }, { "epoch": 2.4203389830508475, "grad_norm": 2.1899256706237793, "learning_rate": 9.661016949152544e-06, "loss": 2.0633, "step": 2856 }, { "epoch": 2.421186440677966, "grad_norm": 1.827673077583313, "learning_rate": 9.646892655367232e-06, "loss": 2.0225, "step": 2857 }, { "epoch": 2.422033898305085, "grad_norm": 1.8376131057739258, "learning_rate": 9.632768361581922e-06, "loss": 2.1096, "step": 2858 }, { "epoch": 2.4228813559322036, "grad_norm": 1.9163520336151123, "learning_rate": 9.61864406779661e-06, "loss": 1.8362, "step": 2859 }, { "epoch": 2.423728813559322, "grad_norm": 2.1687750816345215, "learning_rate": 9.6045197740113e-06, "loss": 1.6314, "step": 2860 }, { "epoch": 2.4245762711864405, "grad_norm": 1.5438897609710693, "learning_rate": 9.590395480225989e-06, "loss": 2.3105, "step": 2861 }, { "epoch": 2.425423728813559, "grad_norm": 2.1108450889587402, "learning_rate": 9.576271186440679e-06, "loss": 1.9747, "step": 2862 }, { "epoch": 2.426271186440678, "grad_norm": 1.5420048236846924, "learning_rate": 9.562146892655367e-06, "loss": 2.2647, "step": 2863 }, { "epoch": 2.4271186440677965, "grad_norm": 1.6626832485198975, "learning_rate": 9.548022598870057e-06, "loss": 2.0965, "step": 2864 }, { "epoch": 2.4279661016949152, "grad_norm": 1.7532156705856323, "learning_rate": 9.533898305084745e-06, "loss": 2.1138, "step": 2865 }, { "epoch": 2.428813559322034, "grad_norm": 1.538637638092041, "learning_rate": 9.519774011299435e-06, "loss": 2.376, "step": 2866 }, { "epoch": 2.4296610169491526, "grad_norm": 1.618196964263916, "learning_rate": 9.505649717514124e-06, "loss": 2.1637, "step": 2867 }, { "epoch": 2.4305084745762713, "grad_norm": 1.9505409002304077, "learning_rate": 9.491525423728814e-06, "loss": 1.932, "step": 2868 }, { "epoch": 2.43135593220339, "grad_norm": 2.0378029346466064, "learning_rate": 9.477401129943503e-06, "loss": 1.9573, "step": 2869 }, { "epoch": 2.4322033898305087, "grad_norm": 2.2740654945373535, "learning_rate": 9.463276836158193e-06, "loss": 1.6023, "step": 2870 }, { "epoch": 2.4330508474576273, "grad_norm": 2.0650203227996826, "learning_rate": 9.449152542372882e-06, "loss": 1.7876, "step": 2871 }, { "epoch": 2.4338983050847456, "grad_norm": 1.5308619737625122, "learning_rate": 9.435028248587572e-06, "loss": 2.3185, "step": 2872 }, { "epoch": 2.4347457627118643, "grad_norm": 1.7073314189910889, "learning_rate": 9.42090395480226e-06, "loss": 2.2802, "step": 2873 }, { "epoch": 2.435593220338983, "grad_norm": 1.7173891067504883, "learning_rate": 9.40677966101695e-06, "loss": 2.1891, "step": 2874 }, { "epoch": 2.4364406779661016, "grad_norm": 1.8193533420562744, "learning_rate": 9.392655367231638e-06, "loss": 2.2416, "step": 2875 }, { "epoch": 2.4372881355932203, "grad_norm": 1.5303488969802856, "learning_rate": 9.378531073446328e-06, "loss": 2.2405, "step": 2876 }, { "epoch": 2.438135593220339, "grad_norm": 1.632622480392456, "learning_rate": 9.364406779661017e-06, "loss": 2.1976, "step": 2877 }, { "epoch": 2.4389830508474577, "grad_norm": 1.5927083492279053, "learning_rate": 9.350282485875707e-06, "loss": 2.012, "step": 2878 }, { "epoch": 2.4398305084745764, "grad_norm": 1.881761074066162, "learning_rate": 9.336158192090395e-06, "loss": 1.9449, "step": 2879 }, { "epoch": 2.440677966101695, "grad_norm": 1.309574842453003, "learning_rate": 9.322033898305085e-06, "loss": 2.5524, "step": 2880 }, { "epoch": 2.4415254237288133, "grad_norm": 1.5436817407608032, "learning_rate": 9.307909604519773e-06, "loss": 2.2504, "step": 2881 }, { "epoch": 2.442372881355932, "grad_norm": 1.769839882850647, "learning_rate": 9.293785310734463e-06, "loss": 2.0125, "step": 2882 }, { "epoch": 2.4432203389830507, "grad_norm": 1.6599061489105225, "learning_rate": 9.279661016949153e-06, "loss": 2.25, "step": 2883 }, { "epoch": 2.4440677966101694, "grad_norm": 1.8840241432189941, "learning_rate": 9.265536723163842e-06, "loss": 2.2019, "step": 2884 }, { "epoch": 2.444915254237288, "grad_norm": 1.6114009618759155, "learning_rate": 9.251412429378532e-06, "loss": 2.1258, "step": 2885 }, { "epoch": 2.4457627118644067, "grad_norm": 1.8707669973373413, "learning_rate": 9.237288135593222e-06, "loss": 2.0681, "step": 2886 }, { "epoch": 2.4466101694915254, "grad_norm": 2.1916449069976807, "learning_rate": 9.22316384180791e-06, "loss": 1.9212, "step": 2887 }, { "epoch": 2.447457627118644, "grad_norm": 1.7218174934387207, "learning_rate": 9.2090395480226e-06, "loss": 2.1402, "step": 2888 }, { "epoch": 2.448305084745763, "grad_norm": 1.666730284690857, "learning_rate": 9.194915254237288e-06, "loss": 2.2446, "step": 2889 }, { "epoch": 2.4491525423728815, "grad_norm": 2.093885660171509, "learning_rate": 9.180790960451978e-06, "loss": 1.6503, "step": 2890 }, { "epoch": 2.45, "grad_norm": 2.076740026473999, "learning_rate": 9.166666666666666e-06, "loss": 1.7412, "step": 2891 }, { "epoch": 2.450847457627119, "grad_norm": 1.4801702499389648, "learning_rate": 9.152542372881356e-06, "loss": 2.4669, "step": 2892 }, { "epoch": 2.451694915254237, "grad_norm": 1.8929673433303833, "learning_rate": 9.138418079096045e-06, "loss": 1.9661, "step": 2893 }, { "epoch": 2.452542372881356, "grad_norm": 1.5265833139419556, "learning_rate": 9.124293785310735e-06, "loss": 2.2726, "step": 2894 }, { "epoch": 2.4533898305084745, "grad_norm": 1.1610336303710938, "learning_rate": 9.110169491525423e-06, "loss": 2.4884, "step": 2895 }, { "epoch": 2.454237288135593, "grad_norm": 1.2153184413909912, "learning_rate": 9.096045197740113e-06, "loss": 2.4264, "step": 2896 }, { "epoch": 2.455084745762712, "grad_norm": 1.9768568277359009, "learning_rate": 9.081920903954803e-06, "loss": 2.0703, "step": 2897 }, { "epoch": 2.4559322033898305, "grad_norm": 1.6819117069244385, "learning_rate": 9.067796610169491e-06, "loss": 2.1022, "step": 2898 }, { "epoch": 2.4567796610169492, "grad_norm": 1.7616136074066162, "learning_rate": 9.053672316384181e-06, "loss": 2.1575, "step": 2899 }, { "epoch": 2.457627118644068, "grad_norm": 1.7328168153762817, "learning_rate": 9.039548022598871e-06, "loss": 2.0504, "step": 2900 }, { "epoch": 2.4584745762711866, "grad_norm": 1.7471535205841064, "learning_rate": 9.02542372881356e-06, "loss": 2.061, "step": 2901 }, { "epoch": 2.459322033898305, "grad_norm": 1.6638541221618652, "learning_rate": 9.01129943502825e-06, "loss": 2.3405, "step": 2902 }, { "epoch": 2.4601694915254235, "grad_norm": 1.5824464559555054, "learning_rate": 8.99717514124294e-06, "loss": 2.1088, "step": 2903 }, { "epoch": 2.461016949152542, "grad_norm": 1.5538772344589233, "learning_rate": 8.983050847457628e-06, "loss": 2.1619, "step": 2904 }, { "epoch": 2.461864406779661, "grad_norm": 2.0436947345733643, "learning_rate": 8.968926553672318e-06, "loss": 1.7712, "step": 2905 }, { "epoch": 2.4627118644067796, "grad_norm": 1.3412665128707886, "learning_rate": 8.954802259887006e-06, "loss": 2.4668, "step": 2906 }, { "epoch": 2.4635593220338983, "grad_norm": 1.483189344406128, "learning_rate": 8.940677966101696e-06, "loss": 2.3362, "step": 2907 }, { "epoch": 2.464406779661017, "grad_norm": 1.8103327751159668, "learning_rate": 8.926553672316384e-06, "loss": 2.0242, "step": 2908 }, { "epoch": 2.4652542372881356, "grad_norm": 2.065345048904419, "learning_rate": 8.912429378531074e-06, "loss": 2.0116, "step": 2909 }, { "epoch": 2.4661016949152543, "grad_norm": 1.9503544569015503, "learning_rate": 8.898305084745763e-06, "loss": 2.0927, "step": 2910 }, { "epoch": 2.466949152542373, "grad_norm": 1.9558987617492676, "learning_rate": 8.884180790960453e-06, "loss": 2.0382, "step": 2911 }, { "epoch": 2.4677966101694917, "grad_norm": 1.5871424674987793, "learning_rate": 8.870056497175141e-06, "loss": 2.2568, "step": 2912 }, { "epoch": 2.4686440677966104, "grad_norm": 1.7679028511047363, "learning_rate": 8.855932203389831e-06, "loss": 1.9557, "step": 2913 }, { "epoch": 2.4694915254237286, "grad_norm": 1.8938539028167725, "learning_rate": 8.84180790960452e-06, "loss": 2.0015, "step": 2914 }, { "epoch": 2.4703389830508473, "grad_norm": 1.2811024188995361, "learning_rate": 8.82768361581921e-06, "loss": 2.4041, "step": 2915 }, { "epoch": 2.471186440677966, "grad_norm": 1.781759262084961, "learning_rate": 8.8135593220339e-06, "loss": 2.0379, "step": 2916 }, { "epoch": 2.4720338983050847, "grad_norm": 1.2422384023666382, "learning_rate": 8.79943502824859e-06, "loss": 2.4718, "step": 2917 }, { "epoch": 2.4728813559322034, "grad_norm": 1.4767169952392578, "learning_rate": 8.785310734463278e-06, "loss": 2.4039, "step": 2918 }, { "epoch": 2.473728813559322, "grad_norm": 1.2458562850952148, "learning_rate": 8.771186440677968e-06, "loss": 2.5647, "step": 2919 }, { "epoch": 2.4745762711864407, "grad_norm": 2.1160736083984375, "learning_rate": 8.757062146892656e-06, "loss": 1.7902, "step": 2920 }, { "epoch": 2.4754237288135594, "grad_norm": 1.473446011543274, "learning_rate": 8.742937853107346e-06, "loss": 2.2131, "step": 2921 }, { "epoch": 2.476271186440678, "grad_norm": 1.4707233905792236, "learning_rate": 8.728813559322034e-06, "loss": 2.1728, "step": 2922 }, { "epoch": 2.477118644067797, "grad_norm": 1.497360348701477, "learning_rate": 8.714689265536724e-06, "loss": 2.2662, "step": 2923 }, { "epoch": 2.477966101694915, "grad_norm": 1.9382723569869995, "learning_rate": 8.700564971751413e-06, "loss": 1.8265, "step": 2924 }, { "epoch": 2.4788135593220337, "grad_norm": 1.7174965143203735, "learning_rate": 8.686440677966103e-06, "loss": 2.1343, "step": 2925 }, { "epoch": 2.4796610169491524, "grad_norm": 1.9615626335144043, "learning_rate": 8.67231638418079e-06, "loss": 2.1523, "step": 2926 }, { "epoch": 2.480508474576271, "grad_norm": 2.1843740940093994, "learning_rate": 8.65819209039548e-06, "loss": 1.8332, "step": 2927 }, { "epoch": 2.48135593220339, "grad_norm": 1.793603539466858, "learning_rate": 8.644067796610169e-06, "loss": 2.2019, "step": 2928 }, { "epoch": 2.4822033898305085, "grad_norm": 2.0399248600006104, "learning_rate": 8.629943502824859e-06, "loss": 2.0296, "step": 2929 }, { "epoch": 2.483050847457627, "grad_norm": 2.1520917415618896, "learning_rate": 8.615819209039547e-06, "loss": 2.2958, "step": 2930 }, { "epoch": 2.483898305084746, "grad_norm": 1.677255630493164, "learning_rate": 8.601694915254237e-06, "loss": 2.2038, "step": 2931 }, { "epoch": 2.4847457627118645, "grad_norm": 1.4223687648773193, "learning_rate": 8.587570621468927e-06, "loss": 2.4918, "step": 2932 }, { "epoch": 2.4855932203389832, "grad_norm": 1.4818404912948608, "learning_rate": 8.573446327683617e-06, "loss": 2.4961, "step": 2933 }, { "epoch": 2.486440677966102, "grad_norm": 1.8745479583740234, "learning_rate": 8.559322033898306e-06, "loss": 2.1015, "step": 2934 }, { "epoch": 2.48728813559322, "grad_norm": 2.2350783348083496, "learning_rate": 8.545197740112996e-06, "loss": 2.0184, "step": 2935 }, { "epoch": 2.488135593220339, "grad_norm": 1.8060007095336914, "learning_rate": 8.531073446327684e-06, "loss": 2.095, "step": 2936 }, { "epoch": 2.4889830508474575, "grad_norm": 1.8137600421905518, "learning_rate": 8.516949152542374e-06, "loss": 2.0185, "step": 2937 }, { "epoch": 2.489830508474576, "grad_norm": 1.374155879020691, "learning_rate": 8.502824858757062e-06, "loss": 2.3764, "step": 2938 }, { "epoch": 2.490677966101695, "grad_norm": 1.761371374130249, "learning_rate": 8.488700564971752e-06, "loss": 2.0937, "step": 2939 }, { "epoch": 2.4915254237288136, "grad_norm": 2.1312267780303955, "learning_rate": 8.47457627118644e-06, "loss": 1.8166, "step": 2940 }, { "epoch": 2.4923728813559323, "grad_norm": 1.6262218952178955, "learning_rate": 8.46045197740113e-06, "loss": 2.2838, "step": 2941 }, { "epoch": 2.493220338983051, "grad_norm": 1.7206265926361084, "learning_rate": 8.446327683615819e-06, "loss": 2.0711, "step": 2942 }, { "epoch": 2.4940677966101696, "grad_norm": 1.7607817649841309, "learning_rate": 8.432203389830509e-06, "loss": 2.0993, "step": 2943 }, { "epoch": 2.4949152542372883, "grad_norm": 2.1898818016052246, "learning_rate": 8.418079096045197e-06, "loss": 1.6373, "step": 2944 }, { "epoch": 2.4957627118644066, "grad_norm": 1.7678654193878174, "learning_rate": 8.403954802259887e-06, "loss": 2.1646, "step": 2945 }, { "epoch": 2.4966101694915253, "grad_norm": 1.371290922164917, "learning_rate": 8.389830508474575e-06, "loss": 2.3973, "step": 2946 }, { "epoch": 2.497457627118644, "grad_norm": 1.9622776508331299, "learning_rate": 8.375706214689265e-06, "loss": 1.8268, "step": 2947 }, { "epoch": 2.4983050847457626, "grad_norm": 1.8486266136169434, "learning_rate": 8.361581920903955e-06, "loss": 2.1427, "step": 2948 }, { "epoch": 2.4991525423728813, "grad_norm": 1.2044062614440918, "learning_rate": 8.347457627118645e-06, "loss": 2.6267, "step": 2949 }, { "epoch": 2.5, "grad_norm": 2.120523452758789, "learning_rate": 8.333333333333334e-06, "loss": 1.887, "step": 2950 }, { "epoch": 2.5008474576271187, "grad_norm": 1.5874050855636597, "learning_rate": 8.319209039548024e-06, "loss": 2.303, "step": 2951 }, { "epoch": 2.5016949152542374, "grad_norm": 1.2719577550888062, "learning_rate": 8.305084745762712e-06, "loss": 2.3387, "step": 2952 }, { "epoch": 2.502542372881356, "grad_norm": 1.9110260009765625, "learning_rate": 8.290960451977402e-06, "loss": 2.3022, "step": 2953 }, { "epoch": 2.5033898305084747, "grad_norm": 1.998567819595337, "learning_rate": 8.27683615819209e-06, "loss": 2.0504, "step": 2954 }, { "epoch": 2.5042372881355934, "grad_norm": 1.2988436222076416, "learning_rate": 8.26271186440678e-06, "loss": 2.4504, "step": 2955 }, { "epoch": 2.505084745762712, "grad_norm": 1.398335576057434, "learning_rate": 8.248587570621469e-06, "loss": 2.273, "step": 2956 }, { "epoch": 2.5059322033898304, "grad_norm": 1.8977516889572144, "learning_rate": 8.234463276836159e-06, "loss": 2.1891, "step": 2957 }, { "epoch": 2.506779661016949, "grad_norm": 1.8704180717468262, "learning_rate": 8.220338983050847e-06, "loss": 2.0068, "step": 2958 }, { "epoch": 2.5076271186440677, "grad_norm": 1.4644842147827148, "learning_rate": 8.206214689265537e-06, "loss": 2.2268, "step": 2959 }, { "epoch": 2.5084745762711864, "grad_norm": 1.4621210098266602, "learning_rate": 8.192090395480225e-06, "loss": 2.3169, "step": 2960 }, { "epoch": 2.509322033898305, "grad_norm": 1.5093555450439453, "learning_rate": 8.177966101694915e-06, "loss": 2.3677, "step": 2961 }, { "epoch": 2.510169491525424, "grad_norm": 1.8150832653045654, "learning_rate": 8.163841807909603e-06, "loss": 1.9276, "step": 2962 }, { "epoch": 2.5110169491525425, "grad_norm": 1.834709644317627, "learning_rate": 8.149717514124293e-06, "loss": 2.046, "step": 2963 }, { "epoch": 2.511864406779661, "grad_norm": 2.189105749130249, "learning_rate": 8.135593220338983e-06, "loss": 1.4947, "step": 2964 }, { "epoch": 2.5127118644067794, "grad_norm": 1.4535638093948364, "learning_rate": 8.121468926553673e-06, "loss": 2.1777, "step": 2965 }, { "epoch": 2.513559322033898, "grad_norm": 1.4188885688781738, "learning_rate": 8.107344632768362e-06, "loss": 2.4833, "step": 2966 }, { "epoch": 2.5144067796610168, "grad_norm": 1.7854934930801392, "learning_rate": 8.093220338983052e-06, "loss": 2.098, "step": 2967 }, { "epoch": 2.5152542372881355, "grad_norm": 1.513467788696289, "learning_rate": 8.07909604519774e-06, "loss": 2.2624, "step": 2968 }, { "epoch": 2.516101694915254, "grad_norm": 1.996180534362793, "learning_rate": 8.06497175141243e-06, "loss": 1.7446, "step": 2969 }, { "epoch": 2.516949152542373, "grad_norm": 2.0471272468566895, "learning_rate": 8.050847457627118e-06, "loss": 1.782, "step": 2970 }, { "epoch": 2.5177966101694915, "grad_norm": 1.7185624837875366, "learning_rate": 8.036723163841808e-06, "loss": 2.1194, "step": 2971 }, { "epoch": 2.51864406779661, "grad_norm": 1.852493166923523, "learning_rate": 8.022598870056497e-06, "loss": 1.9063, "step": 2972 }, { "epoch": 2.519491525423729, "grad_norm": 1.630000114440918, "learning_rate": 8.008474576271187e-06, "loss": 2.2022, "step": 2973 }, { "epoch": 2.5203389830508476, "grad_norm": 1.9185245037078857, "learning_rate": 7.994350282485875e-06, "loss": 2.1009, "step": 2974 }, { "epoch": 2.5211864406779663, "grad_norm": 2.0377321243286133, "learning_rate": 7.980225988700565e-06, "loss": 1.938, "step": 2975 }, { "epoch": 2.522033898305085, "grad_norm": 1.6007628440856934, "learning_rate": 7.966101694915253e-06, "loss": 2.1577, "step": 2976 }, { "epoch": 2.5228813559322036, "grad_norm": 2.1180145740509033, "learning_rate": 7.951977401129943e-06, "loss": 1.9347, "step": 2977 }, { "epoch": 2.523728813559322, "grad_norm": 1.706475853919983, "learning_rate": 7.937853107344633e-06, "loss": 2.1324, "step": 2978 }, { "epoch": 2.5245762711864406, "grad_norm": 1.9764870405197144, "learning_rate": 7.923728813559323e-06, "loss": 1.9918, "step": 2979 }, { "epoch": 2.5254237288135593, "grad_norm": 1.5780428647994995, "learning_rate": 7.909604519774012e-06, "loss": 2.238, "step": 2980 }, { "epoch": 2.526271186440678, "grad_norm": 2.1836647987365723, "learning_rate": 7.895480225988702e-06, "loss": 1.6425, "step": 2981 }, { "epoch": 2.5271186440677966, "grad_norm": 1.736869215965271, "learning_rate": 7.881355932203392e-06, "loss": 2.056, "step": 2982 }, { "epoch": 2.5279661016949153, "grad_norm": 1.3528568744659424, "learning_rate": 7.86723163841808e-06, "loss": 2.3249, "step": 2983 }, { "epoch": 2.528813559322034, "grad_norm": 1.4063446521759033, "learning_rate": 7.85310734463277e-06, "loss": 2.3622, "step": 2984 }, { "epoch": 2.5296610169491527, "grad_norm": 1.8569039106369019, "learning_rate": 7.838983050847458e-06, "loss": 2.0365, "step": 2985 }, { "epoch": 2.530508474576271, "grad_norm": 1.8357794284820557, "learning_rate": 7.824858757062148e-06, "loss": 1.8323, "step": 2986 }, { "epoch": 2.5313559322033896, "grad_norm": 1.867386817932129, "learning_rate": 7.810734463276836e-06, "loss": 1.8655, "step": 2987 }, { "epoch": 2.5322033898305083, "grad_norm": 1.9853345155715942, "learning_rate": 7.796610169491526e-06, "loss": 1.9758, "step": 2988 }, { "epoch": 2.533050847457627, "grad_norm": 1.244191288948059, "learning_rate": 7.782485875706215e-06, "loss": 2.3999, "step": 2989 }, { "epoch": 2.5338983050847457, "grad_norm": 1.669678807258606, "learning_rate": 7.768361581920905e-06, "loss": 1.9253, "step": 2990 }, { "epoch": 2.5347457627118644, "grad_norm": 2.0847461223602295, "learning_rate": 7.754237288135593e-06, "loss": 2.0371, "step": 2991 }, { "epoch": 2.535593220338983, "grad_norm": 1.7110786437988281, "learning_rate": 7.740112994350283e-06, "loss": 2.207, "step": 2992 }, { "epoch": 2.5364406779661017, "grad_norm": 1.4931682348251343, "learning_rate": 7.725988700564971e-06, "loss": 2.2115, "step": 2993 }, { "epoch": 2.5372881355932204, "grad_norm": 1.9188610315322876, "learning_rate": 7.711864406779661e-06, "loss": 1.7046, "step": 2994 }, { "epoch": 2.538135593220339, "grad_norm": 2.1624057292938232, "learning_rate": 7.697740112994351e-06, "loss": 1.8313, "step": 2995 }, { "epoch": 2.538983050847458, "grad_norm": 1.8246220350265503, "learning_rate": 7.683615819209041e-06, "loss": 1.9271, "step": 2996 }, { "epoch": 2.5398305084745765, "grad_norm": 2.056644916534424, "learning_rate": 7.66949152542373e-06, "loss": 1.677, "step": 2997 }, { "epoch": 2.540677966101695, "grad_norm": 2.052074670791626, "learning_rate": 7.65536723163842e-06, "loss": 1.76, "step": 2998 }, { "epoch": 2.5415254237288134, "grad_norm": 1.3644156455993652, "learning_rate": 7.641242937853108e-06, "loss": 2.3393, "step": 2999 }, { "epoch": 2.542372881355932, "grad_norm": 1.7320497035980225, "learning_rate": 7.627118644067798e-06, "loss": 2.1087, "step": 3000 }, { "epoch": 2.5432203389830508, "grad_norm": 1.5821408033370972, "learning_rate": 7.612994350282486e-06, "loss": 2.1904, "step": 3001 }, { "epoch": 2.5440677966101695, "grad_norm": 1.8494126796722412, "learning_rate": 7.598870056497176e-06, "loss": 2.0945, "step": 3002 }, { "epoch": 2.544915254237288, "grad_norm": 1.8187628984451294, "learning_rate": 7.5847457627118645e-06, "loss": 2.1857, "step": 3003 }, { "epoch": 2.545762711864407, "grad_norm": 1.6754740476608276, "learning_rate": 7.5706214689265545e-06, "loss": 1.7984, "step": 3004 }, { "epoch": 2.5466101694915255, "grad_norm": 1.7630802392959595, "learning_rate": 7.556497175141243e-06, "loss": 2.0151, "step": 3005 }, { "epoch": 2.547457627118644, "grad_norm": 2.1192195415496826, "learning_rate": 7.542372881355933e-06, "loss": 2.0071, "step": 3006 }, { "epoch": 2.5483050847457624, "grad_norm": 1.8960492610931396, "learning_rate": 7.528248587570622e-06, "loss": 2.0779, "step": 3007 }, { "epoch": 2.549152542372881, "grad_norm": 1.5422511100769043, "learning_rate": 7.514124293785312e-06, "loss": 2.2483, "step": 3008 }, { "epoch": 2.55, "grad_norm": 1.4536371231079102, "learning_rate": 7.5e-06, "loss": 2.2602, "step": 3009 }, { "epoch": 2.5508474576271185, "grad_norm": 1.515049695968628, "learning_rate": 7.48587570621469e-06, "loss": 2.3488, "step": 3010 }, { "epoch": 2.551694915254237, "grad_norm": 1.879209280014038, "learning_rate": 7.4717514124293785e-06, "loss": 1.8461, "step": 3011 }, { "epoch": 2.552542372881356, "grad_norm": 1.7145729064941406, "learning_rate": 7.4576271186440685e-06, "loss": 2.1602, "step": 3012 }, { "epoch": 2.5533898305084746, "grad_norm": 1.601664662361145, "learning_rate": 7.443502824858757e-06, "loss": 2.106, "step": 3013 }, { "epoch": 2.5542372881355933, "grad_norm": 1.810798168182373, "learning_rate": 7.429378531073447e-06, "loss": 2.2859, "step": 3014 }, { "epoch": 2.555084745762712, "grad_norm": 1.6008425951004028, "learning_rate": 7.415254237288136e-06, "loss": 2.1538, "step": 3015 }, { "epoch": 2.5559322033898306, "grad_norm": 1.68952476978302, "learning_rate": 7.401129943502826e-06, "loss": 2.0608, "step": 3016 }, { "epoch": 2.5567796610169493, "grad_norm": 1.606894850730896, "learning_rate": 7.387005649717514e-06, "loss": 2.2093, "step": 3017 }, { "epoch": 2.557627118644068, "grad_norm": 1.3230990171432495, "learning_rate": 7.372881355932204e-06, "loss": 2.3442, "step": 3018 }, { "epoch": 2.5584745762711867, "grad_norm": 1.8088946342468262, "learning_rate": 7.3587570621468925e-06, "loss": 2.0649, "step": 3019 }, { "epoch": 2.559322033898305, "grad_norm": 1.5568079948425293, "learning_rate": 7.3446327683615825e-06, "loss": 2.2807, "step": 3020 }, { "epoch": 2.5601694915254236, "grad_norm": 2.1737327575683594, "learning_rate": 7.330508474576271e-06, "loss": 1.6519, "step": 3021 }, { "epoch": 2.5610169491525423, "grad_norm": 1.812609076499939, "learning_rate": 7.316384180790961e-06, "loss": 2.1693, "step": 3022 }, { "epoch": 2.561864406779661, "grad_norm": 1.5055807828903198, "learning_rate": 7.30225988700565e-06, "loss": 2.3576, "step": 3023 }, { "epoch": 2.5627118644067797, "grad_norm": 2.2145583629608154, "learning_rate": 7.28813559322034e-06, "loss": 1.9627, "step": 3024 }, { "epoch": 2.5635593220338984, "grad_norm": 1.6984370946884155, "learning_rate": 7.274011299435028e-06, "loss": 2.0201, "step": 3025 }, { "epoch": 2.564406779661017, "grad_norm": 1.5419265031814575, "learning_rate": 7.259887005649718e-06, "loss": 2.2826, "step": 3026 }, { "epoch": 2.5652542372881357, "grad_norm": 2.3023569583892822, "learning_rate": 7.2457627118644065e-06, "loss": 1.592, "step": 3027 }, { "epoch": 2.566101694915254, "grad_norm": 1.7549352645874023, "learning_rate": 7.2316384180790965e-06, "loss": 2.058, "step": 3028 }, { "epoch": 2.5669491525423727, "grad_norm": 1.3474128246307373, "learning_rate": 7.217514124293785e-06, "loss": 2.3685, "step": 3029 }, { "epoch": 2.5677966101694913, "grad_norm": 1.5920982360839844, "learning_rate": 7.203389830508475e-06, "loss": 2.1354, "step": 3030 }, { "epoch": 2.56864406779661, "grad_norm": 1.834998607635498, "learning_rate": 7.189265536723164e-06, "loss": 2.0803, "step": 3031 }, { "epoch": 2.5694915254237287, "grad_norm": 1.6628954410552979, "learning_rate": 7.175141242937854e-06, "loss": 2.0229, "step": 3032 }, { "epoch": 2.5703389830508474, "grad_norm": 1.7421000003814697, "learning_rate": 7.161016949152542e-06, "loss": 2.2294, "step": 3033 }, { "epoch": 2.571186440677966, "grad_norm": 2.0664875507354736, "learning_rate": 7.146892655367232e-06, "loss": 1.8082, "step": 3034 }, { "epoch": 2.5720338983050848, "grad_norm": 1.7396173477172852, "learning_rate": 7.1327683615819206e-06, "loss": 2.1463, "step": 3035 }, { "epoch": 2.5728813559322035, "grad_norm": 1.789170503616333, "learning_rate": 7.1186440677966106e-06, "loss": 2.019, "step": 3036 }, { "epoch": 2.573728813559322, "grad_norm": 1.9018986225128174, "learning_rate": 7.104519774011299e-06, "loss": 2.0897, "step": 3037 }, { "epoch": 2.574576271186441, "grad_norm": 1.6295945644378662, "learning_rate": 7.090395480225989e-06, "loss": 1.898, "step": 3038 }, { "epoch": 2.5754237288135595, "grad_norm": 1.4587124586105347, "learning_rate": 7.076271186440678e-06, "loss": 2.4802, "step": 3039 }, { "epoch": 2.576271186440678, "grad_norm": 2.1292927265167236, "learning_rate": 7.062146892655368e-06, "loss": 1.8714, "step": 3040 }, { "epoch": 2.5771186440677964, "grad_norm": 1.8581408262252808, "learning_rate": 7.048022598870056e-06, "loss": 1.9625, "step": 3041 }, { "epoch": 2.577966101694915, "grad_norm": 1.1935030221939087, "learning_rate": 7.033898305084746e-06, "loss": 2.4388, "step": 3042 }, { "epoch": 2.578813559322034, "grad_norm": 1.8859221935272217, "learning_rate": 7.019774011299435e-06, "loss": 1.939, "step": 3043 }, { "epoch": 2.5796610169491525, "grad_norm": 1.7914434671401978, "learning_rate": 7.0056497175141246e-06, "loss": 2.0808, "step": 3044 }, { "epoch": 2.580508474576271, "grad_norm": 1.882596731185913, "learning_rate": 6.991525423728814e-06, "loss": 1.8391, "step": 3045 }, { "epoch": 2.58135593220339, "grad_norm": 2.0972578525543213, "learning_rate": 6.977401129943503e-06, "loss": 1.9076, "step": 3046 }, { "epoch": 2.5822033898305086, "grad_norm": 1.6736787557601929, "learning_rate": 6.963276836158192e-06, "loss": 2.3895, "step": 3047 }, { "epoch": 2.5830508474576273, "grad_norm": 2.118112802505493, "learning_rate": 6.949152542372882e-06, "loss": 1.8, "step": 3048 }, { "epoch": 2.5838983050847455, "grad_norm": 1.4006154537200928, "learning_rate": 6.93502824858757e-06, "loss": 2.2656, "step": 3049 }, { "epoch": 2.584745762711864, "grad_norm": 1.7507838010787964, "learning_rate": 6.92090395480226e-06, "loss": 1.9411, "step": 3050 }, { "epoch": 2.585593220338983, "grad_norm": 1.6378151178359985, "learning_rate": 6.906779661016949e-06, "loss": 2.1043, "step": 3051 }, { "epoch": 2.5864406779661016, "grad_norm": 2.0987699031829834, "learning_rate": 6.892655367231639e-06, "loss": 1.7536, "step": 3052 }, { "epoch": 2.5872881355932202, "grad_norm": 1.8500317335128784, "learning_rate": 6.878531073446328e-06, "loss": 1.9234, "step": 3053 }, { "epoch": 2.588135593220339, "grad_norm": 1.954512119293213, "learning_rate": 6.864406779661017e-06, "loss": 2.1282, "step": 3054 }, { "epoch": 2.5889830508474576, "grad_norm": 1.7337831258773804, "learning_rate": 6.850282485875706e-06, "loss": 2.1146, "step": 3055 }, { "epoch": 2.5898305084745763, "grad_norm": 1.5660083293914795, "learning_rate": 6.836158192090396e-06, "loss": 2.3264, "step": 3056 }, { "epoch": 2.590677966101695, "grad_norm": 1.435356855392456, "learning_rate": 6.822033898305084e-06, "loss": 2.457, "step": 3057 }, { "epoch": 2.5915254237288137, "grad_norm": 2.0490822792053223, "learning_rate": 6.807909604519774e-06, "loss": 1.7802, "step": 3058 }, { "epoch": 2.5923728813559324, "grad_norm": 1.5111957788467407, "learning_rate": 6.793785310734464e-06, "loss": 2.2187, "step": 3059 }, { "epoch": 2.593220338983051, "grad_norm": 1.6086337566375732, "learning_rate": 6.779661016949153e-06, "loss": 2.1684, "step": 3060 }, { "epoch": 2.5940677966101697, "grad_norm": 1.8838337659835815, "learning_rate": 6.765536723163843e-06, "loss": 2.1587, "step": 3061 }, { "epoch": 2.594915254237288, "grad_norm": 1.6110546588897705, "learning_rate": 6.751412429378532e-06, "loss": 2.225, "step": 3062 }, { "epoch": 2.5957627118644067, "grad_norm": 2.178858995437622, "learning_rate": 6.737288135593221e-06, "loss": 1.9288, "step": 3063 }, { "epoch": 2.5966101694915253, "grad_norm": 1.4706746339797974, "learning_rate": 6.72316384180791e-06, "loss": 2.3295, "step": 3064 }, { "epoch": 2.597457627118644, "grad_norm": 1.2675209045410156, "learning_rate": 6.7090395480226e-06, "loss": 2.4013, "step": 3065 }, { "epoch": 2.5983050847457627, "grad_norm": 1.82991361618042, "learning_rate": 6.694915254237288e-06, "loss": 2.0283, "step": 3066 }, { "epoch": 2.5991525423728814, "grad_norm": 1.9969542026519775, "learning_rate": 6.680790960451978e-06, "loss": 1.9261, "step": 3067 }, { "epoch": 2.6, "grad_norm": 1.5021088123321533, "learning_rate": 6.666666666666667e-06, "loss": 2.3465, "step": 3068 }, { "epoch": 2.6008474576271188, "grad_norm": 1.1992968320846558, "learning_rate": 6.652542372881357e-06, "loss": 2.4956, "step": 3069 }, { "epoch": 2.601694915254237, "grad_norm": 1.1849461793899536, "learning_rate": 6.638418079096046e-06, "loss": 2.6337, "step": 3070 }, { "epoch": 2.6025423728813557, "grad_norm": 1.8810172080993652, "learning_rate": 6.624293785310735e-06, "loss": 2.0795, "step": 3071 }, { "epoch": 2.6033898305084744, "grad_norm": 1.9608169794082642, "learning_rate": 6.610169491525424e-06, "loss": 2.0597, "step": 3072 }, { "epoch": 2.604237288135593, "grad_norm": 1.8074970245361328, "learning_rate": 6.596045197740114e-06, "loss": 1.9833, "step": 3073 }, { "epoch": 2.6050847457627118, "grad_norm": 1.6962586641311646, "learning_rate": 6.581920903954802e-06, "loss": 2.2267, "step": 3074 }, { "epoch": 2.6059322033898304, "grad_norm": 1.7375556230545044, "learning_rate": 6.567796610169492e-06, "loss": 2.101, "step": 3075 }, { "epoch": 2.606779661016949, "grad_norm": 1.7727997303009033, "learning_rate": 6.553672316384181e-06, "loss": 2.036, "step": 3076 }, { "epoch": 2.607627118644068, "grad_norm": 1.2793000936508179, "learning_rate": 6.539548022598871e-06, "loss": 2.4457, "step": 3077 }, { "epoch": 2.6084745762711865, "grad_norm": 1.711143136024475, "learning_rate": 6.52542372881356e-06, "loss": 2.1449, "step": 3078 }, { "epoch": 2.609322033898305, "grad_norm": 2.098222255706787, "learning_rate": 6.51129943502825e-06, "loss": 1.7625, "step": 3079 }, { "epoch": 2.610169491525424, "grad_norm": 2.062216281890869, "learning_rate": 6.497175141242938e-06, "loss": 1.8931, "step": 3080 }, { "epoch": 2.6110169491525426, "grad_norm": 2.228379487991333, "learning_rate": 6.483050847457628e-06, "loss": 1.8649, "step": 3081 }, { "epoch": 2.6118644067796613, "grad_norm": 1.9092179536819458, "learning_rate": 6.468926553672316e-06, "loss": 1.985, "step": 3082 }, { "epoch": 2.61271186440678, "grad_norm": 1.9925532341003418, "learning_rate": 6.454802259887006e-06, "loss": 2.0447, "step": 3083 }, { "epoch": 2.613559322033898, "grad_norm": 1.8609484434127808, "learning_rate": 6.440677966101695e-06, "loss": 2.1578, "step": 3084 }, { "epoch": 2.614406779661017, "grad_norm": 1.516598105430603, "learning_rate": 6.426553672316385e-06, "loss": 2.2194, "step": 3085 }, { "epoch": 2.6152542372881356, "grad_norm": 1.7448123693466187, "learning_rate": 6.412429378531074e-06, "loss": 2.1428, "step": 3086 }, { "epoch": 2.6161016949152542, "grad_norm": 2.107908248901367, "learning_rate": 6.398305084745764e-06, "loss": 1.7867, "step": 3087 }, { "epoch": 2.616949152542373, "grad_norm": 1.9045662879943848, "learning_rate": 6.384180790960452e-06, "loss": 2.0377, "step": 3088 }, { "epoch": 2.6177966101694916, "grad_norm": 1.5822416543960571, "learning_rate": 6.370056497175142e-06, "loss": 2.3111, "step": 3089 }, { "epoch": 2.6186440677966103, "grad_norm": 1.8819859027862549, "learning_rate": 6.3559322033898304e-06, "loss": 2.1015, "step": 3090 }, { "epoch": 2.6194915254237285, "grad_norm": 2.2771382331848145, "learning_rate": 6.34180790960452e-06, "loss": 1.7219, "step": 3091 }, { "epoch": 2.6203389830508472, "grad_norm": 1.9226919412612915, "learning_rate": 6.327683615819209e-06, "loss": 1.934, "step": 3092 }, { "epoch": 2.621186440677966, "grad_norm": 2.1976590156555176, "learning_rate": 6.313559322033899e-06, "loss": 1.7882, "step": 3093 }, { "epoch": 2.6220338983050846, "grad_norm": 1.9256106615066528, "learning_rate": 6.299435028248588e-06, "loss": 1.9117, "step": 3094 }, { "epoch": 2.6228813559322033, "grad_norm": 2.0661087036132812, "learning_rate": 6.285310734463278e-06, "loss": 1.9503, "step": 3095 }, { "epoch": 2.623728813559322, "grad_norm": 2.0507121086120605, "learning_rate": 6.271186440677966e-06, "loss": 2.1173, "step": 3096 }, { "epoch": 2.6245762711864407, "grad_norm": 1.5374890565872192, "learning_rate": 6.257062146892656e-06, "loss": 2.2263, "step": 3097 }, { "epoch": 2.6254237288135593, "grad_norm": 1.3915599584579468, "learning_rate": 6.242937853107345e-06, "loss": 2.2953, "step": 3098 }, { "epoch": 2.626271186440678, "grad_norm": 1.79976487159729, "learning_rate": 6.2288135593220344e-06, "loss": 2.0337, "step": 3099 }, { "epoch": 2.6271186440677967, "grad_norm": 2.077894926071167, "learning_rate": 6.214689265536724e-06, "loss": 2.0656, "step": 3100 }, { "epoch": 2.6279661016949154, "grad_norm": 1.538857340812683, "learning_rate": 6.200564971751413e-06, "loss": 2.3387, "step": 3101 }, { "epoch": 2.628813559322034, "grad_norm": 1.6980963945388794, "learning_rate": 6.186440677966102e-06, "loss": 2.1492, "step": 3102 }, { "epoch": 2.6296610169491528, "grad_norm": 1.9201914072036743, "learning_rate": 6.172316384180792e-06, "loss": 2.0327, "step": 3103 }, { "epoch": 2.6305084745762715, "grad_norm": 1.8236585855484009, "learning_rate": 6.158192090395481e-06, "loss": 2.121, "step": 3104 }, { "epoch": 2.6313559322033897, "grad_norm": 2.30562162399292, "learning_rate": 6.14406779661017e-06, "loss": 2.1442, "step": 3105 }, { "epoch": 2.6322033898305084, "grad_norm": 1.5762290954589844, "learning_rate": 6.129943502824859e-06, "loss": 2.27, "step": 3106 }, { "epoch": 2.633050847457627, "grad_norm": 2.0478971004486084, "learning_rate": 6.1158192090395485e-06, "loss": 1.5608, "step": 3107 }, { "epoch": 2.6338983050847458, "grad_norm": 2.074697494506836, "learning_rate": 6.101694915254238e-06, "loss": 1.807, "step": 3108 }, { "epoch": 2.6347457627118644, "grad_norm": 1.7831599712371826, "learning_rate": 6.087570621468927e-06, "loss": 2.147, "step": 3109 }, { "epoch": 2.635593220338983, "grad_norm": 1.6036802530288696, "learning_rate": 6.073446327683617e-06, "loss": 2.1833, "step": 3110 }, { "epoch": 2.636440677966102, "grad_norm": 1.7700417041778564, "learning_rate": 6.059322033898306e-06, "loss": 2.1094, "step": 3111 }, { "epoch": 2.63728813559322, "grad_norm": 1.8165521621704102, "learning_rate": 6.045197740112995e-06, "loss": 1.9511, "step": 3112 }, { "epoch": 2.6381355932203387, "grad_norm": 1.8041558265686035, "learning_rate": 6.031073446327684e-06, "loss": 2.0213, "step": 3113 }, { "epoch": 2.6389830508474574, "grad_norm": 1.56833016872406, "learning_rate": 6.016949152542373e-06, "loss": 2.1927, "step": 3114 }, { "epoch": 2.639830508474576, "grad_norm": 1.8998544216156006, "learning_rate": 6.0028248587570625e-06, "loss": 2.1135, "step": 3115 }, { "epoch": 2.640677966101695, "grad_norm": 1.405078649520874, "learning_rate": 5.988700564971752e-06, "loss": 2.4591, "step": 3116 }, { "epoch": 2.6415254237288135, "grad_norm": 1.8880126476287842, "learning_rate": 5.974576271186441e-06, "loss": 1.8116, "step": 3117 }, { "epoch": 2.642372881355932, "grad_norm": 1.917741060256958, "learning_rate": 5.960451977401131e-06, "loss": 2.0983, "step": 3118 }, { "epoch": 2.643220338983051, "grad_norm": 1.492091178894043, "learning_rate": 5.94632768361582e-06, "loss": 2.3454, "step": 3119 }, { "epoch": 2.6440677966101696, "grad_norm": 1.7252168655395508, "learning_rate": 5.932203389830509e-06, "loss": 2.3544, "step": 3120 }, { "epoch": 2.6449152542372882, "grad_norm": 2.009967803955078, "learning_rate": 5.918079096045198e-06, "loss": 2.1103, "step": 3121 }, { "epoch": 2.645762711864407, "grad_norm": 1.4831916093826294, "learning_rate": 5.903954802259887e-06, "loss": 2.4132, "step": 3122 }, { "epoch": 2.6466101694915256, "grad_norm": 1.9462958574295044, "learning_rate": 5.8898305084745765e-06, "loss": 1.9792, "step": 3123 }, { "epoch": 2.6474576271186443, "grad_norm": 1.7218736410140991, "learning_rate": 5.875706214689266e-06, "loss": 2.1313, "step": 3124 }, { "epoch": 2.648305084745763, "grad_norm": 1.5944900512695312, "learning_rate": 5.861581920903955e-06, "loss": 2.3315, "step": 3125 }, { "epoch": 2.6491525423728812, "grad_norm": 1.5159193277359009, "learning_rate": 5.847457627118645e-06, "loss": 2.2296, "step": 3126 }, { "epoch": 2.65, "grad_norm": 1.9195917844772339, "learning_rate": 5.833333333333334e-06, "loss": 1.8245, "step": 3127 }, { "epoch": 2.6508474576271186, "grad_norm": 1.477134108543396, "learning_rate": 5.819209039548023e-06, "loss": 2.3509, "step": 3128 }, { "epoch": 2.6516949152542373, "grad_norm": 2.3269686698913574, "learning_rate": 5.805084745762712e-06, "loss": 1.9118, "step": 3129 }, { "epoch": 2.652542372881356, "grad_norm": 1.980559229850769, "learning_rate": 5.790960451977401e-06, "loss": 1.7757, "step": 3130 }, { "epoch": 2.6533898305084747, "grad_norm": 1.5287688970565796, "learning_rate": 5.7768361581920905e-06, "loss": 2.1286, "step": 3131 }, { "epoch": 2.6542372881355933, "grad_norm": 1.9623382091522217, "learning_rate": 5.76271186440678e-06, "loss": 1.7988, "step": 3132 }, { "epoch": 2.655084745762712, "grad_norm": 1.6550776958465576, "learning_rate": 5.748587570621469e-06, "loss": 2.1933, "step": 3133 }, { "epoch": 2.6559322033898303, "grad_norm": 1.4529967308044434, "learning_rate": 5.734463276836159e-06, "loss": 2.4504, "step": 3134 }, { "epoch": 2.656779661016949, "grad_norm": 1.5573678016662598, "learning_rate": 5.720338983050848e-06, "loss": 1.8805, "step": 3135 }, { "epoch": 2.6576271186440676, "grad_norm": 1.2603721618652344, "learning_rate": 5.706214689265537e-06, "loss": 2.4311, "step": 3136 }, { "epoch": 2.6584745762711863, "grad_norm": 1.6075152158737183, "learning_rate": 5.692090395480226e-06, "loss": 2.1516, "step": 3137 }, { "epoch": 2.659322033898305, "grad_norm": 1.4322408437728882, "learning_rate": 5.677966101694915e-06, "loss": 2.2252, "step": 3138 }, { "epoch": 2.6601694915254237, "grad_norm": 1.7860020399093628, "learning_rate": 5.6638418079096046e-06, "loss": 1.9393, "step": 3139 }, { "epoch": 2.6610169491525424, "grad_norm": 1.843631386756897, "learning_rate": 5.649717514124294e-06, "loss": 2.0904, "step": 3140 }, { "epoch": 2.661864406779661, "grad_norm": 2.062152147293091, "learning_rate": 5.635593220338984e-06, "loss": 2.1247, "step": 3141 }, { "epoch": 2.6627118644067798, "grad_norm": 1.5441206693649292, "learning_rate": 5.621468926553673e-06, "loss": 2.2471, "step": 3142 }, { "epoch": 2.6635593220338984, "grad_norm": 1.609926462173462, "learning_rate": 5.607344632768362e-06, "loss": 2.4538, "step": 3143 }, { "epoch": 2.664406779661017, "grad_norm": 1.4819270372390747, "learning_rate": 5.593220338983051e-06, "loss": 2.3786, "step": 3144 }, { "epoch": 2.665254237288136, "grad_norm": 1.875760555267334, "learning_rate": 5.57909604519774e-06, "loss": 2.1711, "step": 3145 }, { "epoch": 2.6661016949152545, "grad_norm": 2.2035133838653564, "learning_rate": 5.5649717514124294e-06, "loss": 2.0346, "step": 3146 }, { "epoch": 2.6669491525423727, "grad_norm": 1.886523723602295, "learning_rate": 5.550847457627119e-06, "loss": 2.1007, "step": 3147 }, { "epoch": 2.6677966101694914, "grad_norm": 2.027946949005127, "learning_rate": 5.536723163841808e-06, "loss": 2.0433, "step": 3148 }, { "epoch": 2.66864406779661, "grad_norm": 1.673337459564209, "learning_rate": 5.522598870056498e-06, "loss": 2.2676, "step": 3149 }, { "epoch": 2.669491525423729, "grad_norm": 1.5989969968795776, "learning_rate": 5.508474576271187e-06, "loss": 2.0769, "step": 3150 }, { "epoch": 2.6703389830508475, "grad_norm": 1.9226703643798828, "learning_rate": 5.494350282485876e-06, "loss": 2.0417, "step": 3151 }, { "epoch": 2.671186440677966, "grad_norm": 1.5872721672058105, "learning_rate": 5.480225988700565e-06, "loss": 2.3024, "step": 3152 }, { "epoch": 2.672033898305085, "grad_norm": 1.6903926134109497, "learning_rate": 5.466101694915254e-06, "loss": 2.015, "step": 3153 }, { "epoch": 2.6728813559322036, "grad_norm": 1.5938266515731812, "learning_rate": 5.4519774011299435e-06, "loss": 1.9506, "step": 3154 }, { "epoch": 2.673728813559322, "grad_norm": 1.930178165435791, "learning_rate": 5.437853107344633e-06, "loss": 2.0749, "step": 3155 }, { "epoch": 2.6745762711864405, "grad_norm": 1.697782039642334, "learning_rate": 5.423728813559322e-06, "loss": 1.8642, "step": 3156 }, { "epoch": 2.675423728813559, "grad_norm": 2.4836606979370117, "learning_rate": 5.409604519774012e-06, "loss": 1.6492, "step": 3157 }, { "epoch": 2.676271186440678, "grad_norm": 1.9653449058532715, "learning_rate": 5.395480225988701e-06, "loss": 1.9011, "step": 3158 }, { "epoch": 2.6771186440677965, "grad_norm": 1.3556913137435913, "learning_rate": 5.38135593220339e-06, "loss": 2.4513, "step": 3159 }, { "epoch": 2.6779661016949152, "grad_norm": 1.405932068824768, "learning_rate": 5.367231638418079e-06, "loss": 2.3086, "step": 3160 }, { "epoch": 2.678813559322034, "grad_norm": 1.8774460554122925, "learning_rate": 5.353107344632768e-06, "loss": 2.0499, "step": 3161 }, { "epoch": 2.6796610169491526, "grad_norm": 1.4726383686065674, "learning_rate": 5.3389830508474575e-06, "loss": 2.2896, "step": 3162 }, { "epoch": 2.6805084745762713, "grad_norm": 2.240968704223633, "learning_rate": 5.324858757062147e-06, "loss": 1.809, "step": 3163 }, { "epoch": 2.68135593220339, "grad_norm": 1.4824795722961426, "learning_rate": 5.310734463276836e-06, "loss": 2.391, "step": 3164 }, { "epoch": 2.6822033898305087, "grad_norm": 1.6574009656906128, "learning_rate": 5.296610169491526e-06, "loss": 2.0352, "step": 3165 }, { "epoch": 2.6830508474576273, "grad_norm": 1.8091789484024048, "learning_rate": 5.282485875706215e-06, "loss": 2.1136, "step": 3166 }, { "epoch": 2.683898305084746, "grad_norm": 1.7391529083251953, "learning_rate": 5.268361581920904e-06, "loss": 2.1191, "step": 3167 }, { "epoch": 2.6847457627118643, "grad_norm": 1.2865487337112427, "learning_rate": 5.254237288135593e-06, "loss": 2.4338, "step": 3168 }, { "epoch": 2.685593220338983, "grad_norm": 1.9282454252243042, "learning_rate": 5.240112994350282e-06, "loss": 2.0678, "step": 3169 }, { "epoch": 2.6864406779661016, "grad_norm": 1.9156838655471802, "learning_rate": 5.2259887005649715e-06, "loss": 2.0578, "step": 3170 }, { "epoch": 2.6872881355932203, "grad_norm": 1.5116440057754517, "learning_rate": 5.211864406779661e-06, "loss": 2.1673, "step": 3171 }, { "epoch": 2.688135593220339, "grad_norm": 1.5679614543914795, "learning_rate": 5.197740112994351e-06, "loss": 2.2779, "step": 3172 }, { "epoch": 2.6889830508474577, "grad_norm": 1.3827556371688843, "learning_rate": 5.18361581920904e-06, "loss": 2.4234, "step": 3173 }, { "epoch": 2.6898305084745764, "grad_norm": 2.056248426437378, "learning_rate": 5.169491525423729e-06, "loss": 1.9765, "step": 3174 }, { "epoch": 2.690677966101695, "grad_norm": 1.9912073612213135, "learning_rate": 5.155367231638418e-06, "loss": 1.8635, "step": 3175 }, { "epoch": 2.6915254237288133, "grad_norm": 2.0789661407470703, "learning_rate": 5.141242937853107e-06, "loss": 1.8654, "step": 3176 }, { "epoch": 2.692372881355932, "grad_norm": 1.617958903312683, "learning_rate": 5.127118644067796e-06, "loss": 2.3341, "step": 3177 }, { "epoch": 2.6932203389830507, "grad_norm": 1.8623449802398682, "learning_rate": 5.112994350282486e-06, "loss": 2.0931, "step": 3178 }, { "epoch": 2.6940677966101694, "grad_norm": 1.7776962518692017, "learning_rate": 5.0988700564971755e-06, "loss": 1.9738, "step": 3179 }, { "epoch": 2.694915254237288, "grad_norm": 1.5862172842025757, "learning_rate": 5.084745762711865e-06, "loss": 2.1484, "step": 3180 }, { "epoch": 2.6957627118644067, "grad_norm": 1.8234288692474365, "learning_rate": 5.070621468926554e-06, "loss": 1.9512, "step": 3181 }, { "epoch": 2.6966101694915254, "grad_norm": 1.7137696743011475, "learning_rate": 5.056497175141244e-06, "loss": 2.1188, "step": 3182 }, { "epoch": 2.697457627118644, "grad_norm": 1.5230895280838013, "learning_rate": 5.042372881355933e-06, "loss": 2.3191, "step": 3183 }, { "epoch": 2.698305084745763, "grad_norm": 1.7367206811904907, "learning_rate": 5.028248587570622e-06, "loss": 2.0591, "step": 3184 }, { "epoch": 2.6991525423728815, "grad_norm": 1.7793805599212646, "learning_rate": 5.014124293785311e-06, "loss": 2.2335, "step": 3185 }, { "epoch": 2.7, "grad_norm": 1.5189614295959473, "learning_rate": 5e-06, "loss": 2.4363, "step": 3186 }, { "epoch": 2.700847457627119, "grad_norm": 1.3897454738616943, "learning_rate": 4.9858757062146896e-06, "loss": 2.4523, "step": 3187 }, { "epoch": 2.7016949152542376, "grad_norm": 1.6624294519424438, "learning_rate": 4.971751412429379e-06, "loss": 1.985, "step": 3188 }, { "epoch": 2.702542372881356, "grad_norm": 1.8261654376983643, "learning_rate": 4.957627118644069e-06, "loss": 2.2007, "step": 3189 }, { "epoch": 2.7033898305084745, "grad_norm": 1.8060328960418701, "learning_rate": 4.943502824858758e-06, "loss": 2.1156, "step": 3190 }, { "epoch": 2.704237288135593, "grad_norm": 1.6717157363891602, "learning_rate": 4.929378531073447e-06, "loss": 2.1968, "step": 3191 }, { "epoch": 2.705084745762712, "grad_norm": 1.9698718786239624, "learning_rate": 4.915254237288136e-06, "loss": 1.9488, "step": 3192 }, { "epoch": 2.7059322033898305, "grad_norm": 1.6100242137908936, "learning_rate": 4.901129943502825e-06, "loss": 2.2336, "step": 3193 }, { "epoch": 2.7067796610169492, "grad_norm": 1.6457633972167969, "learning_rate": 4.8870056497175144e-06, "loss": 2.0917, "step": 3194 }, { "epoch": 2.707627118644068, "grad_norm": 1.5893149375915527, "learning_rate": 4.872881355932204e-06, "loss": 2.3023, "step": 3195 }, { "epoch": 2.7084745762711866, "grad_norm": 1.736437201499939, "learning_rate": 4.858757062146893e-06, "loss": 2.226, "step": 3196 }, { "epoch": 2.709322033898305, "grad_norm": 2.0657811164855957, "learning_rate": 4.844632768361583e-06, "loss": 2.1206, "step": 3197 }, { "epoch": 2.7101694915254235, "grad_norm": 1.9966922998428345, "learning_rate": 4.830508474576272e-06, "loss": 1.9236, "step": 3198 }, { "epoch": 2.711016949152542, "grad_norm": 1.7514450550079346, "learning_rate": 4.816384180790961e-06, "loss": 2.1602, "step": 3199 }, { "epoch": 2.711864406779661, "grad_norm": 1.5922361612319946, "learning_rate": 4.80225988700565e-06, "loss": 2.2008, "step": 3200 }, { "epoch": 2.7127118644067796, "grad_norm": 1.90059494972229, "learning_rate": 4.788135593220339e-06, "loss": 1.925, "step": 3201 }, { "epoch": 2.7135593220338983, "grad_norm": 1.9019501209259033, "learning_rate": 4.7740112994350285e-06, "loss": 1.959, "step": 3202 }, { "epoch": 2.714406779661017, "grad_norm": 1.727725863456726, "learning_rate": 4.759887005649718e-06, "loss": 2.4346, "step": 3203 }, { "epoch": 2.7152542372881356, "grad_norm": 1.9527908563613892, "learning_rate": 4.745762711864407e-06, "loss": 1.798, "step": 3204 }, { "epoch": 2.7161016949152543, "grad_norm": 1.6068388223648071, "learning_rate": 4.731638418079097e-06, "loss": 2.1341, "step": 3205 }, { "epoch": 2.716949152542373, "grad_norm": 1.9126149415969849, "learning_rate": 4.717514124293786e-06, "loss": 1.899, "step": 3206 }, { "epoch": 2.7177966101694917, "grad_norm": 1.710034966468811, "learning_rate": 4.703389830508475e-06, "loss": 2.0316, "step": 3207 }, { "epoch": 2.7186440677966104, "grad_norm": 1.5138301849365234, "learning_rate": 4.689265536723164e-06, "loss": 2.3638, "step": 3208 }, { "epoch": 2.719491525423729, "grad_norm": 2.125113010406494, "learning_rate": 4.675141242937853e-06, "loss": 1.8224, "step": 3209 }, { "epoch": 2.7203389830508473, "grad_norm": 1.3093822002410889, "learning_rate": 4.6610169491525425e-06, "loss": 2.4209, "step": 3210 }, { "epoch": 2.721186440677966, "grad_norm": 1.7390575408935547, "learning_rate": 4.646892655367232e-06, "loss": 2.1579, "step": 3211 }, { "epoch": 2.7220338983050847, "grad_norm": 1.7444908618927002, "learning_rate": 4.632768361581921e-06, "loss": 2.22, "step": 3212 }, { "epoch": 2.7228813559322034, "grad_norm": 1.6553432941436768, "learning_rate": 4.618644067796611e-06, "loss": 2.0794, "step": 3213 }, { "epoch": 2.723728813559322, "grad_norm": 1.4758368730545044, "learning_rate": 4.6045197740113e-06, "loss": 2.2501, "step": 3214 }, { "epoch": 2.7245762711864407, "grad_norm": 1.5900572538375854, "learning_rate": 4.590395480225989e-06, "loss": 2.1464, "step": 3215 }, { "epoch": 2.7254237288135594, "grad_norm": 2.284975290298462, "learning_rate": 4.576271186440678e-06, "loss": 1.6978, "step": 3216 }, { "epoch": 2.726271186440678, "grad_norm": 1.8749831914901733, "learning_rate": 4.562146892655367e-06, "loss": 2.1577, "step": 3217 }, { "epoch": 2.7271186440677964, "grad_norm": 1.7108556032180786, "learning_rate": 4.5480225988700565e-06, "loss": 2.1418, "step": 3218 }, { "epoch": 2.727966101694915, "grad_norm": 1.5814942121505737, "learning_rate": 4.533898305084746e-06, "loss": 2.2746, "step": 3219 }, { "epoch": 2.7288135593220337, "grad_norm": 1.5617300271987915, "learning_rate": 4.519774011299436e-06, "loss": 2.4608, "step": 3220 }, { "epoch": 2.7296610169491524, "grad_norm": 1.6508846282958984, "learning_rate": 4.505649717514125e-06, "loss": 2.21, "step": 3221 }, { "epoch": 2.730508474576271, "grad_norm": 1.6110411882400513, "learning_rate": 4.491525423728814e-06, "loss": 2.3215, "step": 3222 }, { "epoch": 2.73135593220339, "grad_norm": 1.7519724369049072, "learning_rate": 4.477401129943503e-06, "loss": 2.2066, "step": 3223 }, { "epoch": 2.7322033898305085, "grad_norm": 1.8455437421798706, "learning_rate": 4.463276836158192e-06, "loss": 2.2371, "step": 3224 }, { "epoch": 2.733050847457627, "grad_norm": 2.0526866912841797, "learning_rate": 4.449152542372881e-06, "loss": 1.9398, "step": 3225 }, { "epoch": 2.733898305084746, "grad_norm": 1.8807004690170288, "learning_rate": 4.4350282485875705e-06, "loss": 2.1996, "step": 3226 }, { "epoch": 2.7347457627118645, "grad_norm": 1.7172704935073853, "learning_rate": 4.42090395480226e-06, "loss": 1.9544, "step": 3227 }, { "epoch": 2.7355932203389832, "grad_norm": 2.149353265762329, "learning_rate": 4.40677966101695e-06, "loss": 1.5812, "step": 3228 }, { "epoch": 2.736440677966102, "grad_norm": 1.7039453983306885, "learning_rate": 4.392655367231639e-06, "loss": 2.0823, "step": 3229 }, { "epoch": 2.7372881355932206, "grad_norm": 1.71449875831604, "learning_rate": 4.378531073446328e-06, "loss": 2.23, "step": 3230 }, { "epoch": 2.738135593220339, "grad_norm": 1.9371012449264526, "learning_rate": 4.364406779661017e-06, "loss": 1.8363, "step": 3231 }, { "epoch": 2.7389830508474575, "grad_norm": 1.851549506187439, "learning_rate": 4.350282485875706e-06, "loss": 2.0805, "step": 3232 }, { "epoch": 2.739830508474576, "grad_norm": 1.7192928791046143, "learning_rate": 4.336158192090395e-06, "loss": 2.1875, "step": 3233 }, { "epoch": 2.740677966101695, "grad_norm": 1.9296584129333496, "learning_rate": 4.3220338983050846e-06, "loss": 1.938, "step": 3234 }, { "epoch": 2.7415254237288136, "grad_norm": 1.6388558149337769, "learning_rate": 4.307909604519774e-06, "loss": 2.335, "step": 3235 }, { "epoch": 2.7423728813559323, "grad_norm": 1.5845305919647217, "learning_rate": 4.293785310734464e-06, "loss": 2.19, "step": 3236 }, { "epoch": 2.743220338983051, "grad_norm": 2.0693283081054688, "learning_rate": 4.279661016949153e-06, "loss": 1.8567, "step": 3237 }, { "epoch": 2.7440677966101696, "grad_norm": 1.8741554021835327, "learning_rate": 4.265536723163842e-06, "loss": 2.0643, "step": 3238 }, { "epoch": 2.744915254237288, "grad_norm": 1.7262877225875854, "learning_rate": 4.251412429378531e-06, "loss": 2.2332, "step": 3239 }, { "epoch": 2.7457627118644066, "grad_norm": 1.424884557723999, "learning_rate": 4.23728813559322e-06, "loss": 2.2643, "step": 3240 }, { "epoch": 2.7466101694915253, "grad_norm": 1.7748403549194336, "learning_rate": 4.2231638418079094e-06, "loss": 1.8507, "step": 3241 }, { "epoch": 2.747457627118644, "grad_norm": 2.2172834873199463, "learning_rate": 4.209039548022599e-06, "loss": 1.6501, "step": 3242 }, { "epoch": 2.7483050847457626, "grad_norm": 1.8971115350723267, "learning_rate": 4.194915254237288e-06, "loss": 1.7921, "step": 3243 }, { "epoch": 2.7491525423728813, "grad_norm": 2.0673277378082275, "learning_rate": 4.180790960451978e-06, "loss": 1.891, "step": 3244 }, { "epoch": 2.75, "grad_norm": 1.614029049873352, "learning_rate": 4.166666666666667e-06, "loss": 2.3923, "step": 3245 }, { "epoch": 2.7508474576271187, "grad_norm": 1.8733458518981934, "learning_rate": 4.152542372881356e-06, "loss": 1.9396, "step": 3246 }, { "epoch": 2.7516949152542374, "grad_norm": 1.8691768646240234, "learning_rate": 4.138418079096045e-06, "loss": 1.9589, "step": 3247 }, { "epoch": 2.752542372881356, "grad_norm": 1.7177436351776123, "learning_rate": 4.124293785310734e-06, "loss": 2.2102, "step": 3248 }, { "epoch": 2.7533898305084747, "grad_norm": 1.5605498552322388, "learning_rate": 4.1101694915254235e-06, "loss": 2.2768, "step": 3249 }, { "epoch": 2.7542372881355934, "grad_norm": 1.9510782957077026, "learning_rate": 4.096045197740113e-06, "loss": 2.0669, "step": 3250 }, { "epoch": 2.755084745762712, "grad_norm": 1.723276138305664, "learning_rate": 4.081920903954802e-06, "loss": 1.8774, "step": 3251 }, { "epoch": 2.7559322033898304, "grad_norm": 1.8737914562225342, "learning_rate": 4.067796610169492e-06, "loss": 1.9715, "step": 3252 }, { "epoch": 2.756779661016949, "grad_norm": 1.9561488628387451, "learning_rate": 4.053672316384181e-06, "loss": 2.0171, "step": 3253 }, { "epoch": 2.7576271186440677, "grad_norm": 1.9660571813583374, "learning_rate": 4.03954802259887e-06, "loss": 2.0788, "step": 3254 }, { "epoch": 2.7584745762711864, "grad_norm": 1.5801583528518677, "learning_rate": 4.025423728813559e-06, "loss": 2.2551, "step": 3255 }, { "epoch": 2.759322033898305, "grad_norm": 1.958560585975647, "learning_rate": 4.011299435028248e-06, "loss": 2.0017, "step": 3256 }, { "epoch": 2.760169491525424, "grad_norm": 2.0370123386383057, "learning_rate": 3.9971751412429375e-06, "loss": 1.9183, "step": 3257 }, { "epoch": 2.7610169491525425, "grad_norm": 1.924447774887085, "learning_rate": 3.983050847457627e-06, "loss": 1.9518, "step": 3258 }, { "epoch": 2.761864406779661, "grad_norm": 1.909886121749878, "learning_rate": 3.968926553672317e-06, "loss": 1.9164, "step": 3259 }, { "epoch": 2.7627118644067794, "grad_norm": 1.5260083675384521, "learning_rate": 3.954802259887006e-06, "loss": 2.2727, "step": 3260 }, { "epoch": 2.763559322033898, "grad_norm": 1.8427681922912598, "learning_rate": 3.940677966101696e-06, "loss": 1.8079, "step": 3261 }, { "epoch": 2.7644067796610168, "grad_norm": 1.7439347505569458, "learning_rate": 3.926553672316385e-06, "loss": 2.2202, "step": 3262 }, { "epoch": 2.7652542372881355, "grad_norm": 1.720460295677185, "learning_rate": 3.912429378531074e-06, "loss": 2.0204, "step": 3263 }, { "epoch": 2.766101694915254, "grad_norm": 1.6500192880630493, "learning_rate": 3.898305084745763e-06, "loss": 2.5506, "step": 3264 }, { "epoch": 2.766949152542373, "grad_norm": 1.5738345384597778, "learning_rate": 3.884180790960452e-06, "loss": 2.217, "step": 3265 }, { "epoch": 2.7677966101694915, "grad_norm": 1.842882752418518, "learning_rate": 3.8700564971751415e-06, "loss": 2.0424, "step": 3266 }, { "epoch": 2.76864406779661, "grad_norm": 1.4322773218154907, "learning_rate": 3.855932203389831e-06, "loss": 2.4257, "step": 3267 }, { "epoch": 2.769491525423729, "grad_norm": 1.6227883100509644, "learning_rate": 3.841807909604521e-06, "loss": 2.3145, "step": 3268 }, { "epoch": 2.7703389830508476, "grad_norm": 1.8622524738311768, "learning_rate": 3.82768361581921e-06, "loss": 2.1128, "step": 3269 }, { "epoch": 2.7711864406779663, "grad_norm": 1.8271149396896362, "learning_rate": 3.813559322033899e-06, "loss": 2.1978, "step": 3270 }, { "epoch": 2.772033898305085, "grad_norm": 1.6806551218032837, "learning_rate": 3.799435028248588e-06, "loss": 2.25, "step": 3271 }, { "epoch": 2.7728813559322036, "grad_norm": 2.082566976547241, "learning_rate": 3.7853107344632772e-06, "loss": 1.7762, "step": 3272 }, { "epoch": 2.773728813559322, "grad_norm": 2.108517646789551, "learning_rate": 3.7711864406779664e-06, "loss": 1.7451, "step": 3273 }, { "epoch": 2.7745762711864406, "grad_norm": 1.8154159784317017, "learning_rate": 3.757062146892656e-06, "loss": 2.2876, "step": 3274 }, { "epoch": 2.7754237288135593, "grad_norm": 1.8693562746047974, "learning_rate": 3.742937853107345e-06, "loss": 1.9695, "step": 3275 }, { "epoch": 2.776271186440678, "grad_norm": 1.78440523147583, "learning_rate": 3.7288135593220342e-06, "loss": 2.0366, "step": 3276 }, { "epoch": 2.7771186440677966, "grad_norm": 1.734572410583496, "learning_rate": 3.7146892655367234e-06, "loss": 2.0626, "step": 3277 }, { "epoch": 2.7779661016949153, "grad_norm": 2.122291326522827, "learning_rate": 3.700564971751413e-06, "loss": 1.8098, "step": 3278 }, { "epoch": 2.778813559322034, "grad_norm": 1.1030203104019165, "learning_rate": 3.686440677966102e-06, "loss": 2.5436, "step": 3279 }, { "epoch": 2.7796610169491527, "grad_norm": 1.6981253623962402, "learning_rate": 3.6723163841807913e-06, "loss": 2.0083, "step": 3280 }, { "epoch": 2.780508474576271, "grad_norm": 1.5894949436187744, "learning_rate": 3.6581920903954804e-06, "loss": 2.238, "step": 3281 }, { "epoch": 2.7813559322033896, "grad_norm": 1.549978256225586, "learning_rate": 3.64406779661017e-06, "loss": 2.2281, "step": 3282 }, { "epoch": 2.7822033898305083, "grad_norm": 1.5143518447875977, "learning_rate": 3.629943502824859e-06, "loss": 2.3051, "step": 3283 }, { "epoch": 2.783050847457627, "grad_norm": 1.7828015089035034, "learning_rate": 3.6158192090395483e-06, "loss": 2.1792, "step": 3284 }, { "epoch": 2.7838983050847457, "grad_norm": 1.6845375299453735, "learning_rate": 3.6016949152542374e-06, "loss": 2.1513, "step": 3285 }, { "epoch": 2.7847457627118644, "grad_norm": 1.6669672727584839, "learning_rate": 3.587570621468927e-06, "loss": 2.2889, "step": 3286 }, { "epoch": 2.785593220338983, "grad_norm": 1.5005353689193726, "learning_rate": 3.573446327683616e-06, "loss": 2.3482, "step": 3287 }, { "epoch": 2.7864406779661017, "grad_norm": 1.8809428215026855, "learning_rate": 3.5593220338983053e-06, "loss": 1.9747, "step": 3288 }, { "epoch": 2.7872881355932204, "grad_norm": 1.5482172966003418, "learning_rate": 3.5451977401129944e-06, "loss": 2.5255, "step": 3289 }, { "epoch": 2.788135593220339, "grad_norm": 1.8034367561340332, "learning_rate": 3.531073446327684e-06, "loss": 2.1528, "step": 3290 }, { "epoch": 2.788983050847458, "grad_norm": 1.7874549627304077, "learning_rate": 3.516949152542373e-06, "loss": 2.2353, "step": 3291 }, { "epoch": 2.7898305084745765, "grad_norm": 1.96170175075531, "learning_rate": 3.5028248587570623e-06, "loss": 1.8544, "step": 3292 }, { "epoch": 2.790677966101695, "grad_norm": 1.7292556762695312, "learning_rate": 3.4887005649717514e-06, "loss": 2.2137, "step": 3293 }, { "epoch": 2.7915254237288134, "grad_norm": 1.6710186004638672, "learning_rate": 3.474576271186441e-06, "loss": 2.413, "step": 3294 }, { "epoch": 2.792372881355932, "grad_norm": 1.7541210651397705, "learning_rate": 3.46045197740113e-06, "loss": 2.1582, "step": 3295 }, { "epoch": 2.7932203389830508, "grad_norm": 1.61122727394104, "learning_rate": 3.4463276836158193e-06, "loss": 2.1661, "step": 3296 }, { "epoch": 2.7940677966101695, "grad_norm": 1.8008424043655396, "learning_rate": 3.4322033898305084e-06, "loss": 2.1737, "step": 3297 }, { "epoch": 2.794915254237288, "grad_norm": 1.6333184242248535, "learning_rate": 3.418079096045198e-06, "loss": 1.9677, "step": 3298 }, { "epoch": 2.795762711864407, "grad_norm": 1.569275975227356, "learning_rate": 3.403954802259887e-06, "loss": 2.3139, "step": 3299 }, { "epoch": 2.7966101694915255, "grad_norm": 1.9034026861190796, "learning_rate": 3.3898305084745763e-06, "loss": 1.8646, "step": 3300 }, { "epoch": 2.797457627118644, "grad_norm": 1.9896222352981567, "learning_rate": 3.375706214689266e-06, "loss": 1.6079, "step": 3301 }, { "epoch": 2.7983050847457624, "grad_norm": 1.4076114892959595, "learning_rate": 3.361581920903955e-06, "loss": 2.3119, "step": 3302 }, { "epoch": 2.799152542372881, "grad_norm": 1.6879991292953491, "learning_rate": 3.347457627118644e-06, "loss": 2.2824, "step": 3303 }, { "epoch": 2.8, "grad_norm": 1.23570716381073, "learning_rate": 3.3333333333333333e-06, "loss": 2.5017, "step": 3304 }, { "epoch": 2.8008474576271185, "grad_norm": 1.734196662902832, "learning_rate": 3.319209039548023e-06, "loss": 2.0933, "step": 3305 }, { "epoch": 2.801694915254237, "grad_norm": 1.900134563446045, "learning_rate": 3.305084745762712e-06, "loss": 2.155, "step": 3306 }, { "epoch": 2.802542372881356, "grad_norm": 2.1223700046539307, "learning_rate": 3.290960451977401e-06, "loss": 1.8647, "step": 3307 }, { "epoch": 2.8033898305084746, "grad_norm": 1.3897490501403809, "learning_rate": 3.2768361581920903e-06, "loss": 2.425, "step": 3308 }, { "epoch": 2.8042372881355933, "grad_norm": 1.7049990892410278, "learning_rate": 3.26271186440678e-06, "loss": 1.8893, "step": 3309 }, { "epoch": 2.805084745762712, "grad_norm": 1.7432835102081299, "learning_rate": 3.248587570621469e-06, "loss": 2.0095, "step": 3310 }, { "epoch": 2.8059322033898306, "grad_norm": 1.9104880094528198, "learning_rate": 3.234463276836158e-06, "loss": 1.9359, "step": 3311 }, { "epoch": 2.8067796610169493, "grad_norm": 1.3656202554702759, "learning_rate": 3.2203389830508473e-06, "loss": 2.3345, "step": 3312 }, { "epoch": 2.807627118644068, "grad_norm": 1.4636744260787964, "learning_rate": 3.206214689265537e-06, "loss": 2.2708, "step": 3313 }, { "epoch": 2.8084745762711867, "grad_norm": 1.8310048580169678, "learning_rate": 3.192090395480226e-06, "loss": 2.1519, "step": 3314 }, { "epoch": 2.809322033898305, "grad_norm": 1.9034868478775024, "learning_rate": 3.1779661016949152e-06, "loss": 2.0292, "step": 3315 }, { "epoch": 2.8101694915254236, "grad_norm": 1.5066553354263306, "learning_rate": 3.1638418079096044e-06, "loss": 2.4282, "step": 3316 }, { "epoch": 2.8110169491525423, "grad_norm": 1.8293631076812744, "learning_rate": 3.149717514124294e-06, "loss": 1.9231, "step": 3317 }, { "epoch": 2.811864406779661, "grad_norm": 1.886911153793335, "learning_rate": 3.135593220338983e-06, "loss": 2.0372, "step": 3318 }, { "epoch": 2.8127118644067797, "grad_norm": 1.9897105693817139, "learning_rate": 3.1214689265536726e-06, "loss": 2.034, "step": 3319 }, { "epoch": 2.8135593220338984, "grad_norm": 1.4622457027435303, "learning_rate": 3.107344632768362e-06, "loss": 2.3314, "step": 3320 }, { "epoch": 2.814406779661017, "grad_norm": 1.5068597793579102, "learning_rate": 3.093220338983051e-06, "loss": 2.3699, "step": 3321 }, { "epoch": 2.8152542372881357, "grad_norm": 1.9084815979003906, "learning_rate": 3.0790960451977405e-06, "loss": 1.9478, "step": 3322 }, { "epoch": 2.816101694915254, "grad_norm": 1.5619968175888062, "learning_rate": 3.0649717514124297e-06, "loss": 2.1414, "step": 3323 }, { "epoch": 2.8169491525423727, "grad_norm": 1.6689423322677612, "learning_rate": 3.050847457627119e-06, "loss": 2.2752, "step": 3324 }, { "epoch": 2.8177966101694913, "grad_norm": 1.325331449508667, "learning_rate": 3.0367231638418084e-06, "loss": 2.4943, "step": 3325 }, { "epoch": 2.81864406779661, "grad_norm": 1.5005288124084473, "learning_rate": 3.0225988700564975e-06, "loss": 2.3628, "step": 3326 }, { "epoch": 2.8194915254237287, "grad_norm": 1.8637874126434326, "learning_rate": 3.0084745762711867e-06, "loss": 2.0922, "step": 3327 }, { "epoch": 2.8203389830508474, "grad_norm": 2.4057538509368896, "learning_rate": 2.994350282485876e-06, "loss": 1.6564, "step": 3328 }, { "epoch": 2.821186440677966, "grad_norm": 2.0353689193725586, "learning_rate": 2.9802259887005654e-06, "loss": 1.9043, "step": 3329 }, { "epoch": 2.8220338983050848, "grad_norm": 1.7443592548370361, "learning_rate": 2.9661016949152545e-06, "loss": 2.1515, "step": 3330 }, { "epoch": 2.8228813559322035, "grad_norm": 1.5564725399017334, "learning_rate": 2.9519774011299437e-06, "loss": 2.3661, "step": 3331 }, { "epoch": 2.823728813559322, "grad_norm": 1.7410897016525269, "learning_rate": 2.937853107344633e-06, "loss": 2.1678, "step": 3332 }, { "epoch": 2.824576271186441, "grad_norm": 1.715358018875122, "learning_rate": 2.9237288135593224e-06, "loss": 2.0724, "step": 3333 }, { "epoch": 2.8254237288135595, "grad_norm": 2.0631182193756104, "learning_rate": 2.9096045197740115e-06, "loss": 1.8538, "step": 3334 }, { "epoch": 2.826271186440678, "grad_norm": 1.677146077156067, "learning_rate": 2.8954802259887007e-06, "loss": 2.1019, "step": 3335 }, { "epoch": 2.8271186440677964, "grad_norm": 2.1181676387786865, "learning_rate": 2.88135593220339e-06, "loss": 2.0057, "step": 3336 }, { "epoch": 2.827966101694915, "grad_norm": 1.9416948556900024, "learning_rate": 2.8672316384180794e-06, "loss": 1.775, "step": 3337 }, { "epoch": 2.828813559322034, "grad_norm": 1.7427901029586792, "learning_rate": 2.8531073446327686e-06, "loss": 2.0498, "step": 3338 }, { "epoch": 2.8296610169491525, "grad_norm": 1.6878767013549805, "learning_rate": 2.8389830508474577e-06, "loss": 2.1399, "step": 3339 }, { "epoch": 2.830508474576271, "grad_norm": 1.4075371026992798, "learning_rate": 2.824858757062147e-06, "loss": 2.2619, "step": 3340 }, { "epoch": 2.83135593220339, "grad_norm": 1.383636236190796, "learning_rate": 2.8107344632768364e-06, "loss": 2.3308, "step": 3341 }, { "epoch": 2.8322033898305086, "grad_norm": 1.1737382411956787, "learning_rate": 2.7966101694915256e-06, "loss": 2.4857, "step": 3342 }, { "epoch": 2.8330508474576273, "grad_norm": 1.6975308656692505, "learning_rate": 2.7824858757062147e-06, "loss": 2.1746, "step": 3343 }, { "epoch": 2.8338983050847455, "grad_norm": 1.8578035831451416, "learning_rate": 2.768361581920904e-06, "loss": 2.061, "step": 3344 }, { "epoch": 2.834745762711864, "grad_norm": 1.2902839183807373, "learning_rate": 2.7542372881355934e-06, "loss": 2.4812, "step": 3345 }, { "epoch": 2.835593220338983, "grad_norm": 1.8486239910125732, "learning_rate": 2.7401129943502826e-06, "loss": 2.0483, "step": 3346 }, { "epoch": 2.8364406779661016, "grad_norm": 1.5760711431503296, "learning_rate": 2.7259887005649717e-06, "loss": 2.201, "step": 3347 }, { "epoch": 2.8372881355932202, "grad_norm": 1.4381455183029175, "learning_rate": 2.711864406779661e-06, "loss": 2.2938, "step": 3348 }, { "epoch": 2.838135593220339, "grad_norm": 2.1512436866760254, "learning_rate": 2.6977401129943505e-06, "loss": 1.9279, "step": 3349 }, { "epoch": 2.8389830508474576, "grad_norm": 1.9421781301498413, "learning_rate": 2.6836158192090396e-06, "loss": 2.1418, "step": 3350 }, { "epoch": 2.8398305084745763, "grad_norm": 1.4164178371429443, "learning_rate": 2.6694915254237287e-06, "loss": 2.3653, "step": 3351 }, { "epoch": 2.840677966101695, "grad_norm": 1.6631990671157837, "learning_rate": 2.655367231638418e-06, "loss": 2.1167, "step": 3352 }, { "epoch": 2.8415254237288137, "grad_norm": 1.8184740543365479, "learning_rate": 2.6412429378531075e-06, "loss": 2.0879, "step": 3353 }, { "epoch": 2.8423728813559324, "grad_norm": 2.1014161109924316, "learning_rate": 2.6271186440677966e-06, "loss": 1.8246, "step": 3354 }, { "epoch": 2.843220338983051, "grad_norm": 1.7352596521377563, "learning_rate": 2.6129943502824858e-06, "loss": 2.1501, "step": 3355 }, { "epoch": 2.8440677966101697, "grad_norm": 1.4966508150100708, "learning_rate": 2.5988700564971753e-06, "loss": 2.3759, "step": 3356 }, { "epoch": 2.844915254237288, "grad_norm": 1.745072364807129, "learning_rate": 2.5847457627118645e-06, "loss": 2.1665, "step": 3357 }, { "epoch": 2.8457627118644067, "grad_norm": 1.8252897262573242, "learning_rate": 2.5706214689265536e-06, "loss": 2.1921, "step": 3358 }, { "epoch": 2.8466101694915253, "grad_norm": 1.5177760124206543, "learning_rate": 2.556497175141243e-06, "loss": 1.9854, "step": 3359 }, { "epoch": 2.847457627118644, "grad_norm": 1.8378403186798096, "learning_rate": 2.5423728813559323e-06, "loss": 2.0397, "step": 3360 }, { "epoch": 2.8483050847457627, "grad_norm": 1.78495192527771, "learning_rate": 2.528248587570622e-06, "loss": 2.1185, "step": 3361 }, { "epoch": 2.8491525423728814, "grad_norm": 1.9980145692825317, "learning_rate": 2.514124293785311e-06, "loss": 1.9541, "step": 3362 }, { "epoch": 2.85, "grad_norm": 2.014122486114502, "learning_rate": 2.5e-06, "loss": 1.9235, "step": 3363 }, { "epoch": 2.8508474576271188, "grad_norm": 2.524773597717285, "learning_rate": 2.4858757062146894e-06, "loss": 1.8301, "step": 3364 }, { "epoch": 2.851694915254237, "grad_norm": 2.225480556488037, "learning_rate": 2.471751412429379e-06, "loss": 1.8999, "step": 3365 }, { "epoch": 2.8525423728813557, "grad_norm": 1.6353254318237305, "learning_rate": 2.457627118644068e-06, "loss": 2.1385, "step": 3366 }, { "epoch": 2.8533898305084744, "grad_norm": 1.9169930219650269, "learning_rate": 2.4435028248587572e-06, "loss": 1.8262, "step": 3367 }, { "epoch": 2.854237288135593, "grad_norm": 1.9980542659759521, "learning_rate": 2.4293785310734464e-06, "loss": 2.0419, "step": 3368 }, { "epoch": 2.8550847457627118, "grad_norm": 2.3818628787994385, "learning_rate": 2.415254237288136e-06, "loss": 1.9207, "step": 3369 }, { "epoch": 2.8559322033898304, "grad_norm": 1.5910136699676514, "learning_rate": 2.401129943502825e-06, "loss": 2.1945, "step": 3370 }, { "epoch": 2.856779661016949, "grad_norm": 1.8092448711395264, "learning_rate": 2.3870056497175142e-06, "loss": 1.9613, "step": 3371 }, { "epoch": 2.857627118644068, "grad_norm": 1.812135100364685, "learning_rate": 2.3728813559322034e-06, "loss": 1.8699, "step": 3372 }, { "epoch": 2.8584745762711865, "grad_norm": 1.7698718309402466, "learning_rate": 2.358757062146893e-06, "loss": 1.976, "step": 3373 }, { "epoch": 2.859322033898305, "grad_norm": 1.7062208652496338, "learning_rate": 2.344632768361582e-06, "loss": 2.0149, "step": 3374 }, { "epoch": 2.860169491525424, "grad_norm": 1.8272556066513062, "learning_rate": 2.3305084745762712e-06, "loss": 2.0581, "step": 3375 }, { "epoch": 2.8610169491525426, "grad_norm": 1.8334457874298096, "learning_rate": 2.3163841807909604e-06, "loss": 2.0563, "step": 3376 }, { "epoch": 2.8618644067796613, "grad_norm": 2.0398409366607666, "learning_rate": 2.30225988700565e-06, "loss": 1.87, "step": 3377 }, { "epoch": 2.86271186440678, "grad_norm": 1.729124665260315, "learning_rate": 2.288135593220339e-06, "loss": 2.0625, "step": 3378 }, { "epoch": 2.863559322033898, "grad_norm": 1.687659502029419, "learning_rate": 2.2740112994350283e-06, "loss": 2.1105, "step": 3379 }, { "epoch": 2.864406779661017, "grad_norm": 1.8048326969146729, "learning_rate": 2.259887005649718e-06, "loss": 2.1501, "step": 3380 }, { "epoch": 2.8652542372881356, "grad_norm": 1.6243799924850464, "learning_rate": 2.245762711864407e-06, "loss": 2.2105, "step": 3381 }, { "epoch": 2.8661016949152542, "grad_norm": 1.5646708011627197, "learning_rate": 2.231638418079096e-06, "loss": 2.1571, "step": 3382 }, { "epoch": 2.866949152542373, "grad_norm": 1.6267356872558594, "learning_rate": 2.2175141242937853e-06, "loss": 2.2915, "step": 3383 }, { "epoch": 2.8677966101694916, "grad_norm": 1.7456969022750854, "learning_rate": 2.203389830508475e-06, "loss": 1.9753, "step": 3384 }, { "epoch": 2.8686440677966103, "grad_norm": 1.92332923412323, "learning_rate": 2.189265536723164e-06, "loss": 1.8815, "step": 3385 }, { "epoch": 2.8694915254237285, "grad_norm": 1.9790289402008057, "learning_rate": 2.175141242937853e-06, "loss": 2.0115, "step": 3386 }, { "epoch": 2.8703389830508472, "grad_norm": 2.064730405807495, "learning_rate": 2.1610169491525423e-06, "loss": 1.8712, "step": 3387 }, { "epoch": 2.871186440677966, "grad_norm": 2.201131820678711, "learning_rate": 2.146892655367232e-06, "loss": 1.639, "step": 3388 }, { "epoch": 2.8720338983050846, "grad_norm": 1.7507131099700928, "learning_rate": 2.132768361581921e-06, "loss": 2.2192, "step": 3389 }, { "epoch": 2.8728813559322033, "grad_norm": 1.9839781522750854, "learning_rate": 2.11864406779661e-06, "loss": 1.8978, "step": 3390 }, { "epoch": 2.873728813559322, "grad_norm": 2.417419195175171, "learning_rate": 2.1045197740112993e-06, "loss": 1.7828, "step": 3391 }, { "epoch": 2.8745762711864407, "grad_norm": 1.8423184156417847, "learning_rate": 2.090395480225989e-06, "loss": 2.0322, "step": 3392 }, { "epoch": 2.8754237288135593, "grad_norm": 1.6614102125167847, "learning_rate": 2.076271186440678e-06, "loss": 2.2679, "step": 3393 }, { "epoch": 2.876271186440678, "grad_norm": 1.0237455368041992, "learning_rate": 2.062146892655367e-06, "loss": 2.5817, "step": 3394 }, { "epoch": 2.8771186440677967, "grad_norm": 1.5811541080474854, "learning_rate": 2.0480225988700563e-06, "loss": 2.3303, "step": 3395 }, { "epoch": 2.8779661016949154, "grad_norm": 1.284185528755188, "learning_rate": 2.033898305084746e-06, "loss": 2.3703, "step": 3396 }, { "epoch": 2.878813559322034, "grad_norm": 1.6183527708053589, "learning_rate": 2.019774011299435e-06, "loss": 2.1484, "step": 3397 }, { "epoch": 2.8796610169491528, "grad_norm": 1.596364140510559, "learning_rate": 2.005649717514124e-06, "loss": 2.2813, "step": 3398 }, { "epoch": 2.8805084745762715, "grad_norm": 2.3689513206481934, "learning_rate": 1.9915254237288133e-06, "loss": 1.4751, "step": 3399 }, { "epoch": 2.8813559322033897, "grad_norm": 2.1542131900787354, "learning_rate": 1.977401129943503e-06, "loss": 1.779, "step": 3400 }, { "epoch": 2.8822033898305084, "grad_norm": 1.5996615886688232, "learning_rate": 1.9632768361581925e-06, "loss": 2.4124, "step": 3401 }, { "epoch": 2.883050847457627, "grad_norm": 1.4909052848815918, "learning_rate": 1.9491525423728816e-06, "loss": 2.3465, "step": 3402 }, { "epoch": 2.8838983050847458, "grad_norm": 1.9433684349060059, "learning_rate": 1.9350282485875707e-06, "loss": 1.7697, "step": 3403 }, { "epoch": 2.8847457627118644, "grad_norm": 1.9747514724731445, "learning_rate": 1.9209039548022603e-06, "loss": 1.7446, "step": 3404 }, { "epoch": 2.885593220338983, "grad_norm": 1.4945168495178223, "learning_rate": 1.9067796610169495e-06, "loss": 2.222, "step": 3405 }, { "epoch": 2.886440677966102, "grad_norm": 2.045989751815796, "learning_rate": 1.8926553672316386e-06, "loss": 1.8782, "step": 3406 }, { "epoch": 2.88728813559322, "grad_norm": 1.7009637355804443, "learning_rate": 1.878531073446328e-06, "loss": 1.9011, "step": 3407 }, { "epoch": 2.8881355932203387, "grad_norm": 2.1265628337860107, "learning_rate": 1.8644067796610171e-06, "loss": 1.9526, "step": 3408 }, { "epoch": 2.8889830508474574, "grad_norm": 1.696357250213623, "learning_rate": 1.8502824858757065e-06, "loss": 2.2116, "step": 3409 }, { "epoch": 2.889830508474576, "grad_norm": 2.1881537437438965, "learning_rate": 1.8361581920903956e-06, "loss": 1.7084, "step": 3410 }, { "epoch": 2.890677966101695, "grad_norm": 1.791032075881958, "learning_rate": 1.822033898305085e-06, "loss": 2.0799, "step": 3411 }, { "epoch": 2.8915254237288135, "grad_norm": 1.15604829788208, "learning_rate": 1.8079096045197741e-06, "loss": 2.5477, "step": 3412 }, { "epoch": 2.892372881355932, "grad_norm": 1.626738429069519, "learning_rate": 1.7937853107344635e-06, "loss": 2.4368, "step": 3413 }, { "epoch": 2.893220338983051, "grad_norm": 1.5530431270599365, "learning_rate": 1.7796610169491526e-06, "loss": 2.2614, "step": 3414 }, { "epoch": 2.8940677966101696, "grad_norm": 1.7293918132781982, "learning_rate": 1.765536723163842e-06, "loss": 2.1684, "step": 3415 }, { "epoch": 2.8949152542372882, "grad_norm": 2.015207052230835, "learning_rate": 1.7514124293785311e-06, "loss": 1.8092, "step": 3416 }, { "epoch": 2.895762711864407, "grad_norm": 1.7043472528457642, "learning_rate": 1.7372881355932205e-06, "loss": 2.151, "step": 3417 }, { "epoch": 2.8966101694915256, "grad_norm": 1.7575970888137817, "learning_rate": 1.7231638418079096e-06, "loss": 2.0345, "step": 3418 }, { "epoch": 2.8974576271186443, "grad_norm": 1.8352203369140625, "learning_rate": 1.709039548022599e-06, "loss": 2.0827, "step": 3419 }, { "epoch": 2.898305084745763, "grad_norm": 2.0024681091308594, "learning_rate": 1.6949152542372882e-06, "loss": 1.8409, "step": 3420 }, { "epoch": 2.8991525423728812, "grad_norm": 2.071882963180542, "learning_rate": 1.6807909604519775e-06, "loss": 1.6405, "step": 3421 }, { "epoch": 2.9, "grad_norm": 1.923279881477356, "learning_rate": 1.6666666666666667e-06, "loss": 1.9334, "step": 3422 }, { "epoch": 2.9008474576271186, "grad_norm": 1.7485185861587524, "learning_rate": 1.652542372881356e-06, "loss": 2.0783, "step": 3423 }, { "epoch": 2.9016949152542373, "grad_norm": 1.6141704320907593, "learning_rate": 1.6384180790960452e-06, "loss": 2.199, "step": 3424 }, { "epoch": 2.902542372881356, "grad_norm": 1.840831995010376, "learning_rate": 1.6242937853107345e-06, "loss": 1.988, "step": 3425 }, { "epoch": 2.9033898305084747, "grad_norm": 1.810667634010315, "learning_rate": 1.6101694915254237e-06, "loss": 2.0097, "step": 3426 }, { "epoch": 2.9042372881355933, "grad_norm": 1.6117627620697021, "learning_rate": 1.596045197740113e-06, "loss": 2.1645, "step": 3427 }, { "epoch": 2.905084745762712, "grad_norm": 1.802703619003296, "learning_rate": 1.5819209039548022e-06, "loss": 2.1695, "step": 3428 }, { "epoch": 2.9059322033898303, "grad_norm": 1.5631777048110962, "learning_rate": 1.5677966101694915e-06, "loss": 2.1949, "step": 3429 }, { "epoch": 2.906779661016949, "grad_norm": 1.8716254234313965, "learning_rate": 1.553672316384181e-06, "loss": 1.8599, "step": 3430 }, { "epoch": 2.9076271186440676, "grad_norm": 2.2305080890655518, "learning_rate": 1.5395480225988703e-06, "loss": 1.929, "step": 3431 }, { "epoch": 2.9084745762711863, "grad_norm": 1.560698390007019, "learning_rate": 1.5254237288135594e-06, "loss": 2.2672, "step": 3432 }, { "epoch": 2.909322033898305, "grad_norm": 2.0044262409210205, "learning_rate": 1.5112994350282488e-06, "loss": 1.7978, "step": 3433 }, { "epoch": 2.9101694915254237, "grad_norm": 1.956240177154541, "learning_rate": 1.497175141242938e-06, "loss": 1.8694, "step": 3434 }, { "epoch": 2.9110169491525424, "grad_norm": 2.3589026927948, "learning_rate": 1.4830508474576273e-06, "loss": 1.6085, "step": 3435 }, { "epoch": 2.911864406779661, "grad_norm": 1.6329890489578247, "learning_rate": 1.4689265536723164e-06, "loss": 2.2799, "step": 3436 }, { "epoch": 2.9127118644067798, "grad_norm": 1.5386956930160522, "learning_rate": 1.4548022598870058e-06, "loss": 2.1172, "step": 3437 }, { "epoch": 2.9135593220338984, "grad_norm": 1.8822194337844849, "learning_rate": 1.440677966101695e-06, "loss": 1.9766, "step": 3438 }, { "epoch": 2.914406779661017, "grad_norm": 1.6315912008285522, "learning_rate": 1.4265536723163843e-06, "loss": 2.2342, "step": 3439 }, { "epoch": 2.915254237288136, "grad_norm": 1.6176600456237793, "learning_rate": 1.4124293785310734e-06, "loss": 2.0053, "step": 3440 }, { "epoch": 2.9161016949152545, "grad_norm": 1.7778948545455933, "learning_rate": 1.3983050847457628e-06, "loss": 2.0083, "step": 3441 }, { "epoch": 2.9169491525423727, "grad_norm": 2.0902791023254395, "learning_rate": 1.384180790960452e-06, "loss": 1.7752, "step": 3442 }, { "epoch": 2.9177966101694914, "grad_norm": 1.661112666130066, "learning_rate": 1.3700564971751413e-06, "loss": 2.5118, "step": 3443 }, { "epoch": 2.91864406779661, "grad_norm": 1.8177666664123535, "learning_rate": 1.3559322033898304e-06, "loss": 2.035, "step": 3444 }, { "epoch": 2.919491525423729, "grad_norm": 2.120927095413208, "learning_rate": 1.3418079096045198e-06, "loss": 1.7663, "step": 3445 }, { "epoch": 2.9203389830508475, "grad_norm": 1.778826355934143, "learning_rate": 1.327683615819209e-06, "loss": 1.8262, "step": 3446 }, { "epoch": 2.921186440677966, "grad_norm": 1.8968206644058228, "learning_rate": 1.3135593220338983e-06, "loss": 2.0749, "step": 3447 }, { "epoch": 2.922033898305085, "grad_norm": 1.3669257164001465, "learning_rate": 1.2994350282485877e-06, "loss": 2.4951, "step": 3448 }, { "epoch": 2.9228813559322036, "grad_norm": 1.7350246906280518, "learning_rate": 1.2853107344632768e-06, "loss": 2.0928, "step": 3449 }, { "epoch": 2.923728813559322, "grad_norm": 1.2821245193481445, "learning_rate": 1.2711864406779662e-06, "loss": 2.5033, "step": 3450 }, { "epoch": 2.9245762711864405, "grad_norm": 2.0260376930236816, "learning_rate": 1.2570621468926555e-06, "loss": 1.828, "step": 3451 }, { "epoch": 2.925423728813559, "grad_norm": 1.8080556392669678, "learning_rate": 1.2429378531073447e-06, "loss": 1.8051, "step": 3452 }, { "epoch": 2.926271186440678, "grad_norm": 1.7276920080184937, "learning_rate": 1.228813559322034e-06, "loss": 2.0536, "step": 3453 }, { "epoch": 2.9271186440677965, "grad_norm": 2.0028326511383057, "learning_rate": 1.2146892655367232e-06, "loss": 1.8631, "step": 3454 }, { "epoch": 2.9279661016949152, "grad_norm": 2.219172954559326, "learning_rate": 1.2005649717514125e-06, "loss": 1.676, "step": 3455 }, { "epoch": 2.928813559322034, "grad_norm": 1.5696964263916016, "learning_rate": 1.1864406779661017e-06, "loss": 2.2682, "step": 3456 }, { "epoch": 2.9296610169491526, "grad_norm": 1.4314230680465698, "learning_rate": 1.172316384180791e-06, "loss": 2.2949, "step": 3457 }, { "epoch": 2.9305084745762713, "grad_norm": 1.8091028928756714, "learning_rate": 1.1581920903954802e-06, "loss": 2.2635, "step": 3458 }, { "epoch": 2.93135593220339, "grad_norm": 1.8473447561264038, "learning_rate": 1.1440677966101696e-06, "loss": 2.2436, "step": 3459 }, { "epoch": 2.9322033898305087, "grad_norm": 1.2700364589691162, "learning_rate": 1.129943502824859e-06, "loss": 2.318, "step": 3460 }, { "epoch": 2.9330508474576273, "grad_norm": 1.5851829051971436, "learning_rate": 1.115819209039548e-06, "loss": 2.3973, "step": 3461 }, { "epoch": 2.933898305084746, "grad_norm": 1.7963261604309082, "learning_rate": 1.1016949152542374e-06, "loss": 2.0373, "step": 3462 }, { "epoch": 2.9347457627118643, "grad_norm": 1.9901875257492065, "learning_rate": 1.0875706214689266e-06, "loss": 2.0528, "step": 3463 }, { "epoch": 2.935593220338983, "grad_norm": 1.9313286542892456, "learning_rate": 1.073446327683616e-06, "loss": 1.9245, "step": 3464 }, { "epoch": 2.9364406779661016, "grad_norm": 1.8070708513259888, "learning_rate": 1.059322033898305e-06, "loss": 1.998, "step": 3465 }, { "epoch": 2.9372881355932203, "grad_norm": 2.0432677268981934, "learning_rate": 1.0451977401129944e-06, "loss": 1.8153, "step": 3466 }, { "epoch": 2.938135593220339, "grad_norm": 1.7318568229675293, "learning_rate": 1.0310734463276836e-06, "loss": 2.1531, "step": 3467 }, { "epoch": 2.9389830508474577, "grad_norm": 1.4131430387496948, "learning_rate": 1.016949152542373e-06, "loss": 2.1936, "step": 3468 }, { "epoch": 2.9398305084745764, "grad_norm": 2.067652463912964, "learning_rate": 1.002824858757062e-06, "loss": 1.7605, "step": 3469 }, { "epoch": 2.940677966101695, "grad_norm": 1.9416172504425049, "learning_rate": 9.887005649717514e-07, "loss": 1.9297, "step": 3470 }, { "epoch": 2.9415254237288133, "grad_norm": 1.131363868713379, "learning_rate": 9.745762711864408e-07, "loss": 2.5132, "step": 3471 }, { "epoch": 2.942372881355932, "grad_norm": 1.8346997499465942, "learning_rate": 9.604519774011302e-07, "loss": 2.0331, "step": 3472 }, { "epoch": 2.9432203389830507, "grad_norm": 1.6108843088150024, "learning_rate": 9.463276836158193e-07, "loss": 2.3032, "step": 3473 }, { "epoch": 2.9440677966101694, "grad_norm": 1.7487404346466064, "learning_rate": 9.322033898305086e-07, "loss": 2.134, "step": 3474 }, { "epoch": 2.944915254237288, "grad_norm": 1.2577048540115356, "learning_rate": 9.180790960451978e-07, "loss": 2.3789, "step": 3475 }, { "epoch": 2.9457627118644067, "grad_norm": 1.4981341361999512, "learning_rate": 9.039548022598871e-07, "loss": 2.3104, "step": 3476 }, { "epoch": 2.9466101694915254, "grad_norm": 1.8683017492294312, "learning_rate": 8.898305084745763e-07, "loss": 1.9977, "step": 3477 }, { "epoch": 2.947457627118644, "grad_norm": 1.9750769138336182, "learning_rate": 8.757062146892656e-07, "loss": 1.8513, "step": 3478 }, { "epoch": 2.948305084745763, "grad_norm": 1.6585434675216675, "learning_rate": 8.615819209039548e-07, "loss": 2.1858, "step": 3479 }, { "epoch": 2.9491525423728815, "grad_norm": 1.730651617050171, "learning_rate": 8.474576271186441e-07, "loss": 2.0628, "step": 3480 }, { "epoch": 2.95, "grad_norm": 2.089017152786255, "learning_rate": 8.333333333333333e-07, "loss": 1.9277, "step": 3481 }, { "epoch": 2.950847457627119, "grad_norm": 1.5568169355392456, "learning_rate": 8.192090395480226e-07, "loss": 2.1834, "step": 3482 }, { "epoch": 2.9516949152542376, "grad_norm": 1.3532565832138062, "learning_rate": 8.050847457627118e-07, "loss": 2.3501, "step": 3483 }, { "epoch": 2.952542372881356, "grad_norm": 1.320961356163025, "learning_rate": 7.909604519774011e-07, "loss": 2.4191, "step": 3484 }, { "epoch": 2.9533898305084745, "grad_norm": 1.6994813680648804, "learning_rate": 7.768361581920904e-07, "loss": 2.1174, "step": 3485 }, { "epoch": 2.954237288135593, "grad_norm": 2.273296594619751, "learning_rate": 7.627118644067797e-07, "loss": 1.5144, "step": 3486 }, { "epoch": 2.955084745762712, "grad_norm": 1.3097214698791504, "learning_rate": 7.48587570621469e-07, "loss": 2.4397, "step": 3487 }, { "epoch": 2.9559322033898305, "grad_norm": 1.9971638917922974, "learning_rate": 7.344632768361582e-07, "loss": 1.8863, "step": 3488 }, { "epoch": 2.9567796610169492, "grad_norm": 1.8968480825424194, "learning_rate": 7.203389830508475e-07, "loss": 1.8625, "step": 3489 }, { "epoch": 2.957627118644068, "grad_norm": 1.6351325511932373, "learning_rate": 7.062146892655367e-07, "loss": 2.2286, "step": 3490 }, { "epoch": 2.9584745762711866, "grad_norm": 1.5790823698043823, "learning_rate": 6.92090395480226e-07, "loss": 2.2996, "step": 3491 }, { "epoch": 2.959322033898305, "grad_norm": 1.9345752000808716, "learning_rate": 6.779661016949152e-07, "loss": 1.9687, "step": 3492 }, { "epoch": 2.9601694915254235, "grad_norm": 2.0262906551361084, "learning_rate": 6.638418079096045e-07, "loss": 1.9205, "step": 3493 }, { "epoch": 2.961016949152542, "grad_norm": 1.7300212383270264, "learning_rate": 6.497175141242938e-07, "loss": 2.1565, "step": 3494 }, { "epoch": 2.961864406779661, "grad_norm": 1.7060924768447876, "learning_rate": 6.355932203389831e-07, "loss": 2.153, "step": 3495 }, { "epoch": 2.9627118644067796, "grad_norm": 1.7925012111663818, "learning_rate": 6.214689265536723e-07, "loss": 2.0491, "step": 3496 }, { "epoch": 2.9635593220338983, "grad_norm": 1.2668328285217285, "learning_rate": 6.073446327683616e-07, "loss": 2.4809, "step": 3497 }, { "epoch": 2.964406779661017, "grad_norm": 2.553030014038086, "learning_rate": 5.932203389830508e-07, "loss": 1.9113, "step": 3498 }, { "epoch": 2.9652542372881356, "grad_norm": 1.9871301651000977, "learning_rate": 5.790960451977401e-07, "loss": 1.7621, "step": 3499 }, { "epoch": 2.9661016949152543, "grad_norm": 1.802756428718567, "learning_rate": 5.649717514124295e-07, "loss": 1.9916, "step": 3500 }, { "epoch": 2.966949152542373, "grad_norm": 1.663110613822937, "learning_rate": 5.508474576271187e-07, "loss": 2.0954, "step": 3501 }, { "epoch": 2.9677966101694917, "grad_norm": 1.2281522750854492, "learning_rate": 5.36723163841808e-07, "loss": 2.4056, "step": 3502 }, { "epoch": 2.9686440677966104, "grad_norm": 1.5557782649993896, "learning_rate": 5.225988700564972e-07, "loss": 2.2795, "step": 3503 }, { "epoch": 2.969491525423729, "grad_norm": 1.5493392944335938, "learning_rate": 5.084745762711865e-07, "loss": 2.1856, "step": 3504 }, { "epoch": 2.9703389830508473, "grad_norm": 1.5563039779663086, "learning_rate": 4.943502824858757e-07, "loss": 2.155, "step": 3505 }, { "epoch": 2.971186440677966, "grad_norm": 1.797594666481018, "learning_rate": 4.802259887005651e-07, "loss": 1.9369, "step": 3506 }, { "epoch": 2.9720338983050847, "grad_norm": 1.5994278192520142, "learning_rate": 4.661016949152543e-07, "loss": 2.2201, "step": 3507 }, { "epoch": 2.9728813559322034, "grad_norm": 1.510952353477478, "learning_rate": 4.5197740112994353e-07, "loss": 2.1432, "step": 3508 }, { "epoch": 2.973728813559322, "grad_norm": 1.8224469423294067, "learning_rate": 4.378531073446328e-07, "loss": 1.9827, "step": 3509 }, { "epoch": 2.9745762711864407, "grad_norm": 1.4395874738693237, "learning_rate": 4.2372881355932204e-07, "loss": 2.4096, "step": 3510 }, { "epoch": 2.9754237288135594, "grad_norm": 1.8219739198684692, "learning_rate": 4.096045197740113e-07, "loss": 2.1854, "step": 3511 }, { "epoch": 2.976271186440678, "grad_norm": 1.499512791633606, "learning_rate": 3.9548022598870054e-07, "loss": 2.3892, "step": 3512 }, { "epoch": 2.9771186440677964, "grad_norm": 2.0295488834381104, "learning_rate": 3.8135593220338985e-07, "loss": 2.0847, "step": 3513 }, { "epoch": 2.977966101694915, "grad_norm": 1.4956918954849243, "learning_rate": 3.672316384180791e-07, "loss": 2.0691, "step": 3514 }, { "epoch": 2.9788135593220337, "grad_norm": 1.4348477125167847, "learning_rate": 3.5310734463276836e-07, "loss": 2.3293, "step": 3515 }, { "epoch": 2.9796610169491524, "grad_norm": 1.8539484739303589, "learning_rate": 3.389830508474576e-07, "loss": 2.0574, "step": 3516 }, { "epoch": 2.980508474576271, "grad_norm": 1.7122975587844849, "learning_rate": 3.248587570621469e-07, "loss": 2.0708, "step": 3517 }, { "epoch": 2.98135593220339, "grad_norm": 1.9447230100631714, "learning_rate": 3.1073446327683617e-07, "loss": 2.2801, "step": 3518 }, { "epoch": 2.9822033898305085, "grad_norm": 1.7502176761627197, "learning_rate": 2.966101694915254e-07, "loss": 2.0411, "step": 3519 }, { "epoch": 2.983050847457627, "grad_norm": 1.6554374694824219, "learning_rate": 2.8248587570621473e-07, "loss": 2.0652, "step": 3520 }, { "epoch": 2.983898305084746, "grad_norm": 1.910003662109375, "learning_rate": 2.68361581920904e-07, "loss": 1.9196, "step": 3521 }, { "epoch": 2.9847457627118645, "grad_norm": 1.9911201000213623, "learning_rate": 2.5423728813559323e-07, "loss": 1.9887, "step": 3522 }, { "epoch": 2.9855932203389832, "grad_norm": 1.9181311130523682, "learning_rate": 2.4011299435028254e-07, "loss": 2.6016, "step": 3523 }, { "epoch": 2.986440677966102, "grad_norm": 1.4985933303833008, "learning_rate": 2.2598870056497177e-07, "loss": 2.3922, "step": 3524 }, { "epoch": 2.9872881355932206, "grad_norm": 2.1219472885131836, "learning_rate": 2.1186440677966102e-07, "loss": 2.3057, "step": 3525 }, { "epoch": 2.988135593220339, "grad_norm": 1.5086705684661865, "learning_rate": 1.9774011299435027e-07, "loss": 2.345, "step": 3526 }, { "epoch": 2.9889830508474575, "grad_norm": 1.9622693061828613, "learning_rate": 1.8361581920903955e-07, "loss": 1.8735, "step": 3527 }, { "epoch": 2.989830508474576, "grad_norm": 1.5369393825531006, "learning_rate": 1.694915254237288e-07, "loss": 2.4739, "step": 3528 }, { "epoch": 2.990677966101695, "grad_norm": 1.6584341526031494, "learning_rate": 1.5536723163841808e-07, "loss": 2.2392, "step": 3529 }, { "epoch": 2.9915254237288136, "grad_norm": 1.76746666431427, "learning_rate": 1.4124293785310736e-07, "loss": 2.1265, "step": 3530 }, { "epoch": 2.9923728813559323, "grad_norm": 1.4161677360534668, "learning_rate": 1.2711864406779662e-07, "loss": 2.4357, "step": 3531 }, { "epoch": 2.993220338983051, "grad_norm": 1.9487510919570923, "learning_rate": 1.1299435028248588e-07, "loss": 2.1245, "step": 3532 }, { "epoch": 2.9940677966101696, "grad_norm": 1.567739725112915, "learning_rate": 9.887005649717514e-08, "loss": 2.2204, "step": 3533 }, { "epoch": 2.994915254237288, "grad_norm": 1.8734211921691895, "learning_rate": 8.47457627118644e-08, "loss": 2.0297, "step": 3534 }, { "epoch": 2.9957627118644066, "grad_norm": 1.9217379093170166, "learning_rate": 7.062146892655368e-08, "loss": 1.801, "step": 3535 }, { "epoch": 2.9966101694915253, "grad_norm": 1.9299284219741821, "learning_rate": 5.649717514124294e-08, "loss": 2.1399, "step": 3536 }, { "epoch": 2.997457627118644, "grad_norm": 2.1523752212524414, "learning_rate": 4.23728813559322e-08, "loss": 1.5545, "step": 3537 }, { "epoch": 2.9983050847457626, "grad_norm": 2.013671636581421, "learning_rate": 2.824858757062147e-08, "loss": 1.7243, "step": 3538 }, { "epoch": 2.9991525423728813, "grad_norm": 1.8303087949752808, "learning_rate": 1.4124293785310735e-08, "loss": 2.0243, "step": 3539 }, { "epoch": 3.0, "grad_norm": 1.8203142881393433, "learning_rate": 0.0, "loss": 2.1589, "step": 3540 } ], "logging_steps": 1, "max_steps": 3540, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.65500133392384e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }