diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14092 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.99968, + "eval_steps": 500, + "global_step": 1562, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00064, + "grad_norm": 48.05089569091797, + "learning_rate": 0.0, + "loss": 1.4121, + "mean_token_accuracy": 0.6530859991908073, + "num_tokens": 13040.0, + "step": 1 + }, + { + "epoch": 0.00128, + "grad_norm": 26.102807998657227, + "learning_rate": 6.329113924050633e-08, + "loss": 1.5231, + "mean_token_accuracy": 0.6276459023356438, + "num_tokens": 26989.0, + "step": 2 + }, + { + "epoch": 0.00192, + "grad_norm": 95.80143737792969, + "learning_rate": 1.2658227848101266e-07, + "loss": 1.4079, + "mean_token_accuracy": 0.6467841640114784, + "num_tokens": 37679.0, + "step": 3 + }, + { + "epoch": 0.00256, + "grad_norm": 15.393555641174316, + "learning_rate": 1.89873417721519e-07, + "loss": 1.5839, + "mean_token_accuracy": 0.6144984066486359, + "num_tokens": 52503.0, + "step": 4 + }, + { + "epoch": 0.0032, + "grad_norm": 76.43201446533203, + "learning_rate": 2.5316455696202533e-07, + "loss": 1.5376, + "mean_token_accuracy": 0.614102203398943, + "num_tokens": 62165.0, + "step": 5 + }, + { + "epoch": 0.00384, + "grad_norm": 13.124272346496582, + "learning_rate": 3.164556962025317e-07, + "loss": 1.4651, + "mean_token_accuracy": 0.6233282685279846, + "num_tokens": 76003.0, + "step": 6 + }, + { + "epoch": 0.00448, + "grad_norm": 22.9676570892334, + "learning_rate": 3.79746835443038e-07, + "loss": 1.5145, + "mean_token_accuracy": 0.6122585535049438, + "num_tokens": 92265.0, + "step": 7 + }, + { + "epoch": 0.00512, + "grad_norm": 15.335000038146973, + "learning_rate": 4.4303797468354435e-07, + "loss": 1.4871, + "mean_token_accuracy": 0.6254958733916283, + "num_tokens": 106642.0, + "step": 8 + }, + { + "epoch": 0.00576, + "grad_norm": 13.938623428344727, + "learning_rate": 5.063291139240507e-07, + "loss": 1.2911, + "mean_token_accuracy": 0.6838521659374237, + "num_tokens": 118655.0, + "step": 9 + }, + { + "epoch": 0.0064, + "grad_norm": 12.838367462158203, + "learning_rate": 5.69620253164557e-07, + "loss": 1.5914, + "mean_token_accuracy": 0.6144495904445648, + "num_tokens": 133015.0, + "step": 10 + }, + { + "epoch": 0.00704, + "grad_norm": 28.111896514892578, + "learning_rate": 6.329113924050634e-07, + "loss": 1.5112, + "mean_token_accuracy": 0.6265368536114693, + "num_tokens": 147398.0, + "step": 11 + }, + { + "epoch": 0.00768, + "grad_norm": 86.63944244384766, + "learning_rate": 6.962025316455696e-07, + "loss": 1.3589, + "mean_token_accuracy": 0.6383277997374535, + "num_tokens": 158812.0, + "step": 12 + }, + { + "epoch": 0.00832, + "grad_norm": 168.6136016845703, + "learning_rate": 7.59493670886076e-07, + "loss": 1.7851, + "mean_token_accuracy": 0.5883053466677666, + "num_tokens": 170703.0, + "step": 13 + }, + { + "epoch": 0.00896, + "grad_norm": 217.70408630371094, + "learning_rate": 8.227848101265823e-07, + "loss": 1.4027, + "mean_token_accuracy": 0.6471100524067879, + "num_tokens": 184556.0, + "step": 14 + }, + { + "epoch": 0.0096, + "grad_norm": 90.14910125732422, + "learning_rate": 8.860759493670887e-07, + "loss": 1.376, + "mean_token_accuracy": 0.6501478627324104, + "num_tokens": 202523.0, + "step": 15 + }, + { + "epoch": 0.01024, + "grad_norm": 97.01738739013672, + "learning_rate": 9.493670886075951e-07, + "loss": 1.1443, + "mean_token_accuracy": 0.6894906312227249, + "num_tokens": 212189.0, + "step": 16 + }, + { + "epoch": 0.01088, + "grad_norm": 64.89408874511719, + "learning_rate": 1.0126582278481013e-06, + "loss": 1.5423, + "mean_token_accuracy": 0.6109918430447578, + "num_tokens": 226375.0, + "step": 17 + }, + { + "epoch": 0.01152, + "grad_norm": 82.11033630371094, + "learning_rate": 1.0759493670886077e-06, + "loss": 1.6578, + "mean_token_accuracy": 0.6166552416980267, + "num_tokens": 238636.0, + "step": 18 + }, + { + "epoch": 0.01216, + "grad_norm": 102.00513458251953, + "learning_rate": 1.139240506329114e-06, + "loss": 1.3764, + "mean_token_accuracy": 0.6483487188816071, + "num_tokens": 250678.0, + "step": 19 + }, + { + "epoch": 0.0128, + "grad_norm": 69.1679916381836, + "learning_rate": 1.2025316455696204e-06, + "loss": 1.5599, + "mean_token_accuracy": 0.6092639714479446, + "num_tokens": 266795.0, + "step": 20 + }, + { + "epoch": 0.01344, + "grad_norm": 138.91578674316406, + "learning_rate": 1.2658227848101267e-06, + "loss": 1.4293, + "mean_token_accuracy": 0.6301368102431297, + "num_tokens": 280316.0, + "step": 21 + }, + { + "epoch": 0.01408, + "grad_norm": 84.13861083984375, + "learning_rate": 1.3291139240506329e-06, + "loss": 1.5571, + "mean_token_accuracy": 0.6376139968633652, + "num_tokens": 292273.0, + "step": 22 + }, + { + "epoch": 0.01472, + "grad_norm": 75.20580291748047, + "learning_rate": 1.3924050632911392e-06, + "loss": 1.7514, + "mean_token_accuracy": 0.5854216478765011, + "num_tokens": 304665.0, + "step": 23 + }, + { + "epoch": 0.01536, + "grad_norm": 33.301204681396484, + "learning_rate": 1.4556962025316456e-06, + "loss": 1.445, + "mean_token_accuracy": 0.6291614323854446, + "num_tokens": 317582.0, + "step": 24 + }, + { + "epoch": 0.016, + "grad_norm": 49.913997650146484, + "learning_rate": 1.518987341772152e-06, + "loss": 1.1659, + "mean_token_accuracy": 0.6733755245804787, + "num_tokens": 330930.0, + "step": 25 + }, + { + "epoch": 0.01664, + "grad_norm": 16.423551559448242, + "learning_rate": 1.5822784810126585e-06, + "loss": 1.4189, + "mean_token_accuracy": 0.6274379268288612, + "num_tokens": 343897.0, + "step": 26 + }, + { + "epoch": 0.01728, + "grad_norm": 48.93357467651367, + "learning_rate": 1.6455696202531647e-06, + "loss": 1.5449, + "mean_token_accuracy": 0.6013954728841782, + "num_tokens": 358229.0, + "step": 27 + }, + { + "epoch": 0.01792, + "grad_norm": 41.11186981201172, + "learning_rate": 1.708860759493671e-06, + "loss": 1.4404, + "mean_token_accuracy": 0.6342460885643959, + "num_tokens": 369284.0, + "step": 28 + }, + { + "epoch": 0.01856, + "grad_norm": 32.76636505126953, + "learning_rate": 1.7721518987341774e-06, + "loss": 1.5705, + "mean_token_accuracy": 0.6268726661801338, + "num_tokens": 384919.0, + "step": 29 + }, + { + "epoch": 0.0192, + "grad_norm": 65.5134048461914, + "learning_rate": 1.8354430379746838e-06, + "loss": 1.4156, + "mean_token_accuracy": 0.6435664221644402, + "num_tokens": 399789.0, + "step": 30 + }, + { + "epoch": 0.01984, + "grad_norm": 6.874727725982666, + "learning_rate": 1.8987341772151901e-06, + "loss": 1.3794, + "mean_token_accuracy": 0.6731243506073952, + "num_tokens": 410300.0, + "step": 31 + }, + { + "epoch": 0.02048, + "grad_norm": 30.61371421813965, + "learning_rate": 1.9620253164556965e-06, + "loss": 1.4516, + "mean_token_accuracy": 0.6296539008617401, + "num_tokens": 425086.0, + "step": 32 + }, + { + "epoch": 0.02112, + "grad_norm": 53.762176513671875, + "learning_rate": 2.0253164556962026e-06, + "loss": 1.3964, + "mean_token_accuracy": 0.6357481107115746, + "num_tokens": 438978.0, + "step": 33 + }, + { + "epoch": 0.02176, + "grad_norm": 58.90037536621094, + "learning_rate": 2.088607594936709e-06, + "loss": 1.4799, + "mean_token_accuracy": 0.619662769138813, + "num_tokens": 451824.0, + "step": 34 + }, + { + "epoch": 0.0224, + "grad_norm": 8.633367538452148, + "learning_rate": 2.1518987341772153e-06, + "loss": 1.4459, + "mean_token_accuracy": 0.6490786001086235, + "num_tokens": 465569.0, + "step": 35 + }, + { + "epoch": 0.02304, + "grad_norm": 152.17538452148438, + "learning_rate": 2.2151898734177215e-06, + "loss": 1.5855, + "mean_token_accuracy": 0.5879618301987648, + "num_tokens": 476187.0, + "step": 36 + }, + { + "epoch": 0.02368, + "grad_norm": 63.17410659790039, + "learning_rate": 2.278481012658228e-06, + "loss": 1.4941, + "mean_token_accuracy": 0.6467924416065216, + "num_tokens": 490658.0, + "step": 37 + }, + { + "epoch": 0.02432, + "grad_norm": 40.93563461303711, + "learning_rate": 2.341772151898734e-06, + "loss": 1.2798, + "mean_token_accuracy": 0.6463187485933304, + "num_tokens": 500707.0, + "step": 38 + }, + { + "epoch": 0.02496, + "grad_norm": 47.74807357788086, + "learning_rate": 2.4050632911392408e-06, + "loss": 1.682, + "mean_token_accuracy": 0.5752647258341312, + "num_tokens": 514924.0, + "step": 39 + }, + { + "epoch": 0.0256, + "grad_norm": 8.83420181274414, + "learning_rate": 2.4683544303797473e-06, + "loss": 1.374, + "mean_token_accuracy": 0.6248214021325111, + "num_tokens": 529540.0, + "step": 40 + }, + { + "epoch": 0.02624, + "grad_norm": 73.02564239501953, + "learning_rate": 2.5316455696202535e-06, + "loss": 1.4803, + "mean_token_accuracy": 0.63168865442276, + "num_tokens": 542779.0, + "step": 41 + }, + { + "epoch": 0.02688, + "grad_norm": 58.30765914916992, + "learning_rate": 2.5949367088607596e-06, + "loss": 1.2061, + "mean_token_accuracy": 0.6671500578522682, + "num_tokens": 556396.0, + "step": 42 + }, + { + "epoch": 0.02752, + "grad_norm": 33.67079162597656, + "learning_rate": 2.6582278481012658e-06, + "loss": 1.522, + "mean_token_accuracy": 0.5986876226961613, + "num_tokens": 569089.0, + "step": 43 + }, + { + "epoch": 0.02816, + "grad_norm": 31.859474182128906, + "learning_rate": 2.7215189873417724e-06, + "loss": 1.3638, + "mean_token_accuracy": 0.6643245741724968, + "num_tokens": 583380.0, + "step": 44 + }, + { + "epoch": 0.0288, + "grad_norm": 33.59089660644531, + "learning_rate": 2.7848101265822785e-06, + "loss": 1.4956, + "mean_token_accuracy": 0.6372537463903427, + "num_tokens": 596014.0, + "step": 45 + }, + { + "epoch": 0.02944, + "grad_norm": 25.843647003173828, + "learning_rate": 2.848101265822785e-06, + "loss": 1.1846, + "mean_token_accuracy": 0.6587028503417969, + "num_tokens": 608555.0, + "step": 46 + }, + { + "epoch": 0.03008, + "grad_norm": 5.101419925689697, + "learning_rate": 2.9113924050632912e-06, + "loss": 1.3947, + "mean_token_accuracy": 0.6413091421127319, + "num_tokens": 621134.0, + "step": 47 + }, + { + "epoch": 0.03072, + "grad_norm": 40.20245361328125, + "learning_rate": 2.9746835443037974e-06, + "loss": 1.345, + "mean_token_accuracy": 0.6560010835528374, + "num_tokens": 635852.0, + "step": 48 + }, + { + "epoch": 0.03136, + "grad_norm": 94.18912506103516, + "learning_rate": 3.037974683544304e-06, + "loss": 1.3185, + "mean_token_accuracy": 0.6504691988229752, + "num_tokens": 648238.0, + "step": 49 + }, + { + "epoch": 0.032, + "grad_norm": 23.04238510131836, + "learning_rate": 3.10126582278481e-06, + "loss": 1.5373, + "mean_token_accuracy": 0.6217592805624008, + "num_tokens": 662824.0, + "step": 50 + }, + { + "epoch": 0.03264, + "grad_norm": 56.0700569152832, + "learning_rate": 3.164556962025317e-06, + "loss": 1.2922, + "mean_token_accuracy": 0.6528200656175613, + "num_tokens": 675378.0, + "step": 51 + }, + { + "epoch": 0.03328, + "grad_norm": 14.78956127166748, + "learning_rate": 3.2278481012658232e-06, + "loss": 1.5514, + "mean_token_accuracy": 0.6194410622119904, + "num_tokens": 689687.0, + "step": 52 + }, + { + "epoch": 0.03392, + "grad_norm": 52.21746063232422, + "learning_rate": 3.2911392405063294e-06, + "loss": 1.5048, + "mean_token_accuracy": 0.6074254661798477, + "num_tokens": 702196.0, + "step": 53 + }, + { + "epoch": 0.03456, + "grad_norm": 39.832069396972656, + "learning_rate": 3.354430379746836e-06, + "loss": 1.2862, + "mean_token_accuracy": 0.6434847190976143, + "num_tokens": 716114.0, + "step": 54 + }, + { + "epoch": 0.0352, + "grad_norm": 13.459836959838867, + "learning_rate": 3.417721518987342e-06, + "loss": 1.6001, + "mean_token_accuracy": 0.6024078205227852, + "num_tokens": 729970.0, + "step": 55 + }, + { + "epoch": 0.03584, + "grad_norm": 7.002089023590088, + "learning_rate": 3.4810126582278487e-06, + "loss": 1.2902, + "mean_token_accuracy": 0.6684707179665565, + "num_tokens": 742129.0, + "step": 56 + }, + { + "epoch": 0.03648, + "grad_norm": 5.5110907554626465, + "learning_rate": 3.544303797468355e-06, + "loss": 1.287, + "mean_token_accuracy": 0.6572533845901489, + "num_tokens": 756207.0, + "step": 57 + }, + { + "epoch": 0.03712, + "grad_norm": 19.13697052001953, + "learning_rate": 3.607594936708861e-06, + "loss": 1.3056, + "mean_token_accuracy": 0.6451508551836014, + "num_tokens": 768845.0, + "step": 58 + }, + { + "epoch": 0.03776, + "grad_norm": 15.833284378051758, + "learning_rate": 3.6708860759493675e-06, + "loss": 1.738, + "mean_token_accuracy": 0.5944546982645988, + "num_tokens": 780788.0, + "step": 59 + }, + { + "epoch": 0.0384, + "grad_norm": 20.870206832885742, + "learning_rate": 3.7341772151898737e-06, + "loss": 1.4071, + "mean_token_accuracy": 0.6308668106794357, + "num_tokens": 793426.0, + "step": 60 + }, + { + "epoch": 0.03904, + "grad_norm": 16.34093475341797, + "learning_rate": 3.7974683544303802e-06, + "loss": 1.5468, + "mean_token_accuracy": 0.627624161541462, + "num_tokens": 804663.0, + "step": 61 + }, + { + "epoch": 0.03968, + "grad_norm": 17.309402465820312, + "learning_rate": 3.860759493670886e-06, + "loss": 1.505, + "mean_token_accuracy": 0.6210604161024094, + "num_tokens": 818199.0, + "step": 62 + }, + { + "epoch": 0.04032, + "grad_norm": 4.618063449859619, + "learning_rate": 3.924050632911393e-06, + "loss": 1.2564, + "mean_token_accuracy": 0.664100281894207, + "num_tokens": 830056.0, + "step": 63 + }, + { + "epoch": 0.04096, + "grad_norm": 61.135398864746094, + "learning_rate": 3.9873417721518995e-06, + "loss": 1.2191, + "mean_token_accuracy": 0.6882363706827164, + "num_tokens": 844140.0, + "step": 64 + }, + { + "epoch": 0.0416, + "grad_norm": 58.18097686767578, + "learning_rate": 4.050632911392405e-06, + "loss": 1.4123, + "mean_token_accuracy": 0.6319537982344627, + "num_tokens": 856421.0, + "step": 65 + }, + { + "epoch": 0.04224, + "grad_norm": 4.918764591217041, + "learning_rate": 4.113924050632912e-06, + "loss": 1.342, + "mean_token_accuracy": 0.6437618285417557, + "num_tokens": 870219.0, + "step": 66 + }, + { + "epoch": 0.04288, + "grad_norm": 6.8447585105896, + "learning_rate": 4.177215189873418e-06, + "loss": 1.2137, + "mean_token_accuracy": 0.6827035769820213, + "num_tokens": 880524.0, + "step": 67 + }, + { + "epoch": 0.04352, + "grad_norm": 12.660514831542969, + "learning_rate": 4.240506329113924e-06, + "loss": 1.4528, + "mean_token_accuracy": 0.6331579238176346, + "num_tokens": 891958.0, + "step": 68 + }, + { + "epoch": 0.04416, + "grad_norm": 7.19080924987793, + "learning_rate": 4.303797468354431e-06, + "loss": 1.4252, + "mean_token_accuracy": 0.65432970225811, + "num_tokens": 904908.0, + "step": 69 + }, + { + "epoch": 0.0448, + "grad_norm": 3.9678971767425537, + "learning_rate": 4.367088607594937e-06, + "loss": 1.2971, + "mean_token_accuracy": 0.6341002583503723, + "num_tokens": 918681.0, + "step": 70 + }, + { + "epoch": 0.04544, + "grad_norm": 7.076503276824951, + "learning_rate": 4.430379746835443e-06, + "loss": 1.2615, + "mean_token_accuracy": 0.6656483858823776, + "num_tokens": 928006.0, + "step": 71 + }, + { + "epoch": 0.04608, + "grad_norm": 5.339262962341309, + "learning_rate": 4.4936708860759495e-06, + "loss": 1.344, + "mean_token_accuracy": 0.6466359868645668, + "num_tokens": 940426.0, + "step": 72 + }, + { + "epoch": 0.04672, + "grad_norm": 7.0051703453063965, + "learning_rate": 4.556962025316456e-06, + "loss": 1.4216, + "mean_token_accuracy": 0.630195863544941, + "num_tokens": 954324.0, + "step": 73 + }, + { + "epoch": 0.04736, + "grad_norm": 4.788408279418945, + "learning_rate": 4.620253164556963e-06, + "loss": 1.4698, + "mean_token_accuracy": 0.614908404648304, + "num_tokens": 967599.0, + "step": 74 + }, + { + "epoch": 0.048, + "grad_norm": 5.4968366622924805, + "learning_rate": 4.683544303797468e-06, + "loss": 1.3819, + "mean_token_accuracy": 0.6416184306144714, + "num_tokens": 981616.0, + "step": 75 + }, + { + "epoch": 0.04864, + "grad_norm": 4.646515369415283, + "learning_rate": 4.746835443037975e-06, + "loss": 1.4479, + "mean_token_accuracy": 0.6559240221977234, + "num_tokens": 993440.0, + "step": 76 + }, + { + "epoch": 0.04928, + "grad_norm": 5.726164817810059, + "learning_rate": 4.8101265822784815e-06, + "loss": 1.3377, + "mean_token_accuracy": 0.6394649744033813, + "num_tokens": 1005930.0, + "step": 77 + }, + { + "epoch": 0.04992, + "grad_norm": 7.1859588623046875, + "learning_rate": 4.873417721518987e-06, + "loss": 1.5267, + "mean_token_accuracy": 0.6308169737458229, + "num_tokens": 1018113.0, + "step": 78 + }, + { + "epoch": 0.05056, + "grad_norm": 5.919410705566406, + "learning_rate": 4.936708860759495e-06, + "loss": 1.5063, + "mean_token_accuracy": 0.6113156750798225, + "num_tokens": 1030828.0, + "step": 79 + }, + { + "epoch": 0.0512, + "grad_norm": 4.142421722412109, + "learning_rate": 5e-06, + "loss": 1.365, + "mean_token_accuracy": 0.6484999246895313, + "num_tokens": 1045699.0, + "step": 80 + }, + { + "epoch": 0.05184, + "grad_norm": 5.720826625823975, + "learning_rate": 5e-06, + "loss": 1.324, + "mean_token_accuracy": 0.6485482379794121, + "num_tokens": 1060409.0, + "step": 81 + }, + { + "epoch": 0.05248, + "grad_norm": 4.827797889709473, + "learning_rate": 5e-06, + "loss": 1.2039, + "mean_token_accuracy": 0.6779980957508087, + "num_tokens": 1072000.0, + "step": 82 + }, + { + "epoch": 0.05312, + "grad_norm": 4.712104797363281, + "learning_rate": 5e-06, + "loss": 1.5722, + "mean_token_accuracy": 0.6242717280983925, + "num_tokens": 1085195.0, + "step": 83 + }, + { + "epoch": 0.05376, + "grad_norm": 4.540091514587402, + "learning_rate": 5e-06, + "loss": 1.5155, + "mean_token_accuracy": 0.633654311299324, + "num_tokens": 1099571.0, + "step": 84 + }, + { + "epoch": 0.0544, + "grad_norm": 5.3648905754089355, + "learning_rate": 5e-06, + "loss": 1.4137, + "mean_token_accuracy": 0.6268021315336227, + "num_tokens": 1114063.0, + "step": 85 + }, + { + "epoch": 0.05504, + "grad_norm": 4.212844371795654, + "learning_rate": 5e-06, + "loss": 1.3986, + "mean_token_accuracy": 0.638413742184639, + "num_tokens": 1128456.0, + "step": 86 + }, + { + "epoch": 0.05568, + "grad_norm": 4.77896785736084, + "learning_rate": 5e-06, + "loss": 1.3913, + "mean_token_accuracy": 0.6454877629876137, + "num_tokens": 1141977.0, + "step": 87 + }, + { + "epoch": 0.05632, + "grad_norm": 6.540133953094482, + "learning_rate": 5e-06, + "loss": 1.1014, + "mean_token_accuracy": 0.6967510804533958, + "num_tokens": 1154057.0, + "step": 88 + }, + { + "epoch": 0.05696, + "grad_norm": 3.7844600677490234, + "learning_rate": 5e-06, + "loss": 1.4211, + "mean_token_accuracy": 0.6254619807004929, + "num_tokens": 1169061.0, + "step": 89 + }, + { + "epoch": 0.0576, + "grad_norm": 3.695892810821533, + "learning_rate": 5e-06, + "loss": 1.2861, + "mean_token_accuracy": 0.6654118373990059, + "num_tokens": 1182796.0, + "step": 90 + }, + { + "epoch": 0.05824, + "grad_norm": 4.524760723114014, + "learning_rate": 5e-06, + "loss": 1.5883, + "mean_token_accuracy": 0.6046858802437782, + "num_tokens": 1196449.0, + "step": 91 + }, + { + "epoch": 0.05888, + "grad_norm": 5.951873779296875, + "learning_rate": 5e-06, + "loss": 1.3191, + "mean_token_accuracy": 0.6647541224956512, + "num_tokens": 1209472.0, + "step": 92 + }, + { + "epoch": 0.05952, + "grad_norm": 5.607054233551025, + "learning_rate": 5e-06, + "loss": 1.5771, + "mean_token_accuracy": 0.6196302324533463, + "num_tokens": 1222720.0, + "step": 93 + }, + { + "epoch": 0.06016, + "grad_norm": 4.97398567199707, + "learning_rate": 5e-06, + "loss": 1.2972, + "mean_token_accuracy": 0.6596869081258774, + "num_tokens": 1236366.0, + "step": 94 + }, + { + "epoch": 0.0608, + "grad_norm": 5.066143035888672, + "learning_rate": 5e-06, + "loss": 1.6685, + "mean_token_accuracy": 0.6038380563259125, + "num_tokens": 1248615.0, + "step": 95 + }, + { + "epoch": 0.06144, + "grad_norm": 4.967097282409668, + "learning_rate": 5e-06, + "loss": 1.3559, + "mean_token_accuracy": 0.649577222764492, + "num_tokens": 1258892.0, + "step": 96 + }, + { + "epoch": 0.06208, + "grad_norm": 3.9898176193237305, + "learning_rate": 5e-06, + "loss": 1.1218, + "mean_token_accuracy": 0.6794590428471565, + "num_tokens": 1272167.0, + "step": 97 + }, + { + "epoch": 0.06272, + "grad_norm": 4.856038570404053, + "learning_rate": 5e-06, + "loss": 1.4458, + "mean_token_accuracy": 0.6120704114437103, + "num_tokens": 1284435.0, + "step": 98 + }, + { + "epoch": 0.06336, + "grad_norm": 4.787650108337402, + "learning_rate": 5e-06, + "loss": 1.1262, + "mean_token_accuracy": 0.7047765105962753, + "num_tokens": 1295683.0, + "step": 99 + }, + { + "epoch": 0.064, + "grad_norm": 4.880126953125, + "learning_rate": 5e-06, + "loss": 1.359, + "mean_token_accuracy": 0.6566307917237282, + "num_tokens": 1309253.0, + "step": 100 + }, + { + "epoch": 0.06464, + "grad_norm": 4.704743385314941, + "learning_rate": 5e-06, + "loss": 1.3073, + "mean_token_accuracy": 0.6889312416315079, + "num_tokens": 1321838.0, + "step": 101 + }, + { + "epoch": 0.06528, + "grad_norm": 4.521302700042725, + "learning_rate": 5e-06, + "loss": 1.199, + "mean_token_accuracy": 0.6745252087712288, + "num_tokens": 1333677.0, + "step": 102 + }, + { + "epoch": 0.06592, + "grad_norm": 4.4061689376831055, + "learning_rate": 5e-06, + "loss": 1.2863, + "mean_token_accuracy": 0.6601276621222496, + "num_tokens": 1345912.0, + "step": 103 + }, + { + "epoch": 0.06656, + "grad_norm": 4.12923002243042, + "learning_rate": 5e-06, + "loss": 1.3052, + "mean_token_accuracy": 0.6593477055430412, + "num_tokens": 1356535.0, + "step": 104 + }, + { + "epoch": 0.0672, + "grad_norm": 4.265780448913574, + "learning_rate": 5e-06, + "loss": 1.5341, + "mean_token_accuracy": 0.6348154991865158, + "num_tokens": 1368522.0, + "step": 105 + }, + { + "epoch": 0.06784, + "grad_norm": 4.388949394226074, + "learning_rate": 5e-06, + "loss": 1.3616, + "mean_token_accuracy": 0.6796349138021469, + "num_tokens": 1381247.0, + "step": 106 + }, + { + "epoch": 0.06848, + "grad_norm": 4.523592948913574, + "learning_rate": 5e-06, + "loss": 1.4017, + "mean_token_accuracy": 0.638551875948906, + "num_tokens": 1392378.0, + "step": 107 + }, + { + "epoch": 0.06912, + "grad_norm": 4.722465991973877, + "learning_rate": 5e-06, + "loss": 1.1751, + "mean_token_accuracy": 0.6694681495428085, + "num_tokens": 1404081.0, + "step": 108 + }, + { + "epoch": 0.06976, + "grad_norm": 3.7663962841033936, + "learning_rate": 5e-06, + "loss": 1.2044, + "mean_token_accuracy": 0.6716165691614151, + "num_tokens": 1417942.0, + "step": 109 + }, + { + "epoch": 0.0704, + "grad_norm": 3.8090057373046875, + "learning_rate": 5e-06, + "loss": 1.4822, + "mean_token_accuracy": 0.6303943246603012, + "num_tokens": 1429633.0, + "step": 110 + }, + { + "epoch": 0.07104, + "grad_norm": 4.707150936126709, + "learning_rate": 5e-06, + "loss": 1.4048, + "mean_token_accuracy": 0.661470353603363, + "num_tokens": 1439918.0, + "step": 111 + }, + { + "epoch": 0.07168, + "grad_norm": 4.384817600250244, + "learning_rate": 5e-06, + "loss": 1.3427, + "mean_token_accuracy": 0.6572804600000381, + "num_tokens": 1451652.0, + "step": 112 + }, + { + "epoch": 0.07232, + "grad_norm": 3.9072980880737305, + "learning_rate": 5e-06, + "loss": 1.3317, + "mean_token_accuracy": 0.6496234610676765, + "num_tokens": 1466346.0, + "step": 113 + }, + { + "epoch": 0.07296, + "grad_norm": 3.8167197704315186, + "learning_rate": 5e-06, + "loss": 1.5108, + "mean_token_accuracy": 0.619833417236805, + "num_tokens": 1482241.0, + "step": 114 + }, + { + "epoch": 0.0736, + "grad_norm": 3.857537031173706, + "learning_rate": 5e-06, + "loss": 1.3917, + "mean_token_accuracy": 0.6645993143320084, + "num_tokens": 1495196.0, + "step": 115 + }, + { + "epoch": 0.07424, + "grad_norm": 4.024837970733643, + "learning_rate": 5e-06, + "loss": 1.3252, + "mean_token_accuracy": 0.6474068984389305, + "num_tokens": 1508711.0, + "step": 116 + }, + { + "epoch": 0.07488, + "grad_norm": 3.6451432704925537, + "learning_rate": 5e-06, + "loss": 1.3883, + "mean_token_accuracy": 0.658332034945488, + "num_tokens": 1521622.0, + "step": 117 + }, + { + "epoch": 0.07552, + "grad_norm": 3.7489166259765625, + "learning_rate": 5e-06, + "loss": 1.4337, + "mean_token_accuracy": 0.6373646706342697, + "num_tokens": 1536109.0, + "step": 118 + }, + { + "epoch": 0.07616, + "grad_norm": 4.419317245483398, + "learning_rate": 5e-06, + "loss": 1.6063, + "mean_token_accuracy": 0.5973443016409874, + "num_tokens": 1548521.0, + "step": 119 + }, + { + "epoch": 0.0768, + "grad_norm": 3.8151636123657227, + "learning_rate": 5e-06, + "loss": 1.2434, + "mean_token_accuracy": 0.6510246470570564, + "num_tokens": 1562145.0, + "step": 120 + }, + { + "epoch": 0.07744, + "grad_norm": 4.26577091217041, + "learning_rate": 5e-06, + "loss": 1.5936, + "mean_token_accuracy": 0.6028061434626579, + "num_tokens": 1575331.0, + "step": 121 + }, + { + "epoch": 0.07808, + "grad_norm": 4.482457637786865, + "learning_rate": 5e-06, + "loss": 1.3789, + "mean_token_accuracy": 0.6534934043884277, + "num_tokens": 1587643.0, + "step": 122 + }, + { + "epoch": 0.07872, + "grad_norm": 3.56472110748291, + "learning_rate": 5e-06, + "loss": 1.4365, + "mean_token_accuracy": 0.636934220790863, + "num_tokens": 1602307.0, + "step": 123 + }, + { + "epoch": 0.07936, + "grad_norm": 3.643859386444092, + "learning_rate": 5e-06, + "loss": 1.3166, + "mean_token_accuracy": 0.6632036790251732, + "num_tokens": 1616771.0, + "step": 124 + }, + { + "epoch": 0.08, + "grad_norm": 3.907698154449463, + "learning_rate": 5e-06, + "loss": 1.347, + "mean_token_accuracy": 0.6501271575689316, + "num_tokens": 1628788.0, + "step": 125 + }, + { + "epoch": 0.08064, + "grad_norm": 3.952827215194702, + "learning_rate": 5e-06, + "loss": 1.5638, + "mean_token_accuracy": 0.6289772987365723, + "num_tokens": 1643292.0, + "step": 126 + }, + { + "epoch": 0.08128, + "grad_norm": 3.829796314239502, + "learning_rate": 5e-06, + "loss": 1.4867, + "mean_token_accuracy": 0.6409079134464264, + "num_tokens": 1657859.0, + "step": 127 + }, + { + "epoch": 0.08192, + "grad_norm": 3.4832980632781982, + "learning_rate": 5e-06, + "loss": 1.1756, + "mean_token_accuracy": 0.6971960365772247, + "num_tokens": 1672891.0, + "step": 128 + }, + { + "epoch": 0.08256, + "grad_norm": 4.326021671295166, + "learning_rate": 5e-06, + "loss": 1.3608, + "mean_token_accuracy": 0.663967490196228, + "num_tokens": 1685243.0, + "step": 129 + }, + { + "epoch": 0.0832, + "grad_norm": 3.8590521812438965, + "learning_rate": 5e-06, + "loss": 1.3535, + "mean_token_accuracy": 0.6475187167525291, + "num_tokens": 1699220.0, + "step": 130 + }, + { + "epoch": 0.08384, + "grad_norm": 4.005199432373047, + "learning_rate": 5e-06, + "loss": 1.5247, + "mean_token_accuracy": 0.6049772128462791, + "num_tokens": 1711827.0, + "step": 131 + }, + { + "epoch": 0.08448, + "grad_norm": 5.4232378005981445, + "learning_rate": 5e-06, + "loss": 1.4393, + "mean_token_accuracy": 0.6452240273356438, + "num_tokens": 1722307.0, + "step": 132 + }, + { + "epoch": 0.08512, + "grad_norm": 3.7561964988708496, + "learning_rate": 5e-06, + "loss": 1.3973, + "mean_token_accuracy": 0.643314465880394, + "num_tokens": 1735760.0, + "step": 133 + }, + { + "epoch": 0.08576, + "grad_norm": 4.557453155517578, + "learning_rate": 5e-06, + "loss": 1.4625, + "mean_token_accuracy": 0.6588743627071381, + "num_tokens": 1749840.0, + "step": 134 + }, + { + "epoch": 0.0864, + "grad_norm": 4.375631809234619, + "learning_rate": 5e-06, + "loss": 1.3369, + "mean_token_accuracy": 0.6503657773137093, + "num_tokens": 1764508.0, + "step": 135 + }, + { + "epoch": 0.08704, + "grad_norm": 3.6710991859436035, + "learning_rate": 5e-06, + "loss": 1.3455, + "mean_token_accuracy": 0.6592239439487457, + "num_tokens": 1777991.0, + "step": 136 + }, + { + "epoch": 0.08768, + "grad_norm": 4.055100440979004, + "learning_rate": 5e-06, + "loss": 1.2004, + "mean_token_accuracy": 0.6970224753022194, + "num_tokens": 1790908.0, + "step": 137 + }, + { + "epoch": 0.08832, + "grad_norm": 3.4759104251861572, + "learning_rate": 5e-06, + "loss": 1.3996, + "mean_token_accuracy": 0.6468167528510094, + "num_tokens": 1807180.0, + "step": 138 + }, + { + "epoch": 0.08896, + "grad_norm": 4.201884746551514, + "learning_rate": 5e-06, + "loss": 1.36, + "mean_token_accuracy": 0.645078033208847, + "num_tokens": 1819408.0, + "step": 139 + }, + { + "epoch": 0.0896, + "grad_norm": 4.253586769104004, + "learning_rate": 5e-06, + "loss": 1.555, + "mean_token_accuracy": 0.6383631229400635, + "num_tokens": 1832773.0, + "step": 140 + }, + { + "epoch": 0.09024, + "grad_norm": 3.354541063308716, + "learning_rate": 5e-06, + "loss": 1.2203, + "mean_token_accuracy": 0.6814669519662857, + "num_tokens": 1848152.0, + "step": 141 + }, + { + "epoch": 0.09088, + "grad_norm": 3.436411142349243, + "learning_rate": 5e-06, + "loss": 1.4883, + "mean_token_accuracy": 0.6492328196763992, + "num_tokens": 1863701.0, + "step": 142 + }, + { + "epoch": 0.09152, + "grad_norm": 4.413644790649414, + "learning_rate": 5e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7075678631663322, + "num_tokens": 1872939.0, + "step": 143 + }, + { + "epoch": 0.09216, + "grad_norm": 5.079326152801514, + "learning_rate": 5e-06, + "loss": 1.3153, + "mean_token_accuracy": 0.6815094351768494, + "num_tokens": 1881487.0, + "step": 144 + }, + { + "epoch": 0.0928, + "grad_norm": 4.065243721008301, + "learning_rate": 5e-06, + "loss": 1.3596, + "mean_token_accuracy": 0.6275613754987717, + "num_tokens": 1892857.0, + "step": 145 + }, + { + "epoch": 0.09344, + "grad_norm": 3.9777028560638428, + "learning_rate": 5e-06, + "loss": 1.1708, + "mean_token_accuracy": 0.7008514180779457, + "num_tokens": 1905468.0, + "step": 146 + }, + { + "epoch": 0.09408, + "grad_norm": 3.9590489864349365, + "learning_rate": 5e-06, + "loss": 1.3645, + "mean_token_accuracy": 0.645123079419136, + "num_tokens": 1921339.0, + "step": 147 + }, + { + "epoch": 0.09472, + "grad_norm": 4.232624053955078, + "learning_rate": 5e-06, + "loss": 1.4376, + "mean_token_accuracy": 0.6167034581303596, + "num_tokens": 1933922.0, + "step": 148 + }, + { + "epoch": 0.09536, + "grad_norm": 4.538359642028809, + "learning_rate": 5e-06, + "loss": 1.2695, + "mean_token_accuracy": 0.6443182751536369, + "num_tokens": 1946327.0, + "step": 149 + }, + { + "epoch": 0.096, + "grad_norm": 3.987658977508545, + "learning_rate": 5e-06, + "loss": 1.165, + "mean_token_accuracy": 0.700744241476059, + "num_tokens": 1958145.0, + "step": 150 + }, + { + "epoch": 0.09664, + "grad_norm": 5.451640605926514, + "learning_rate": 5e-06, + "loss": 1.1847, + "mean_token_accuracy": 0.6844369322061539, + "num_tokens": 1968466.0, + "step": 151 + }, + { + "epoch": 0.09728, + "grad_norm": 3.7554731369018555, + "learning_rate": 5e-06, + "loss": 1.3363, + "mean_token_accuracy": 0.6585431769490242, + "num_tokens": 1981386.0, + "step": 152 + }, + { + "epoch": 0.09792, + "grad_norm": 3.601236581802368, + "learning_rate": 5e-06, + "loss": 1.3988, + "mean_token_accuracy": 0.6534986943006516, + "num_tokens": 1995319.0, + "step": 153 + }, + { + "epoch": 0.09856, + "grad_norm": 3.569467306137085, + "learning_rate": 5e-06, + "loss": 1.1972, + "mean_token_accuracy": 0.6820317879319191, + "num_tokens": 2008019.0, + "step": 154 + }, + { + "epoch": 0.0992, + "grad_norm": 3.896125078201294, + "learning_rate": 5e-06, + "loss": 1.3651, + "mean_token_accuracy": 0.6400049701333046, + "num_tokens": 2021300.0, + "step": 155 + }, + { + "epoch": 0.09984, + "grad_norm": 3.486210584640503, + "learning_rate": 5e-06, + "loss": 1.3398, + "mean_token_accuracy": 0.6543328985571861, + "num_tokens": 2033964.0, + "step": 156 + }, + { + "epoch": 0.10048, + "grad_norm": 3.03397274017334, + "learning_rate": 5e-06, + "loss": 1.4379, + "mean_token_accuracy": 0.6368994787335396, + "num_tokens": 2051392.0, + "step": 157 + }, + { + "epoch": 0.10112, + "grad_norm": 3.8133559226989746, + "learning_rate": 5e-06, + "loss": 1.4191, + "mean_token_accuracy": 0.6660285517573357, + "num_tokens": 2063899.0, + "step": 158 + }, + { + "epoch": 0.10176, + "grad_norm": 2.894871234893799, + "learning_rate": 5e-06, + "loss": 1.149, + "mean_token_accuracy": 0.687338799238205, + "num_tokens": 2081505.0, + "step": 159 + }, + { + "epoch": 0.1024, + "grad_norm": 4.369359016418457, + "learning_rate": 5e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6496308445930481, + "num_tokens": 2092604.0, + "step": 160 + }, + { + "epoch": 0.10304, + "grad_norm": 4.516582489013672, + "learning_rate": 5e-06, + "loss": 1.3535, + "mean_token_accuracy": 0.6436882838606834, + "num_tokens": 2103256.0, + "step": 161 + }, + { + "epoch": 0.10368, + "grad_norm": 3.317488431930542, + "learning_rate": 5e-06, + "loss": 1.3131, + "mean_token_accuracy": 0.6673252284526825, + "num_tokens": 2119060.0, + "step": 162 + }, + { + "epoch": 0.10432, + "grad_norm": 4.195248603820801, + "learning_rate": 5e-06, + "loss": 1.5371, + "mean_token_accuracy": 0.6269242167472839, + "num_tokens": 2131564.0, + "step": 163 + }, + { + "epoch": 0.10496, + "grad_norm": 4.055263042449951, + "learning_rate": 5e-06, + "loss": 1.2917, + "mean_token_accuracy": 0.672488197684288, + "num_tokens": 2144473.0, + "step": 164 + }, + { + "epoch": 0.1056, + "grad_norm": 3.9197511672973633, + "learning_rate": 5e-06, + "loss": 1.3064, + "mean_token_accuracy": 0.664051964879036, + "num_tokens": 2157277.0, + "step": 165 + }, + { + "epoch": 0.10624, + "grad_norm": 4.073387145996094, + "learning_rate": 5e-06, + "loss": 1.3085, + "mean_token_accuracy": 0.6389222107827663, + "num_tokens": 2168765.0, + "step": 166 + }, + { + "epoch": 0.10688, + "grad_norm": 3.508542060852051, + "learning_rate": 5e-06, + "loss": 1.2401, + "mean_token_accuracy": 0.6622222438454628, + "num_tokens": 2182480.0, + "step": 167 + }, + { + "epoch": 0.10752, + "grad_norm": 5.038687229156494, + "learning_rate": 5e-06, + "loss": 1.3458, + "mean_token_accuracy": 0.6548017933964729, + "num_tokens": 2192216.0, + "step": 168 + }, + { + "epoch": 0.10816, + "grad_norm": 3.743532180786133, + "learning_rate": 5e-06, + "loss": 1.3079, + "mean_token_accuracy": 0.675239585340023, + "num_tokens": 2205231.0, + "step": 169 + }, + { + "epoch": 0.1088, + "grad_norm": 3.9550719261169434, + "learning_rate": 5e-06, + "loss": 1.4297, + "mean_token_accuracy": 0.6310381144285202, + "num_tokens": 2219210.0, + "step": 170 + }, + { + "epoch": 0.10944, + "grad_norm": 3.988621950149536, + "learning_rate": 5e-06, + "loss": 1.2593, + "mean_token_accuracy": 0.6804088428616524, + "num_tokens": 2232718.0, + "step": 171 + }, + { + "epoch": 0.11008, + "grad_norm": 4.214746475219727, + "learning_rate": 5e-06, + "loss": 1.1987, + "mean_token_accuracy": 0.6554268151521683, + "num_tokens": 2244509.0, + "step": 172 + }, + { + "epoch": 0.11072, + "grad_norm": 4.047118186950684, + "learning_rate": 5e-06, + "loss": 1.3908, + "mean_token_accuracy": 0.6510942876338959, + "num_tokens": 2256799.0, + "step": 173 + }, + { + "epoch": 0.11136, + "grad_norm": 4.169956207275391, + "learning_rate": 5e-06, + "loss": 1.1637, + "mean_token_accuracy": 0.6976971700787544, + "num_tokens": 2267854.0, + "step": 174 + }, + { + "epoch": 0.112, + "grad_norm": 4.0025434494018555, + "learning_rate": 5e-06, + "loss": 1.2681, + "mean_token_accuracy": 0.6660499349236488, + "num_tokens": 2281207.0, + "step": 175 + }, + { + "epoch": 0.11264, + "grad_norm": 3.6148102283477783, + "learning_rate": 5e-06, + "loss": 1.0893, + "mean_token_accuracy": 0.6946917325258255, + "num_tokens": 2294364.0, + "step": 176 + }, + { + "epoch": 0.11328, + "grad_norm": 4.246650695800781, + "learning_rate": 5e-06, + "loss": 1.3055, + "mean_token_accuracy": 0.6432985439896584, + "num_tokens": 2304580.0, + "step": 177 + }, + { + "epoch": 0.11392, + "grad_norm": 3.6579151153564453, + "learning_rate": 5e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.6313494071364403, + "num_tokens": 2319903.0, + "step": 178 + }, + { + "epoch": 0.11456, + "grad_norm": 3.988365411758423, + "learning_rate": 5e-06, + "loss": 1.1713, + "mean_token_accuracy": 0.6748137697577477, + "num_tokens": 2334193.0, + "step": 179 + }, + { + "epoch": 0.1152, + "grad_norm": 4.839256286621094, + "learning_rate": 5e-06, + "loss": 1.6099, + "mean_token_accuracy": 0.642399325966835, + "num_tokens": 2344137.0, + "step": 180 + }, + { + "epoch": 0.11584, + "grad_norm": 3.8175253868103027, + "learning_rate": 5e-06, + "loss": 1.3848, + "mean_token_accuracy": 0.6434383615851402, + "num_tokens": 2356787.0, + "step": 181 + }, + { + "epoch": 0.11648, + "grad_norm": 4.244999885559082, + "learning_rate": 5e-06, + "loss": 1.3926, + "mean_token_accuracy": 0.6472559943795204, + "num_tokens": 2369124.0, + "step": 182 + }, + { + "epoch": 0.11712, + "grad_norm": 3.850306749343872, + "learning_rate": 5e-06, + "loss": 1.266, + "mean_token_accuracy": 0.6437094509601593, + "num_tokens": 2383314.0, + "step": 183 + }, + { + "epoch": 0.11776, + "grad_norm": 5.292626857757568, + "learning_rate": 5e-06, + "loss": 1.7328, + "mean_token_accuracy": 0.5816368944942951, + "num_tokens": 2392822.0, + "step": 184 + }, + { + "epoch": 0.1184, + "grad_norm": 4.827669620513916, + "learning_rate": 5e-06, + "loss": 1.6344, + "mean_token_accuracy": 0.6154987290501595, + "num_tokens": 2404362.0, + "step": 185 + }, + { + "epoch": 0.11904, + "grad_norm": 4.1474995613098145, + "learning_rate": 5e-06, + "loss": 1.3776, + "mean_token_accuracy": 0.6394111067056656, + "num_tokens": 2415206.0, + "step": 186 + }, + { + "epoch": 0.11968, + "grad_norm": 4.1867995262146, + "learning_rate": 5e-06, + "loss": 1.1555, + "mean_token_accuracy": 0.6839761063456535, + "num_tokens": 2428366.0, + "step": 187 + }, + { + "epoch": 0.12032, + "grad_norm": 3.8448567390441895, + "learning_rate": 5e-06, + "loss": 1.3755, + "mean_token_accuracy": 0.6562356427311897, + "num_tokens": 2440401.0, + "step": 188 + }, + { + "epoch": 0.12096, + "grad_norm": 3.82326078414917, + "learning_rate": 5e-06, + "loss": 1.2447, + "mean_token_accuracy": 0.651421345770359, + "num_tokens": 2453194.0, + "step": 189 + }, + { + "epoch": 0.1216, + "grad_norm": 3.8324315547943115, + "learning_rate": 5e-06, + "loss": 1.4677, + "mean_token_accuracy": 0.6340256333351135, + "num_tokens": 2466328.0, + "step": 190 + }, + { + "epoch": 0.12224, + "grad_norm": 3.4532899856567383, + "learning_rate": 5e-06, + "loss": 1.3832, + "mean_token_accuracy": 0.6311789453029633, + "num_tokens": 2480165.0, + "step": 191 + }, + { + "epoch": 0.12288, + "grad_norm": 6.352081298828125, + "learning_rate": 5e-06, + "loss": 1.5605, + "mean_token_accuracy": 0.6426242366433144, + "num_tokens": 2489563.0, + "step": 192 + }, + { + "epoch": 0.12352, + "grad_norm": 3.9290707111358643, + "learning_rate": 5e-06, + "loss": 1.4923, + "mean_token_accuracy": 0.621891662478447, + "num_tokens": 2503752.0, + "step": 193 + }, + { + "epoch": 0.12416, + "grad_norm": 3.5599541664123535, + "learning_rate": 5e-06, + "loss": 1.658, + "mean_token_accuracy": 0.5923861265182495, + "num_tokens": 2517941.0, + "step": 194 + }, + { + "epoch": 0.1248, + "grad_norm": 4.907262802124023, + "learning_rate": 5e-06, + "loss": 1.3243, + "mean_token_accuracy": 0.684236004948616, + "num_tokens": 2531694.0, + "step": 195 + }, + { + "epoch": 0.12544, + "grad_norm": 3.895585298538208, + "learning_rate": 5e-06, + "loss": 1.2594, + "mean_token_accuracy": 0.6675282418727875, + "num_tokens": 2543318.0, + "step": 196 + }, + { + "epoch": 0.12608, + "grad_norm": 3.7483768463134766, + "learning_rate": 5e-06, + "loss": 1.3555, + "mean_token_accuracy": 0.6702639237046242, + "num_tokens": 2555858.0, + "step": 197 + }, + { + "epoch": 0.12672, + "grad_norm": 3.980715751647949, + "learning_rate": 5e-06, + "loss": 1.3247, + "mean_token_accuracy": 0.654958538711071, + "num_tokens": 2570848.0, + "step": 198 + }, + { + "epoch": 0.12736, + "grad_norm": 3.402679443359375, + "learning_rate": 5e-06, + "loss": 1.4151, + "mean_token_accuracy": 0.639847457408905, + "num_tokens": 2586951.0, + "step": 199 + }, + { + "epoch": 0.128, + "grad_norm": 3.603440284729004, + "learning_rate": 5e-06, + "loss": 1.138, + "mean_token_accuracy": 0.6972187757492065, + "num_tokens": 2600380.0, + "step": 200 + }, + { + "epoch": 0.12864, + "grad_norm": 4.226911544799805, + "learning_rate": 5e-06, + "loss": 1.2305, + "mean_token_accuracy": 0.6777093335986137, + "num_tokens": 2612169.0, + "step": 201 + }, + { + "epoch": 0.12928, + "grad_norm": 4.133816719055176, + "learning_rate": 5e-06, + "loss": 1.5127, + "mean_token_accuracy": 0.645031102001667, + "num_tokens": 2625681.0, + "step": 202 + }, + { + "epoch": 0.12992, + "grad_norm": 4.464379787445068, + "learning_rate": 5e-06, + "loss": 1.3419, + "mean_token_accuracy": 0.655508816242218, + "num_tokens": 2638449.0, + "step": 203 + }, + { + "epoch": 0.13056, + "grad_norm": 3.691314697265625, + "learning_rate": 5e-06, + "loss": 1.4329, + "mean_token_accuracy": 0.6271785870194435, + "num_tokens": 2651404.0, + "step": 204 + }, + { + "epoch": 0.1312, + "grad_norm": 3.735065460205078, + "learning_rate": 5e-06, + "loss": 1.259, + "mean_token_accuracy": 0.6792504116892815, + "num_tokens": 2663577.0, + "step": 205 + }, + { + "epoch": 0.13184, + "grad_norm": 3.8141613006591797, + "learning_rate": 5e-06, + "loss": 1.3812, + "mean_token_accuracy": 0.6525682806968689, + "num_tokens": 2675704.0, + "step": 206 + }, + { + "epoch": 0.13248, + "grad_norm": 4.096824645996094, + "learning_rate": 5e-06, + "loss": 1.4175, + "mean_token_accuracy": 0.6340369358658791, + "num_tokens": 2687284.0, + "step": 207 + }, + { + "epoch": 0.13312, + "grad_norm": 4.180744171142578, + "learning_rate": 5e-06, + "loss": 1.37, + "mean_token_accuracy": 0.6400524824857712, + "num_tokens": 2700343.0, + "step": 208 + }, + { + "epoch": 0.13376, + "grad_norm": 4.275300979614258, + "learning_rate": 5e-06, + "loss": 1.3739, + "mean_token_accuracy": 0.6336183995008469, + "num_tokens": 2713760.0, + "step": 209 + }, + { + "epoch": 0.1344, + "grad_norm": 3.547708511352539, + "learning_rate": 5e-06, + "loss": 1.4059, + "mean_token_accuracy": 0.6354609504342079, + "num_tokens": 2728045.0, + "step": 210 + }, + { + "epoch": 0.13504, + "grad_norm": 4.222541809082031, + "learning_rate": 5e-06, + "loss": 1.3929, + "mean_token_accuracy": 0.6449902206659317, + "num_tokens": 2741308.0, + "step": 211 + }, + { + "epoch": 0.13568, + "grad_norm": 3.930753707885742, + "learning_rate": 5e-06, + "loss": 1.5346, + "mean_token_accuracy": 0.6030523180961609, + "num_tokens": 2754620.0, + "step": 212 + }, + { + "epoch": 0.13632, + "grad_norm": 3.6813647747039795, + "learning_rate": 5e-06, + "loss": 1.3267, + "mean_token_accuracy": 0.6522213146090508, + "num_tokens": 2768180.0, + "step": 213 + }, + { + "epoch": 0.13696, + "grad_norm": 4.064117431640625, + "learning_rate": 5e-06, + "loss": 1.3081, + "mean_token_accuracy": 0.6697950512170792, + "num_tokens": 2781050.0, + "step": 214 + }, + { + "epoch": 0.1376, + "grad_norm": 3.927386522293091, + "learning_rate": 5e-06, + "loss": 1.5213, + "mean_token_accuracy": 0.6275500729680061, + "num_tokens": 2792644.0, + "step": 215 + }, + { + "epoch": 0.13824, + "grad_norm": 3.762558937072754, + "learning_rate": 5e-06, + "loss": 1.339, + "mean_token_accuracy": 0.6525787115097046, + "num_tokens": 2805857.0, + "step": 216 + }, + { + "epoch": 0.13888, + "grad_norm": 3.3911473751068115, + "learning_rate": 5e-06, + "loss": 1.3861, + "mean_token_accuracy": 0.646359771490097, + "num_tokens": 2822403.0, + "step": 217 + }, + { + "epoch": 0.13952, + "grad_norm": 3.3811612129211426, + "learning_rate": 5e-06, + "loss": 1.4414, + "mean_token_accuracy": 0.6381309777498245, + "num_tokens": 2836579.0, + "step": 218 + }, + { + "epoch": 0.14016, + "grad_norm": 3.9682304859161377, + "learning_rate": 5e-06, + "loss": 1.4728, + "mean_token_accuracy": 0.611208513379097, + "num_tokens": 2848833.0, + "step": 219 + }, + { + "epoch": 0.1408, + "grad_norm": 4.066648483276367, + "learning_rate": 5e-06, + "loss": 1.3081, + "mean_token_accuracy": 0.6451118811964989, + "num_tokens": 2859672.0, + "step": 220 + }, + { + "epoch": 0.14144, + "grad_norm": 3.577544927597046, + "learning_rate": 5e-06, + "loss": 1.4143, + "mean_token_accuracy": 0.6415835171937943, + "num_tokens": 2875418.0, + "step": 221 + }, + { + "epoch": 0.14208, + "grad_norm": 3.8373844623565674, + "learning_rate": 5e-06, + "loss": 1.1867, + "mean_token_accuracy": 0.6628148853778839, + "num_tokens": 2886843.0, + "step": 222 + }, + { + "epoch": 0.14272, + "grad_norm": 3.243741273880005, + "learning_rate": 5e-06, + "loss": 1.3639, + "mean_token_accuracy": 0.6402468308806419, + "num_tokens": 2903000.0, + "step": 223 + }, + { + "epoch": 0.14336, + "grad_norm": 3.6917643547058105, + "learning_rate": 5e-06, + "loss": 1.4826, + "mean_token_accuracy": 0.6137516796588898, + "num_tokens": 2916086.0, + "step": 224 + }, + { + "epoch": 0.144, + "grad_norm": 3.6961069107055664, + "learning_rate": 5e-06, + "loss": 1.3914, + "mean_token_accuracy": 0.6221867948770523, + "num_tokens": 2928082.0, + "step": 225 + }, + { + "epoch": 0.14464, + "grad_norm": 3.3489155769348145, + "learning_rate": 5e-06, + "loss": 1.2829, + "mean_token_accuracy": 0.6520458236336708, + "num_tokens": 2941972.0, + "step": 226 + }, + { + "epoch": 0.14528, + "grad_norm": 3.9291248321533203, + "learning_rate": 5e-06, + "loss": 1.4375, + "mean_token_accuracy": 0.6534432545304298, + "num_tokens": 2954326.0, + "step": 227 + }, + { + "epoch": 0.14592, + "grad_norm": 4.408154487609863, + "learning_rate": 5e-06, + "loss": 1.37, + "mean_token_accuracy": 0.6362905651330948, + "num_tokens": 2964773.0, + "step": 228 + }, + { + "epoch": 0.14656, + "grad_norm": 3.3480911254882812, + "learning_rate": 5e-06, + "loss": 1.2627, + "mean_token_accuracy": 0.664552852511406, + "num_tokens": 2979164.0, + "step": 229 + }, + { + "epoch": 0.1472, + "grad_norm": 3.5520999431610107, + "learning_rate": 5e-06, + "loss": 1.2172, + "mean_token_accuracy": 0.677513062953949, + "num_tokens": 2993457.0, + "step": 230 + }, + { + "epoch": 0.14784, + "grad_norm": 3.3027398586273193, + "learning_rate": 5e-06, + "loss": 1.4642, + "mean_token_accuracy": 0.6177601739764214, + "num_tokens": 3007665.0, + "step": 231 + }, + { + "epoch": 0.14848, + "grad_norm": 3.64074444770813, + "learning_rate": 5e-06, + "loss": 1.2464, + "mean_token_accuracy": 0.662057913839817, + "num_tokens": 3020550.0, + "step": 232 + }, + { + "epoch": 0.14912, + "grad_norm": 3.9199254512786865, + "learning_rate": 5e-06, + "loss": 1.3942, + "mean_token_accuracy": 0.6174388378858566, + "num_tokens": 3034384.0, + "step": 233 + }, + { + "epoch": 0.14976, + "grad_norm": 4.028416633605957, + "learning_rate": 5e-06, + "loss": 1.2443, + "mean_token_accuracy": 0.6706736162304878, + "num_tokens": 3046491.0, + "step": 234 + }, + { + "epoch": 0.1504, + "grad_norm": 3.4330265522003174, + "learning_rate": 5e-06, + "loss": 1.4317, + "mean_token_accuracy": 0.6397150233387947, + "num_tokens": 3061547.0, + "step": 235 + }, + { + "epoch": 0.15104, + "grad_norm": 4.62261438369751, + "learning_rate": 5e-06, + "loss": 1.4654, + "mean_token_accuracy": 0.6216901019215584, + "num_tokens": 3073316.0, + "step": 236 + }, + { + "epoch": 0.15168, + "grad_norm": 3.8148386478424072, + "learning_rate": 5e-06, + "loss": 1.3572, + "mean_token_accuracy": 0.6381306573748589, + "num_tokens": 3086074.0, + "step": 237 + }, + { + "epoch": 0.15232, + "grad_norm": 3.6774654388427734, + "learning_rate": 5e-06, + "loss": 1.2743, + "mean_token_accuracy": 0.6746890023350716, + "num_tokens": 3099218.0, + "step": 238 + }, + { + "epoch": 0.15296, + "grad_norm": 3.8915648460388184, + "learning_rate": 5e-06, + "loss": 1.3005, + "mean_token_accuracy": 0.6652230620384216, + "num_tokens": 3113283.0, + "step": 239 + }, + { + "epoch": 0.1536, + "grad_norm": 3.641663074493408, + "learning_rate": 5e-06, + "loss": 1.4299, + "mean_token_accuracy": 0.6305139660835266, + "num_tokens": 3127092.0, + "step": 240 + }, + { + "epoch": 0.15424, + "grad_norm": 3.9802157878875732, + "learning_rate": 5e-06, + "loss": 1.3628, + "mean_token_accuracy": 0.6499741598963737, + "num_tokens": 3137977.0, + "step": 241 + }, + { + "epoch": 0.15488, + "grad_norm": 3.3519856929779053, + "learning_rate": 5e-06, + "loss": 1.3649, + "mean_token_accuracy": 0.6538999378681183, + "num_tokens": 3153296.0, + "step": 242 + }, + { + "epoch": 0.15552, + "grad_norm": 3.9312145709991455, + "learning_rate": 5e-06, + "loss": 1.1886, + "mean_token_accuracy": 0.687839575111866, + "num_tokens": 3165430.0, + "step": 243 + }, + { + "epoch": 0.15616, + "grad_norm": 3.9684488773345947, + "learning_rate": 5e-06, + "loss": 1.392, + "mean_token_accuracy": 0.629355788230896, + "num_tokens": 3176799.0, + "step": 244 + }, + { + "epoch": 0.1568, + "grad_norm": 3.610091209411621, + "learning_rate": 5e-06, + "loss": 1.3166, + "mean_token_accuracy": 0.6479083597660065, + "num_tokens": 3190258.0, + "step": 245 + }, + { + "epoch": 0.15744, + "grad_norm": 3.921807289123535, + "learning_rate": 5e-06, + "loss": 1.1064, + "mean_token_accuracy": 0.7143979370594025, + "num_tokens": 3201789.0, + "step": 246 + }, + { + "epoch": 0.15808, + "grad_norm": 3.4888627529144287, + "learning_rate": 5e-06, + "loss": 1.2273, + "mean_token_accuracy": 0.6818583980202675, + "num_tokens": 3214773.0, + "step": 247 + }, + { + "epoch": 0.15872, + "grad_norm": 3.9141690731048584, + "learning_rate": 5e-06, + "loss": 1.2463, + "mean_token_accuracy": 0.6758697032928467, + "num_tokens": 3226302.0, + "step": 248 + }, + { + "epoch": 0.15936, + "grad_norm": 3.585526943206787, + "learning_rate": 5e-06, + "loss": 1.29, + "mean_token_accuracy": 0.6522084772586823, + "num_tokens": 3239487.0, + "step": 249 + }, + { + "epoch": 0.16, + "grad_norm": 2.9985756874084473, + "learning_rate": 5e-06, + "loss": 1.2875, + "mean_token_accuracy": 0.665367841720581, + "num_tokens": 3254553.0, + "step": 250 + }, + { + "epoch": 0.16064, + "grad_norm": 4.460598945617676, + "learning_rate": 5e-06, + "loss": 1.413, + "mean_token_accuracy": 0.6606506556272507, + "num_tokens": 3266374.0, + "step": 251 + }, + { + "epoch": 0.16128, + "grad_norm": 3.867008686065674, + "learning_rate": 5e-06, + "loss": 1.4733, + "mean_token_accuracy": 0.6357235088944435, + "num_tokens": 3278642.0, + "step": 252 + }, + { + "epoch": 0.16192, + "grad_norm": 3.6840028762817383, + "learning_rate": 5e-06, + "loss": 1.3735, + "mean_token_accuracy": 0.6500705629587173, + "num_tokens": 3292643.0, + "step": 253 + }, + { + "epoch": 0.16256, + "grad_norm": 3.631727933883667, + "learning_rate": 5e-06, + "loss": 1.3561, + "mean_token_accuracy": 0.6603741720318794, + "num_tokens": 3308572.0, + "step": 254 + }, + { + "epoch": 0.1632, + "grad_norm": 3.8139543533325195, + "learning_rate": 5e-06, + "loss": 1.4079, + "mean_token_accuracy": 0.6566642299294472, + "num_tokens": 3321852.0, + "step": 255 + }, + { + "epoch": 0.16384, + "grad_norm": 4.278744697570801, + "learning_rate": 5e-06, + "loss": 1.3128, + "mean_token_accuracy": 0.6340290307998657, + "num_tokens": 3333364.0, + "step": 256 + }, + { + "epoch": 0.16448, + "grad_norm": 3.855288505554199, + "learning_rate": 5e-06, + "loss": 1.2726, + "mean_token_accuracy": 0.6573414877057076, + "num_tokens": 3346153.0, + "step": 257 + }, + { + "epoch": 0.16512, + "grad_norm": 3.894836187362671, + "learning_rate": 5e-06, + "loss": 1.5052, + "mean_token_accuracy": 0.6395176202058792, + "num_tokens": 3357803.0, + "step": 258 + }, + { + "epoch": 0.16576, + "grad_norm": 3.7376608848571777, + "learning_rate": 5e-06, + "loss": 1.3856, + "mean_token_accuracy": 0.6377875059843063, + "num_tokens": 3370640.0, + "step": 259 + }, + { + "epoch": 0.1664, + "grad_norm": 3.66434907913208, + "learning_rate": 5e-06, + "loss": 1.2933, + "mean_token_accuracy": 0.6526513993740082, + "num_tokens": 3384626.0, + "step": 260 + }, + { + "epoch": 0.16704, + "grad_norm": 4.31889533996582, + "learning_rate": 5e-06, + "loss": 1.4037, + "mean_token_accuracy": 0.6519733518362045, + "num_tokens": 3396351.0, + "step": 261 + }, + { + "epoch": 0.16768, + "grad_norm": 4.194382667541504, + "learning_rate": 5e-06, + "loss": 1.2248, + "mean_token_accuracy": 0.6719919368624687, + "num_tokens": 3410809.0, + "step": 262 + }, + { + "epoch": 0.16832, + "grad_norm": 5.298657417297363, + "learning_rate": 5e-06, + "loss": 1.2344, + "mean_token_accuracy": 0.6541409119963646, + "num_tokens": 3421666.0, + "step": 263 + }, + { + "epoch": 0.16896, + "grad_norm": 3.7578792572021484, + "learning_rate": 5e-06, + "loss": 1.4221, + "mean_token_accuracy": 0.6374265551567078, + "num_tokens": 3435240.0, + "step": 264 + }, + { + "epoch": 0.1696, + "grad_norm": 4.36591100692749, + "learning_rate": 5e-06, + "loss": 1.3996, + "mean_token_accuracy": 0.6582349985837936, + "num_tokens": 3447417.0, + "step": 265 + }, + { + "epoch": 0.17024, + "grad_norm": 4.242166042327881, + "learning_rate": 5e-06, + "loss": 1.2213, + "mean_token_accuracy": 0.6886605694890022, + "num_tokens": 3457202.0, + "step": 266 + }, + { + "epoch": 0.17088, + "grad_norm": 4.421549320220947, + "learning_rate": 5e-06, + "loss": 1.4154, + "mean_token_accuracy": 0.6361653730273247, + "num_tokens": 3470888.0, + "step": 267 + }, + { + "epoch": 0.17152, + "grad_norm": 3.4272501468658447, + "learning_rate": 5e-06, + "loss": 1.4722, + "mean_token_accuracy": 0.617170162498951, + "num_tokens": 3483711.0, + "step": 268 + }, + { + "epoch": 0.17216, + "grad_norm": 4.099259853363037, + "learning_rate": 5e-06, + "loss": 1.3181, + "mean_token_accuracy": 0.6635381802916527, + "num_tokens": 3494261.0, + "step": 269 + }, + { + "epoch": 0.1728, + "grad_norm": 3.460908889770508, + "learning_rate": 5e-06, + "loss": 1.2027, + "mean_token_accuracy": 0.6816031113266945, + "num_tokens": 3508416.0, + "step": 270 + }, + { + "epoch": 0.17344, + "grad_norm": 4.011609077453613, + "learning_rate": 5e-06, + "loss": 1.2527, + "mean_token_accuracy": 0.6691607385873795, + "num_tokens": 3521566.0, + "step": 271 + }, + { + "epoch": 0.17408, + "grad_norm": 4.310615062713623, + "learning_rate": 5e-06, + "loss": 1.5243, + "mean_token_accuracy": 0.606864832341671, + "num_tokens": 3532437.0, + "step": 272 + }, + { + "epoch": 0.17472, + "grad_norm": 3.865201950073242, + "learning_rate": 5e-06, + "loss": 1.3655, + "mean_token_accuracy": 0.6517080217599869, + "num_tokens": 3544654.0, + "step": 273 + }, + { + "epoch": 0.17536, + "grad_norm": 3.779001235961914, + "learning_rate": 5e-06, + "loss": 1.5361, + "mean_token_accuracy": 0.6139826104044914, + "num_tokens": 3560143.0, + "step": 274 + }, + { + "epoch": 0.176, + "grad_norm": 3.909745454788208, + "learning_rate": 5e-06, + "loss": 1.2, + "mean_token_accuracy": 0.6911701187491417, + "num_tokens": 3571846.0, + "step": 275 + }, + { + "epoch": 0.17664, + "grad_norm": 4.487984657287598, + "learning_rate": 5e-06, + "loss": 1.307, + "mean_token_accuracy": 0.6519964337348938, + "num_tokens": 3583280.0, + "step": 276 + }, + { + "epoch": 0.17728, + "grad_norm": 4.58504056930542, + "learning_rate": 5e-06, + "loss": 1.4673, + "mean_token_accuracy": 0.6244921982288361, + "num_tokens": 3593797.0, + "step": 277 + }, + { + "epoch": 0.17792, + "grad_norm": 3.6989223957061768, + "learning_rate": 5e-06, + "loss": 1.4841, + "mean_token_accuracy": 0.6436078920960426, + "num_tokens": 3606077.0, + "step": 278 + }, + { + "epoch": 0.17856, + "grad_norm": 3.5363776683807373, + "learning_rate": 5e-06, + "loss": 1.3562, + "mean_token_accuracy": 0.6404093876481056, + "num_tokens": 3619274.0, + "step": 279 + }, + { + "epoch": 0.1792, + "grad_norm": 3.5803604125976562, + "learning_rate": 5e-06, + "loss": 1.2417, + "mean_token_accuracy": 0.6808914020657539, + "num_tokens": 3631353.0, + "step": 280 + }, + { + "epoch": 0.17984, + "grad_norm": 3.8783459663391113, + "learning_rate": 5e-06, + "loss": 1.3802, + "mean_token_accuracy": 0.6372303292155266, + "num_tokens": 3645098.0, + "step": 281 + }, + { + "epoch": 0.18048, + "grad_norm": 4.057406425476074, + "learning_rate": 5e-06, + "loss": 1.2089, + "mean_token_accuracy": 0.6749606877565384, + "num_tokens": 3657936.0, + "step": 282 + }, + { + "epoch": 0.18112, + "grad_norm": 3.0335772037506104, + "learning_rate": 5e-06, + "loss": 1.1465, + "mean_token_accuracy": 0.6885220557451248, + "num_tokens": 3672249.0, + "step": 283 + }, + { + "epoch": 0.18176, + "grad_norm": 3.654318332672119, + "learning_rate": 5e-06, + "loss": 1.2322, + "mean_token_accuracy": 0.6707694306969643, + "num_tokens": 3685850.0, + "step": 284 + }, + { + "epoch": 0.1824, + "grad_norm": 3.4704298973083496, + "learning_rate": 5e-06, + "loss": 1.1906, + "mean_token_accuracy": 0.6730613932013512, + "num_tokens": 3699407.0, + "step": 285 + }, + { + "epoch": 0.18304, + "grad_norm": 4.028052806854248, + "learning_rate": 5e-06, + "loss": 1.511, + "mean_token_accuracy": 0.6216867938637733, + "num_tokens": 3711240.0, + "step": 286 + }, + { + "epoch": 0.18368, + "grad_norm": 3.9164350032806396, + "learning_rate": 5e-06, + "loss": 1.2674, + "mean_token_accuracy": 0.6724821552634239, + "num_tokens": 3723872.0, + "step": 287 + }, + { + "epoch": 0.18432, + "grad_norm": 4.470592498779297, + "learning_rate": 5e-06, + "loss": 1.3975, + "mean_token_accuracy": 0.6604571491479874, + "num_tokens": 3737013.0, + "step": 288 + }, + { + "epoch": 0.18496, + "grad_norm": 3.5540971755981445, + "learning_rate": 5e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.683054082095623, + "num_tokens": 3750893.0, + "step": 289 + }, + { + "epoch": 0.1856, + "grad_norm": 3.6694583892822266, + "learning_rate": 5e-06, + "loss": 1.4439, + "mean_token_accuracy": 0.6296076104044914, + "num_tokens": 3763965.0, + "step": 290 + }, + { + "epoch": 0.18624, + "grad_norm": 4.5381059646606445, + "learning_rate": 5e-06, + "loss": 1.4017, + "mean_token_accuracy": 0.646364264190197, + "num_tokens": 3774483.0, + "step": 291 + }, + { + "epoch": 0.18688, + "grad_norm": 3.607478141784668, + "learning_rate": 5e-06, + "loss": 1.5724, + "mean_token_accuracy": 0.6356127932667732, + "num_tokens": 3791198.0, + "step": 292 + }, + { + "epoch": 0.18752, + "grad_norm": 3.7672901153564453, + "learning_rate": 5e-06, + "loss": 1.5793, + "mean_token_accuracy": 0.6113990694284439, + "num_tokens": 3805077.0, + "step": 293 + }, + { + "epoch": 0.18816, + "grad_norm": 3.517371892929077, + "learning_rate": 5e-06, + "loss": 1.3182, + "mean_token_accuracy": 0.6544737070798874, + "num_tokens": 3819471.0, + "step": 294 + }, + { + "epoch": 0.1888, + "grad_norm": 3.6588094234466553, + "learning_rate": 5e-06, + "loss": 1.1415, + "mean_token_accuracy": 0.6868576034903526, + "num_tokens": 3833299.0, + "step": 295 + }, + { + "epoch": 0.18944, + "grad_norm": 4.042988300323486, + "learning_rate": 5e-06, + "loss": 1.331, + "mean_token_accuracy": 0.6815094500780106, + "num_tokens": 3845749.0, + "step": 296 + }, + { + "epoch": 0.19008, + "grad_norm": 3.829592227935791, + "learning_rate": 5e-06, + "loss": 1.5645, + "mean_token_accuracy": 0.6153044253587723, + "num_tokens": 3858961.0, + "step": 297 + }, + { + "epoch": 0.19072, + "grad_norm": 4.074889659881592, + "learning_rate": 5e-06, + "loss": 1.4884, + "mean_token_accuracy": 0.6340715438127518, + "num_tokens": 3870935.0, + "step": 298 + }, + { + "epoch": 0.19136, + "grad_norm": 3.7292230129241943, + "learning_rate": 5e-06, + "loss": 1.461, + "mean_token_accuracy": 0.6340260431170464, + "num_tokens": 3883149.0, + "step": 299 + }, + { + "epoch": 0.192, + "grad_norm": 3.7191953659057617, + "learning_rate": 5e-06, + "loss": 1.2, + "mean_token_accuracy": 0.6816589832305908, + "num_tokens": 3896395.0, + "step": 300 + }, + { + "epoch": 0.19264, + "grad_norm": 3.5360212326049805, + "learning_rate": 5e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6812401190400124, + "num_tokens": 3912111.0, + "step": 301 + }, + { + "epoch": 0.19328, + "grad_norm": 4.3103132247924805, + "learning_rate": 5e-06, + "loss": 1.1345, + "mean_token_accuracy": 0.6864209771156311, + "num_tokens": 3922023.0, + "step": 302 + }, + { + "epoch": 0.19392, + "grad_norm": 4.525723457336426, + "learning_rate": 5e-06, + "loss": 1.1642, + "mean_token_accuracy": 0.6812352165579796, + "num_tokens": 3933051.0, + "step": 303 + }, + { + "epoch": 0.19456, + "grad_norm": 4.2806172370910645, + "learning_rate": 5e-06, + "loss": 1.4921, + "mean_token_accuracy": 0.6231048293411732, + "num_tokens": 3943203.0, + "step": 304 + }, + { + "epoch": 0.1952, + "grad_norm": 3.759788751602173, + "learning_rate": 5e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6561701893806458, + "num_tokens": 3956583.0, + "step": 305 + }, + { + "epoch": 0.19584, + "grad_norm": 3.7161481380462646, + "learning_rate": 5e-06, + "loss": 1.4199, + "mean_token_accuracy": 0.6432743892073631, + "num_tokens": 3968468.0, + "step": 306 + }, + { + "epoch": 0.19648, + "grad_norm": 3.6811437606811523, + "learning_rate": 5e-06, + "loss": 1.3131, + "mean_token_accuracy": 0.6686923652887344, + "num_tokens": 3980727.0, + "step": 307 + }, + { + "epoch": 0.19712, + "grad_norm": 4.159343242645264, + "learning_rate": 5e-06, + "loss": 1.4896, + "mean_token_accuracy": 0.6509639658033848, + "num_tokens": 3993831.0, + "step": 308 + }, + { + "epoch": 0.19776, + "grad_norm": 3.5082013607025146, + "learning_rate": 5e-06, + "loss": 1.1129, + "mean_token_accuracy": 0.7138783186674118, + "num_tokens": 4006704.0, + "step": 309 + }, + { + "epoch": 0.1984, + "grad_norm": 4.171331882476807, + "learning_rate": 5e-06, + "loss": 1.2373, + "mean_token_accuracy": 0.6580014526844025, + "num_tokens": 4023069.0, + "step": 310 + }, + { + "epoch": 0.19904, + "grad_norm": 3.516143321990967, + "learning_rate": 5e-06, + "loss": 1.4212, + "mean_token_accuracy": 0.6425874978303909, + "num_tokens": 4036994.0, + "step": 311 + }, + { + "epoch": 0.19968, + "grad_norm": 3.506361484527588, + "learning_rate": 5e-06, + "loss": 1.5113, + "mean_token_accuracy": 0.6125459745526314, + "num_tokens": 4050240.0, + "step": 312 + }, + { + "epoch": 0.20032, + "grad_norm": 4.198498725891113, + "learning_rate": 5e-06, + "loss": 1.4596, + "mean_token_accuracy": 0.6291738748550415, + "num_tokens": 4061778.0, + "step": 313 + }, + { + "epoch": 0.20096, + "grad_norm": 3.3201327323913574, + "learning_rate": 5e-06, + "loss": 1.4545, + "mean_token_accuracy": 0.6387949883937836, + "num_tokens": 4076918.0, + "step": 314 + }, + { + "epoch": 0.2016, + "grad_norm": 3.174764394760132, + "learning_rate": 5e-06, + "loss": 1.2661, + "mean_token_accuracy": 0.6788045838475227, + "num_tokens": 4091861.0, + "step": 315 + }, + { + "epoch": 0.20224, + "grad_norm": 3.773123264312744, + "learning_rate": 5e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.7004147991538048, + "num_tokens": 4103615.0, + "step": 316 + }, + { + "epoch": 0.20288, + "grad_norm": 3.759938955307007, + "learning_rate": 5e-06, + "loss": 1.3967, + "mean_token_accuracy": 0.6160966157913208, + "num_tokens": 4116084.0, + "step": 317 + }, + { + "epoch": 0.20352, + "grad_norm": 3.3908169269561768, + "learning_rate": 5e-06, + "loss": 1.3196, + "mean_token_accuracy": 0.6666592955589294, + "num_tokens": 4130227.0, + "step": 318 + }, + { + "epoch": 0.20416, + "grad_norm": 3.709275007247925, + "learning_rate": 5e-06, + "loss": 1.1854, + "mean_token_accuracy": 0.690848097205162, + "num_tokens": 4144753.0, + "step": 319 + }, + { + "epoch": 0.2048, + "grad_norm": 4.040079116821289, + "learning_rate": 5e-06, + "loss": 1.271, + "mean_token_accuracy": 0.6566968783736229, + "num_tokens": 4157685.0, + "step": 320 + }, + { + "epoch": 0.20544, + "grad_norm": 3.6473450660705566, + "learning_rate": 5e-06, + "loss": 1.2116, + "mean_token_accuracy": 0.6666957810521126, + "num_tokens": 4171592.0, + "step": 321 + }, + { + "epoch": 0.20608, + "grad_norm": 4.44047212600708, + "learning_rate": 5e-06, + "loss": 1.3629, + "mean_token_accuracy": 0.6269867643713951, + "num_tokens": 4182621.0, + "step": 322 + }, + { + "epoch": 0.20672, + "grad_norm": 4.875802993774414, + "learning_rate": 5e-06, + "loss": 1.251, + "mean_token_accuracy": 0.671268492937088, + "num_tokens": 4191893.0, + "step": 323 + }, + { + "epoch": 0.20736, + "grad_norm": 3.2327218055725098, + "learning_rate": 5e-06, + "loss": 1.2432, + "mean_token_accuracy": 0.6710969433188438, + "num_tokens": 4207608.0, + "step": 324 + }, + { + "epoch": 0.208, + "grad_norm": 3.433987617492676, + "learning_rate": 5e-06, + "loss": 1.4811, + "mean_token_accuracy": 0.641696572303772, + "num_tokens": 4222070.0, + "step": 325 + }, + { + "epoch": 0.20864, + "grad_norm": 3.3024795055389404, + "learning_rate": 5e-06, + "loss": 1.4027, + "mean_token_accuracy": 0.6242343187332153, + "num_tokens": 4237064.0, + "step": 326 + }, + { + "epoch": 0.20928, + "grad_norm": 3.8479273319244385, + "learning_rate": 5e-06, + "loss": 1.1806, + "mean_token_accuracy": 0.6728235110640526, + "num_tokens": 4251385.0, + "step": 327 + }, + { + "epoch": 0.20992, + "grad_norm": 3.911982774734497, + "learning_rate": 5e-06, + "loss": 1.3907, + "mean_token_accuracy": 0.633483037352562, + "num_tokens": 4264013.0, + "step": 328 + }, + { + "epoch": 0.21056, + "grad_norm": 3.055570125579834, + "learning_rate": 5e-06, + "loss": 1.5427, + "mean_token_accuracy": 0.6328605860471725, + "num_tokens": 4280497.0, + "step": 329 + }, + { + "epoch": 0.2112, + "grad_norm": 3.9111008644104004, + "learning_rate": 5e-06, + "loss": 1.3318, + "mean_token_accuracy": 0.6829836070537567, + "num_tokens": 4293657.0, + "step": 330 + }, + { + "epoch": 0.21184, + "grad_norm": 3.3383522033691406, + "learning_rate": 5e-06, + "loss": 1.3847, + "mean_token_accuracy": 0.6295205429196358, + "num_tokens": 4309618.0, + "step": 331 + }, + { + "epoch": 0.21248, + "grad_norm": 3.3280251026153564, + "learning_rate": 5e-06, + "loss": 1.5237, + "mean_token_accuracy": 0.6586425974965096, + "num_tokens": 4326147.0, + "step": 332 + }, + { + "epoch": 0.21312, + "grad_norm": 4.489631175994873, + "learning_rate": 5e-06, + "loss": 1.2546, + "mean_token_accuracy": 0.653937578201294, + "num_tokens": 4336964.0, + "step": 333 + }, + { + "epoch": 0.21376, + "grad_norm": 3.654022693634033, + "learning_rate": 5e-06, + "loss": 1.3246, + "mean_token_accuracy": 0.6411551535129547, + "num_tokens": 4349801.0, + "step": 334 + }, + { + "epoch": 0.2144, + "grad_norm": 3.9658567905426025, + "learning_rate": 5e-06, + "loss": 1.2223, + "mean_token_accuracy": 0.6976972743868828, + "num_tokens": 4362565.0, + "step": 335 + }, + { + "epoch": 0.21504, + "grad_norm": 4.284513473510742, + "learning_rate": 5e-06, + "loss": 1.2982, + "mean_token_accuracy": 0.6423984244465828, + "num_tokens": 4373113.0, + "step": 336 + }, + { + "epoch": 0.21568, + "grad_norm": 3.3546524047851562, + "learning_rate": 5e-06, + "loss": 1.5144, + "mean_token_accuracy": 0.6321973502635956, + "num_tokens": 4388315.0, + "step": 337 + }, + { + "epoch": 0.21632, + "grad_norm": 3.7386813163757324, + "learning_rate": 5e-06, + "loss": 1.2948, + "mean_token_accuracy": 0.6624687612056732, + "num_tokens": 4400428.0, + "step": 338 + }, + { + "epoch": 0.21696, + "grad_norm": 4.466668128967285, + "learning_rate": 5e-06, + "loss": 1.7564, + "mean_token_accuracy": 0.6115086637437344, + "num_tokens": 4412812.0, + "step": 339 + }, + { + "epoch": 0.2176, + "grad_norm": 3.6271438598632812, + "learning_rate": 5e-06, + "loss": 1.2008, + "mean_token_accuracy": 0.6809025183320045, + "num_tokens": 4427547.0, + "step": 340 + }, + { + "epoch": 0.21824, + "grad_norm": 4.270169258117676, + "learning_rate": 5e-06, + "loss": 1.4229, + "mean_token_accuracy": 0.6368228495121002, + "num_tokens": 4440979.0, + "step": 341 + }, + { + "epoch": 0.21888, + "grad_norm": 4.036962509155273, + "learning_rate": 5e-06, + "loss": 1.4317, + "mean_token_accuracy": 0.6311650201678276, + "num_tokens": 4452973.0, + "step": 342 + }, + { + "epoch": 0.21952, + "grad_norm": 3.645164728164673, + "learning_rate": 5e-06, + "loss": 1.2559, + "mean_token_accuracy": 0.6653162762522697, + "num_tokens": 4465907.0, + "step": 343 + }, + { + "epoch": 0.22016, + "grad_norm": 4.088701248168945, + "learning_rate": 5e-06, + "loss": 1.2515, + "mean_token_accuracy": 0.6554296687245369, + "num_tokens": 4477731.0, + "step": 344 + }, + { + "epoch": 0.2208, + "grad_norm": 3.935673713684082, + "learning_rate": 5e-06, + "loss": 1.23, + "mean_token_accuracy": 0.6872739866375923, + "num_tokens": 4490092.0, + "step": 345 + }, + { + "epoch": 0.22144, + "grad_norm": 3.8297736644744873, + "learning_rate": 5e-06, + "loss": 1.3338, + "mean_token_accuracy": 0.6665596142411232, + "num_tokens": 4502310.0, + "step": 346 + }, + { + "epoch": 0.22208, + "grad_norm": 3.4555552005767822, + "learning_rate": 5e-06, + "loss": 1.3386, + "mean_token_accuracy": 0.645504966378212, + "num_tokens": 4517152.0, + "step": 347 + }, + { + "epoch": 0.22272, + "grad_norm": 3.445380926132202, + "learning_rate": 5e-06, + "loss": 1.3176, + "mean_token_accuracy": 0.656374916434288, + "num_tokens": 4531588.0, + "step": 348 + }, + { + "epoch": 0.22336, + "grad_norm": 3.376492500305176, + "learning_rate": 5e-06, + "loss": 1.4416, + "mean_token_accuracy": 0.6500495374202728, + "num_tokens": 4548945.0, + "step": 349 + }, + { + "epoch": 0.224, + "grad_norm": 3.7682902812957764, + "learning_rate": 5e-06, + "loss": 1.1904, + "mean_token_accuracy": 0.7001358345150948, + "num_tokens": 4561085.0, + "step": 350 + }, + { + "epoch": 0.22464, + "grad_norm": 3.9040138721466064, + "learning_rate": 5e-06, + "loss": 1.3457, + "mean_token_accuracy": 0.6525379121303558, + "num_tokens": 4574945.0, + "step": 351 + }, + { + "epoch": 0.22528, + "grad_norm": 3.5685391426086426, + "learning_rate": 5e-06, + "loss": 1.3322, + "mean_token_accuracy": 0.6565421000123024, + "num_tokens": 4588253.0, + "step": 352 + }, + { + "epoch": 0.22592, + "grad_norm": 3.4802379608154297, + "learning_rate": 5e-06, + "loss": 1.2408, + "mean_token_accuracy": 0.6631387919187546, + "num_tokens": 4603347.0, + "step": 353 + }, + { + "epoch": 0.22656, + "grad_norm": 4.1048126220703125, + "learning_rate": 5e-06, + "loss": 1.2342, + "mean_token_accuracy": 0.7041416242718697, + "num_tokens": 4616180.0, + "step": 354 + }, + { + "epoch": 0.2272, + "grad_norm": 3.617142677307129, + "learning_rate": 5e-06, + "loss": 1.193, + "mean_token_accuracy": 0.68916055560112, + "num_tokens": 4628116.0, + "step": 355 + }, + { + "epoch": 0.22784, + "grad_norm": 3.48990797996521, + "learning_rate": 5e-06, + "loss": 1.3371, + "mean_token_accuracy": 0.6546562537550926, + "num_tokens": 4644302.0, + "step": 356 + }, + { + "epoch": 0.22848, + "grad_norm": 4.8016180992126465, + "learning_rate": 5e-06, + "loss": 1.3828, + "mean_token_accuracy": 0.6490079835057259, + "num_tokens": 4654201.0, + "step": 357 + }, + { + "epoch": 0.22912, + "grad_norm": 3.589632749557495, + "learning_rate": 5e-06, + "loss": 1.303, + "mean_token_accuracy": 0.6622688621282578, + "num_tokens": 4666579.0, + "step": 358 + }, + { + "epoch": 0.22976, + "grad_norm": 3.8532536029815674, + "learning_rate": 5e-06, + "loss": 1.2905, + "mean_token_accuracy": 0.6697241440415382, + "num_tokens": 4678614.0, + "step": 359 + }, + { + "epoch": 0.2304, + "grad_norm": 3.768440008163452, + "learning_rate": 5e-06, + "loss": 1.3384, + "mean_token_accuracy": 0.6274634152650833, + "num_tokens": 4690259.0, + "step": 360 + }, + { + "epoch": 0.23104, + "grad_norm": 4.048650741577148, + "learning_rate": 5e-06, + "loss": 1.27, + "mean_token_accuracy": 0.6553780138492584, + "num_tokens": 4702794.0, + "step": 361 + }, + { + "epoch": 0.23168, + "grad_norm": 3.264341354370117, + "learning_rate": 5e-06, + "loss": 1.2521, + "mean_token_accuracy": 0.7201630547642708, + "num_tokens": 4718863.0, + "step": 362 + }, + { + "epoch": 0.23232, + "grad_norm": 3.293111562728882, + "learning_rate": 5e-06, + "loss": 1.206, + "mean_token_accuracy": 0.691804438829422, + "num_tokens": 4731459.0, + "step": 363 + }, + { + "epoch": 0.23296, + "grad_norm": 3.562152862548828, + "learning_rate": 5e-06, + "loss": 1.5825, + "mean_token_accuracy": 0.6220528446137905, + "num_tokens": 4744183.0, + "step": 364 + }, + { + "epoch": 0.2336, + "grad_norm": 3.858302116394043, + "learning_rate": 5e-06, + "loss": 1.2556, + "mean_token_accuracy": 0.6687511652708054, + "num_tokens": 4755339.0, + "step": 365 + }, + { + "epoch": 0.23424, + "grad_norm": 3.6017565727233887, + "learning_rate": 5e-06, + "loss": 1.4059, + "mean_token_accuracy": 0.6345420032739639, + "num_tokens": 4767629.0, + "step": 366 + }, + { + "epoch": 0.23488, + "grad_norm": 3.706761598587036, + "learning_rate": 5e-06, + "loss": 1.1984, + "mean_token_accuracy": 0.6690258160233498, + "num_tokens": 4778905.0, + "step": 367 + }, + { + "epoch": 0.23552, + "grad_norm": 3.1312525272369385, + "learning_rate": 5e-06, + "loss": 1.3188, + "mean_token_accuracy": 0.6492372825741768, + "num_tokens": 4794948.0, + "step": 368 + }, + { + "epoch": 0.23616, + "grad_norm": 4.282083034515381, + "learning_rate": 5e-06, + "loss": 1.4944, + "mean_token_accuracy": 0.6254525110125542, + "num_tokens": 4807887.0, + "step": 369 + }, + { + "epoch": 0.2368, + "grad_norm": 3.156104564666748, + "learning_rate": 5e-06, + "loss": 1.4343, + "mean_token_accuracy": 0.6384943351149559, + "num_tokens": 4823135.0, + "step": 370 + }, + { + "epoch": 0.23744, + "grad_norm": 3.9901719093322754, + "learning_rate": 5e-06, + "loss": 1.2035, + "mean_token_accuracy": 0.7160904258489609, + "num_tokens": 4835841.0, + "step": 371 + }, + { + "epoch": 0.23808, + "grad_norm": 3.2367820739746094, + "learning_rate": 5e-06, + "loss": 1.276, + "mean_token_accuracy": 0.659798189997673, + "num_tokens": 4852490.0, + "step": 372 + }, + { + "epoch": 0.23872, + "grad_norm": 3.741534948348999, + "learning_rate": 5e-06, + "loss": 1.3245, + "mean_token_accuracy": 0.6480759754776955, + "num_tokens": 4864536.0, + "step": 373 + }, + { + "epoch": 0.23936, + "grad_norm": 5.090270042419434, + "learning_rate": 5e-06, + "loss": 1.3845, + "mean_token_accuracy": 0.6432400941848755, + "num_tokens": 4873861.0, + "step": 374 + }, + { + "epoch": 0.24, + "grad_norm": 3.550171136856079, + "learning_rate": 5e-06, + "loss": 1.305, + "mean_token_accuracy": 0.6453453898429871, + "num_tokens": 4888154.0, + "step": 375 + }, + { + "epoch": 0.24064, + "grad_norm": 4.662119388580322, + "learning_rate": 5e-06, + "loss": 1.3197, + "mean_token_accuracy": 0.6590218544006348, + "num_tokens": 4898468.0, + "step": 376 + }, + { + "epoch": 0.24128, + "grad_norm": 5.356217861175537, + "learning_rate": 5e-06, + "loss": 1.3352, + "mean_token_accuracy": 0.6680933758616447, + "num_tokens": 4910094.0, + "step": 377 + }, + { + "epoch": 0.24192, + "grad_norm": 3.6107497215270996, + "learning_rate": 5e-06, + "loss": 1.1964, + "mean_token_accuracy": 0.6513196639716625, + "num_tokens": 4924004.0, + "step": 378 + }, + { + "epoch": 0.24256, + "grad_norm": 3.8560822010040283, + "learning_rate": 5e-06, + "loss": 1.4503, + "mean_token_accuracy": 0.6257938891649246, + "num_tokens": 4937000.0, + "step": 379 + }, + { + "epoch": 0.2432, + "grad_norm": 3.5278120040893555, + "learning_rate": 5e-06, + "loss": 1.5268, + "mean_token_accuracy": 0.6288462430238724, + "num_tokens": 4951330.0, + "step": 380 + }, + { + "epoch": 0.24384, + "grad_norm": 3.4525208473205566, + "learning_rate": 5e-06, + "loss": 1.2486, + "mean_token_accuracy": 0.6658232286572456, + "num_tokens": 4966074.0, + "step": 381 + }, + { + "epoch": 0.24448, + "grad_norm": 3.9059042930603027, + "learning_rate": 5e-06, + "loss": 1.3658, + "mean_token_accuracy": 0.6580025032162666, + "num_tokens": 4977899.0, + "step": 382 + }, + { + "epoch": 0.24512, + "grad_norm": 3.895254135131836, + "learning_rate": 5e-06, + "loss": 1.2906, + "mean_token_accuracy": 0.659791849553585, + "num_tokens": 4990481.0, + "step": 383 + }, + { + "epoch": 0.24576, + "grad_norm": 3.6709907054901123, + "learning_rate": 5e-06, + "loss": 1.243, + "mean_token_accuracy": 0.6784983575344086, + "num_tokens": 5003987.0, + "step": 384 + }, + { + "epoch": 0.2464, + "grad_norm": 3.8411707878112793, + "learning_rate": 5e-06, + "loss": 1.3464, + "mean_token_accuracy": 0.6375136002898216, + "num_tokens": 5015850.0, + "step": 385 + }, + { + "epoch": 0.24704, + "grad_norm": 4.552581787109375, + "learning_rate": 5e-06, + "loss": 1.5245, + "mean_token_accuracy": 0.6128373965620995, + "num_tokens": 5026044.0, + "step": 386 + }, + { + "epoch": 0.24768, + "grad_norm": 3.8649439811706543, + "learning_rate": 5e-06, + "loss": 1.4339, + "mean_token_accuracy": 0.6444417163729668, + "num_tokens": 5039126.0, + "step": 387 + }, + { + "epoch": 0.24832, + "grad_norm": 4.057676315307617, + "learning_rate": 5e-06, + "loss": 1.352, + "mean_token_accuracy": 0.6415472850203514, + "num_tokens": 5051247.0, + "step": 388 + }, + { + "epoch": 0.24896, + "grad_norm": 4.093824863433838, + "learning_rate": 5e-06, + "loss": 1.3294, + "mean_token_accuracy": 0.6551511734724045, + "num_tokens": 5066290.0, + "step": 389 + }, + { + "epoch": 0.2496, + "grad_norm": 3.478832244873047, + "learning_rate": 5e-06, + "loss": 1.2387, + "mean_token_accuracy": 0.6599762067198753, + "num_tokens": 5079904.0, + "step": 390 + }, + { + "epoch": 0.25024, + "grad_norm": 3.4885847568511963, + "learning_rate": 5e-06, + "loss": 1.2518, + "mean_token_accuracy": 0.6731147542595863, + "num_tokens": 5093663.0, + "step": 391 + }, + { + "epoch": 0.25088, + "grad_norm": 3.4742021560668945, + "learning_rate": 5e-06, + "loss": 1.1443, + "mean_token_accuracy": 0.7067101299762726, + "num_tokens": 5108375.0, + "step": 392 + }, + { + "epoch": 0.25152, + "grad_norm": 3.459711790084839, + "learning_rate": 5e-06, + "loss": 1.2421, + "mean_token_accuracy": 0.677531287074089, + "num_tokens": 5121820.0, + "step": 393 + }, + { + "epoch": 0.25216, + "grad_norm": 3.607994794845581, + "learning_rate": 5e-06, + "loss": 1.7177, + "mean_token_accuracy": 0.5983672738075256, + "num_tokens": 5136753.0, + "step": 394 + }, + { + "epoch": 0.2528, + "grad_norm": 3.9843177795410156, + "learning_rate": 5e-06, + "loss": 1.2271, + "mean_token_accuracy": 0.6913007572293282, + "num_tokens": 5148997.0, + "step": 395 + }, + { + "epoch": 0.25344, + "grad_norm": 3.323129177093506, + "learning_rate": 5e-06, + "loss": 1.4278, + "mean_token_accuracy": 0.6322130486369133, + "num_tokens": 5163285.0, + "step": 396 + }, + { + "epoch": 0.25408, + "grad_norm": 4.542083740234375, + "learning_rate": 5e-06, + "loss": 1.3214, + "mean_token_accuracy": 0.676998108625412, + "num_tokens": 5174430.0, + "step": 397 + }, + { + "epoch": 0.25472, + "grad_norm": 3.523313045501709, + "learning_rate": 5e-06, + "loss": 1.5198, + "mean_token_accuracy": 0.6127360239624977, + "num_tokens": 5188411.0, + "step": 398 + }, + { + "epoch": 0.25536, + "grad_norm": 3.990492820739746, + "learning_rate": 5e-06, + "loss": 1.4177, + "mean_token_accuracy": 0.6671818047761917, + "num_tokens": 5199067.0, + "step": 399 + }, + { + "epoch": 0.256, + "grad_norm": 3.5755157470703125, + "learning_rate": 5e-06, + "loss": 1.3593, + "mean_token_accuracy": 0.6605222076177597, + "num_tokens": 5212285.0, + "step": 400 + }, + { + "epoch": 0.25664, + "grad_norm": 3.8733558654785156, + "learning_rate": 5e-06, + "loss": 1.2535, + "mean_token_accuracy": 0.6705236658453941, + "num_tokens": 5224693.0, + "step": 401 + }, + { + "epoch": 0.25728, + "grad_norm": 3.86195707321167, + "learning_rate": 5e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7054353207349777, + "num_tokens": 5235503.0, + "step": 402 + }, + { + "epoch": 0.25792, + "grad_norm": 4.819467067718506, + "learning_rate": 5e-06, + "loss": 1.2544, + "mean_token_accuracy": 0.6714291796088219, + "num_tokens": 5244676.0, + "step": 403 + }, + { + "epoch": 0.25856, + "grad_norm": 4.117583274841309, + "learning_rate": 5e-06, + "loss": 1.4205, + "mean_token_accuracy": 0.6371640935540199, + "num_tokens": 5259342.0, + "step": 404 + }, + { + "epoch": 0.2592, + "grad_norm": 3.8214738368988037, + "learning_rate": 5e-06, + "loss": 1.2089, + "mean_token_accuracy": 0.682219110429287, + "num_tokens": 5271812.0, + "step": 405 + }, + { + "epoch": 0.25984, + "grad_norm": 4.264610290527344, + "learning_rate": 5e-06, + "loss": 1.2525, + "mean_token_accuracy": 0.6648172214627266, + "num_tokens": 5285329.0, + "step": 406 + }, + { + "epoch": 0.26048, + "grad_norm": 3.759557008743286, + "learning_rate": 5e-06, + "loss": 1.2922, + "mean_token_accuracy": 0.6575791016221046, + "num_tokens": 5298290.0, + "step": 407 + }, + { + "epoch": 0.26112, + "grad_norm": 5.103738784790039, + "learning_rate": 5e-06, + "loss": 1.3045, + "mean_token_accuracy": 0.6531935781240463, + "num_tokens": 5313458.0, + "step": 408 + }, + { + "epoch": 0.26176, + "grad_norm": 4.379658222198486, + "learning_rate": 5e-06, + "loss": 1.2592, + "mean_token_accuracy": 0.6718562245368958, + "num_tokens": 5324820.0, + "step": 409 + }, + { + "epoch": 0.2624, + "grad_norm": 3.613741636276245, + "learning_rate": 5e-06, + "loss": 1.3052, + "mean_token_accuracy": 0.6661521196365356, + "num_tokens": 5340445.0, + "step": 410 + }, + { + "epoch": 0.26304, + "grad_norm": 3.643263578414917, + "learning_rate": 5e-06, + "loss": 1.4656, + "mean_token_accuracy": 0.6501626446843147, + "num_tokens": 5353074.0, + "step": 411 + }, + { + "epoch": 0.26368, + "grad_norm": 3.359731912612915, + "learning_rate": 5e-06, + "loss": 1.1761, + "mean_token_accuracy": 0.6895303055644035, + "num_tokens": 5367294.0, + "step": 412 + }, + { + "epoch": 0.26432, + "grad_norm": 4.145616054534912, + "learning_rate": 5e-06, + "loss": 1.3095, + "mean_token_accuracy": 0.6614864692091942, + "num_tokens": 5378260.0, + "step": 413 + }, + { + "epoch": 0.26496, + "grad_norm": 4.191911697387695, + "learning_rate": 5e-06, + "loss": 1.4995, + "mean_token_accuracy": 0.6586913987994194, + "num_tokens": 5390393.0, + "step": 414 + }, + { + "epoch": 0.2656, + "grad_norm": 3.9197440147399902, + "learning_rate": 5e-06, + "loss": 1.2837, + "mean_token_accuracy": 0.649936854839325, + "num_tokens": 5404355.0, + "step": 415 + }, + { + "epoch": 0.26624, + "grad_norm": 3.791869640350342, + "learning_rate": 5e-06, + "loss": 1.3852, + "mean_token_accuracy": 0.6447301283478737, + "num_tokens": 5418228.0, + "step": 416 + }, + { + "epoch": 0.26688, + "grad_norm": 3.4961142539978027, + "learning_rate": 5e-06, + "loss": 1.2148, + "mean_token_accuracy": 0.6757354438304901, + "num_tokens": 5431981.0, + "step": 417 + }, + { + "epoch": 0.26752, + "grad_norm": 3.777859687805176, + "learning_rate": 5e-06, + "loss": 1.2461, + "mean_token_accuracy": 0.6744889244437218, + "num_tokens": 5446699.0, + "step": 418 + }, + { + "epoch": 0.26816, + "grad_norm": 4.008702754974365, + "learning_rate": 5e-06, + "loss": 1.4865, + "mean_token_accuracy": 0.628353901207447, + "num_tokens": 5459373.0, + "step": 419 + }, + { + "epoch": 0.2688, + "grad_norm": 3.69231915473938, + "learning_rate": 5e-06, + "loss": 1.418, + "mean_token_accuracy": 0.6555268168449402, + "num_tokens": 5473223.0, + "step": 420 + }, + { + "epoch": 0.26944, + "grad_norm": 3.597212314605713, + "learning_rate": 5e-06, + "loss": 1.4661, + "mean_token_accuracy": 0.6289801895618439, + "num_tokens": 5487849.0, + "step": 421 + }, + { + "epoch": 0.27008, + "grad_norm": 3.8283562660217285, + "learning_rate": 5e-06, + "loss": 1.2745, + "mean_token_accuracy": 0.6649068146944046, + "num_tokens": 5499444.0, + "step": 422 + }, + { + "epoch": 0.27072, + "grad_norm": 3.896993398666382, + "learning_rate": 5e-06, + "loss": 1.3163, + "mean_token_accuracy": 0.6707305237650871, + "num_tokens": 5512030.0, + "step": 423 + }, + { + "epoch": 0.27136, + "grad_norm": 3.609224557876587, + "learning_rate": 5e-06, + "loss": 1.2482, + "mean_token_accuracy": 0.6678152307868004, + "num_tokens": 5525475.0, + "step": 424 + }, + { + "epoch": 0.272, + "grad_norm": 3.715836763381958, + "learning_rate": 5e-06, + "loss": 1.1806, + "mean_token_accuracy": 0.6893536150455475, + "num_tokens": 5536706.0, + "step": 425 + }, + { + "epoch": 0.27264, + "grad_norm": 4.006832599639893, + "learning_rate": 5e-06, + "loss": 1.3409, + "mean_token_accuracy": 0.6921984776854515, + "num_tokens": 5551024.0, + "step": 426 + }, + { + "epoch": 0.27328, + "grad_norm": 3.5625905990600586, + "learning_rate": 5e-06, + "loss": 1.3058, + "mean_token_accuracy": 0.6508674696087837, + "num_tokens": 5566008.0, + "step": 427 + }, + { + "epoch": 0.27392, + "grad_norm": 3.7165002822875977, + "learning_rate": 5e-06, + "loss": 1.4402, + "mean_token_accuracy": 0.6360224187374115, + "num_tokens": 5579661.0, + "step": 428 + }, + { + "epoch": 0.27456, + "grad_norm": 3.702185869216919, + "learning_rate": 5e-06, + "loss": 1.3791, + "mean_token_accuracy": 0.6388497278094292, + "num_tokens": 5593091.0, + "step": 429 + }, + { + "epoch": 0.2752, + "grad_norm": 3.397646188735962, + "learning_rate": 5e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6443121284246445, + "num_tokens": 5607808.0, + "step": 430 + }, + { + "epoch": 0.27584, + "grad_norm": 4.425196170806885, + "learning_rate": 5e-06, + "loss": 1.2816, + "mean_token_accuracy": 0.646281823515892, + "num_tokens": 5619010.0, + "step": 431 + }, + { + "epoch": 0.27648, + "grad_norm": 3.7968697547912598, + "learning_rate": 5e-06, + "loss": 1.4615, + "mean_token_accuracy": 0.6492092609405518, + "num_tokens": 5634182.0, + "step": 432 + }, + { + "epoch": 0.27712, + "grad_norm": 3.3441648483276367, + "learning_rate": 5e-06, + "loss": 1.4186, + "mean_token_accuracy": 0.6269052773714066, + "num_tokens": 5647759.0, + "step": 433 + }, + { + "epoch": 0.27776, + "grad_norm": 3.4352946281433105, + "learning_rate": 5e-06, + "loss": 1.3009, + "mean_token_accuracy": 0.666948527097702, + "num_tokens": 5662089.0, + "step": 434 + }, + { + "epoch": 0.2784, + "grad_norm": 3.8102269172668457, + "learning_rate": 5e-06, + "loss": 1.3815, + "mean_token_accuracy": 0.6399514004588127, + "num_tokens": 5674982.0, + "step": 435 + }, + { + "epoch": 0.27904, + "grad_norm": 3.747889995574951, + "learning_rate": 5e-06, + "loss": 1.5238, + "mean_token_accuracy": 0.6195821687579155, + "num_tokens": 5687944.0, + "step": 436 + }, + { + "epoch": 0.27968, + "grad_norm": 3.963461399078369, + "learning_rate": 5e-06, + "loss": 1.2936, + "mean_token_accuracy": 0.6649496257305145, + "num_tokens": 5699002.0, + "step": 437 + }, + { + "epoch": 0.28032, + "grad_norm": 3.3493547439575195, + "learning_rate": 5e-06, + "loss": 1.346, + "mean_token_accuracy": 0.6441225036978722, + "num_tokens": 5712275.0, + "step": 438 + }, + { + "epoch": 0.28096, + "grad_norm": 3.779747247695923, + "learning_rate": 5e-06, + "loss": 1.5187, + "mean_token_accuracy": 0.6183040626347065, + "num_tokens": 5726089.0, + "step": 439 + }, + { + "epoch": 0.2816, + "grad_norm": 3.8327977657318115, + "learning_rate": 5e-06, + "loss": 1.301, + "mean_token_accuracy": 0.6831925585865974, + "num_tokens": 5737061.0, + "step": 440 + }, + { + "epoch": 0.28224, + "grad_norm": 3.559340476989746, + "learning_rate": 5e-06, + "loss": 1.286, + "mean_token_accuracy": 0.667030468583107, + "num_tokens": 5751187.0, + "step": 441 + }, + { + "epoch": 0.28288, + "grad_norm": 3.395509719848633, + "learning_rate": 5e-06, + "loss": 1.1954, + "mean_token_accuracy": 0.6770320907235146, + "num_tokens": 5765243.0, + "step": 442 + }, + { + "epoch": 0.28352, + "grad_norm": 4.320680618286133, + "learning_rate": 5e-06, + "loss": 1.4825, + "mean_token_accuracy": 0.6353371068835258, + "num_tokens": 5776809.0, + "step": 443 + }, + { + "epoch": 0.28416, + "grad_norm": 4.229187488555908, + "learning_rate": 5e-06, + "loss": 1.3178, + "mean_token_accuracy": 0.6752159968018532, + "num_tokens": 5788234.0, + "step": 444 + }, + { + "epoch": 0.2848, + "grad_norm": 3.9184088706970215, + "learning_rate": 5e-06, + "loss": 1.2316, + "mean_token_accuracy": 0.6824081540107727, + "num_tokens": 5799793.0, + "step": 445 + }, + { + "epoch": 0.28544, + "grad_norm": 4.083866596221924, + "learning_rate": 5e-06, + "loss": 1.4558, + "mean_token_accuracy": 0.6533151641488075, + "num_tokens": 5812228.0, + "step": 446 + }, + { + "epoch": 0.28608, + "grad_norm": 4.136886119842529, + "learning_rate": 5e-06, + "loss": 1.2802, + "mean_token_accuracy": 0.6800569593906403, + "num_tokens": 5822937.0, + "step": 447 + }, + { + "epoch": 0.28672, + "grad_norm": 3.92091965675354, + "learning_rate": 5e-06, + "loss": 1.253, + "mean_token_accuracy": 0.6916609779000282, + "num_tokens": 5835964.0, + "step": 448 + }, + { + "epoch": 0.28736, + "grad_norm": 3.784158706665039, + "learning_rate": 5e-06, + "loss": 1.2972, + "mean_token_accuracy": 0.674082837998867, + "num_tokens": 5847766.0, + "step": 449 + }, + { + "epoch": 0.288, + "grad_norm": 4.055779933929443, + "learning_rate": 5e-06, + "loss": 1.5458, + "mean_token_accuracy": 0.6409785822033882, + "num_tokens": 5860297.0, + "step": 450 + }, + { + "epoch": 0.28864, + "grad_norm": 4.014561176300049, + "learning_rate": 5e-06, + "loss": 1.239, + "mean_token_accuracy": 0.6951504573225975, + "num_tokens": 5871881.0, + "step": 451 + }, + { + "epoch": 0.28928, + "grad_norm": 3.908066987991333, + "learning_rate": 5e-06, + "loss": 1.1405, + "mean_token_accuracy": 0.6899219900369644, + "num_tokens": 5883233.0, + "step": 452 + }, + { + "epoch": 0.28992, + "grad_norm": 3.5451455116271973, + "learning_rate": 5e-06, + "loss": 1.4424, + "mean_token_accuracy": 0.6282008588314056, + "num_tokens": 5897342.0, + "step": 453 + }, + { + "epoch": 0.29056, + "grad_norm": 3.9957897663116455, + "learning_rate": 5e-06, + "loss": 1.2495, + "mean_token_accuracy": 0.6782330796122551, + "num_tokens": 5909429.0, + "step": 454 + }, + { + "epoch": 0.2912, + "grad_norm": 3.5935301780700684, + "learning_rate": 5e-06, + "loss": 1.292, + "mean_token_accuracy": 0.6655653864145279, + "num_tokens": 5923163.0, + "step": 455 + }, + { + "epoch": 0.29184, + "grad_norm": 3.677741765975952, + "learning_rate": 5e-06, + "loss": 1.3873, + "mean_token_accuracy": 0.6691123694181442, + "num_tokens": 5938353.0, + "step": 456 + }, + { + "epoch": 0.29248, + "grad_norm": 3.7560808658599854, + "learning_rate": 5e-06, + "loss": 1.2574, + "mean_token_accuracy": 0.6840595826506615, + "num_tokens": 5950566.0, + "step": 457 + }, + { + "epoch": 0.29312, + "grad_norm": 4.219088077545166, + "learning_rate": 5e-06, + "loss": 1.4552, + "mean_token_accuracy": 0.6695370376110077, + "num_tokens": 5963475.0, + "step": 458 + }, + { + "epoch": 0.29376, + "grad_norm": 4.02653169631958, + "learning_rate": 5e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6490977182984352, + "num_tokens": 5974934.0, + "step": 459 + }, + { + "epoch": 0.2944, + "grad_norm": 3.8300678730010986, + "learning_rate": 5e-06, + "loss": 1.2826, + "mean_token_accuracy": 0.64987413585186, + "num_tokens": 5987250.0, + "step": 460 + }, + { + "epoch": 0.29504, + "grad_norm": 3.818307876586914, + "learning_rate": 5e-06, + "loss": 1.1296, + "mean_token_accuracy": 0.6943321749567986, + "num_tokens": 5999834.0, + "step": 461 + }, + { + "epoch": 0.29568, + "grad_norm": 3.6047048568725586, + "learning_rate": 5e-06, + "loss": 1.168, + "mean_token_accuracy": 0.6777333468198776, + "num_tokens": 6012454.0, + "step": 462 + }, + { + "epoch": 0.29632, + "grad_norm": 3.4579696655273438, + "learning_rate": 5e-06, + "loss": 1.4147, + "mean_token_accuracy": 0.6355468481779099, + "num_tokens": 6026563.0, + "step": 463 + }, + { + "epoch": 0.29696, + "grad_norm": 4.736328125, + "learning_rate": 5e-06, + "loss": 1.3852, + "mean_token_accuracy": 0.6531487628817558, + "num_tokens": 6037687.0, + "step": 464 + }, + { + "epoch": 0.2976, + "grad_norm": 5.537712574005127, + "learning_rate": 5e-06, + "loss": 1.3899, + "mean_token_accuracy": 0.6446737200021744, + "num_tokens": 6052621.0, + "step": 465 + }, + { + "epoch": 0.29824, + "grad_norm": 4.118095397949219, + "learning_rate": 5e-06, + "loss": 1.4534, + "mean_token_accuracy": 0.6429826766252518, + "num_tokens": 6065725.0, + "step": 466 + }, + { + "epoch": 0.29888, + "grad_norm": 3.415851354598999, + "learning_rate": 5e-06, + "loss": 1.1162, + "mean_token_accuracy": 0.6864155679941177, + "num_tokens": 6080168.0, + "step": 467 + }, + { + "epoch": 0.29952, + "grad_norm": 3.098151922225952, + "learning_rate": 5e-06, + "loss": 1.3817, + "mean_token_accuracy": 0.6485566720366478, + "num_tokens": 6096552.0, + "step": 468 + }, + { + "epoch": 0.30016, + "grad_norm": 4.419194221496582, + "learning_rate": 5e-06, + "loss": 1.2934, + "mean_token_accuracy": 0.6655605882406235, + "num_tokens": 6107311.0, + "step": 469 + }, + { + "epoch": 0.3008, + "grad_norm": 2.9706687927246094, + "learning_rate": 5e-06, + "loss": 1.4043, + "mean_token_accuracy": 0.6342752501368523, + "num_tokens": 6123233.0, + "step": 470 + }, + { + "epoch": 0.30144, + "grad_norm": 4.0415940284729, + "learning_rate": 5e-06, + "loss": 1.4607, + "mean_token_accuracy": 0.6415122263133526, + "num_tokens": 6133347.0, + "step": 471 + }, + { + "epoch": 0.30208, + "grad_norm": 3.6789848804473877, + "learning_rate": 5e-06, + "loss": 1.359, + "mean_token_accuracy": 0.6488511562347412, + "num_tokens": 6147619.0, + "step": 472 + }, + { + "epoch": 0.30272, + "grad_norm": 3.8090357780456543, + "learning_rate": 5e-06, + "loss": 1.3368, + "mean_token_accuracy": 0.638521321117878, + "num_tokens": 6159564.0, + "step": 473 + }, + { + "epoch": 0.30336, + "grad_norm": 3.4183847904205322, + "learning_rate": 5e-06, + "loss": 1.428, + "mean_token_accuracy": 0.6213861741125584, + "num_tokens": 6173025.0, + "step": 474 + }, + { + "epoch": 0.304, + "grad_norm": 3.822892427444458, + "learning_rate": 5e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6979233846068382, + "num_tokens": 6185966.0, + "step": 475 + }, + { + "epoch": 0.30464, + "grad_norm": 3.686979293823242, + "learning_rate": 5e-06, + "loss": 1.2195, + "mean_token_accuracy": 0.6794964447617531, + "num_tokens": 6198680.0, + "step": 476 + }, + { + "epoch": 0.30528, + "grad_norm": 3.797368049621582, + "learning_rate": 5e-06, + "loss": 1.3273, + "mean_token_accuracy": 0.6465971991419792, + "num_tokens": 6212513.0, + "step": 477 + }, + { + "epoch": 0.30592, + "grad_norm": 3.9698474407196045, + "learning_rate": 5e-06, + "loss": 1.3636, + "mean_token_accuracy": 0.6398535817861557, + "num_tokens": 6224341.0, + "step": 478 + }, + { + "epoch": 0.30656, + "grad_norm": 3.755352258682251, + "learning_rate": 5e-06, + "loss": 1.2421, + "mean_token_accuracy": 0.6844679713249207, + "num_tokens": 6236749.0, + "step": 479 + }, + { + "epoch": 0.3072, + "grad_norm": 3.6229302883148193, + "learning_rate": 5e-06, + "loss": 1.1653, + "mean_token_accuracy": 0.7136622071266174, + "num_tokens": 6250516.0, + "step": 480 + }, + { + "epoch": 0.30784, + "grad_norm": 4.006715774536133, + "learning_rate": 5e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.6872854009270668, + "num_tokens": 6264775.0, + "step": 481 + }, + { + "epoch": 0.30848, + "grad_norm": 3.036703586578369, + "learning_rate": 5e-06, + "loss": 1.4079, + "mean_token_accuracy": 0.6424620673060417, + "num_tokens": 6280727.0, + "step": 482 + }, + { + "epoch": 0.30912, + "grad_norm": 3.861215114593506, + "learning_rate": 5e-06, + "loss": 1.5746, + "mean_token_accuracy": 0.6214606538414955, + "num_tokens": 6294874.0, + "step": 483 + }, + { + "epoch": 0.30976, + "grad_norm": 3.6067492961883545, + "learning_rate": 5e-06, + "loss": 1.2666, + "mean_token_accuracy": 0.6533055976033211, + "num_tokens": 6308075.0, + "step": 484 + }, + { + "epoch": 0.3104, + "grad_norm": 3.8777058124542236, + "learning_rate": 5e-06, + "loss": 1.4112, + "mean_token_accuracy": 0.6393994837999344, + "num_tokens": 6319998.0, + "step": 485 + }, + { + "epoch": 0.31104, + "grad_norm": 3.640782594680786, + "learning_rate": 5e-06, + "loss": 1.3312, + "mean_token_accuracy": 0.6485870778560638, + "num_tokens": 6332589.0, + "step": 486 + }, + { + "epoch": 0.31168, + "grad_norm": 3.792318344116211, + "learning_rate": 5e-06, + "loss": 1.4024, + "mean_token_accuracy": 0.6588046550750732, + "num_tokens": 6345037.0, + "step": 487 + }, + { + "epoch": 0.31232, + "grad_norm": 3.5393240451812744, + "learning_rate": 5e-06, + "loss": 1.3419, + "mean_token_accuracy": 0.6457289680838585, + "num_tokens": 6359457.0, + "step": 488 + }, + { + "epoch": 0.31296, + "grad_norm": 3.974876642227173, + "learning_rate": 5e-06, + "loss": 1.1741, + "mean_token_accuracy": 0.6610330641269684, + "num_tokens": 6370983.0, + "step": 489 + }, + { + "epoch": 0.3136, + "grad_norm": 3.6941604614257812, + "learning_rate": 5e-06, + "loss": 1.2241, + "mean_token_accuracy": 0.686374232172966, + "num_tokens": 6384168.0, + "step": 490 + }, + { + "epoch": 0.31424, + "grad_norm": 4.212184906005859, + "learning_rate": 5e-06, + "loss": 1.4216, + "mean_token_accuracy": 0.6811521798372269, + "num_tokens": 6395881.0, + "step": 491 + }, + { + "epoch": 0.31488, + "grad_norm": 3.925226926803589, + "learning_rate": 5e-06, + "loss": 1.3582, + "mean_token_accuracy": 0.6342300400137901, + "num_tokens": 6409919.0, + "step": 492 + }, + { + "epoch": 0.31552, + "grad_norm": 3.9599673748016357, + "learning_rate": 5e-06, + "loss": 1.3918, + "mean_token_accuracy": 0.6489474773406982, + "num_tokens": 6422097.0, + "step": 493 + }, + { + "epoch": 0.31616, + "grad_norm": 3.42258358001709, + "learning_rate": 5e-06, + "loss": 1.1146, + "mean_token_accuracy": 0.6913427859544754, + "num_tokens": 6435173.0, + "step": 494 + }, + { + "epoch": 0.3168, + "grad_norm": 4.284220218658447, + "learning_rate": 5e-06, + "loss": 1.4547, + "mean_token_accuracy": 0.6440516263246536, + "num_tokens": 6447321.0, + "step": 495 + }, + { + "epoch": 0.31744, + "grad_norm": 3.7218246459960938, + "learning_rate": 5e-06, + "loss": 1.3806, + "mean_token_accuracy": 0.629929706454277, + "num_tokens": 6460270.0, + "step": 496 + }, + { + "epoch": 0.31808, + "grad_norm": 3.406933546066284, + "learning_rate": 5e-06, + "loss": 1.1401, + "mean_token_accuracy": 0.67889504134655, + "num_tokens": 6473352.0, + "step": 497 + }, + { + "epoch": 0.31872, + "grad_norm": 3.2584404945373535, + "learning_rate": 5e-06, + "loss": 1.2213, + "mean_token_accuracy": 0.6629000529646873, + "num_tokens": 6487623.0, + "step": 498 + }, + { + "epoch": 0.31936, + "grad_norm": 4.134445667266846, + "learning_rate": 5e-06, + "loss": 1.1397, + "mean_token_accuracy": 0.6819293051958084, + "num_tokens": 6499986.0, + "step": 499 + }, + { + "epoch": 0.32, + "grad_norm": 4.104599475860596, + "learning_rate": 5e-06, + "loss": 1.4537, + "mean_token_accuracy": 0.6394720375537872, + "num_tokens": 6512724.0, + "step": 500 + }, + { + "epoch": 0.32064, + "grad_norm": 3.4379241466522217, + "learning_rate": 5e-06, + "loss": 1.3714, + "mean_token_accuracy": 0.6542030349373817, + "num_tokens": 6526727.0, + "step": 501 + }, + { + "epoch": 0.32128, + "grad_norm": 3.4537572860717773, + "learning_rate": 5e-06, + "loss": 1.3053, + "mean_token_accuracy": 0.6661063358187675, + "num_tokens": 6542663.0, + "step": 502 + }, + { + "epoch": 0.32192, + "grad_norm": 3.5106639862060547, + "learning_rate": 5e-06, + "loss": 1.3694, + "mean_token_accuracy": 0.6464217305183411, + "num_tokens": 6557847.0, + "step": 503 + }, + { + "epoch": 0.32256, + "grad_norm": 3.924419641494751, + "learning_rate": 5e-06, + "loss": 1.2544, + "mean_token_accuracy": 0.6603437811136246, + "num_tokens": 6570515.0, + "step": 504 + }, + { + "epoch": 0.3232, + "grad_norm": 3.385101318359375, + "learning_rate": 5e-06, + "loss": 1.4872, + "mean_token_accuracy": 0.6421084851026535, + "num_tokens": 6584786.0, + "step": 505 + }, + { + "epoch": 0.32384, + "grad_norm": 3.378535032272339, + "learning_rate": 5e-06, + "loss": 1.1475, + "mean_token_accuracy": 0.6894903257489204, + "num_tokens": 6598436.0, + "step": 506 + }, + { + "epoch": 0.32448, + "grad_norm": 4.74169397354126, + "learning_rate": 5e-06, + "loss": 1.4346, + "mean_token_accuracy": 0.6547307670116425, + "num_tokens": 6610436.0, + "step": 507 + }, + { + "epoch": 0.32512, + "grad_norm": 3.473893165588379, + "learning_rate": 5e-06, + "loss": 1.3827, + "mean_token_accuracy": 0.6525059714913368, + "num_tokens": 6626032.0, + "step": 508 + }, + { + "epoch": 0.32576, + "grad_norm": 4.2575273513793945, + "learning_rate": 5e-06, + "loss": 1.4238, + "mean_token_accuracy": 0.6410808116197586, + "num_tokens": 6637760.0, + "step": 509 + }, + { + "epoch": 0.3264, + "grad_norm": 3.5705769062042236, + "learning_rate": 5e-06, + "loss": 1.2437, + "mean_token_accuracy": 0.6628687754273415, + "num_tokens": 6652912.0, + "step": 510 + }, + { + "epoch": 0.32704, + "grad_norm": 4.5391011238098145, + "learning_rate": 5e-06, + "loss": 1.292, + "mean_token_accuracy": 0.6497529372572899, + "num_tokens": 6664537.0, + "step": 511 + }, + { + "epoch": 0.32768, + "grad_norm": 4.8541789054870605, + "learning_rate": 5e-06, + "loss": 1.3102, + "mean_token_accuracy": 0.6663089245557785, + "num_tokens": 6674380.0, + "step": 512 + }, + { + "epoch": 0.32832, + "grad_norm": 4.8184332847595215, + "learning_rate": 5e-06, + "loss": 1.3336, + "mean_token_accuracy": 0.6582650914788246, + "num_tokens": 6686208.0, + "step": 513 + }, + { + "epoch": 0.32896, + "grad_norm": 3.9492416381835938, + "learning_rate": 5e-06, + "loss": 1.1609, + "mean_token_accuracy": 0.6893665343523026, + "num_tokens": 6698802.0, + "step": 514 + }, + { + "epoch": 0.3296, + "grad_norm": 3.2947769165039062, + "learning_rate": 5e-06, + "loss": 1.3108, + "mean_token_accuracy": 0.6393668726086617, + "num_tokens": 6714353.0, + "step": 515 + }, + { + "epoch": 0.33024, + "grad_norm": 3.5826685428619385, + "learning_rate": 5e-06, + "loss": 1.2592, + "mean_token_accuracy": 0.663002572953701, + "num_tokens": 6728955.0, + "step": 516 + }, + { + "epoch": 0.33088, + "grad_norm": 6.832690238952637, + "learning_rate": 5e-06, + "loss": 1.2339, + "mean_token_accuracy": 0.6615518927574158, + "num_tokens": 6742119.0, + "step": 517 + }, + { + "epoch": 0.33152, + "grad_norm": 3.935009479522705, + "learning_rate": 5e-06, + "loss": 1.2767, + "mean_token_accuracy": 0.6649063900113106, + "num_tokens": 6754185.0, + "step": 518 + }, + { + "epoch": 0.33216, + "grad_norm": 4.145579814910889, + "learning_rate": 5e-06, + "loss": 1.5135, + "mean_token_accuracy": 0.6258162558078766, + "num_tokens": 6765367.0, + "step": 519 + }, + { + "epoch": 0.3328, + "grad_norm": 3.592618227005005, + "learning_rate": 5e-06, + "loss": 1.4578, + "mean_token_accuracy": 0.623950220644474, + "num_tokens": 6778122.0, + "step": 520 + }, + { + "epoch": 0.33344, + "grad_norm": 5.795764923095703, + "learning_rate": 5e-06, + "loss": 1.4626, + "mean_token_accuracy": 0.6486967876553535, + "num_tokens": 6790660.0, + "step": 521 + }, + { + "epoch": 0.33408, + "grad_norm": 4.278341293334961, + "learning_rate": 5e-06, + "loss": 1.1897, + "mean_token_accuracy": 0.6738953441381454, + "num_tokens": 6802594.0, + "step": 522 + }, + { + "epoch": 0.33472, + "grad_norm": 4.899449825286865, + "learning_rate": 5e-06, + "loss": 1.3754, + "mean_token_accuracy": 0.6378564760088921, + "num_tokens": 6818415.0, + "step": 523 + }, + { + "epoch": 0.33536, + "grad_norm": 4.733186721801758, + "learning_rate": 5e-06, + "loss": 1.36, + "mean_token_accuracy": 0.6564139500260353, + "num_tokens": 6831504.0, + "step": 524 + }, + { + "epoch": 0.336, + "grad_norm": 3.7966043949127197, + "learning_rate": 5e-06, + "loss": 1.2975, + "mean_token_accuracy": 0.6624530181288719, + "num_tokens": 6843279.0, + "step": 525 + }, + { + "epoch": 0.33664, + "grad_norm": 5.124260425567627, + "learning_rate": 5e-06, + "loss": 1.5377, + "mean_token_accuracy": 0.6152323558926582, + "num_tokens": 6854556.0, + "step": 526 + }, + { + "epoch": 0.33728, + "grad_norm": 4.210925579071045, + "learning_rate": 5e-06, + "loss": 1.3342, + "mean_token_accuracy": 0.6440554708242416, + "num_tokens": 6867858.0, + "step": 527 + }, + { + "epoch": 0.33792, + "grad_norm": 3.751556873321533, + "learning_rate": 5e-06, + "loss": 1.4675, + "mean_token_accuracy": 0.6253782510757446, + "num_tokens": 6881065.0, + "step": 528 + }, + { + "epoch": 0.33856, + "grad_norm": 6.117438793182373, + "learning_rate": 5e-06, + "loss": 1.356, + "mean_token_accuracy": 0.6668061912059784, + "num_tokens": 6894733.0, + "step": 529 + }, + { + "epoch": 0.3392, + "grad_norm": 3.5207901000976562, + "learning_rate": 5e-06, + "loss": 1.2676, + "mean_token_accuracy": 0.6576649472117424, + "num_tokens": 6907851.0, + "step": 530 + }, + { + "epoch": 0.33984, + "grad_norm": 3.6760780811309814, + "learning_rate": 5e-06, + "loss": 1.1176, + "mean_token_accuracy": 0.6879062727093697, + "num_tokens": 6921679.0, + "step": 531 + }, + { + "epoch": 0.34048, + "grad_norm": 4.656152725219727, + "learning_rate": 5e-06, + "loss": 1.42, + "mean_token_accuracy": 0.642581582069397, + "num_tokens": 6935677.0, + "step": 532 + }, + { + "epoch": 0.34112, + "grad_norm": 5.187691688537598, + "learning_rate": 5e-06, + "loss": 1.4074, + "mean_token_accuracy": 0.668558657169342, + "num_tokens": 6945440.0, + "step": 533 + }, + { + "epoch": 0.34176, + "grad_norm": 7.5727949142456055, + "learning_rate": 5e-06, + "loss": 1.4775, + "mean_token_accuracy": 0.6168788969516754, + "num_tokens": 6959496.0, + "step": 534 + }, + { + "epoch": 0.3424, + "grad_norm": 3.821122646331787, + "learning_rate": 5e-06, + "loss": 1.1118, + "mean_token_accuracy": 0.6967110335826874, + "num_tokens": 6970897.0, + "step": 535 + }, + { + "epoch": 0.34304, + "grad_norm": 3.28977108001709, + "learning_rate": 5e-06, + "loss": 1.2668, + "mean_token_accuracy": 0.6639266163110733, + "num_tokens": 6986271.0, + "step": 536 + }, + { + "epoch": 0.34368, + "grad_norm": 4.031164646148682, + "learning_rate": 5e-06, + "loss": 1.2953, + "mean_token_accuracy": 0.6541831567883492, + "num_tokens": 6998841.0, + "step": 537 + }, + { + "epoch": 0.34432, + "grad_norm": 5.315206527709961, + "learning_rate": 5e-06, + "loss": 1.2881, + "mean_token_accuracy": 0.6494873613119125, + "num_tokens": 7008652.0, + "step": 538 + }, + { + "epoch": 0.34496, + "grad_norm": 5.740390777587891, + "learning_rate": 5e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.6532981097698212, + "num_tokens": 7020915.0, + "step": 539 + }, + { + "epoch": 0.3456, + "grad_norm": 5.474863529205322, + "learning_rate": 5e-06, + "loss": 1.1128, + "mean_token_accuracy": 0.7052044421434402, + "num_tokens": 7032119.0, + "step": 540 + }, + { + "epoch": 0.34624, + "grad_norm": 4.56429386138916, + "learning_rate": 5e-06, + "loss": 1.2831, + "mean_token_accuracy": 0.6683964505791664, + "num_tokens": 7045586.0, + "step": 541 + }, + { + "epoch": 0.34688, + "grad_norm": 3.815187454223633, + "learning_rate": 5e-06, + "loss": 1.1035, + "mean_token_accuracy": 0.7000684291124344, + "num_tokens": 7057244.0, + "step": 542 + }, + { + "epoch": 0.34752, + "grad_norm": 6.026943683624268, + "learning_rate": 5e-06, + "loss": 1.2064, + "mean_token_accuracy": 0.6807686313986778, + "num_tokens": 7068752.0, + "step": 543 + }, + { + "epoch": 0.34816, + "grad_norm": 4.224482536315918, + "learning_rate": 5e-06, + "loss": 1.2508, + "mean_token_accuracy": 0.6724436059594154, + "num_tokens": 7082898.0, + "step": 544 + }, + { + "epoch": 0.3488, + "grad_norm": 7.96382474899292, + "learning_rate": 5e-06, + "loss": 1.1555, + "mean_token_accuracy": 0.6773002594709396, + "num_tokens": 7095886.0, + "step": 545 + }, + { + "epoch": 0.34944, + "grad_norm": 4.775862693786621, + "learning_rate": 5e-06, + "loss": 1.2992, + "mean_token_accuracy": 0.6667480766773224, + "num_tokens": 7107762.0, + "step": 546 + }, + { + "epoch": 0.35008, + "grad_norm": 3.49412202835083, + "learning_rate": 5e-06, + "loss": 1.1785, + "mean_token_accuracy": 0.7002041935920715, + "num_tokens": 7121108.0, + "step": 547 + }, + { + "epoch": 0.35072, + "grad_norm": 4.250086784362793, + "learning_rate": 5e-06, + "loss": 1.1921, + "mean_token_accuracy": 0.7101981267333031, + "num_tokens": 7132883.0, + "step": 548 + }, + { + "epoch": 0.35136, + "grad_norm": 3.9039688110351562, + "learning_rate": 5e-06, + "loss": 1.2883, + "mean_token_accuracy": 0.6617401614785194, + "num_tokens": 7146456.0, + "step": 549 + }, + { + "epoch": 0.352, + "grad_norm": 3.8325276374816895, + "learning_rate": 5e-06, + "loss": 1.315, + "mean_token_accuracy": 0.6657034084200859, + "num_tokens": 7160223.0, + "step": 550 + }, + { + "epoch": 0.35264, + "grad_norm": 5.472667217254639, + "learning_rate": 5e-06, + "loss": 1.4434, + "mean_token_accuracy": 0.635408416390419, + "num_tokens": 7175945.0, + "step": 551 + }, + { + "epoch": 0.35328, + "grad_norm": 4.009690761566162, + "learning_rate": 5e-06, + "loss": 1.3101, + "mean_token_accuracy": 0.6696026399731636, + "num_tokens": 7189467.0, + "step": 552 + }, + { + "epoch": 0.35392, + "grad_norm": 4.114287853240967, + "learning_rate": 5e-06, + "loss": 1.2527, + "mean_token_accuracy": 0.669425942003727, + "num_tokens": 7201319.0, + "step": 553 + }, + { + "epoch": 0.35456, + "grad_norm": 4.302579402923584, + "learning_rate": 5e-06, + "loss": 1.3494, + "mean_token_accuracy": 0.63911372423172, + "num_tokens": 7213327.0, + "step": 554 + }, + { + "epoch": 0.3552, + "grad_norm": 3.737901210784912, + "learning_rate": 5e-06, + "loss": 1.3994, + "mean_token_accuracy": 0.6371675282716751, + "num_tokens": 7226217.0, + "step": 555 + }, + { + "epoch": 0.35584, + "grad_norm": 3.517141103744507, + "learning_rate": 5e-06, + "loss": 1.5393, + "mean_token_accuracy": 0.615730918943882, + "num_tokens": 7240381.0, + "step": 556 + }, + { + "epoch": 0.35648, + "grad_norm": 4.263305187225342, + "learning_rate": 5e-06, + "loss": 1.1575, + "mean_token_accuracy": 0.6844438910484314, + "num_tokens": 7251375.0, + "step": 557 + }, + { + "epoch": 0.35712, + "grad_norm": 4.197317600250244, + "learning_rate": 5e-06, + "loss": 1.3062, + "mean_token_accuracy": 0.642548106610775, + "num_tokens": 7265647.0, + "step": 558 + }, + { + "epoch": 0.35776, + "grad_norm": 4.2730560302734375, + "learning_rate": 5e-06, + "loss": 1.2598, + "mean_token_accuracy": 0.6705774366855621, + "num_tokens": 7277240.0, + "step": 559 + }, + { + "epoch": 0.3584, + "grad_norm": 5.627854347229004, + "learning_rate": 5e-06, + "loss": 1.133, + "mean_token_accuracy": 0.6944706663489342, + "num_tokens": 7293050.0, + "step": 560 + }, + { + "epoch": 0.35904, + "grad_norm": 5.039371013641357, + "learning_rate": 5e-06, + "loss": 1.3061, + "mean_token_accuracy": 0.652328722178936, + "num_tokens": 7305212.0, + "step": 561 + }, + { + "epoch": 0.35968, + "grad_norm": 4.255235195159912, + "learning_rate": 5e-06, + "loss": 1.2425, + "mean_token_accuracy": 0.6588255614042282, + "num_tokens": 7318058.0, + "step": 562 + }, + { + "epoch": 0.36032, + "grad_norm": 3.5205321311950684, + "learning_rate": 5e-06, + "loss": 1.3016, + "mean_token_accuracy": 0.660710796713829, + "num_tokens": 7330484.0, + "step": 563 + }, + { + "epoch": 0.36096, + "grad_norm": 4.263877868652344, + "learning_rate": 5e-06, + "loss": 1.2689, + "mean_token_accuracy": 0.6572084054350853, + "num_tokens": 7342403.0, + "step": 564 + }, + { + "epoch": 0.3616, + "grad_norm": 3.9740233421325684, + "learning_rate": 5e-06, + "loss": 1.1917, + "mean_token_accuracy": 0.6762436851859093, + "num_tokens": 7353974.0, + "step": 565 + }, + { + "epoch": 0.36224, + "grad_norm": 3.4019787311553955, + "learning_rate": 5e-06, + "loss": 1.4559, + "mean_token_accuracy": 0.6335306763648987, + "num_tokens": 7371145.0, + "step": 566 + }, + { + "epoch": 0.36288, + "grad_norm": 3.6773386001586914, + "learning_rate": 5e-06, + "loss": 1.3727, + "mean_token_accuracy": 0.6477261707186699, + "num_tokens": 7385503.0, + "step": 567 + }, + { + "epoch": 0.36352, + "grad_norm": 3.533553123474121, + "learning_rate": 5e-06, + "loss": 1.3915, + "mean_token_accuracy": 0.6359102874994278, + "num_tokens": 7397974.0, + "step": 568 + }, + { + "epoch": 0.36416, + "grad_norm": 4.083873271942139, + "learning_rate": 5e-06, + "loss": 1.2959, + "mean_token_accuracy": 0.6282073631882668, + "num_tokens": 7409071.0, + "step": 569 + }, + { + "epoch": 0.3648, + "grad_norm": 3.371812582015991, + "learning_rate": 5e-06, + "loss": 1.3724, + "mean_token_accuracy": 0.6561341881752014, + "num_tokens": 7425801.0, + "step": 570 + }, + { + "epoch": 0.36544, + "grad_norm": 5.2290425300598145, + "learning_rate": 5e-06, + "loss": 1.3808, + "mean_token_accuracy": 0.66990677267313, + "num_tokens": 7435083.0, + "step": 571 + }, + { + "epoch": 0.36608, + "grad_norm": 3.8227179050445557, + "learning_rate": 5e-06, + "loss": 1.2523, + "mean_token_accuracy": 0.6640745401382446, + "num_tokens": 7445874.0, + "step": 572 + }, + { + "epoch": 0.36672, + "grad_norm": 3.826213836669922, + "learning_rate": 5e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6566179618239403, + "num_tokens": 7458792.0, + "step": 573 + }, + { + "epoch": 0.36736, + "grad_norm": 3.166212558746338, + "learning_rate": 5e-06, + "loss": 1.3854, + "mean_token_accuracy": 0.6362887248396873, + "num_tokens": 7474364.0, + "step": 574 + }, + { + "epoch": 0.368, + "grad_norm": 3.225037097930908, + "learning_rate": 5e-06, + "loss": 1.4097, + "mean_token_accuracy": 0.6328474953770638, + "num_tokens": 7489627.0, + "step": 575 + }, + { + "epoch": 0.36864, + "grad_norm": 4.110698699951172, + "learning_rate": 5e-06, + "loss": 1.2922, + "mean_token_accuracy": 0.6546645760536194, + "num_tokens": 7501958.0, + "step": 576 + }, + { + "epoch": 0.36928, + "grad_norm": 3.426607608795166, + "learning_rate": 5e-06, + "loss": 1.4665, + "mean_token_accuracy": 0.6124880164861679, + "num_tokens": 7515385.0, + "step": 577 + }, + { + "epoch": 0.36992, + "grad_norm": 3.6768105030059814, + "learning_rate": 5e-06, + "loss": 1.2686, + "mean_token_accuracy": 0.6734522432088852, + "num_tokens": 7528618.0, + "step": 578 + }, + { + "epoch": 0.37056, + "grad_norm": 3.3351573944091797, + "learning_rate": 5e-06, + "loss": 1.351, + "mean_token_accuracy": 0.6506511121988297, + "num_tokens": 7543651.0, + "step": 579 + }, + { + "epoch": 0.3712, + "grad_norm": 4.15482759475708, + "learning_rate": 5e-06, + "loss": 1.3044, + "mean_token_accuracy": 0.6768280491232872, + "num_tokens": 7554851.0, + "step": 580 + }, + { + "epoch": 0.37184, + "grad_norm": 4.212845802307129, + "learning_rate": 5e-06, + "loss": 1.3038, + "mean_token_accuracy": 0.6363187730312347, + "num_tokens": 7567001.0, + "step": 581 + }, + { + "epoch": 0.37248, + "grad_norm": 4.185598850250244, + "learning_rate": 5e-06, + "loss": 1.3491, + "mean_token_accuracy": 0.6468858942389488, + "num_tokens": 7579432.0, + "step": 582 + }, + { + "epoch": 0.37312, + "grad_norm": 3.4942967891693115, + "learning_rate": 5e-06, + "loss": 1.364, + "mean_token_accuracy": 0.650618351995945, + "num_tokens": 7593471.0, + "step": 583 + }, + { + "epoch": 0.37376, + "grad_norm": 3.4861021041870117, + "learning_rate": 5e-06, + "loss": 1.1446, + "mean_token_accuracy": 0.6844891607761383, + "num_tokens": 7608553.0, + "step": 584 + }, + { + "epoch": 0.3744, + "grad_norm": 3.893850803375244, + "learning_rate": 5e-06, + "loss": 1.458, + "mean_token_accuracy": 0.621130146086216, + "num_tokens": 7621367.0, + "step": 585 + }, + { + "epoch": 0.37504, + "grad_norm": 3.653973340988159, + "learning_rate": 5e-06, + "loss": 1.3408, + "mean_token_accuracy": 0.6439371258020401, + "num_tokens": 7634694.0, + "step": 586 + }, + { + "epoch": 0.37568, + "grad_norm": 3.94148850440979, + "learning_rate": 5e-06, + "loss": 1.4367, + "mean_token_accuracy": 0.6162022799253464, + "num_tokens": 7646183.0, + "step": 587 + }, + { + "epoch": 0.37632, + "grad_norm": 3.2505555152893066, + "learning_rate": 5e-06, + "loss": 1.4763, + "mean_token_accuracy": 0.6298285201191902, + "num_tokens": 7661964.0, + "step": 588 + }, + { + "epoch": 0.37696, + "grad_norm": 3.1683783531188965, + "learning_rate": 5e-06, + "loss": 1.18, + "mean_token_accuracy": 0.6652778312563896, + "num_tokens": 7676405.0, + "step": 589 + }, + { + "epoch": 0.3776, + "grad_norm": 4.221475601196289, + "learning_rate": 5e-06, + "loss": 1.2668, + "mean_token_accuracy": 0.6732780113816261, + "num_tokens": 7690168.0, + "step": 590 + }, + { + "epoch": 0.37824, + "grad_norm": 4.262617111206055, + "learning_rate": 5e-06, + "loss": 1.2225, + "mean_token_accuracy": 0.6608899086713791, + "num_tokens": 7700128.0, + "step": 591 + }, + { + "epoch": 0.37888, + "grad_norm": 3.564286231994629, + "learning_rate": 5e-06, + "loss": 1.4015, + "mean_token_accuracy": 0.6410685330629349, + "num_tokens": 7711700.0, + "step": 592 + }, + { + "epoch": 0.37952, + "grad_norm": 3.5013587474823, + "learning_rate": 5e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.7084442153573036, + "num_tokens": 7722670.0, + "step": 593 + }, + { + "epoch": 0.38016, + "grad_norm": 4.010073661804199, + "learning_rate": 5e-06, + "loss": 1.2001, + "mean_token_accuracy": 0.6808509230613708, + "num_tokens": 7734476.0, + "step": 594 + }, + { + "epoch": 0.3808, + "grad_norm": 3.8407106399536133, + "learning_rate": 5e-06, + "loss": 1.2772, + "mean_token_accuracy": 0.661079652607441, + "num_tokens": 7747191.0, + "step": 595 + }, + { + "epoch": 0.38144, + "grad_norm": 3.9195191860198975, + "learning_rate": 5e-06, + "loss": 1.2063, + "mean_token_accuracy": 0.6758553683757782, + "num_tokens": 7757542.0, + "step": 596 + }, + { + "epoch": 0.38208, + "grad_norm": 5.459002494812012, + "learning_rate": 5e-06, + "loss": 1.4173, + "mean_token_accuracy": 0.6381874680519104, + "num_tokens": 7770427.0, + "step": 597 + }, + { + "epoch": 0.38272, + "grad_norm": 4.335068702697754, + "learning_rate": 5e-06, + "loss": 1.2628, + "mean_token_accuracy": 0.6815316006541252, + "num_tokens": 7781771.0, + "step": 598 + }, + { + "epoch": 0.38336, + "grad_norm": 3.5294859409332275, + "learning_rate": 5e-06, + "loss": 1.1975, + "mean_token_accuracy": 0.6658317893743515, + "num_tokens": 7796104.0, + "step": 599 + }, + { + "epoch": 0.384, + "grad_norm": 5.696824550628662, + "learning_rate": 5e-06, + "loss": 1.3531, + "mean_token_accuracy": 0.6557957530021667, + "num_tokens": 7808117.0, + "step": 600 + }, + { + "epoch": 0.38464, + "grad_norm": 3.5926239490509033, + "learning_rate": 5e-06, + "loss": 1.276, + "mean_token_accuracy": 0.6641542464494705, + "num_tokens": 7819726.0, + "step": 601 + }, + { + "epoch": 0.38528, + "grad_norm": 3.8258309364318848, + "learning_rate": 5e-06, + "loss": 1.3628, + "mean_token_accuracy": 0.6388561427593231, + "num_tokens": 7833205.0, + "step": 602 + }, + { + "epoch": 0.38592, + "grad_norm": 8.0659761428833, + "learning_rate": 5e-06, + "loss": 1.2999, + "mean_token_accuracy": 0.6774822995066643, + "num_tokens": 7847836.0, + "step": 603 + }, + { + "epoch": 0.38656, + "grad_norm": 3.9128899574279785, + "learning_rate": 5e-06, + "loss": 1.1609, + "mean_token_accuracy": 0.6698524802923203, + "num_tokens": 7860682.0, + "step": 604 + }, + { + "epoch": 0.3872, + "grad_norm": 4.181707382202148, + "learning_rate": 5e-06, + "loss": 1.5005, + "mean_token_accuracy": 0.6392693892121315, + "num_tokens": 7872637.0, + "step": 605 + }, + { + "epoch": 0.38784, + "grad_norm": 4.092964172363281, + "learning_rate": 5e-06, + "loss": 1.4009, + "mean_token_accuracy": 0.6333677843213081, + "num_tokens": 7884650.0, + "step": 606 + }, + { + "epoch": 0.38848, + "grad_norm": 4.87518835067749, + "learning_rate": 5e-06, + "loss": 1.3242, + "mean_token_accuracy": 0.6464787498116493, + "num_tokens": 7898052.0, + "step": 607 + }, + { + "epoch": 0.38912, + "grad_norm": 4.693853855133057, + "learning_rate": 5e-06, + "loss": 1.3407, + "mean_token_accuracy": 0.6689095348119736, + "num_tokens": 7912216.0, + "step": 608 + }, + { + "epoch": 0.38976, + "grad_norm": 4.081251621246338, + "learning_rate": 5e-06, + "loss": 1.3657, + "mean_token_accuracy": 0.6710544601082802, + "num_tokens": 7924053.0, + "step": 609 + }, + { + "epoch": 0.3904, + "grad_norm": 25.89602279663086, + "learning_rate": 5e-06, + "loss": 1.3437, + "mean_token_accuracy": 0.6381836906075478, + "num_tokens": 7936238.0, + "step": 610 + }, + { + "epoch": 0.39104, + "grad_norm": 5.2313055992126465, + "learning_rate": 5e-06, + "loss": 1.341, + "mean_token_accuracy": 0.6564094573259354, + "num_tokens": 7948038.0, + "step": 611 + }, + { + "epoch": 0.39168, + "grad_norm": 3.936708927154541, + "learning_rate": 5e-06, + "loss": 1.3564, + "mean_token_accuracy": 0.6562279239296913, + "num_tokens": 7960193.0, + "step": 612 + }, + { + "epoch": 0.39232, + "grad_norm": 5.367516994476318, + "learning_rate": 5e-06, + "loss": 1.2065, + "mean_token_accuracy": 0.6724315732717514, + "num_tokens": 7974200.0, + "step": 613 + }, + { + "epoch": 0.39296, + "grad_norm": 4.628478527069092, + "learning_rate": 5e-06, + "loss": 1.4374, + "mean_token_accuracy": 0.6563806012272835, + "num_tokens": 7988501.0, + "step": 614 + }, + { + "epoch": 0.3936, + "grad_norm": 5.892339706420898, + "learning_rate": 5e-06, + "loss": 1.2956, + "mean_token_accuracy": 0.6271971762180328, + "num_tokens": 8000663.0, + "step": 615 + }, + { + "epoch": 0.39424, + "grad_norm": 7.812566757202148, + "learning_rate": 5e-06, + "loss": 1.294, + "mean_token_accuracy": 0.6547481939196587, + "num_tokens": 8014228.0, + "step": 616 + }, + { + "epoch": 0.39488, + "grad_norm": 3.2736427783966064, + "learning_rate": 5e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.7260592132806778, + "num_tokens": 8028431.0, + "step": 617 + }, + { + "epoch": 0.39552, + "grad_norm": 4.344757080078125, + "learning_rate": 5e-06, + "loss": 1.1555, + "mean_token_accuracy": 0.6958058997988701, + "num_tokens": 8038986.0, + "step": 618 + }, + { + "epoch": 0.39616, + "grad_norm": 4.368517875671387, + "learning_rate": 5e-06, + "loss": 1.2744, + "mean_token_accuracy": 0.6637471318244934, + "num_tokens": 8050087.0, + "step": 619 + }, + { + "epoch": 0.3968, + "grad_norm": 3.57128643989563, + "learning_rate": 5e-06, + "loss": 1.4689, + "mean_token_accuracy": 0.6049885600805283, + "num_tokens": 8063113.0, + "step": 620 + }, + { + "epoch": 0.39744, + "grad_norm": 3.331205368041992, + "learning_rate": 5e-06, + "loss": 1.4329, + "mean_token_accuracy": 0.6517436727881432, + "num_tokens": 8079228.0, + "step": 621 + }, + { + "epoch": 0.39808, + "grad_norm": 4.018087863922119, + "learning_rate": 5e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.696721188724041, + "num_tokens": 8092023.0, + "step": 622 + }, + { + "epoch": 0.39872, + "grad_norm": 3.527395486831665, + "learning_rate": 5e-06, + "loss": 1.3711, + "mean_token_accuracy": 0.6707320511341095, + "num_tokens": 8107080.0, + "step": 623 + }, + { + "epoch": 0.39936, + "grad_norm": 3.9434077739715576, + "learning_rate": 5e-06, + "loss": 1.4069, + "mean_token_accuracy": 0.6350800693035126, + "num_tokens": 8120374.0, + "step": 624 + }, + { + "epoch": 0.4, + "grad_norm": 9.819520950317383, + "learning_rate": 5e-06, + "loss": 1.3298, + "mean_token_accuracy": 0.6648931205272675, + "num_tokens": 8131526.0, + "step": 625 + }, + { + "epoch": 0.40064, + "grad_norm": 3.7477002143859863, + "learning_rate": 5e-06, + "loss": 1.3742, + "mean_token_accuracy": 0.6332258731126785, + "num_tokens": 8144842.0, + "step": 626 + }, + { + "epoch": 0.40128, + "grad_norm": 3.494314432144165, + "learning_rate": 5e-06, + "loss": 1.4336, + "mean_token_accuracy": 0.6268866658210754, + "num_tokens": 8157481.0, + "step": 627 + }, + { + "epoch": 0.40192, + "grad_norm": 4.175013542175293, + "learning_rate": 5e-06, + "loss": 1.1405, + "mean_token_accuracy": 0.6741645857691765, + "num_tokens": 8168266.0, + "step": 628 + }, + { + "epoch": 0.40256, + "grad_norm": 3.4416167736053467, + "learning_rate": 5e-06, + "loss": 1.3, + "mean_token_accuracy": 0.6660801768302917, + "num_tokens": 8182677.0, + "step": 629 + }, + { + "epoch": 0.4032, + "grad_norm": 4.334346771240234, + "learning_rate": 5e-06, + "loss": 1.2434, + "mean_token_accuracy": 0.6613388434052467, + "num_tokens": 8195270.0, + "step": 630 + }, + { + "epoch": 0.40384, + "grad_norm": 3.4856371879577637, + "learning_rate": 5e-06, + "loss": 1.3815, + "mean_token_accuracy": 0.6581474095582962, + "num_tokens": 8209764.0, + "step": 631 + }, + { + "epoch": 0.40448, + "grad_norm": 4.153162002563477, + "learning_rate": 5e-06, + "loss": 1.3904, + "mean_token_accuracy": 0.6714613437652588, + "num_tokens": 8220815.0, + "step": 632 + }, + { + "epoch": 0.40512, + "grad_norm": 4.055039405822754, + "learning_rate": 5e-06, + "loss": 1.3551, + "mean_token_accuracy": 0.6623844504356384, + "num_tokens": 8234827.0, + "step": 633 + }, + { + "epoch": 0.40576, + "grad_norm": 3.558887481689453, + "learning_rate": 5e-06, + "loss": 1.3641, + "mean_token_accuracy": 0.6388072147965431, + "num_tokens": 8246889.0, + "step": 634 + }, + { + "epoch": 0.4064, + "grad_norm": 4.767853736877441, + "learning_rate": 5e-06, + "loss": 1.6547, + "mean_token_accuracy": 0.5957511439919472, + "num_tokens": 8259650.0, + "step": 635 + }, + { + "epoch": 0.40704, + "grad_norm": 3.799283504486084, + "learning_rate": 5e-06, + "loss": 1.2804, + "mean_token_accuracy": 0.676610916852951, + "num_tokens": 8271638.0, + "step": 636 + }, + { + "epoch": 0.40768, + "grad_norm": 3.697746753692627, + "learning_rate": 5e-06, + "loss": 1.3211, + "mean_token_accuracy": 0.6482478119432926, + "num_tokens": 8285204.0, + "step": 637 + }, + { + "epoch": 0.40832, + "grad_norm": 3.6978259086608887, + "learning_rate": 5e-06, + "loss": 1.2889, + "mean_token_accuracy": 0.6479171589016914, + "num_tokens": 8297791.0, + "step": 638 + }, + { + "epoch": 0.40896, + "grad_norm": 4.445859432220459, + "learning_rate": 5e-06, + "loss": 1.4383, + "mean_token_accuracy": 0.6379449293017387, + "num_tokens": 8308088.0, + "step": 639 + }, + { + "epoch": 0.4096, + "grad_norm": 3.462293863296509, + "learning_rate": 5e-06, + "loss": 1.1934, + "mean_token_accuracy": 0.6681175008416176, + "num_tokens": 8322994.0, + "step": 640 + }, + { + "epoch": 0.41024, + "grad_norm": 3.471963405609131, + "learning_rate": 5e-06, + "loss": 1.4795, + "mean_token_accuracy": 0.6364353597164154, + "num_tokens": 8336239.0, + "step": 641 + }, + { + "epoch": 0.41088, + "grad_norm": 4.054087162017822, + "learning_rate": 5e-06, + "loss": 1.4736, + "mean_token_accuracy": 0.6363670602440834, + "num_tokens": 8347731.0, + "step": 642 + }, + { + "epoch": 0.41152, + "grad_norm": 3.717003583908081, + "learning_rate": 5e-06, + "loss": 1.4482, + "mean_token_accuracy": 0.6272126361727715, + "num_tokens": 8359999.0, + "step": 643 + }, + { + "epoch": 0.41216, + "grad_norm": 3.302205801010132, + "learning_rate": 5e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6774929463863373, + "num_tokens": 8373684.0, + "step": 644 + }, + { + "epoch": 0.4128, + "grad_norm": 3.4035229682922363, + "learning_rate": 5e-06, + "loss": 1.2946, + "mean_token_accuracy": 0.6636142283678055, + "num_tokens": 8387313.0, + "step": 645 + }, + { + "epoch": 0.41344, + "grad_norm": 3.3320178985595703, + "learning_rate": 5e-06, + "loss": 1.2419, + "mean_token_accuracy": 0.6601713374257088, + "num_tokens": 8401909.0, + "step": 646 + }, + { + "epoch": 0.41408, + "grad_norm": 4.073376178741455, + "learning_rate": 5e-06, + "loss": 1.392, + "mean_token_accuracy": 0.6541470885276794, + "num_tokens": 8412682.0, + "step": 647 + }, + { + "epoch": 0.41472, + "grad_norm": 3.4275381565093994, + "learning_rate": 5e-06, + "loss": 1.2128, + "mean_token_accuracy": 0.7010362893342972, + "num_tokens": 8425791.0, + "step": 648 + }, + { + "epoch": 0.41536, + "grad_norm": 3.013326644897461, + "learning_rate": 5e-06, + "loss": 1.2823, + "mean_token_accuracy": 0.6790317669510841, + "num_tokens": 8441869.0, + "step": 649 + }, + { + "epoch": 0.416, + "grad_norm": 3.8601441383361816, + "learning_rate": 5e-06, + "loss": 1.1898, + "mean_token_accuracy": 0.6818736344575882, + "num_tokens": 8453324.0, + "step": 650 + }, + { + "epoch": 0.41664, + "grad_norm": 4.1017537117004395, + "learning_rate": 5e-06, + "loss": 1.2861, + "mean_token_accuracy": 0.6557259410619736, + "num_tokens": 8465258.0, + "step": 651 + }, + { + "epoch": 0.41728, + "grad_norm": 4.002110481262207, + "learning_rate": 5e-06, + "loss": 1.3043, + "mean_token_accuracy": 0.6684800609946251, + "num_tokens": 8477574.0, + "step": 652 + }, + { + "epoch": 0.41792, + "grad_norm": 3.2750160694122314, + "learning_rate": 5e-06, + "loss": 1.137, + "mean_token_accuracy": 0.6889987885951996, + "num_tokens": 8493134.0, + "step": 653 + }, + { + "epoch": 0.41856, + "grad_norm": 4.388451099395752, + "learning_rate": 5e-06, + "loss": 1.1421, + "mean_token_accuracy": 0.6810869425535202, + "num_tokens": 8504478.0, + "step": 654 + }, + { + "epoch": 0.4192, + "grad_norm": 4.337303161621094, + "learning_rate": 5e-06, + "loss": 1.2395, + "mean_token_accuracy": 0.6621519103646278, + "num_tokens": 8515776.0, + "step": 655 + }, + { + "epoch": 0.41984, + "grad_norm": 3.3329954147338867, + "learning_rate": 5e-06, + "loss": 1.2419, + "mean_token_accuracy": 0.6687774360179901, + "num_tokens": 8530155.0, + "step": 656 + }, + { + "epoch": 0.42048, + "grad_norm": 4.358274459838867, + "learning_rate": 5e-06, + "loss": 1.4287, + "mean_token_accuracy": 0.6304730176925659, + "num_tokens": 8541253.0, + "step": 657 + }, + { + "epoch": 0.42112, + "grad_norm": 3.5478384494781494, + "learning_rate": 5e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.7162381857633591, + "num_tokens": 8555920.0, + "step": 658 + }, + { + "epoch": 0.42176, + "grad_norm": 3.6502294540405273, + "learning_rate": 5e-06, + "loss": 1.5178, + "mean_token_accuracy": 0.6263556554913521, + "num_tokens": 8568098.0, + "step": 659 + }, + { + "epoch": 0.4224, + "grad_norm": 3.196720838546753, + "learning_rate": 5e-06, + "loss": 1.3436, + "mean_token_accuracy": 0.6646198481321335, + "num_tokens": 8583176.0, + "step": 660 + }, + { + "epoch": 0.42304, + "grad_norm": 4.109900951385498, + "learning_rate": 5e-06, + "loss": 1.2928, + "mean_token_accuracy": 0.6598446816205978, + "num_tokens": 8593933.0, + "step": 661 + }, + { + "epoch": 0.42368, + "grad_norm": 3.258894205093384, + "learning_rate": 5e-06, + "loss": 1.1133, + "mean_token_accuracy": 0.6876930743455887, + "num_tokens": 8607476.0, + "step": 662 + }, + { + "epoch": 0.42432, + "grad_norm": 3.369394063949585, + "learning_rate": 5e-06, + "loss": 1.4229, + "mean_token_accuracy": 0.6260672360658646, + "num_tokens": 8620304.0, + "step": 663 + }, + { + "epoch": 0.42496, + "grad_norm": 2.924621820449829, + "learning_rate": 5e-06, + "loss": 1.3395, + "mean_token_accuracy": 0.6627652049064636, + "num_tokens": 8637128.0, + "step": 664 + }, + { + "epoch": 0.4256, + "grad_norm": 3.057220458984375, + "learning_rate": 5e-06, + "loss": 1.3794, + "mean_token_accuracy": 0.6343020871281624, + "num_tokens": 8653441.0, + "step": 665 + }, + { + "epoch": 0.42624, + "grad_norm": 3.4192020893096924, + "learning_rate": 5e-06, + "loss": 1.2575, + "mean_token_accuracy": 0.6774614155292511, + "num_tokens": 8667588.0, + "step": 666 + }, + { + "epoch": 0.42688, + "grad_norm": 3.5300302505493164, + "learning_rate": 5e-06, + "loss": 1.4013, + "mean_token_accuracy": 0.6498018577694893, + "num_tokens": 8680263.0, + "step": 667 + }, + { + "epoch": 0.42752, + "grad_norm": 4.497870922088623, + "learning_rate": 5e-06, + "loss": 1.3489, + "mean_token_accuracy": 0.6535830795764923, + "num_tokens": 8691240.0, + "step": 668 + }, + { + "epoch": 0.42816, + "grad_norm": 4.537415504455566, + "learning_rate": 5e-06, + "loss": 1.1375, + "mean_token_accuracy": 0.6975407898426056, + "num_tokens": 8703412.0, + "step": 669 + }, + { + "epoch": 0.4288, + "grad_norm": 4.318458080291748, + "learning_rate": 5e-06, + "loss": 1.3989, + "mean_token_accuracy": 0.6657192297279835, + "num_tokens": 8715529.0, + "step": 670 + }, + { + "epoch": 0.42944, + "grad_norm": 3.754676580429077, + "learning_rate": 5e-06, + "loss": 1.3016, + "mean_token_accuracy": 0.6766445562243462, + "num_tokens": 8727543.0, + "step": 671 + }, + { + "epoch": 0.43008, + "grad_norm": 3.548112630844116, + "learning_rate": 5e-06, + "loss": 1.2104, + "mean_token_accuracy": 0.672496572136879, + "num_tokens": 8741690.0, + "step": 672 + }, + { + "epoch": 0.43072, + "grad_norm": 3.170255422592163, + "learning_rate": 5e-06, + "loss": 1.2556, + "mean_token_accuracy": 0.6616998463869095, + "num_tokens": 8756679.0, + "step": 673 + }, + { + "epoch": 0.43136, + "grad_norm": 4.215174674987793, + "learning_rate": 5e-06, + "loss": 1.3897, + "mean_token_accuracy": 0.6362641379237175, + "num_tokens": 8766546.0, + "step": 674 + }, + { + "epoch": 0.432, + "grad_norm": 3.93945050239563, + "learning_rate": 5e-06, + "loss": 1.5469, + "mean_token_accuracy": 0.6284241452813148, + "num_tokens": 8777535.0, + "step": 675 + }, + { + "epoch": 0.43264, + "grad_norm": 3.380723237991333, + "learning_rate": 5e-06, + "loss": 1.2613, + "mean_token_accuracy": 0.6732500046491623, + "num_tokens": 8790959.0, + "step": 676 + }, + { + "epoch": 0.43328, + "grad_norm": 3.63143253326416, + "learning_rate": 5e-06, + "loss": 1.3131, + "mean_token_accuracy": 0.6863637119531631, + "num_tokens": 8804351.0, + "step": 677 + }, + { + "epoch": 0.43392, + "grad_norm": 3.2990407943725586, + "learning_rate": 5e-06, + "loss": 1.3531, + "mean_token_accuracy": 0.6465996205806732, + "num_tokens": 8819041.0, + "step": 678 + }, + { + "epoch": 0.43456, + "grad_norm": 3.4251043796539307, + "learning_rate": 5e-06, + "loss": 1.5304, + "mean_token_accuracy": 0.6462676748633385, + "num_tokens": 8833926.0, + "step": 679 + }, + { + "epoch": 0.4352, + "grad_norm": 3.7104909420013428, + "learning_rate": 5e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.695975661277771, + "num_tokens": 8845597.0, + "step": 680 + }, + { + "epoch": 0.43584, + "grad_norm": 3.3179309368133545, + "learning_rate": 5e-06, + "loss": 1.3163, + "mean_token_accuracy": 0.6567405387759209, + "num_tokens": 8860280.0, + "step": 681 + }, + { + "epoch": 0.43648, + "grad_norm": 3.607069730758667, + "learning_rate": 5e-06, + "loss": 1.3554, + "mean_token_accuracy": 0.6537708342075348, + "num_tokens": 8873793.0, + "step": 682 + }, + { + "epoch": 0.43712, + "grad_norm": 3.275057554244995, + "learning_rate": 5e-06, + "loss": 1.3613, + "mean_token_accuracy": 0.6499952375888824, + "num_tokens": 8886476.0, + "step": 683 + }, + { + "epoch": 0.43776, + "grad_norm": 3.3160624504089355, + "learning_rate": 5e-06, + "loss": 1.2898, + "mean_token_accuracy": 0.6765732616186142, + "num_tokens": 8900749.0, + "step": 684 + }, + { + "epoch": 0.4384, + "grad_norm": 3.347907543182373, + "learning_rate": 5e-06, + "loss": 1.2472, + "mean_token_accuracy": 0.6879568248987198, + "num_tokens": 8913760.0, + "step": 685 + }, + { + "epoch": 0.43904, + "grad_norm": 3.862211227416992, + "learning_rate": 5e-06, + "loss": 1.424, + "mean_token_accuracy": 0.6521790996193886, + "num_tokens": 8926061.0, + "step": 686 + }, + { + "epoch": 0.43968, + "grad_norm": 3.4736506938934326, + "learning_rate": 5e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.7187496647238731, + "num_tokens": 8939507.0, + "step": 687 + }, + { + "epoch": 0.44032, + "grad_norm": 3.3794503211975098, + "learning_rate": 5e-06, + "loss": 1.22, + "mean_token_accuracy": 0.6592478863894939, + "num_tokens": 8952411.0, + "step": 688 + }, + { + "epoch": 0.44096, + "grad_norm": 3.6152052879333496, + "learning_rate": 5e-06, + "loss": 1.1974, + "mean_token_accuracy": 0.6630196422338486, + "num_tokens": 8966507.0, + "step": 689 + }, + { + "epoch": 0.4416, + "grad_norm": 4.036067485809326, + "learning_rate": 5e-06, + "loss": 1.3094, + "mean_token_accuracy": 0.6584384590387344, + "num_tokens": 8978156.0, + "step": 690 + }, + { + "epoch": 0.44224, + "grad_norm": 3.740229845046997, + "learning_rate": 5e-06, + "loss": 1.3378, + "mean_token_accuracy": 0.6580435633659363, + "num_tokens": 8990929.0, + "step": 691 + }, + { + "epoch": 0.44288, + "grad_norm": 3.417703866958618, + "learning_rate": 5e-06, + "loss": 1.4495, + "mean_token_accuracy": 0.6308450028300285, + "num_tokens": 9004923.0, + "step": 692 + }, + { + "epoch": 0.44352, + "grad_norm": 4.16903829574585, + "learning_rate": 5e-06, + "loss": 1.4896, + "mean_token_accuracy": 0.6304129362106323, + "num_tokens": 9018493.0, + "step": 693 + }, + { + "epoch": 0.44416, + "grad_norm": 3.383941411972046, + "learning_rate": 5e-06, + "loss": 1.4175, + "mean_token_accuracy": 0.6552764996886253, + "num_tokens": 9032465.0, + "step": 694 + }, + { + "epoch": 0.4448, + "grad_norm": 3.398747205734253, + "learning_rate": 5e-06, + "loss": 1.2548, + "mean_token_accuracy": 0.6547529026865959, + "num_tokens": 9045706.0, + "step": 695 + }, + { + "epoch": 0.44544, + "grad_norm": 3.575016975402832, + "learning_rate": 5e-06, + "loss": 1.3807, + "mean_token_accuracy": 0.6451460421085358, + "num_tokens": 9059033.0, + "step": 696 + }, + { + "epoch": 0.44608, + "grad_norm": 3.3936767578125, + "learning_rate": 5e-06, + "loss": 1.2398, + "mean_token_accuracy": 0.6596207022666931, + "num_tokens": 9072068.0, + "step": 697 + }, + { + "epoch": 0.44672, + "grad_norm": 3.675055980682373, + "learning_rate": 5e-06, + "loss": 1.299, + "mean_token_accuracy": 0.6535976231098175, + "num_tokens": 9084713.0, + "step": 698 + }, + { + "epoch": 0.44736, + "grad_norm": 3.564359426498413, + "learning_rate": 5e-06, + "loss": 1.3035, + "mean_token_accuracy": 0.70219536870718, + "num_tokens": 9097477.0, + "step": 699 + }, + { + "epoch": 0.448, + "grad_norm": 3.400031566619873, + "learning_rate": 5e-06, + "loss": 1.4423, + "mean_token_accuracy": 0.6343613564968109, + "num_tokens": 9112699.0, + "step": 700 + }, + { + "epoch": 0.44864, + "grad_norm": 3.9619691371917725, + "learning_rate": 5e-06, + "loss": 1.3641, + "mean_token_accuracy": 0.6419604942202568, + "num_tokens": 9124943.0, + "step": 701 + }, + { + "epoch": 0.44928, + "grad_norm": 3.6950299739837646, + "learning_rate": 5e-06, + "loss": 1.1939, + "mean_token_accuracy": 0.6645899340510368, + "num_tokens": 9136459.0, + "step": 702 + }, + { + "epoch": 0.44992, + "grad_norm": 2.9667203426361084, + "learning_rate": 5e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.7160019502043724, + "num_tokens": 9153604.0, + "step": 703 + }, + { + "epoch": 0.45056, + "grad_norm": 3.338284969329834, + "learning_rate": 5e-06, + "loss": 1.3143, + "mean_token_accuracy": 0.6503826230764389, + "num_tokens": 9168475.0, + "step": 704 + }, + { + "epoch": 0.4512, + "grad_norm": 3.5206825733184814, + "learning_rate": 5e-06, + "loss": 1.3616, + "mean_token_accuracy": 0.6541919782757759, + "num_tokens": 9180057.0, + "step": 705 + }, + { + "epoch": 0.45184, + "grad_norm": 3.8366057872772217, + "learning_rate": 5e-06, + "loss": 1.0917, + "mean_token_accuracy": 0.701392412185669, + "num_tokens": 9191539.0, + "step": 706 + }, + { + "epoch": 0.45248, + "grad_norm": 3.7983529567718506, + "learning_rate": 5e-06, + "loss": 1.3878, + "mean_token_accuracy": 0.6564305797219276, + "num_tokens": 9201977.0, + "step": 707 + }, + { + "epoch": 0.45312, + "grad_norm": 4.199508190155029, + "learning_rate": 5e-06, + "loss": 1.1504, + "mean_token_accuracy": 0.6836559697985649, + "num_tokens": 9212342.0, + "step": 708 + }, + { + "epoch": 0.45376, + "grad_norm": 3.685267686843872, + "learning_rate": 5e-06, + "loss": 1.3698, + "mean_token_accuracy": 0.6535738334059715, + "num_tokens": 9225755.0, + "step": 709 + }, + { + "epoch": 0.4544, + "grad_norm": 3.736710786819458, + "learning_rate": 5e-06, + "loss": 1.2562, + "mean_token_accuracy": 0.6680023595690727, + "num_tokens": 9237738.0, + "step": 710 + }, + { + "epoch": 0.45504, + "grad_norm": 3.5703136920928955, + "learning_rate": 5e-06, + "loss": 1.5439, + "mean_token_accuracy": 0.6260874792933464, + "num_tokens": 9250443.0, + "step": 711 + }, + { + "epoch": 0.45568, + "grad_norm": 3.6314592361450195, + "learning_rate": 5e-06, + "loss": 1.2796, + "mean_token_accuracy": 0.6624791696667671, + "num_tokens": 9262486.0, + "step": 712 + }, + { + "epoch": 0.45632, + "grad_norm": 3.988708019256592, + "learning_rate": 5e-06, + "loss": 1.3134, + "mean_token_accuracy": 0.6787229478359222, + "num_tokens": 9274110.0, + "step": 713 + }, + { + "epoch": 0.45696, + "grad_norm": 4.141347885131836, + "learning_rate": 5e-06, + "loss": 1.2379, + "mean_token_accuracy": 0.6678915992379189, + "num_tokens": 9285461.0, + "step": 714 + }, + { + "epoch": 0.4576, + "grad_norm": 4.030619144439697, + "learning_rate": 5e-06, + "loss": 1.4524, + "mean_token_accuracy": 0.6373696550726891, + "num_tokens": 9297793.0, + "step": 715 + }, + { + "epoch": 0.45824, + "grad_norm": 3.8397583961486816, + "learning_rate": 5e-06, + "loss": 1.3026, + "mean_token_accuracy": 0.6478614434599876, + "num_tokens": 9309837.0, + "step": 716 + }, + { + "epoch": 0.45888, + "grad_norm": 3.211944580078125, + "learning_rate": 5e-06, + "loss": 1.3505, + "mean_token_accuracy": 0.6571612730622292, + "num_tokens": 9325986.0, + "step": 717 + }, + { + "epoch": 0.45952, + "grad_norm": 3.775752305984497, + "learning_rate": 5e-06, + "loss": 1.4041, + "mean_token_accuracy": 0.6476349085569382, + "num_tokens": 9339126.0, + "step": 718 + }, + { + "epoch": 0.46016, + "grad_norm": 3.311610221862793, + "learning_rate": 5e-06, + "loss": 1.2113, + "mean_token_accuracy": 0.6731600984930992, + "num_tokens": 9354016.0, + "step": 719 + }, + { + "epoch": 0.4608, + "grad_norm": 3.6527278423309326, + "learning_rate": 5e-06, + "loss": 1.4461, + "mean_token_accuracy": 0.6173442825675011, + "num_tokens": 9366261.0, + "step": 720 + }, + { + "epoch": 0.46144, + "grad_norm": 3.3843095302581787, + "learning_rate": 5e-06, + "loss": 1.1579, + "mean_token_accuracy": 0.6829282343387604, + "num_tokens": 9381276.0, + "step": 721 + }, + { + "epoch": 0.46208, + "grad_norm": 3.229539394378662, + "learning_rate": 5e-06, + "loss": 1.2984, + "mean_token_accuracy": 0.6582097262144089, + "num_tokens": 9395271.0, + "step": 722 + }, + { + "epoch": 0.46272, + "grad_norm": 3.170426607131958, + "learning_rate": 5e-06, + "loss": 1.2019, + "mean_token_accuracy": 0.6727022156119347, + "num_tokens": 9409204.0, + "step": 723 + }, + { + "epoch": 0.46336, + "grad_norm": 3.8384881019592285, + "learning_rate": 5e-06, + "loss": 1.3632, + "mean_token_accuracy": 0.6532674580812454, + "num_tokens": 9423838.0, + "step": 724 + }, + { + "epoch": 0.464, + "grad_norm": 4.176010608673096, + "learning_rate": 5e-06, + "loss": 1.3754, + "mean_token_accuracy": 0.6710385903716087, + "num_tokens": 9434639.0, + "step": 725 + }, + { + "epoch": 0.46464, + "grad_norm": 3.5365447998046875, + "learning_rate": 5e-06, + "loss": 1.2791, + "mean_token_accuracy": 0.6657568737864494, + "num_tokens": 9446770.0, + "step": 726 + }, + { + "epoch": 0.46528, + "grad_norm": 3.4129528999328613, + "learning_rate": 5e-06, + "loss": 1.3072, + "mean_token_accuracy": 0.6470441669225693, + "num_tokens": 9460600.0, + "step": 727 + }, + { + "epoch": 0.46592, + "grad_norm": 4.013781547546387, + "learning_rate": 5e-06, + "loss": 1.3892, + "mean_token_accuracy": 0.656228207051754, + "num_tokens": 9472044.0, + "step": 728 + }, + { + "epoch": 0.46656, + "grad_norm": 3.449136734008789, + "learning_rate": 5e-06, + "loss": 1.372, + "mean_token_accuracy": 0.6523317843675613, + "num_tokens": 9484363.0, + "step": 729 + }, + { + "epoch": 0.4672, + "grad_norm": 3.7383124828338623, + "learning_rate": 5e-06, + "loss": 1.3216, + "mean_token_accuracy": 0.6544977352023125, + "num_tokens": 9496620.0, + "step": 730 + }, + { + "epoch": 0.46784, + "grad_norm": 3.362048864364624, + "learning_rate": 5e-06, + "loss": 1.3853, + "mean_token_accuracy": 0.6726252436637878, + "num_tokens": 9510670.0, + "step": 731 + }, + { + "epoch": 0.46848, + "grad_norm": 3.314443826675415, + "learning_rate": 5e-06, + "loss": 1.4104, + "mean_token_accuracy": 0.6626652106642723, + "num_tokens": 9525327.0, + "step": 732 + }, + { + "epoch": 0.46912, + "grad_norm": 3.8517005443573, + "learning_rate": 5e-06, + "loss": 1.3099, + "mean_token_accuracy": 0.650071769952774, + "num_tokens": 9537583.0, + "step": 733 + }, + { + "epoch": 0.46976, + "grad_norm": 3.4071006774902344, + "learning_rate": 5e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6916024461388588, + "num_tokens": 9550411.0, + "step": 734 + }, + { + "epoch": 0.4704, + "grad_norm": 4.703375816345215, + "learning_rate": 5e-06, + "loss": 1.2984, + "mean_token_accuracy": 0.6852922365069389, + "num_tokens": 9561144.0, + "step": 735 + }, + { + "epoch": 0.47104, + "grad_norm": 3.5826289653778076, + "learning_rate": 5e-06, + "loss": 1.5058, + "mean_token_accuracy": 0.6203412935137749, + "num_tokens": 9575541.0, + "step": 736 + }, + { + "epoch": 0.47168, + "grad_norm": 3.2071099281311035, + "learning_rate": 5e-06, + "loss": 1.2267, + "mean_token_accuracy": 0.6681589409708977, + "num_tokens": 9591163.0, + "step": 737 + }, + { + "epoch": 0.47232, + "grad_norm": 3.8028645515441895, + "learning_rate": 5e-06, + "loss": 1.4041, + "mean_token_accuracy": 0.6411704197525978, + "num_tokens": 9604337.0, + "step": 738 + }, + { + "epoch": 0.47296, + "grad_norm": 3.5578410625457764, + "learning_rate": 5e-06, + "loss": 1.4089, + "mean_token_accuracy": 0.6202419102191925, + "num_tokens": 9618994.0, + "step": 739 + }, + { + "epoch": 0.4736, + "grad_norm": 4.015564441680908, + "learning_rate": 5e-06, + "loss": 1.3252, + "mean_token_accuracy": 0.6425078436732292, + "num_tokens": 9629590.0, + "step": 740 + }, + { + "epoch": 0.47424, + "grad_norm": 3.3953940868377686, + "learning_rate": 5e-06, + "loss": 1.2248, + "mean_token_accuracy": 0.6555972173810005, + "num_tokens": 9643383.0, + "step": 741 + }, + { + "epoch": 0.47488, + "grad_norm": 3.509755849838257, + "learning_rate": 5e-06, + "loss": 1.2274, + "mean_token_accuracy": 0.6479950994253159, + "num_tokens": 9657137.0, + "step": 742 + }, + { + "epoch": 0.47552, + "grad_norm": 3.403864622116089, + "learning_rate": 5e-06, + "loss": 1.3223, + "mean_token_accuracy": 0.6538697630167007, + "num_tokens": 9670515.0, + "step": 743 + }, + { + "epoch": 0.47616, + "grad_norm": 3.5815911293029785, + "learning_rate": 5e-06, + "loss": 1.3843, + "mean_token_accuracy": 0.6228384971618652, + "num_tokens": 9682689.0, + "step": 744 + }, + { + "epoch": 0.4768, + "grad_norm": 4.452811241149902, + "learning_rate": 5e-06, + "loss": 1.3518, + "mean_token_accuracy": 0.6771413907408714, + "num_tokens": 9693372.0, + "step": 745 + }, + { + "epoch": 0.47744, + "grad_norm": 4.269803524017334, + "learning_rate": 5e-06, + "loss": 1.212, + "mean_token_accuracy": 0.672097809612751, + "num_tokens": 9702991.0, + "step": 746 + }, + { + "epoch": 0.47808, + "grad_norm": 3.6928703784942627, + "learning_rate": 5e-06, + "loss": 1.3458, + "mean_token_accuracy": 0.6829958707094193, + "num_tokens": 9716472.0, + "step": 747 + }, + { + "epoch": 0.47872, + "grad_norm": 3.9352831840515137, + "learning_rate": 5e-06, + "loss": 1.4422, + "mean_token_accuracy": 0.6457963958382607, + "num_tokens": 9730346.0, + "step": 748 + }, + { + "epoch": 0.47936, + "grad_norm": 4.322943687438965, + "learning_rate": 5e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6857285089790821, + "num_tokens": 9742613.0, + "step": 749 + }, + { + "epoch": 0.48, + "grad_norm": 3.4020259380340576, + "learning_rate": 5e-06, + "loss": 1.3059, + "mean_token_accuracy": 0.6406174898147583, + "num_tokens": 9754833.0, + "step": 750 + }, + { + "epoch": 0.48064, + "grad_norm": 3.288209915161133, + "learning_rate": 5e-06, + "loss": 1.3021, + "mean_token_accuracy": 0.6589253880083561, + "num_tokens": 9769316.0, + "step": 751 + }, + { + "epoch": 0.48128, + "grad_norm": 3.2498161792755127, + "learning_rate": 5e-06, + "loss": 1.4946, + "mean_token_accuracy": 0.6402696147561073, + "num_tokens": 9783768.0, + "step": 752 + }, + { + "epoch": 0.48192, + "grad_norm": 3.8162779808044434, + "learning_rate": 5e-06, + "loss": 1.3911, + "mean_token_accuracy": 0.6404719427227974, + "num_tokens": 9797279.0, + "step": 753 + }, + { + "epoch": 0.48256, + "grad_norm": 4.253142833709717, + "learning_rate": 5e-06, + "loss": 1.4797, + "mean_token_accuracy": 0.6313204690814018, + "num_tokens": 9808629.0, + "step": 754 + }, + { + "epoch": 0.4832, + "grad_norm": 3.716420888900757, + "learning_rate": 5e-06, + "loss": 1.3401, + "mean_token_accuracy": 0.6325643435120583, + "num_tokens": 9821968.0, + "step": 755 + }, + { + "epoch": 0.48384, + "grad_norm": 3.5335354804992676, + "learning_rate": 5e-06, + "loss": 1.3272, + "mean_token_accuracy": 0.6529423892498016, + "num_tokens": 9835554.0, + "step": 756 + }, + { + "epoch": 0.48448, + "grad_norm": 3.359344005584717, + "learning_rate": 5e-06, + "loss": 1.3782, + "mean_token_accuracy": 0.6145281083881855, + "num_tokens": 9849101.0, + "step": 757 + }, + { + "epoch": 0.48512, + "grad_norm": 3.4545371532440186, + "learning_rate": 5e-06, + "loss": 1.5942, + "mean_token_accuracy": 0.6196755021810532, + "num_tokens": 9862033.0, + "step": 758 + }, + { + "epoch": 0.48576, + "grad_norm": 3.6362133026123047, + "learning_rate": 5e-06, + "loss": 1.3524, + "mean_token_accuracy": 0.6347524076700211, + "num_tokens": 9873692.0, + "step": 759 + }, + { + "epoch": 0.4864, + "grad_norm": 3.699906826019287, + "learning_rate": 5e-06, + "loss": 1.2751, + "mean_token_accuracy": 0.656343087553978, + "num_tokens": 9885375.0, + "step": 760 + }, + { + "epoch": 0.48704, + "grad_norm": 3.8104074001312256, + "learning_rate": 5e-06, + "loss": 1.4196, + "mean_token_accuracy": 0.630670964717865, + "num_tokens": 9897179.0, + "step": 761 + }, + { + "epoch": 0.48768, + "grad_norm": 3.5518436431884766, + "learning_rate": 5e-06, + "loss": 1.1912, + "mean_token_accuracy": 0.6667153909802437, + "num_tokens": 9909370.0, + "step": 762 + }, + { + "epoch": 0.48832, + "grad_norm": 3.4174013137817383, + "learning_rate": 5e-06, + "loss": 1.4634, + "mean_token_accuracy": 0.6383125334978104, + "num_tokens": 9924593.0, + "step": 763 + }, + { + "epoch": 0.48896, + "grad_norm": 3.690223217010498, + "learning_rate": 5e-06, + "loss": 1.2632, + "mean_token_accuracy": 0.6747411042451859, + "num_tokens": 9938123.0, + "step": 764 + }, + { + "epoch": 0.4896, + "grad_norm": 3.189453125, + "learning_rate": 5e-06, + "loss": 1.2344, + "mean_token_accuracy": 0.6770232170820236, + "num_tokens": 9952420.0, + "step": 765 + }, + { + "epoch": 0.49024, + "grad_norm": 4.607802867889404, + "learning_rate": 5e-06, + "loss": 1.2471, + "mean_token_accuracy": 0.6755125150084496, + "num_tokens": 9962228.0, + "step": 766 + }, + { + "epoch": 0.49088, + "grad_norm": 3.5634379386901855, + "learning_rate": 5e-06, + "loss": 1.1842, + "mean_token_accuracy": 0.6926667168736458, + "num_tokens": 9974613.0, + "step": 767 + }, + { + "epoch": 0.49152, + "grad_norm": 3.5588109493255615, + "learning_rate": 5e-06, + "loss": 1.3507, + "mean_token_accuracy": 0.6611402109265327, + "num_tokens": 9988413.0, + "step": 768 + }, + { + "epoch": 0.49216, + "grad_norm": 3.356700897216797, + "learning_rate": 5e-06, + "loss": 1.6306, + "mean_token_accuracy": 0.6335580386221409, + "num_tokens": 10003108.0, + "step": 769 + }, + { + "epoch": 0.4928, + "grad_norm": 4.425334453582764, + "learning_rate": 5e-06, + "loss": 1.379, + "mean_token_accuracy": 0.6304365694522858, + "num_tokens": 10015961.0, + "step": 770 + }, + { + "epoch": 0.49344, + "grad_norm": 3.2346768379211426, + "learning_rate": 5e-06, + "loss": 1.194, + "mean_token_accuracy": 0.6939344108104706, + "num_tokens": 10028885.0, + "step": 771 + }, + { + "epoch": 0.49408, + "grad_norm": 2.969572067260742, + "learning_rate": 5e-06, + "loss": 1.1707, + "mean_token_accuracy": 0.6672687157988548, + "num_tokens": 10044177.0, + "step": 772 + }, + { + "epoch": 0.49472, + "grad_norm": 3.9597513675689697, + "learning_rate": 5e-06, + "loss": 1.2808, + "mean_token_accuracy": 0.6557754501700401, + "num_tokens": 10055808.0, + "step": 773 + }, + { + "epoch": 0.49536, + "grad_norm": 3.450819730758667, + "learning_rate": 5e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.6846578419208527, + "num_tokens": 10071283.0, + "step": 774 + }, + { + "epoch": 0.496, + "grad_norm": 3.999828338623047, + "learning_rate": 5e-06, + "loss": 1.4067, + "mean_token_accuracy": 0.6274128258228302, + "num_tokens": 10083352.0, + "step": 775 + }, + { + "epoch": 0.49664, + "grad_norm": 4.048245429992676, + "learning_rate": 5e-06, + "loss": 1.2898, + "mean_token_accuracy": 0.6655979752540588, + "num_tokens": 10095482.0, + "step": 776 + }, + { + "epoch": 0.49728, + "grad_norm": 3.832430124282837, + "learning_rate": 5e-06, + "loss": 1.3077, + "mean_token_accuracy": 0.6555028632283211, + "num_tokens": 10108448.0, + "step": 777 + }, + { + "epoch": 0.49792, + "grad_norm": 3.215700626373291, + "learning_rate": 5e-06, + "loss": 1.3706, + "mean_token_accuracy": 0.653311513364315, + "num_tokens": 10122582.0, + "step": 778 + }, + { + "epoch": 0.49856, + "grad_norm": 3.9083938598632812, + "learning_rate": 5e-06, + "loss": 1.4162, + "mean_token_accuracy": 0.666377916932106, + "num_tokens": 10133857.0, + "step": 779 + }, + { + "epoch": 0.4992, + "grad_norm": 3.3011085987091064, + "learning_rate": 5e-06, + "loss": 1.4602, + "mean_token_accuracy": 0.6177773475646973, + "num_tokens": 10149390.0, + "step": 780 + }, + { + "epoch": 0.49984, + "grad_norm": 4.202136516571045, + "learning_rate": 5e-06, + "loss": 1.348, + "mean_token_accuracy": 0.6335294619202614, + "num_tokens": 10160191.0, + "step": 781 + }, + { + "epoch": 0.50048, + "grad_norm": 3.8100340366363525, + "learning_rate": 5e-06, + "loss": 1.286, + "mean_token_accuracy": 0.664748452603817, + "num_tokens": 10174169.0, + "step": 782 + }, + { + "epoch": 0.50112, + "grad_norm": 3.2231757640838623, + "learning_rate": 5e-06, + "loss": 1.5273, + "mean_token_accuracy": 0.6323799937963486, + "num_tokens": 10189013.0, + "step": 783 + }, + { + "epoch": 0.50176, + "grad_norm": 3.380337953567505, + "learning_rate": 5e-06, + "loss": 1.2976, + "mean_token_accuracy": 0.6576580554246902, + "num_tokens": 10205197.0, + "step": 784 + }, + { + "epoch": 0.5024, + "grad_norm": 3.5312960147857666, + "learning_rate": 5e-06, + "loss": 1.4795, + "mean_token_accuracy": 0.6108041927218437, + "num_tokens": 10218782.0, + "step": 785 + }, + { + "epoch": 0.50304, + "grad_norm": 3.7805802822113037, + "learning_rate": 5e-06, + "loss": 1.1567, + "mean_token_accuracy": 0.7012158781290054, + "num_tokens": 10229754.0, + "step": 786 + }, + { + "epoch": 0.50368, + "grad_norm": 3.575208902359009, + "learning_rate": 5e-06, + "loss": 1.2914, + "mean_token_accuracy": 0.6698039025068283, + "num_tokens": 10243519.0, + "step": 787 + }, + { + "epoch": 0.50432, + "grad_norm": 4.018414497375488, + "learning_rate": 5e-06, + "loss": 1.2843, + "mean_token_accuracy": 0.6500632241368294, + "num_tokens": 10255436.0, + "step": 788 + }, + { + "epoch": 0.50496, + "grad_norm": 3.3472957611083984, + "learning_rate": 5e-06, + "loss": 1.409, + "mean_token_accuracy": 0.6223405599594116, + "num_tokens": 10270218.0, + "step": 789 + }, + { + "epoch": 0.5056, + "grad_norm": 3.555922031402588, + "learning_rate": 5e-06, + "loss": 1.4861, + "mean_token_accuracy": 0.6319947242736816, + "num_tokens": 10283818.0, + "step": 790 + }, + { + "epoch": 0.50624, + "grad_norm": 3.2534327507019043, + "learning_rate": 5e-06, + "loss": 1.505, + "mean_token_accuracy": 0.6172455549240112, + "num_tokens": 10299119.0, + "step": 791 + }, + { + "epoch": 0.50688, + "grad_norm": 3.78558087348938, + "learning_rate": 5e-06, + "loss": 1.3765, + "mean_token_accuracy": 0.6513939723372459, + "num_tokens": 10311103.0, + "step": 792 + }, + { + "epoch": 0.50752, + "grad_norm": 3.426884412765503, + "learning_rate": 5e-06, + "loss": 1.3942, + "mean_token_accuracy": 0.6268276050686836, + "num_tokens": 10324256.0, + "step": 793 + }, + { + "epoch": 0.50816, + "grad_norm": 3.586442470550537, + "learning_rate": 5e-06, + "loss": 1.291, + "mean_token_accuracy": 0.6560362279415131, + "num_tokens": 10336275.0, + "step": 794 + }, + { + "epoch": 0.5088, + "grad_norm": 3.6246700286865234, + "learning_rate": 5e-06, + "loss": 1.4371, + "mean_token_accuracy": 0.64004335552454, + "num_tokens": 10348939.0, + "step": 795 + }, + { + "epoch": 0.50944, + "grad_norm": 3.8206660747528076, + "learning_rate": 5e-06, + "loss": 1.3641, + "mean_token_accuracy": 0.6557733938097954, + "num_tokens": 10360108.0, + "step": 796 + }, + { + "epoch": 0.51008, + "grad_norm": 4.05738639831543, + "learning_rate": 5e-06, + "loss": 1.3265, + "mean_token_accuracy": 0.6359497681260109, + "num_tokens": 10371144.0, + "step": 797 + }, + { + "epoch": 0.51072, + "grad_norm": 3.9649102687835693, + "learning_rate": 5e-06, + "loss": 1.4398, + "mean_token_accuracy": 0.6443544700741768, + "num_tokens": 10381537.0, + "step": 798 + }, + { + "epoch": 0.51136, + "grad_norm": 3.3141987323760986, + "learning_rate": 5e-06, + "loss": 1.4374, + "mean_token_accuracy": 0.6308365762233734, + "num_tokens": 10396539.0, + "step": 799 + }, + { + "epoch": 0.512, + "grad_norm": 3.1399598121643066, + "learning_rate": 5e-06, + "loss": 1.3463, + "mean_token_accuracy": 0.6553637236356735, + "num_tokens": 10412962.0, + "step": 800 + }, + { + "epoch": 0.51264, + "grad_norm": 3.690521478652954, + "learning_rate": 5e-06, + "loss": 1.4432, + "mean_token_accuracy": 0.6525074169039726, + "num_tokens": 10426314.0, + "step": 801 + }, + { + "epoch": 0.51328, + "grad_norm": 3.5348432064056396, + "learning_rate": 5e-06, + "loss": 1.5491, + "mean_token_accuracy": 0.5989897102117538, + "num_tokens": 10439348.0, + "step": 802 + }, + { + "epoch": 0.51392, + "grad_norm": 3.760218620300293, + "learning_rate": 5e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6977821663022041, + "num_tokens": 10449973.0, + "step": 803 + }, + { + "epoch": 0.51456, + "grad_norm": 3.7311551570892334, + "learning_rate": 5e-06, + "loss": 1.4811, + "mean_token_accuracy": 0.630589596927166, + "num_tokens": 10462019.0, + "step": 804 + }, + { + "epoch": 0.5152, + "grad_norm": 3.4669084548950195, + "learning_rate": 5e-06, + "loss": 1.3778, + "mean_token_accuracy": 0.6389184445142746, + "num_tokens": 10474583.0, + "step": 805 + }, + { + "epoch": 0.51584, + "grad_norm": 3.2782742977142334, + "learning_rate": 5e-06, + "loss": 1.2256, + "mean_token_accuracy": 0.6817874610424042, + "num_tokens": 10488568.0, + "step": 806 + }, + { + "epoch": 0.51648, + "grad_norm": 4.345005035400391, + "learning_rate": 5e-06, + "loss": 1.3498, + "mean_token_accuracy": 0.6508120521903038, + "num_tokens": 10500968.0, + "step": 807 + }, + { + "epoch": 0.51712, + "grad_norm": 3.8742589950561523, + "learning_rate": 5e-06, + "loss": 1.4047, + "mean_token_accuracy": 0.634697936475277, + "num_tokens": 10513586.0, + "step": 808 + }, + { + "epoch": 0.51776, + "grad_norm": 3.4968934059143066, + "learning_rate": 5e-06, + "loss": 1.4397, + "mean_token_accuracy": 0.6320827975869179, + "num_tokens": 10528263.0, + "step": 809 + }, + { + "epoch": 0.5184, + "grad_norm": 3.89860463142395, + "learning_rate": 5e-06, + "loss": 1.299, + "mean_token_accuracy": 0.6727789714932442, + "num_tokens": 10539352.0, + "step": 810 + }, + { + "epoch": 0.51904, + "grad_norm": 3.1833720207214355, + "learning_rate": 5e-06, + "loss": 1.2753, + "mean_token_accuracy": 0.6482102572917938, + "num_tokens": 10553515.0, + "step": 811 + }, + { + "epoch": 0.51968, + "grad_norm": 3.3082292079925537, + "learning_rate": 5e-06, + "loss": 1.2998, + "mean_token_accuracy": 0.6546992510557175, + "num_tokens": 10566663.0, + "step": 812 + }, + { + "epoch": 0.52032, + "grad_norm": 3.6185340881347656, + "learning_rate": 5e-06, + "loss": 1.3815, + "mean_token_accuracy": 0.6625709310173988, + "num_tokens": 10579584.0, + "step": 813 + }, + { + "epoch": 0.52096, + "grad_norm": 3.55534291267395, + "learning_rate": 5e-06, + "loss": 1.1128, + "mean_token_accuracy": 0.6956586241722107, + "num_tokens": 10591791.0, + "step": 814 + }, + { + "epoch": 0.5216, + "grad_norm": 3.650907516479492, + "learning_rate": 5e-06, + "loss": 1.4477, + "mean_token_accuracy": 0.624105378985405, + "num_tokens": 10604975.0, + "step": 815 + }, + { + "epoch": 0.52224, + "grad_norm": 3.9432995319366455, + "learning_rate": 5e-06, + "loss": 1.3038, + "mean_token_accuracy": 0.6531900316476822, + "num_tokens": 10616449.0, + "step": 816 + }, + { + "epoch": 0.52288, + "grad_norm": 3.8777639865875244, + "learning_rate": 5e-06, + "loss": 1.47, + "mean_token_accuracy": 0.6426081731915474, + "num_tokens": 10626994.0, + "step": 817 + }, + { + "epoch": 0.52352, + "grad_norm": 3.064539909362793, + "learning_rate": 5e-06, + "loss": 1.2586, + "mean_token_accuracy": 0.6581793427467346, + "num_tokens": 10642855.0, + "step": 818 + }, + { + "epoch": 0.52416, + "grad_norm": 3.3149263858795166, + "learning_rate": 5e-06, + "loss": 1.4482, + "mean_token_accuracy": 0.6364821642637253, + "num_tokens": 10656056.0, + "step": 819 + }, + { + "epoch": 0.5248, + "grad_norm": 4.199079990386963, + "learning_rate": 5e-06, + "loss": 1.4089, + "mean_token_accuracy": 0.6289070248603821, + "num_tokens": 10668057.0, + "step": 820 + }, + { + "epoch": 0.52544, + "grad_norm": 3.142550468444824, + "learning_rate": 5e-06, + "loss": 1.526, + "mean_token_accuracy": 0.6158655360341072, + "num_tokens": 10682235.0, + "step": 821 + }, + { + "epoch": 0.52608, + "grad_norm": 3.3503355979919434, + "learning_rate": 5e-06, + "loss": 1.2846, + "mean_token_accuracy": 0.6406090259552002, + "num_tokens": 10697588.0, + "step": 822 + }, + { + "epoch": 0.52672, + "grad_norm": 3.7879579067230225, + "learning_rate": 5e-06, + "loss": 1.4343, + "mean_token_accuracy": 0.6635532379150391, + "num_tokens": 10708737.0, + "step": 823 + }, + { + "epoch": 0.52736, + "grad_norm": 3.9766318798065186, + "learning_rate": 5e-06, + "loss": 1.352, + "mean_token_accuracy": 0.6262383349239826, + "num_tokens": 10721121.0, + "step": 824 + }, + { + "epoch": 0.528, + "grad_norm": 3.3426828384399414, + "learning_rate": 5e-06, + "loss": 1.2434, + "mean_token_accuracy": 0.6687643975019455, + "num_tokens": 10734364.0, + "step": 825 + }, + { + "epoch": 0.52864, + "grad_norm": 3.411301612854004, + "learning_rate": 5e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.695383183658123, + "num_tokens": 10747101.0, + "step": 826 + }, + { + "epoch": 0.52928, + "grad_norm": 4.2775044441223145, + "learning_rate": 5e-06, + "loss": 1.1268, + "mean_token_accuracy": 0.7007181644439697, + "num_tokens": 10757435.0, + "step": 827 + }, + { + "epoch": 0.52992, + "grad_norm": 3.670020341873169, + "learning_rate": 5e-06, + "loss": 1.2358, + "mean_token_accuracy": 0.6553446725010872, + "num_tokens": 10768682.0, + "step": 828 + }, + { + "epoch": 0.53056, + "grad_norm": 3.6720056533813477, + "learning_rate": 5e-06, + "loss": 1.4772, + "mean_token_accuracy": 0.6402060613036156, + "num_tokens": 10781720.0, + "step": 829 + }, + { + "epoch": 0.5312, + "grad_norm": 4.194923400878906, + "learning_rate": 5e-06, + "loss": 1.1888, + "mean_token_accuracy": 0.6686526387929916, + "num_tokens": 10794142.0, + "step": 830 + }, + { + "epoch": 0.53184, + "grad_norm": 3.1744613647460938, + "learning_rate": 5e-06, + "loss": 1.4092, + "mean_token_accuracy": 0.6328324228525162, + "num_tokens": 10809011.0, + "step": 831 + }, + { + "epoch": 0.53248, + "grad_norm": 3.844196319580078, + "learning_rate": 5e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.6974828243255615, + "num_tokens": 10819963.0, + "step": 832 + }, + { + "epoch": 0.53312, + "grad_norm": 3.668311834335327, + "learning_rate": 5e-06, + "loss": 1.2182, + "mean_token_accuracy": 0.6790367737412453, + "num_tokens": 10834237.0, + "step": 833 + }, + { + "epoch": 0.53376, + "grad_norm": 3.610236406326294, + "learning_rate": 5e-06, + "loss": 1.078, + "mean_token_accuracy": 0.7012447938323021, + "num_tokens": 10847095.0, + "step": 834 + }, + { + "epoch": 0.5344, + "grad_norm": 3.7682337760925293, + "learning_rate": 5e-06, + "loss": 1.1083, + "mean_token_accuracy": 0.6979804188013077, + "num_tokens": 10859861.0, + "step": 835 + }, + { + "epoch": 0.53504, + "grad_norm": 3.720351457595825, + "learning_rate": 5e-06, + "loss": 1.3023, + "mean_token_accuracy": 0.6436434611678123, + "num_tokens": 10871706.0, + "step": 836 + }, + { + "epoch": 0.53568, + "grad_norm": 3.608431816101074, + "learning_rate": 5e-06, + "loss": 1.5091, + "mean_token_accuracy": 0.613635927438736, + "num_tokens": 10884615.0, + "step": 837 + }, + { + "epoch": 0.53632, + "grad_norm": 3.321657419204712, + "learning_rate": 5e-06, + "loss": 1.1323, + "mean_token_accuracy": 0.6927084550261497, + "num_tokens": 10898152.0, + "step": 838 + }, + { + "epoch": 0.53696, + "grad_norm": 2.9468841552734375, + "learning_rate": 5e-06, + "loss": 1.3507, + "mean_token_accuracy": 0.6446966454386711, + "num_tokens": 10915868.0, + "step": 839 + }, + { + "epoch": 0.5376, + "grad_norm": 3.565668821334839, + "learning_rate": 5e-06, + "loss": 1.4081, + "mean_token_accuracy": 0.6363908722996712, + "num_tokens": 10930562.0, + "step": 840 + }, + { + "epoch": 0.53824, + "grad_norm": 3.9890897274017334, + "learning_rate": 5e-06, + "loss": 1.4644, + "mean_token_accuracy": 0.6419349610805511, + "num_tokens": 10942958.0, + "step": 841 + }, + { + "epoch": 0.53888, + "grad_norm": 3.5691657066345215, + "learning_rate": 5e-06, + "loss": 1.461, + "mean_token_accuracy": 0.6422456279397011, + "num_tokens": 10955566.0, + "step": 842 + }, + { + "epoch": 0.53952, + "grad_norm": 3.0054261684417725, + "learning_rate": 5e-06, + "loss": 1.32, + "mean_token_accuracy": 0.6669855192303658, + "num_tokens": 10971064.0, + "step": 843 + }, + { + "epoch": 0.54016, + "grad_norm": 2.9434778690338135, + "learning_rate": 5e-06, + "loss": 1.343, + "mean_token_accuracy": 0.6634941324591637, + "num_tokens": 10987737.0, + "step": 844 + }, + { + "epoch": 0.5408, + "grad_norm": 4.207048416137695, + "learning_rate": 5e-06, + "loss": 1.338, + "mean_token_accuracy": 0.6447890773415565, + "num_tokens": 10998859.0, + "step": 845 + }, + { + "epoch": 0.54144, + "grad_norm": 3.3798792362213135, + "learning_rate": 5e-06, + "loss": 1.5105, + "mean_token_accuracy": 0.6433539763092995, + "num_tokens": 11013214.0, + "step": 846 + }, + { + "epoch": 0.54208, + "grad_norm": 3.163572311401367, + "learning_rate": 5e-06, + "loss": 1.3399, + "mean_token_accuracy": 0.6501183435320854, + "num_tokens": 11028566.0, + "step": 847 + }, + { + "epoch": 0.54272, + "grad_norm": 3.5735156536102295, + "learning_rate": 5e-06, + "loss": 1.6305, + "mean_token_accuracy": 0.6012577600777149, + "num_tokens": 11043246.0, + "step": 848 + }, + { + "epoch": 0.54336, + "grad_norm": 4.034946441650391, + "learning_rate": 5e-06, + "loss": 1.1873, + "mean_token_accuracy": 0.6620588451623917, + "num_tokens": 11055228.0, + "step": 849 + }, + { + "epoch": 0.544, + "grad_norm": 3.2156589031219482, + "learning_rate": 5e-06, + "loss": 1.4072, + "mean_token_accuracy": 0.6418510600924492, + "num_tokens": 11068267.0, + "step": 850 + }, + { + "epoch": 0.54464, + "grad_norm": 4.0673723220825195, + "learning_rate": 5e-06, + "loss": 1.2545, + "mean_token_accuracy": 0.6594027280807495, + "num_tokens": 11080978.0, + "step": 851 + }, + { + "epoch": 0.54528, + "grad_norm": 3.5857112407684326, + "learning_rate": 5e-06, + "loss": 1.4054, + "mean_token_accuracy": 0.6293277516961098, + "num_tokens": 11096210.0, + "step": 852 + }, + { + "epoch": 0.54592, + "grad_norm": 3.829974889755249, + "learning_rate": 5e-06, + "loss": 1.3174, + "mean_token_accuracy": 0.6636649072170258, + "num_tokens": 11108254.0, + "step": 853 + }, + { + "epoch": 0.54656, + "grad_norm": 3.5567145347595215, + "learning_rate": 5e-06, + "loss": 1.2567, + "mean_token_accuracy": 0.677757516503334, + "num_tokens": 11120465.0, + "step": 854 + }, + { + "epoch": 0.5472, + "grad_norm": 4.473601341247559, + "learning_rate": 5e-06, + "loss": 1.2163, + "mean_token_accuracy": 0.6928970888257027, + "num_tokens": 11131956.0, + "step": 855 + }, + { + "epoch": 0.54784, + "grad_norm": 3.658292293548584, + "learning_rate": 5e-06, + "loss": 1.2401, + "mean_token_accuracy": 0.6552563831210136, + "num_tokens": 11144655.0, + "step": 856 + }, + { + "epoch": 0.54848, + "grad_norm": 3.061565399169922, + "learning_rate": 5e-06, + "loss": 1.1552, + "mean_token_accuracy": 0.6949977725744247, + "num_tokens": 11159566.0, + "step": 857 + }, + { + "epoch": 0.54912, + "grad_norm": 3.8165862560272217, + "learning_rate": 5e-06, + "loss": 1.2363, + "mean_token_accuracy": 0.6678136140108109, + "num_tokens": 11172278.0, + "step": 858 + }, + { + "epoch": 0.54976, + "grad_norm": 3.937960147857666, + "learning_rate": 5e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.7062254995107651, + "num_tokens": 11183273.0, + "step": 859 + }, + { + "epoch": 0.5504, + "grad_norm": 3.9735426902770996, + "learning_rate": 5e-06, + "loss": 1.2674, + "mean_token_accuracy": 0.6709722802042961, + "num_tokens": 11194956.0, + "step": 860 + }, + { + "epoch": 0.55104, + "grad_norm": 3.741502523422241, + "learning_rate": 5e-06, + "loss": 1.4785, + "mean_token_accuracy": 0.6148476675152779, + "num_tokens": 11209741.0, + "step": 861 + }, + { + "epoch": 0.55168, + "grad_norm": 3.544828176498413, + "learning_rate": 5e-06, + "loss": 1.3682, + "mean_token_accuracy": 0.6354441791772842, + "num_tokens": 11222629.0, + "step": 862 + }, + { + "epoch": 0.55232, + "grad_norm": 3.3560214042663574, + "learning_rate": 5e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.6948365420103073, + "num_tokens": 11237834.0, + "step": 863 + }, + { + "epoch": 0.55296, + "grad_norm": 3.512924909591675, + "learning_rate": 5e-06, + "loss": 1.181, + "mean_token_accuracy": 0.6836326494812965, + "num_tokens": 11250638.0, + "step": 864 + }, + { + "epoch": 0.5536, + "grad_norm": 4.28767728805542, + "learning_rate": 5e-06, + "loss": 1.475, + "mean_token_accuracy": 0.6338236667215824, + "num_tokens": 11261887.0, + "step": 865 + }, + { + "epoch": 0.55424, + "grad_norm": 3.2134881019592285, + "learning_rate": 5e-06, + "loss": 1.3229, + "mean_token_accuracy": 0.6470964848995209, + "num_tokens": 11275931.0, + "step": 866 + }, + { + "epoch": 0.55488, + "grad_norm": 3.689152240753174, + "learning_rate": 5e-06, + "loss": 1.3717, + "mean_token_accuracy": 0.6691553071141243, + "num_tokens": 11287528.0, + "step": 867 + }, + { + "epoch": 0.55552, + "grad_norm": 3.289281129837036, + "learning_rate": 5e-06, + "loss": 1.2839, + "mean_token_accuracy": 0.6697202101349831, + "num_tokens": 11300231.0, + "step": 868 + }, + { + "epoch": 0.55616, + "grad_norm": 3.278754234313965, + "learning_rate": 5e-06, + "loss": 1.1297, + "mean_token_accuracy": 0.6946472376585007, + "num_tokens": 11315024.0, + "step": 869 + }, + { + "epoch": 0.5568, + "grad_norm": 3.2673239707946777, + "learning_rate": 5e-06, + "loss": 1.4874, + "mean_token_accuracy": 0.6627454794943333, + "num_tokens": 11329475.0, + "step": 870 + }, + { + "epoch": 0.55744, + "grad_norm": 3.1076149940490723, + "learning_rate": 5e-06, + "loss": 1.3683, + "mean_token_accuracy": 0.6290438398718834, + "num_tokens": 11343973.0, + "step": 871 + }, + { + "epoch": 0.55808, + "grad_norm": 3.526763439178467, + "learning_rate": 5e-06, + "loss": 1.3713, + "mean_token_accuracy": 0.6168685257434845, + "num_tokens": 11356517.0, + "step": 872 + }, + { + "epoch": 0.55872, + "grad_norm": 3.46929931640625, + "learning_rate": 5e-06, + "loss": 1.3024, + "mean_token_accuracy": 0.6537005454301834, + "num_tokens": 11369229.0, + "step": 873 + }, + { + "epoch": 0.55936, + "grad_norm": 3.599717617034912, + "learning_rate": 5e-06, + "loss": 1.3816, + "mean_token_accuracy": 0.641513504087925, + "num_tokens": 11382702.0, + "step": 874 + }, + { + "epoch": 0.56, + "grad_norm": 3.80094313621521, + "learning_rate": 5e-06, + "loss": 1.5008, + "mean_token_accuracy": 0.6274667903780937, + "num_tokens": 11396562.0, + "step": 875 + }, + { + "epoch": 0.56064, + "grad_norm": 4.2999067306518555, + "learning_rate": 5e-06, + "loss": 1.2018, + "mean_token_accuracy": 0.6762094050645828, + "num_tokens": 11406774.0, + "step": 876 + }, + { + "epoch": 0.56128, + "grad_norm": 3.715298652648926, + "learning_rate": 5e-06, + "loss": 1.2514, + "mean_token_accuracy": 0.6620960757136345, + "num_tokens": 11418251.0, + "step": 877 + }, + { + "epoch": 0.56192, + "grad_norm": 3.0805916786193848, + "learning_rate": 5e-06, + "loss": 1.1502, + "mean_token_accuracy": 0.684480644762516, + "num_tokens": 11433197.0, + "step": 878 + }, + { + "epoch": 0.56256, + "grad_norm": 3.6326444149017334, + "learning_rate": 5e-06, + "loss": 1.2656, + "mean_token_accuracy": 0.660808764398098, + "num_tokens": 11446639.0, + "step": 879 + }, + { + "epoch": 0.5632, + "grad_norm": 12.266148567199707, + "learning_rate": 5e-06, + "loss": 1.286, + "mean_token_accuracy": 0.6693281307816505, + "num_tokens": 11458609.0, + "step": 880 + }, + { + "epoch": 0.56384, + "grad_norm": 3.6536591053009033, + "learning_rate": 5e-06, + "loss": 1.2049, + "mean_token_accuracy": 0.6694196611642838, + "num_tokens": 11470645.0, + "step": 881 + }, + { + "epoch": 0.56448, + "grad_norm": 3.287473201751709, + "learning_rate": 5e-06, + "loss": 1.3294, + "mean_token_accuracy": 0.6692755967378616, + "num_tokens": 11484303.0, + "step": 882 + }, + { + "epoch": 0.56512, + "grad_norm": 3.7565791606903076, + "learning_rate": 5e-06, + "loss": 1.251, + "mean_token_accuracy": 0.6664244830608368, + "num_tokens": 11496299.0, + "step": 883 + }, + { + "epoch": 0.56576, + "grad_norm": 3.544475793838501, + "learning_rate": 5e-06, + "loss": 1.4526, + "mean_token_accuracy": 0.6100342273712158, + "num_tokens": 11510676.0, + "step": 884 + }, + { + "epoch": 0.5664, + "grad_norm": 3.682511568069458, + "learning_rate": 5e-06, + "loss": 1.4142, + "mean_token_accuracy": 0.6500721573829651, + "num_tokens": 11523371.0, + "step": 885 + }, + { + "epoch": 0.56704, + "grad_norm": 3.6271486282348633, + "learning_rate": 5e-06, + "loss": 1.1237, + "mean_token_accuracy": 0.6834971457719803, + "num_tokens": 11536061.0, + "step": 886 + }, + { + "epoch": 0.56768, + "grad_norm": 3.1198318004608154, + "learning_rate": 5e-06, + "loss": 1.2309, + "mean_token_accuracy": 0.658136211335659, + "num_tokens": 11550795.0, + "step": 887 + }, + { + "epoch": 0.56832, + "grad_norm": 3.9022724628448486, + "learning_rate": 5e-06, + "loss": 1.3044, + "mean_token_accuracy": 0.6979828551411629, + "num_tokens": 11564562.0, + "step": 888 + }, + { + "epoch": 0.56896, + "grad_norm": 3.295694351196289, + "learning_rate": 5e-06, + "loss": 1.4184, + "mean_token_accuracy": 0.60136728733778, + "num_tokens": 11578577.0, + "step": 889 + }, + { + "epoch": 0.5696, + "grad_norm": 3.0561180114746094, + "learning_rate": 5e-06, + "loss": 1.3529, + "mean_token_accuracy": 0.6563450619578362, + "num_tokens": 11594404.0, + "step": 890 + }, + { + "epoch": 0.57024, + "grad_norm": 3.44431471824646, + "learning_rate": 5e-06, + "loss": 1.1605, + "mean_token_accuracy": 0.6642890870571136, + "num_tokens": 11605723.0, + "step": 891 + }, + { + "epoch": 0.57088, + "grad_norm": 4.037685871124268, + "learning_rate": 5e-06, + "loss": 1.2558, + "mean_token_accuracy": 0.6648613065481186, + "num_tokens": 11619031.0, + "step": 892 + }, + { + "epoch": 0.57152, + "grad_norm": 3.2583799362182617, + "learning_rate": 5e-06, + "loss": 1.3105, + "mean_token_accuracy": 0.6500160917639732, + "num_tokens": 11634316.0, + "step": 893 + }, + { + "epoch": 0.57216, + "grad_norm": 3.2072439193725586, + "learning_rate": 5e-06, + "loss": 1.4559, + "mean_token_accuracy": 0.6469361782073975, + "num_tokens": 11650239.0, + "step": 894 + }, + { + "epoch": 0.5728, + "grad_norm": 3.4376208782196045, + "learning_rate": 5e-06, + "loss": 1.3858, + "mean_token_accuracy": 0.6572685986757278, + "num_tokens": 11662751.0, + "step": 895 + }, + { + "epoch": 0.57344, + "grad_norm": 3.647529363632202, + "learning_rate": 5e-06, + "loss": 1.3375, + "mean_token_accuracy": 0.6592177748680115, + "num_tokens": 11675377.0, + "step": 896 + }, + { + "epoch": 0.57408, + "grad_norm": 3.332850217819214, + "learning_rate": 5e-06, + "loss": 1.3343, + "mean_token_accuracy": 0.6491860672831535, + "num_tokens": 11688675.0, + "step": 897 + }, + { + "epoch": 0.57472, + "grad_norm": 4.066124439239502, + "learning_rate": 5e-06, + "loss": 1.3361, + "mean_token_accuracy": 0.6541391238570213, + "num_tokens": 11700393.0, + "step": 898 + }, + { + "epoch": 0.57536, + "grad_norm": 3.341097593307495, + "learning_rate": 5e-06, + "loss": 1.2515, + "mean_token_accuracy": 0.6574959680438042, + "num_tokens": 11714611.0, + "step": 899 + }, + { + "epoch": 0.576, + "grad_norm": 3.0946879386901855, + "learning_rate": 5e-06, + "loss": 1.242, + "mean_token_accuracy": 0.6756256222724915, + "num_tokens": 11731479.0, + "step": 900 + }, + { + "epoch": 0.57664, + "grad_norm": 3.3247451782226562, + "learning_rate": 5e-06, + "loss": 1.4695, + "mean_token_accuracy": 0.6269465908408165, + "num_tokens": 11748872.0, + "step": 901 + }, + { + "epoch": 0.57728, + "grad_norm": 3.942417860031128, + "learning_rate": 5e-06, + "loss": 1.3982, + "mean_token_accuracy": 0.6202432103455067, + "num_tokens": 11760014.0, + "step": 902 + }, + { + "epoch": 0.57792, + "grad_norm": 3.633100986480713, + "learning_rate": 5e-06, + "loss": 1.4247, + "mean_token_accuracy": 0.6394501402974129, + "num_tokens": 11773867.0, + "step": 903 + }, + { + "epoch": 0.57856, + "grad_norm": 3.383073568344116, + "learning_rate": 5e-06, + "loss": 1.1386, + "mean_token_accuracy": 0.697243720293045, + "num_tokens": 11787283.0, + "step": 904 + }, + { + "epoch": 0.5792, + "grad_norm": 3.678783416748047, + "learning_rate": 5e-06, + "loss": 1.3926, + "mean_token_accuracy": 0.6641874313354492, + "num_tokens": 11799263.0, + "step": 905 + }, + { + "epoch": 0.57984, + "grad_norm": 3.2661468982696533, + "learning_rate": 5e-06, + "loss": 1.5136, + "mean_token_accuracy": 0.6076219081878662, + "num_tokens": 11815606.0, + "step": 906 + }, + { + "epoch": 0.58048, + "grad_norm": 3.52829909324646, + "learning_rate": 5e-06, + "loss": 1.2213, + "mean_token_accuracy": 0.6809684634208679, + "num_tokens": 11829589.0, + "step": 907 + }, + { + "epoch": 0.58112, + "grad_norm": 3.6113576889038086, + "learning_rate": 5e-06, + "loss": 1.3111, + "mean_token_accuracy": 0.6847885251045227, + "num_tokens": 11842023.0, + "step": 908 + }, + { + "epoch": 0.58176, + "grad_norm": 4.104685306549072, + "learning_rate": 5e-06, + "loss": 1.4434, + "mean_token_accuracy": 0.6598182618618011, + "num_tokens": 11852988.0, + "step": 909 + }, + { + "epoch": 0.5824, + "grad_norm": 3.4313085079193115, + "learning_rate": 5e-06, + "loss": 1.4167, + "mean_token_accuracy": 0.6736102141439915, + "num_tokens": 11866706.0, + "step": 910 + }, + { + "epoch": 0.58304, + "grad_norm": 3.2502808570861816, + "learning_rate": 5e-06, + "loss": 1.3682, + "mean_token_accuracy": 0.6428176760673523, + "num_tokens": 11882889.0, + "step": 911 + }, + { + "epoch": 0.58368, + "grad_norm": 3.662310838699341, + "learning_rate": 5e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.715592160820961, + "num_tokens": 11895792.0, + "step": 912 + }, + { + "epoch": 0.58432, + "grad_norm": 3.0405428409576416, + "learning_rate": 5e-06, + "loss": 1.2439, + "mean_token_accuracy": 0.6653807386755943, + "num_tokens": 11911225.0, + "step": 913 + }, + { + "epoch": 0.58496, + "grad_norm": 3.550328016281128, + "learning_rate": 5e-06, + "loss": 1.3896, + "mean_token_accuracy": 0.6358233094215393, + "num_tokens": 11924073.0, + "step": 914 + }, + { + "epoch": 0.5856, + "grad_norm": 3.2749056816101074, + "learning_rate": 5e-06, + "loss": 1.3361, + "mean_token_accuracy": 0.6581274121999741, + "num_tokens": 11938332.0, + "step": 915 + }, + { + "epoch": 0.58624, + "grad_norm": 3.873444080352783, + "learning_rate": 5e-06, + "loss": 1.3269, + "mean_token_accuracy": 0.6423259451985359, + "num_tokens": 11950429.0, + "step": 916 + }, + { + "epoch": 0.58688, + "grad_norm": 3.691632032394409, + "learning_rate": 5e-06, + "loss": 1.3576, + "mean_token_accuracy": 0.6709922403097153, + "num_tokens": 11962979.0, + "step": 917 + }, + { + "epoch": 0.58752, + "grad_norm": 3.1465516090393066, + "learning_rate": 5e-06, + "loss": 1.3155, + "mean_token_accuracy": 0.6728775128722191, + "num_tokens": 11978492.0, + "step": 918 + }, + { + "epoch": 0.58816, + "grad_norm": 3.738511562347412, + "learning_rate": 5e-06, + "loss": 1.1487, + "mean_token_accuracy": 0.706301674246788, + "num_tokens": 11991139.0, + "step": 919 + }, + { + "epoch": 0.5888, + "grad_norm": 3.288872241973877, + "learning_rate": 5e-06, + "loss": 1.3725, + "mean_token_accuracy": 0.6539236456155777, + "num_tokens": 12005815.0, + "step": 920 + }, + { + "epoch": 0.58944, + "grad_norm": 3.644181966781616, + "learning_rate": 5e-06, + "loss": 1.2652, + "mean_token_accuracy": 0.6836251989006996, + "num_tokens": 12017897.0, + "step": 921 + }, + { + "epoch": 0.59008, + "grad_norm": 3.8078083992004395, + "learning_rate": 5e-06, + "loss": 1.3801, + "mean_token_accuracy": 0.6469420120120049, + "num_tokens": 12030359.0, + "step": 922 + }, + { + "epoch": 0.59072, + "grad_norm": 3.2687323093414307, + "learning_rate": 5e-06, + "loss": 1.4049, + "mean_token_accuracy": 0.6341100111603737, + "num_tokens": 12044729.0, + "step": 923 + }, + { + "epoch": 0.59136, + "grad_norm": 3.4478020668029785, + "learning_rate": 5e-06, + "loss": 1.4287, + "mean_token_accuracy": 0.6452651098370552, + "num_tokens": 12058788.0, + "step": 924 + }, + { + "epoch": 0.592, + "grad_norm": 4.092494010925293, + "learning_rate": 5e-06, + "loss": 1.1304, + "mean_token_accuracy": 0.6950362101197243, + "num_tokens": 12069502.0, + "step": 925 + }, + { + "epoch": 0.59264, + "grad_norm": 4.566901683807373, + "learning_rate": 5e-06, + "loss": 1.261, + "mean_token_accuracy": 0.6621346473693848, + "num_tokens": 12080932.0, + "step": 926 + }, + { + "epoch": 0.59328, + "grad_norm": 3.4059062004089355, + "learning_rate": 5e-06, + "loss": 1.291, + "mean_token_accuracy": 0.6705317497253418, + "num_tokens": 12094725.0, + "step": 927 + }, + { + "epoch": 0.59392, + "grad_norm": 4.018156051635742, + "learning_rate": 5e-06, + "loss": 1.4457, + "mean_token_accuracy": 0.6595090329647064, + "num_tokens": 12107082.0, + "step": 928 + }, + { + "epoch": 0.59456, + "grad_norm": 3.448580741882324, + "learning_rate": 5e-06, + "loss": 1.2716, + "mean_token_accuracy": 0.6856422200798988, + "num_tokens": 12121239.0, + "step": 929 + }, + { + "epoch": 0.5952, + "grad_norm": 3.425841808319092, + "learning_rate": 5e-06, + "loss": 1.3174, + "mean_token_accuracy": 0.6534383073449135, + "num_tokens": 12134626.0, + "step": 930 + }, + { + "epoch": 0.59584, + "grad_norm": 4.416814804077148, + "learning_rate": 5e-06, + "loss": 1.2661, + "mean_token_accuracy": 0.6484142020344734, + "num_tokens": 12145951.0, + "step": 931 + }, + { + "epoch": 0.59648, + "grad_norm": 3.968085765838623, + "learning_rate": 5e-06, + "loss": 1.4512, + "mean_token_accuracy": 0.6346799582242966, + "num_tokens": 12157958.0, + "step": 932 + }, + { + "epoch": 0.59712, + "grad_norm": 3.6708478927612305, + "learning_rate": 5e-06, + "loss": 1.2783, + "mean_token_accuracy": 0.6855080351233482, + "num_tokens": 12170548.0, + "step": 933 + }, + { + "epoch": 0.59776, + "grad_norm": 3.8740973472595215, + "learning_rate": 5e-06, + "loss": 1.1046, + "mean_token_accuracy": 0.704432986676693, + "num_tokens": 12185292.0, + "step": 934 + }, + { + "epoch": 0.5984, + "grad_norm": 3.4846086502075195, + "learning_rate": 5e-06, + "loss": 1.3035, + "mean_token_accuracy": 0.6759809032082558, + "num_tokens": 12198081.0, + "step": 935 + }, + { + "epoch": 0.59904, + "grad_norm": 3.027975082397461, + "learning_rate": 5e-06, + "loss": 1.2897, + "mean_token_accuracy": 0.6573361679911613, + "num_tokens": 12214742.0, + "step": 936 + }, + { + "epoch": 0.59968, + "grad_norm": 3.879801034927368, + "learning_rate": 5e-06, + "loss": 1.3042, + "mean_token_accuracy": 0.641165092587471, + "num_tokens": 12225671.0, + "step": 937 + }, + { + "epoch": 0.60032, + "grad_norm": 3.933652877807617, + "learning_rate": 5e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.7146632373332977, + "num_tokens": 12239781.0, + "step": 938 + }, + { + "epoch": 0.60096, + "grad_norm": 4.3125786781311035, + "learning_rate": 5e-06, + "loss": 1.2013, + "mean_token_accuracy": 0.6595223546028137, + "num_tokens": 12250660.0, + "step": 939 + }, + { + "epoch": 0.6016, + "grad_norm": 3.671967029571533, + "learning_rate": 5e-06, + "loss": 1.4127, + "mean_token_accuracy": 0.6495495587587357, + "num_tokens": 12261626.0, + "step": 940 + }, + { + "epoch": 0.60224, + "grad_norm": 3.524958610534668, + "learning_rate": 5e-06, + "loss": 1.5488, + "mean_token_accuracy": 0.6084389686584473, + "num_tokens": 12275245.0, + "step": 941 + }, + { + "epoch": 0.60288, + "grad_norm": 3.6148650646209717, + "learning_rate": 5e-06, + "loss": 1.3679, + "mean_token_accuracy": 0.6441814675927162, + "num_tokens": 12289591.0, + "step": 942 + }, + { + "epoch": 0.60352, + "grad_norm": 3.531022071838379, + "learning_rate": 5e-06, + "loss": 1.4175, + "mean_token_accuracy": 0.6326302289962769, + "num_tokens": 12303061.0, + "step": 943 + }, + { + "epoch": 0.60416, + "grad_norm": 3.5599935054779053, + "learning_rate": 5e-06, + "loss": 1.4981, + "mean_token_accuracy": 0.6270119249820709, + "num_tokens": 12317689.0, + "step": 944 + }, + { + "epoch": 0.6048, + "grad_norm": 3.125378370285034, + "learning_rate": 5e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.7116389200091362, + "num_tokens": 12332378.0, + "step": 945 + }, + { + "epoch": 0.60544, + "grad_norm": 3.8127193450927734, + "learning_rate": 5e-06, + "loss": 1.3282, + "mean_token_accuracy": 0.6464278548955917, + "num_tokens": 12345052.0, + "step": 946 + }, + { + "epoch": 0.60608, + "grad_norm": 3.636815309524536, + "learning_rate": 5e-06, + "loss": 1.4728, + "mean_token_accuracy": 0.6262509748339653, + "num_tokens": 12356621.0, + "step": 947 + }, + { + "epoch": 0.60672, + "grad_norm": 3.3789074420928955, + "learning_rate": 5e-06, + "loss": 1.2371, + "mean_token_accuracy": 0.6846612468361855, + "num_tokens": 12370824.0, + "step": 948 + }, + { + "epoch": 0.60736, + "grad_norm": 3.5147576332092285, + "learning_rate": 5e-06, + "loss": 1.2429, + "mean_token_accuracy": 0.6510372906923294, + "num_tokens": 12382166.0, + "step": 949 + }, + { + "epoch": 0.608, + "grad_norm": 4.723844528198242, + "learning_rate": 5e-06, + "loss": 1.3264, + "mean_token_accuracy": 0.6632048487663269, + "num_tokens": 12394366.0, + "step": 950 + }, + { + "epoch": 0.60864, + "grad_norm": 3.679612398147583, + "learning_rate": 5e-06, + "loss": 1.3556, + "mean_token_accuracy": 0.6747320145368576, + "num_tokens": 12408326.0, + "step": 951 + }, + { + "epoch": 0.60928, + "grad_norm": 3.3034772872924805, + "learning_rate": 5e-06, + "loss": 1.5539, + "mean_token_accuracy": 0.6177156269550323, + "num_tokens": 12422379.0, + "step": 952 + }, + { + "epoch": 0.60992, + "grad_norm": 7.560748100280762, + "learning_rate": 5e-06, + "loss": 1.3543, + "mean_token_accuracy": 0.6603868454694748, + "num_tokens": 12434809.0, + "step": 953 + }, + { + "epoch": 0.61056, + "grad_norm": 4.265347003936768, + "learning_rate": 5e-06, + "loss": 1.4282, + "mean_token_accuracy": 0.6390318870544434, + "num_tokens": 12447614.0, + "step": 954 + }, + { + "epoch": 0.6112, + "grad_norm": 3.8850181102752686, + "learning_rate": 5e-06, + "loss": 1.2699, + "mean_token_accuracy": 0.6917356178164482, + "num_tokens": 12459891.0, + "step": 955 + }, + { + "epoch": 0.61184, + "grad_norm": 3.479156255722046, + "learning_rate": 5e-06, + "loss": 1.1499, + "mean_token_accuracy": 0.6750801056623459, + "num_tokens": 12474836.0, + "step": 956 + }, + { + "epoch": 0.61248, + "grad_norm": 2.7899651527404785, + "learning_rate": 5e-06, + "loss": 1.2879, + "mean_token_accuracy": 0.6665042042732239, + "num_tokens": 12493558.0, + "step": 957 + }, + { + "epoch": 0.61312, + "grad_norm": 3.6457180976867676, + "learning_rate": 5e-06, + "loss": 1.2007, + "mean_token_accuracy": 0.6936507746577263, + "num_tokens": 12506849.0, + "step": 958 + }, + { + "epoch": 0.61376, + "grad_norm": 3.0956859588623047, + "learning_rate": 5e-06, + "loss": 1.4628, + "mean_token_accuracy": 0.6274904161691666, + "num_tokens": 12522171.0, + "step": 959 + }, + { + "epoch": 0.6144, + "grad_norm": 3.615293264389038, + "learning_rate": 5e-06, + "loss": 1.2786, + "mean_token_accuracy": 0.679816409945488, + "num_tokens": 12537702.0, + "step": 960 + }, + { + "epoch": 0.61504, + "grad_norm": 3.4518120288848877, + "learning_rate": 5e-06, + "loss": 1.3591, + "mean_token_accuracy": 0.6446092203259468, + "num_tokens": 12550526.0, + "step": 961 + }, + { + "epoch": 0.61568, + "grad_norm": 3.4621338844299316, + "learning_rate": 5e-06, + "loss": 1.3075, + "mean_token_accuracy": 0.665081262588501, + "num_tokens": 12564227.0, + "step": 962 + }, + { + "epoch": 0.61632, + "grad_norm": 3.3471479415893555, + "learning_rate": 5e-06, + "loss": 1.4756, + "mean_token_accuracy": 0.6200397908687592, + "num_tokens": 12578106.0, + "step": 963 + }, + { + "epoch": 0.61696, + "grad_norm": 3.874799966812134, + "learning_rate": 5e-06, + "loss": 1.2777, + "mean_token_accuracy": 0.6613158509135246, + "num_tokens": 12589750.0, + "step": 964 + }, + { + "epoch": 0.6176, + "grad_norm": 4.006873607635498, + "learning_rate": 5e-06, + "loss": 1.408, + "mean_token_accuracy": 0.6419458091259003, + "num_tokens": 12602450.0, + "step": 965 + }, + { + "epoch": 0.61824, + "grad_norm": 3.674241542816162, + "learning_rate": 5e-06, + "loss": 1.2756, + "mean_token_accuracy": 0.6584514081478119, + "num_tokens": 12613871.0, + "step": 966 + }, + { + "epoch": 0.61888, + "grad_norm": 3.7405648231506348, + "learning_rate": 5e-06, + "loss": 1.3301, + "mean_token_accuracy": 0.6810361295938492, + "num_tokens": 12626220.0, + "step": 967 + }, + { + "epoch": 0.61952, + "grad_norm": 3.660600185394287, + "learning_rate": 5e-06, + "loss": 1.2219, + "mean_token_accuracy": 0.6716256737709045, + "num_tokens": 12636440.0, + "step": 968 + }, + { + "epoch": 0.62016, + "grad_norm": 3.4270999431610107, + "learning_rate": 5e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.7019147500395775, + "num_tokens": 12649545.0, + "step": 969 + }, + { + "epoch": 0.6208, + "grad_norm": 3.562014102935791, + "learning_rate": 5e-06, + "loss": 1.2243, + "mean_token_accuracy": 0.6942142397165298, + "num_tokens": 12660841.0, + "step": 970 + }, + { + "epoch": 0.62144, + "grad_norm": 4.004054069519043, + "learning_rate": 5e-06, + "loss": 1.0961, + "mean_token_accuracy": 0.6962654888629913, + "num_tokens": 12675967.0, + "step": 971 + }, + { + "epoch": 0.62208, + "grad_norm": 3.749152898788452, + "learning_rate": 5e-06, + "loss": 1.489, + "mean_token_accuracy": 0.6146213822066784, + "num_tokens": 12687203.0, + "step": 972 + }, + { + "epoch": 0.62272, + "grad_norm": 3.2638871669769287, + "learning_rate": 5e-06, + "loss": 1.1979, + "mean_token_accuracy": 0.687263160943985, + "num_tokens": 12700697.0, + "step": 973 + }, + { + "epoch": 0.62336, + "grad_norm": 3.310070037841797, + "learning_rate": 5e-06, + "loss": 1.4779, + "mean_token_accuracy": 0.632742814719677, + "num_tokens": 12716668.0, + "step": 974 + }, + { + "epoch": 0.624, + "grad_norm": 3.3164589405059814, + "learning_rate": 5e-06, + "loss": 1.301, + "mean_token_accuracy": 0.6563373729586601, + "num_tokens": 12729912.0, + "step": 975 + }, + { + "epoch": 0.62464, + "grad_norm": 3.2415506839752197, + "learning_rate": 5e-06, + "loss": 1.393, + "mean_token_accuracy": 0.644082136452198, + "num_tokens": 12745520.0, + "step": 976 + }, + { + "epoch": 0.62528, + "grad_norm": 3.333308458328247, + "learning_rate": 5e-06, + "loss": 1.1238, + "mean_token_accuracy": 0.6863394901156425, + "num_tokens": 12759203.0, + "step": 977 + }, + { + "epoch": 0.62592, + "grad_norm": 4.198854923248291, + "learning_rate": 5e-06, + "loss": 1.3601, + "mean_token_accuracy": 0.6624843999743462, + "num_tokens": 12770322.0, + "step": 978 + }, + { + "epoch": 0.62656, + "grad_norm": 3.849907636642456, + "learning_rate": 5e-06, + "loss": 1.2947, + "mean_token_accuracy": 0.6675618216395378, + "num_tokens": 12782951.0, + "step": 979 + }, + { + "epoch": 0.6272, + "grad_norm": 3.4649503231048584, + "learning_rate": 5e-06, + "loss": 1.1915, + "mean_token_accuracy": 0.6806611344218254, + "num_tokens": 12795383.0, + "step": 980 + }, + { + "epoch": 0.62784, + "grad_norm": 3.63466739654541, + "learning_rate": 5e-06, + "loss": 1.3124, + "mean_token_accuracy": 0.6731822267174721, + "num_tokens": 12808692.0, + "step": 981 + }, + { + "epoch": 0.62848, + "grad_norm": 4.293845176696777, + "learning_rate": 5e-06, + "loss": 1.1757, + "mean_token_accuracy": 0.6780604794621468, + "num_tokens": 12821099.0, + "step": 982 + }, + { + "epoch": 0.62912, + "grad_norm": 3.565584897994995, + "learning_rate": 5e-06, + "loss": 1.1746, + "mean_token_accuracy": 0.6787943094968796, + "num_tokens": 12832201.0, + "step": 983 + }, + { + "epoch": 0.62976, + "grad_norm": 3.517613410949707, + "learning_rate": 5e-06, + "loss": 1.2167, + "mean_token_accuracy": 0.6914558485150337, + "num_tokens": 12845465.0, + "step": 984 + }, + { + "epoch": 0.6304, + "grad_norm": 3.6170578002929688, + "learning_rate": 5e-06, + "loss": 1.3226, + "mean_token_accuracy": 0.6587705016136169, + "num_tokens": 12857366.0, + "step": 985 + }, + { + "epoch": 0.63104, + "grad_norm": 3.504154682159424, + "learning_rate": 5e-06, + "loss": 1.4641, + "mean_token_accuracy": 0.6085046976804733, + "num_tokens": 12871695.0, + "step": 986 + }, + { + "epoch": 0.63168, + "grad_norm": 3.543142557144165, + "learning_rate": 5e-06, + "loss": 1.1252, + "mean_token_accuracy": 0.7007554769515991, + "num_tokens": 12884113.0, + "step": 987 + }, + { + "epoch": 0.63232, + "grad_norm": 3.9888851642608643, + "learning_rate": 5e-06, + "loss": 1.2741, + "mean_token_accuracy": 0.656329832971096, + "num_tokens": 12898706.0, + "step": 988 + }, + { + "epoch": 0.63296, + "grad_norm": 3.472778081893921, + "learning_rate": 5e-06, + "loss": 1.2431, + "mean_token_accuracy": 0.6751798540353775, + "num_tokens": 12911380.0, + "step": 989 + }, + { + "epoch": 0.6336, + "grad_norm": 3.3277764320373535, + "learning_rate": 5e-06, + "loss": 1.475, + "mean_token_accuracy": 0.632897637784481, + "num_tokens": 12925697.0, + "step": 990 + }, + { + "epoch": 0.63424, + "grad_norm": 3.047473669052124, + "learning_rate": 5e-06, + "loss": 1.2663, + "mean_token_accuracy": 0.6683759167790413, + "num_tokens": 12939473.0, + "step": 991 + }, + { + "epoch": 0.63488, + "grad_norm": 3.483201503753662, + "learning_rate": 5e-06, + "loss": 1.3407, + "mean_token_accuracy": 0.6652352660894394, + "num_tokens": 12952439.0, + "step": 992 + }, + { + "epoch": 0.63552, + "grad_norm": 4.43934965133667, + "learning_rate": 5e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6819510236382484, + "num_tokens": 12963232.0, + "step": 993 + }, + { + "epoch": 0.63616, + "grad_norm": 3.2107748985290527, + "learning_rate": 5e-06, + "loss": 1.2219, + "mean_token_accuracy": 0.6648012548685074, + "num_tokens": 12976396.0, + "step": 994 + }, + { + "epoch": 0.6368, + "grad_norm": 3.8679394721984863, + "learning_rate": 5e-06, + "loss": 1.3487, + "mean_token_accuracy": 0.6491437703371048, + "num_tokens": 12989586.0, + "step": 995 + }, + { + "epoch": 0.63744, + "grad_norm": 3.75811767578125, + "learning_rate": 5e-06, + "loss": 1.2384, + "mean_token_accuracy": 0.6684290617704391, + "num_tokens": 13002145.0, + "step": 996 + }, + { + "epoch": 0.63808, + "grad_norm": 4.223326206207275, + "learning_rate": 5e-06, + "loss": 1.3218, + "mean_token_accuracy": 0.6605047658085823, + "num_tokens": 13011853.0, + "step": 997 + }, + { + "epoch": 0.63872, + "grad_norm": 4.10746955871582, + "learning_rate": 5e-06, + "loss": 1.2647, + "mean_token_accuracy": 0.6529572680592537, + "num_tokens": 13022296.0, + "step": 998 + }, + { + "epoch": 0.63936, + "grad_norm": 3.858157157897949, + "learning_rate": 5e-06, + "loss": 1.3031, + "mean_token_accuracy": 0.6564661860466003, + "num_tokens": 13032768.0, + "step": 999 + }, + { + "epoch": 0.64, + "grad_norm": 3.4283535480499268, + "learning_rate": 5e-06, + "loss": 1.3122, + "mean_token_accuracy": 0.6764922738075256, + "num_tokens": 13045249.0, + "step": 1000 + }, + { + "epoch": 0.64064, + "grad_norm": 3.5663790702819824, + "learning_rate": 5e-06, + "loss": 1.3038, + "mean_token_accuracy": 0.6534328386187553, + "num_tokens": 13057087.0, + "step": 1001 + }, + { + "epoch": 0.64128, + "grad_norm": 4.08723783493042, + "learning_rate": 5e-06, + "loss": 1.311, + "mean_token_accuracy": 0.6497639790177345, + "num_tokens": 13069353.0, + "step": 1002 + }, + { + "epoch": 0.64192, + "grad_norm": 3.1709539890289307, + "learning_rate": 5e-06, + "loss": 1.4385, + "mean_token_accuracy": 0.6491308063268661, + "num_tokens": 13084511.0, + "step": 1003 + }, + { + "epoch": 0.64256, + "grad_norm": 3.7724292278289795, + "learning_rate": 5e-06, + "loss": 1.2049, + "mean_token_accuracy": 0.6716037020087242, + "num_tokens": 13095572.0, + "step": 1004 + }, + { + "epoch": 0.6432, + "grad_norm": 3.4885339736938477, + "learning_rate": 5e-06, + "loss": 1.1454, + "mean_token_accuracy": 0.7053978741168976, + "num_tokens": 13108817.0, + "step": 1005 + }, + { + "epoch": 0.64384, + "grad_norm": 3.718435287475586, + "learning_rate": 5e-06, + "loss": 1.2653, + "mean_token_accuracy": 0.6529537960886955, + "num_tokens": 13119845.0, + "step": 1006 + }, + { + "epoch": 0.64448, + "grad_norm": 3.7939629554748535, + "learning_rate": 5e-06, + "loss": 1.2563, + "mean_token_accuracy": 0.6783741563558578, + "num_tokens": 13131590.0, + "step": 1007 + }, + { + "epoch": 0.64512, + "grad_norm": 3.0090038776397705, + "learning_rate": 5e-06, + "loss": 1.3032, + "mean_token_accuracy": 0.6621121242642403, + "num_tokens": 13146614.0, + "step": 1008 + }, + { + "epoch": 0.64576, + "grad_norm": 3.3267111778259277, + "learning_rate": 5e-06, + "loss": 1.4166, + "mean_token_accuracy": 0.6354531794786453, + "num_tokens": 13160682.0, + "step": 1009 + }, + { + "epoch": 0.6464, + "grad_norm": 3.528743267059326, + "learning_rate": 5e-06, + "loss": 1.246, + "mean_token_accuracy": 0.6703604385256767, + "num_tokens": 13172998.0, + "step": 1010 + }, + { + "epoch": 0.64704, + "grad_norm": 3.2315750122070312, + "learning_rate": 5e-06, + "loss": 1.2159, + "mean_token_accuracy": 0.6840131431818008, + "num_tokens": 13187182.0, + "step": 1011 + }, + { + "epoch": 0.64768, + "grad_norm": 3.885690689086914, + "learning_rate": 5e-06, + "loss": 1.3193, + "mean_token_accuracy": 0.6602044403553009, + "num_tokens": 13198449.0, + "step": 1012 + }, + { + "epoch": 0.64832, + "grad_norm": 4.214417934417725, + "learning_rate": 5e-06, + "loss": 1.3289, + "mean_token_accuracy": 0.638420894742012, + "num_tokens": 13208888.0, + "step": 1013 + }, + { + "epoch": 0.64896, + "grad_norm": 3.303224563598633, + "learning_rate": 5e-06, + "loss": 1.3866, + "mean_token_accuracy": 0.6286184787750244, + "num_tokens": 13222042.0, + "step": 1014 + }, + { + "epoch": 0.6496, + "grad_norm": 3.879709482192993, + "learning_rate": 5e-06, + "loss": 1.4231, + "mean_token_accuracy": 0.6209117695689201, + "num_tokens": 13234662.0, + "step": 1015 + }, + { + "epoch": 0.65024, + "grad_norm": 3.770817995071411, + "learning_rate": 5e-06, + "loss": 1.3315, + "mean_token_accuracy": 0.6555488482117653, + "num_tokens": 13245357.0, + "step": 1016 + }, + { + "epoch": 0.65088, + "grad_norm": 3.627957582473755, + "learning_rate": 5e-06, + "loss": 1.2457, + "mean_token_accuracy": 0.6971615925431252, + "num_tokens": 13258154.0, + "step": 1017 + }, + { + "epoch": 0.65152, + "grad_norm": 3.818009853363037, + "learning_rate": 5e-06, + "loss": 1.2174, + "mean_token_accuracy": 0.6713634058833122, + "num_tokens": 13270001.0, + "step": 1018 + }, + { + "epoch": 0.65216, + "grad_norm": 3.7726924419403076, + "learning_rate": 5e-06, + "loss": 1.2554, + "mean_token_accuracy": 0.6766888722777367, + "num_tokens": 13281334.0, + "step": 1019 + }, + { + "epoch": 0.6528, + "grad_norm": 3.608661413192749, + "learning_rate": 5e-06, + "loss": 1.4083, + "mean_token_accuracy": 0.6521065756678581, + "num_tokens": 13294860.0, + "step": 1020 + }, + { + "epoch": 0.65344, + "grad_norm": 3.7841391563415527, + "learning_rate": 5e-06, + "loss": 1.4197, + "mean_token_accuracy": 0.6439206749200821, + "num_tokens": 13308758.0, + "step": 1021 + }, + { + "epoch": 0.65408, + "grad_norm": 3.836831569671631, + "learning_rate": 5e-06, + "loss": 1.3922, + "mean_token_accuracy": 0.6572518870234489, + "num_tokens": 13319883.0, + "step": 1022 + }, + { + "epoch": 0.65472, + "grad_norm": 3.774944305419922, + "learning_rate": 5e-06, + "loss": 1.1792, + "mean_token_accuracy": 0.6656563580036163, + "num_tokens": 13331846.0, + "step": 1023 + }, + { + "epoch": 0.65536, + "grad_norm": 4.0701751708984375, + "learning_rate": 5e-06, + "loss": 1.2333, + "mean_token_accuracy": 0.6843068152666092, + "num_tokens": 13343010.0, + "step": 1024 + }, + { + "epoch": 0.656, + "grad_norm": 3.7170510292053223, + "learning_rate": 5e-06, + "loss": 1.1427, + "mean_token_accuracy": 0.6941706016659737, + "num_tokens": 13356096.0, + "step": 1025 + }, + { + "epoch": 0.65664, + "grad_norm": 3.4047844409942627, + "learning_rate": 5e-06, + "loss": 1.2994, + "mean_token_accuracy": 0.6502626538276672, + "num_tokens": 13370415.0, + "step": 1026 + }, + { + "epoch": 0.65728, + "grad_norm": 3.013894557952881, + "learning_rate": 5e-06, + "loss": 1.2778, + "mean_token_accuracy": 0.6713566333055496, + "num_tokens": 13385621.0, + "step": 1027 + }, + { + "epoch": 0.65792, + "grad_norm": 3.8273723125457764, + "learning_rate": 5e-06, + "loss": 1.4213, + "mean_token_accuracy": 0.6448807269334793, + "num_tokens": 13399913.0, + "step": 1028 + }, + { + "epoch": 0.65856, + "grad_norm": 4.501821041107178, + "learning_rate": 5e-06, + "loss": 1.3451, + "mean_token_accuracy": 0.6397663801908493, + "num_tokens": 13410323.0, + "step": 1029 + }, + { + "epoch": 0.6592, + "grad_norm": 3.656630516052246, + "learning_rate": 5e-06, + "loss": 1.3693, + "mean_token_accuracy": 0.6572180986404419, + "num_tokens": 13421431.0, + "step": 1030 + }, + { + "epoch": 0.65984, + "grad_norm": 3.761538505554199, + "learning_rate": 5e-06, + "loss": 1.3701, + "mean_token_accuracy": 0.6800885275006294, + "num_tokens": 13433250.0, + "step": 1031 + }, + { + "epoch": 0.66048, + "grad_norm": 3.5799546241760254, + "learning_rate": 5e-06, + "loss": 1.3473, + "mean_token_accuracy": 0.6503010243177414, + "num_tokens": 13446010.0, + "step": 1032 + }, + { + "epoch": 0.66112, + "grad_norm": 3.578547239303589, + "learning_rate": 5e-06, + "loss": 1.2915, + "mean_token_accuracy": 0.7072044536471367, + "num_tokens": 13458978.0, + "step": 1033 + }, + { + "epoch": 0.66176, + "grad_norm": 3.554094076156616, + "learning_rate": 5e-06, + "loss": 1.1486, + "mean_token_accuracy": 0.6704866662621498, + "num_tokens": 13471248.0, + "step": 1034 + }, + { + "epoch": 0.6624, + "grad_norm": 3.5921144485473633, + "learning_rate": 5e-06, + "loss": 1.3039, + "mean_token_accuracy": 0.6752297282218933, + "num_tokens": 13483651.0, + "step": 1035 + }, + { + "epoch": 0.66304, + "grad_norm": 3.580885648727417, + "learning_rate": 5e-06, + "loss": 1.3638, + "mean_token_accuracy": 0.6531356200575829, + "num_tokens": 13496365.0, + "step": 1036 + }, + { + "epoch": 0.66368, + "grad_norm": 3.6400530338287354, + "learning_rate": 5e-06, + "loss": 1.2824, + "mean_token_accuracy": 0.6671290174126625, + "num_tokens": 13509196.0, + "step": 1037 + }, + { + "epoch": 0.66432, + "grad_norm": 3.050649404525757, + "learning_rate": 5e-06, + "loss": 1.2883, + "mean_token_accuracy": 0.6613614112138748, + "num_tokens": 13524592.0, + "step": 1038 + }, + { + "epoch": 0.66496, + "grad_norm": 3.1810715198516846, + "learning_rate": 5e-06, + "loss": 1.4794, + "mean_token_accuracy": 0.633582279086113, + "num_tokens": 13539709.0, + "step": 1039 + }, + { + "epoch": 0.6656, + "grad_norm": 3.488229751586914, + "learning_rate": 5e-06, + "loss": 1.1359, + "mean_token_accuracy": 0.6704655885696411, + "num_tokens": 13551656.0, + "step": 1040 + }, + { + "epoch": 0.66624, + "grad_norm": 3.1657679080963135, + "learning_rate": 5e-06, + "loss": 1.405, + "mean_token_accuracy": 0.6457558646798134, + "num_tokens": 13566681.0, + "step": 1041 + }, + { + "epoch": 0.66688, + "grad_norm": 3.7111074924468994, + "learning_rate": 5e-06, + "loss": 1.1577, + "mean_token_accuracy": 0.6851859763264656, + "num_tokens": 13578777.0, + "step": 1042 + }, + { + "epoch": 0.66752, + "grad_norm": 3.803246021270752, + "learning_rate": 5e-06, + "loss": 1.7394, + "mean_token_accuracy": 0.5742352418601513, + "num_tokens": 13591184.0, + "step": 1043 + }, + { + "epoch": 0.66816, + "grad_norm": 3.44681453704834, + "learning_rate": 5e-06, + "loss": 1.249, + "mean_token_accuracy": 0.6815991401672363, + "num_tokens": 13603011.0, + "step": 1044 + }, + { + "epoch": 0.6688, + "grad_norm": 3.4363629817962646, + "learning_rate": 5e-06, + "loss": 1.207, + "mean_token_accuracy": 0.6451572626829147, + "num_tokens": 13617577.0, + "step": 1045 + }, + { + "epoch": 0.66944, + "grad_norm": 3.9714715480804443, + "learning_rate": 5e-06, + "loss": 1.316, + "mean_token_accuracy": 0.6733849868178368, + "num_tokens": 13628510.0, + "step": 1046 + }, + { + "epoch": 0.67008, + "grad_norm": 3.5095605850219727, + "learning_rate": 5e-06, + "loss": 1.2265, + "mean_token_accuracy": 0.6774598509073257, + "num_tokens": 13639965.0, + "step": 1047 + }, + { + "epoch": 0.67072, + "grad_norm": 3.4731342792510986, + "learning_rate": 5e-06, + "loss": 1.514, + "mean_token_accuracy": 0.6291368082165718, + "num_tokens": 13652881.0, + "step": 1048 + }, + { + "epoch": 0.67136, + "grad_norm": 3.4731788635253906, + "learning_rate": 5e-06, + "loss": 1.2134, + "mean_token_accuracy": 0.7064939141273499, + "num_tokens": 13665986.0, + "step": 1049 + }, + { + "epoch": 0.672, + "grad_norm": 3.885256052017212, + "learning_rate": 5e-06, + "loss": 1.2582, + "mean_token_accuracy": 0.6920784562826157, + "num_tokens": 13677555.0, + "step": 1050 + }, + { + "epoch": 0.67264, + "grad_norm": 3.5971357822418213, + "learning_rate": 5e-06, + "loss": 1.2803, + "mean_token_accuracy": 0.6860932558774948, + "num_tokens": 13689439.0, + "step": 1051 + }, + { + "epoch": 0.67328, + "grad_norm": 3.4999284744262695, + "learning_rate": 5e-06, + "loss": 1.2694, + "mean_token_accuracy": 0.6710788905620575, + "num_tokens": 13703306.0, + "step": 1052 + }, + { + "epoch": 0.67392, + "grad_norm": 3.894716262817383, + "learning_rate": 5e-06, + "loss": 1.1895, + "mean_token_accuracy": 0.694163866341114, + "num_tokens": 13715107.0, + "step": 1053 + }, + { + "epoch": 0.67456, + "grad_norm": 3.8361921310424805, + "learning_rate": 5e-06, + "loss": 1.3142, + "mean_token_accuracy": 0.6533372104167938, + "num_tokens": 13726324.0, + "step": 1054 + }, + { + "epoch": 0.6752, + "grad_norm": 3.5220136642456055, + "learning_rate": 5e-06, + "loss": 1.5138, + "mean_token_accuracy": 0.6085778698325157, + "num_tokens": 13739132.0, + "step": 1055 + }, + { + "epoch": 0.67584, + "grad_norm": 3.4445347785949707, + "learning_rate": 5e-06, + "loss": 1.3869, + "mean_token_accuracy": 0.6502392590045929, + "num_tokens": 13751996.0, + "step": 1056 + }, + { + "epoch": 0.67648, + "grad_norm": 4.514054298400879, + "learning_rate": 5e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7111445441842079, + "num_tokens": 13762912.0, + "step": 1057 + }, + { + "epoch": 0.67712, + "grad_norm": 3.4511091709136963, + "learning_rate": 5e-06, + "loss": 1.2244, + "mean_token_accuracy": 0.6701223999261856, + "num_tokens": 13776094.0, + "step": 1058 + }, + { + "epoch": 0.67776, + "grad_norm": 3.518554449081421, + "learning_rate": 5e-06, + "loss": 1.4949, + "mean_token_accuracy": 0.6190471276640892, + "num_tokens": 13789681.0, + "step": 1059 + }, + { + "epoch": 0.6784, + "grad_norm": 3.177955150604248, + "learning_rate": 5e-06, + "loss": 1.4254, + "mean_token_accuracy": 0.6302540749311447, + "num_tokens": 13802777.0, + "step": 1060 + }, + { + "epoch": 0.67904, + "grad_norm": 3.7214250564575195, + "learning_rate": 5e-06, + "loss": 1.3088, + "mean_token_accuracy": 0.6556456014513969, + "num_tokens": 13815229.0, + "step": 1061 + }, + { + "epoch": 0.67968, + "grad_norm": 3.726001739501953, + "learning_rate": 5e-06, + "loss": 1.2057, + "mean_token_accuracy": 0.6688976883888245, + "num_tokens": 13826932.0, + "step": 1062 + }, + { + "epoch": 0.68032, + "grad_norm": 3.1761860847473145, + "learning_rate": 5e-06, + "loss": 1.4054, + "mean_token_accuracy": 0.6317546740174294, + "num_tokens": 13842144.0, + "step": 1063 + }, + { + "epoch": 0.68096, + "grad_norm": 4.224031448364258, + "learning_rate": 5e-06, + "loss": 1.1421, + "mean_token_accuracy": 0.6879750266671181, + "num_tokens": 13852498.0, + "step": 1064 + }, + { + "epoch": 0.6816, + "grad_norm": 3.1462998390197754, + "learning_rate": 5e-06, + "loss": 1.3639, + "mean_token_accuracy": 0.6389659121632576, + "num_tokens": 13867842.0, + "step": 1065 + }, + { + "epoch": 0.68224, + "grad_norm": 3.7994680404663086, + "learning_rate": 5e-06, + "loss": 1.2442, + "mean_token_accuracy": 0.6608950793743134, + "num_tokens": 13878904.0, + "step": 1066 + }, + { + "epoch": 0.68288, + "grad_norm": 3.3029258251190186, + "learning_rate": 5e-06, + "loss": 1.1501, + "mean_token_accuracy": 0.6978383362293243, + "num_tokens": 13892970.0, + "step": 1067 + }, + { + "epoch": 0.68352, + "grad_norm": 4.019161224365234, + "learning_rate": 5e-06, + "loss": 1.2156, + "mean_token_accuracy": 0.689938597381115, + "num_tokens": 13904207.0, + "step": 1068 + }, + { + "epoch": 0.68416, + "grad_norm": 3.9899635314941406, + "learning_rate": 5e-06, + "loss": 1.4744, + "mean_token_accuracy": 0.6247128024697304, + "num_tokens": 13917327.0, + "step": 1069 + }, + { + "epoch": 0.6848, + "grad_norm": 5.03689432144165, + "learning_rate": 5e-06, + "loss": 1.3551, + "mean_token_accuracy": 0.6756569370627403, + "num_tokens": 13927551.0, + "step": 1070 + }, + { + "epoch": 0.68544, + "grad_norm": 3.43404221534729, + "learning_rate": 5e-06, + "loss": 1.1618, + "mean_token_accuracy": 0.6679700240492821, + "num_tokens": 13939630.0, + "step": 1071 + }, + { + "epoch": 0.68608, + "grad_norm": 4.027390956878662, + "learning_rate": 5e-06, + "loss": 1.3842, + "mean_token_accuracy": 0.6630447581410408, + "num_tokens": 13949082.0, + "step": 1072 + }, + { + "epoch": 0.68672, + "grad_norm": 3.764420986175537, + "learning_rate": 5e-06, + "loss": 1.293, + "mean_token_accuracy": 0.6628079935908318, + "num_tokens": 13962085.0, + "step": 1073 + }, + { + "epoch": 0.68736, + "grad_norm": 3.617522954940796, + "learning_rate": 5e-06, + "loss": 1.5355, + "mean_token_accuracy": 0.6268560141324997, + "num_tokens": 13973510.0, + "step": 1074 + }, + { + "epoch": 0.688, + "grad_norm": 3.6434836387634277, + "learning_rate": 5e-06, + "loss": 1.3283, + "mean_token_accuracy": 0.6506749242544174, + "num_tokens": 13986487.0, + "step": 1075 + }, + { + "epoch": 0.68864, + "grad_norm": 3.4601213932037354, + "learning_rate": 5e-06, + "loss": 1.1938, + "mean_token_accuracy": 0.6761009320616722, + "num_tokens": 13998818.0, + "step": 1076 + }, + { + "epoch": 0.68928, + "grad_norm": 3.537867307662964, + "learning_rate": 5e-06, + "loss": 1.2904, + "mean_token_accuracy": 0.6788460239768028, + "num_tokens": 14011012.0, + "step": 1077 + }, + { + "epoch": 0.68992, + "grad_norm": 3.204850435256958, + "learning_rate": 5e-06, + "loss": 1.257, + "mean_token_accuracy": 0.650609090924263, + "num_tokens": 14026133.0, + "step": 1078 + }, + { + "epoch": 0.69056, + "grad_norm": 3.1684117317199707, + "learning_rate": 5e-06, + "loss": 1.3857, + "mean_token_accuracy": 0.669170081615448, + "num_tokens": 14041463.0, + "step": 1079 + }, + { + "epoch": 0.6912, + "grad_norm": 2.97310209274292, + "learning_rate": 5e-06, + "loss": 1.1807, + "mean_token_accuracy": 0.678413525223732, + "num_tokens": 14055206.0, + "step": 1080 + }, + { + "epoch": 0.69184, + "grad_norm": 3.415344476699829, + "learning_rate": 5e-06, + "loss": 1.335, + "mean_token_accuracy": 0.6698315292596817, + "num_tokens": 14067043.0, + "step": 1081 + }, + { + "epoch": 0.69248, + "grad_norm": 3.2605786323547363, + "learning_rate": 5e-06, + "loss": 1.23, + "mean_token_accuracy": 0.6607236042618752, + "num_tokens": 14081150.0, + "step": 1082 + }, + { + "epoch": 0.69312, + "grad_norm": 3.5928292274475098, + "learning_rate": 5e-06, + "loss": 1.3275, + "mean_token_accuracy": 0.6353218033909798, + "num_tokens": 14093199.0, + "step": 1083 + }, + { + "epoch": 0.69376, + "grad_norm": 3.6726202964782715, + "learning_rate": 5e-06, + "loss": 1.2071, + "mean_token_accuracy": 0.6714643463492393, + "num_tokens": 14106118.0, + "step": 1084 + }, + { + "epoch": 0.6944, + "grad_norm": 3.3273112773895264, + "learning_rate": 5e-06, + "loss": 1.2143, + "mean_token_accuracy": 0.6702926605939865, + "num_tokens": 14119919.0, + "step": 1085 + }, + { + "epoch": 0.69504, + "grad_norm": 3.3181533813476562, + "learning_rate": 5e-06, + "loss": 1.525, + "mean_token_accuracy": 0.6393495798110962, + "num_tokens": 14134965.0, + "step": 1086 + }, + { + "epoch": 0.69568, + "grad_norm": 3.290024995803833, + "learning_rate": 5e-06, + "loss": 1.4868, + "mean_token_accuracy": 0.641509011387825, + "num_tokens": 14150873.0, + "step": 1087 + }, + { + "epoch": 0.69632, + "grad_norm": 3.179009199142456, + "learning_rate": 5e-06, + "loss": 1.3236, + "mean_token_accuracy": 0.6799350008368492, + "num_tokens": 14165423.0, + "step": 1088 + }, + { + "epoch": 0.69696, + "grad_norm": 4.067260265350342, + "learning_rate": 5e-06, + "loss": 1.3399, + "mean_token_accuracy": 0.6480335146188736, + "num_tokens": 14177467.0, + "step": 1089 + }, + { + "epoch": 0.6976, + "grad_norm": 3.0903289318084717, + "learning_rate": 5e-06, + "loss": 1.2387, + "mean_token_accuracy": 0.6776180788874626, + "num_tokens": 14192746.0, + "step": 1090 + }, + { + "epoch": 0.69824, + "grad_norm": 3.60392165184021, + "learning_rate": 5e-06, + "loss": 1.4149, + "mean_token_accuracy": 0.6424715965986252, + "num_tokens": 14205814.0, + "step": 1091 + }, + { + "epoch": 0.69888, + "grad_norm": 3.857509136199951, + "learning_rate": 5e-06, + "loss": 1.2077, + "mean_token_accuracy": 0.6793344393372536, + "num_tokens": 14216791.0, + "step": 1092 + }, + { + "epoch": 0.69952, + "grad_norm": 3.376009702682495, + "learning_rate": 5e-06, + "loss": 1.329, + "mean_token_accuracy": 0.6632150262594223, + "num_tokens": 14231190.0, + "step": 1093 + }, + { + "epoch": 0.70016, + "grad_norm": 3.522667407989502, + "learning_rate": 5e-06, + "loss": 1.3746, + "mean_token_accuracy": 0.6448647379875183, + "num_tokens": 14246266.0, + "step": 1094 + }, + { + "epoch": 0.7008, + "grad_norm": 3.88810658454895, + "learning_rate": 5e-06, + "loss": 1.4056, + "mean_token_accuracy": 0.6266975551843643, + "num_tokens": 14258900.0, + "step": 1095 + }, + { + "epoch": 0.70144, + "grad_norm": 4.134660243988037, + "learning_rate": 5e-06, + "loss": 1.4136, + "mean_token_accuracy": 0.6337871551513672, + "num_tokens": 14270135.0, + "step": 1096 + }, + { + "epoch": 0.70208, + "grad_norm": 2.7987403869628906, + "learning_rate": 5e-06, + "loss": 1.2978, + "mean_token_accuracy": 0.6591611802577972, + "num_tokens": 14287885.0, + "step": 1097 + }, + { + "epoch": 0.70272, + "grad_norm": 3.6904680728912354, + "learning_rate": 5e-06, + "loss": 1.3421, + "mean_token_accuracy": 0.6668279096484184, + "num_tokens": 14299211.0, + "step": 1098 + }, + { + "epoch": 0.70336, + "grad_norm": 3.754704475402832, + "learning_rate": 5e-06, + "loss": 1.2728, + "mean_token_accuracy": 0.6614086180925369, + "num_tokens": 14310772.0, + "step": 1099 + }, + { + "epoch": 0.704, + "grad_norm": 4.1148529052734375, + "learning_rate": 5e-06, + "loss": 1.3038, + "mean_token_accuracy": 0.6619797796010971, + "num_tokens": 14321538.0, + "step": 1100 + }, + { + "epoch": 0.70464, + "grad_norm": 3.9892449378967285, + "learning_rate": 5e-06, + "loss": 1.4351, + "mean_token_accuracy": 0.6713762879371643, + "num_tokens": 14332186.0, + "step": 1101 + }, + { + "epoch": 0.70528, + "grad_norm": 2.8868937492370605, + "learning_rate": 5e-06, + "loss": 1.3939, + "mean_token_accuracy": 0.6400659307837486, + "num_tokens": 14349043.0, + "step": 1102 + }, + { + "epoch": 0.70592, + "grad_norm": 3.4299302101135254, + "learning_rate": 5e-06, + "loss": 1.2674, + "mean_token_accuracy": 0.6643095165491104, + "num_tokens": 14363765.0, + "step": 1103 + }, + { + "epoch": 0.70656, + "grad_norm": 3.3706107139587402, + "learning_rate": 5e-06, + "loss": 1.3314, + "mean_token_accuracy": 0.6502428278326988, + "num_tokens": 14377935.0, + "step": 1104 + }, + { + "epoch": 0.7072, + "grad_norm": 3.353766441345215, + "learning_rate": 5e-06, + "loss": 1.4449, + "mean_token_accuracy": 0.6395809948444366, + "num_tokens": 14391497.0, + "step": 1105 + }, + { + "epoch": 0.70784, + "grad_norm": 3.7346043586730957, + "learning_rate": 5e-06, + "loss": 1.2955, + "mean_token_accuracy": 0.6504843458533287, + "num_tokens": 14403120.0, + "step": 1106 + }, + { + "epoch": 0.70848, + "grad_norm": 3.9729044437408447, + "learning_rate": 5e-06, + "loss": 1.2647, + "mean_token_accuracy": 0.6647339016199112, + "num_tokens": 14415923.0, + "step": 1107 + }, + { + "epoch": 0.70912, + "grad_norm": 4.029970169067383, + "learning_rate": 5e-06, + "loss": 1.227, + "mean_token_accuracy": 0.6661604270339012, + "num_tokens": 14427554.0, + "step": 1108 + }, + { + "epoch": 0.70976, + "grad_norm": 3.4321465492248535, + "learning_rate": 5e-06, + "loss": 1.1746, + "mean_token_accuracy": 0.6878796294331551, + "num_tokens": 14441354.0, + "step": 1109 + }, + { + "epoch": 0.7104, + "grad_norm": 3.303091287612915, + "learning_rate": 5e-06, + "loss": 1.3507, + "mean_token_accuracy": 0.6451763212680817, + "num_tokens": 14455989.0, + "step": 1110 + }, + { + "epoch": 0.71104, + "grad_norm": 3.9641027450561523, + "learning_rate": 5e-06, + "loss": 1.287, + "mean_token_accuracy": 0.661915197968483, + "num_tokens": 14466700.0, + "step": 1111 + }, + { + "epoch": 0.71168, + "grad_norm": 3.4277381896972656, + "learning_rate": 5e-06, + "loss": 1.1418, + "mean_token_accuracy": 0.6885363236069679, + "num_tokens": 14479735.0, + "step": 1112 + }, + { + "epoch": 0.71232, + "grad_norm": 3.531708240509033, + "learning_rate": 5e-06, + "loss": 1.1786, + "mean_token_accuracy": 0.6798023506999016, + "num_tokens": 14492428.0, + "step": 1113 + }, + { + "epoch": 0.71296, + "grad_norm": 3.962233304977417, + "learning_rate": 5e-06, + "loss": 1.5488, + "mean_token_accuracy": 0.6183573752641678, + "num_tokens": 14505050.0, + "step": 1114 + }, + { + "epoch": 0.7136, + "grad_norm": 3.1472697257995605, + "learning_rate": 5e-06, + "loss": 1.426, + "mean_token_accuracy": 0.6550877764821053, + "num_tokens": 14519128.0, + "step": 1115 + }, + { + "epoch": 0.71424, + "grad_norm": 3.8537216186523438, + "learning_rate": 5e-06, + "loss": 1.1991, + "mean_token_accuracy": 0.682333379983902, + "num_tokens": 14530594.0, + "step": 1116 + }, + { + "epoch": 0.71488, + "grad_norm": 3.527343511581421, + "learning_rate": 5e-06, + "loss": 1.207, + "mean_token_accuracy": 0.6722685918211937, + "num_tokens": 14541472.0, + "step": 1117 + }, + { + "epoch": 0.71552, + "grad_norm": 3.790855646133423, + "learning_rate": 5e-06, + "loss": 1.2531, + "mean_token_accuracy": 0.6708436533808708, + "num_tokens": 14552813.0, + "step": 1118 + }, + { + "epoch": 0.71616, + "grad_norm": 3.553488254547119, + "learning_rate": 5e-06, + "loss": 1.1016, + "mean_token_accuracy": 0.7148231789469719, + "num_tokens": 14565447.0, + "step": 1119 + }, + { + "epoch": 0.7168, + "grad_norm": 3.887118339538574, + "learning_rate": 5e-06, + "loss": 1.1646, + "mean_token_accuracy": 0.6907912865281105, + "num_tokens": 14577377.0, + "step": 1120 + }, + { + "epoch": 0.71744, + "grad_norm": 3.0343868732452393, + "learning_rate": 5e-06, + "loss": 1.142, + "mean_token_accuracy": 0.6866175085306168, + "num_tokens": 14591768.0, + "step": 1121 + }, + { + "epoch": 0.71808, + "grad_norm": 4.561229705810547, + "learning_rate": 5e-06, + "loss": 1.4004, + "mean_token_accuracy": 0.6330222748219967, + "num_tokens": 14603193.0, + "step": 1122 + }, + { + "epoch": 0.71872, + "grad_norm": 3.5638325214385986, + "learning_rate": 5e-06, + "loss": 1.1526, + "mean_token_accuracy": 0.6880695223808289, + "num_tokens": 14617007.0, + "step": 1123 + }, + { + "epoch": 0.71936, + "grad_norm": 3.810415267944336, + "learning_rate": 5e-06, + "loss": 1.279, + "mean_token_accuracy": 0.670023150742054, + "num_tokens": 14629048.0, + "step": 1124 + }, + { + "epoch": 0.72, + "grad_norm": 4.179751396179199, + "learning_rate": 5e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.7168305143713951, + "num_tokens": 14639669.0, + "step": 1125 + }, + { + "epoch": 0.72064, + "grad_norm": 3.539612054824829, + "learning_rate": 5e-06, + "loss": 1.3306, + "mean_token_accuracy": 0.6881062537431717, + "num_tokens": 14652435.0, + "step": 1126 + }, + { + "epoch": 0.72128, + "grad_norm": 3.597693681716919, + "learning_rate": 5e-06, + "loss": 1.3782, + "mean_token_accuracy": 0.6435956582427025, + "num_tokens": 14664804.0, + "step": 1127 + }, + { + "epoch": 0.72192, + "grad_norm": 3.3020715713500977, + "learning_rate": 5e-06, + "loss": 1.3158, + "mean_token_accuracy": 0.6422711089253426, + "num_tokens": 14679481.0, + "step": 1128 + }, + { + "epoch": 0.72256, + "grad_norm": 3.4006054401397705, + "learning_rate": 5e-06, + "loss": 1.4268, + "mean_token_accuracy": 0.6340702697634697, + "num_tokens": 14693009.0, + "step": 1129 + }, + { + "epoch": 0.7232, + "grad_norm": 3.6534066200256348, + "learning_rate": 5e-06, + "loss": 1.3942, + "mean_token_accuracy": 0.6490144804120064, + "num_tokens": 14704415.0, + "step": 1130 + }, + { + "epoch": 0.72384, + "grad_norm": 4.022477149963379, + "learning_rate": 5e-06, + "loss": 1.4425, + "mean_token_accuracy": 0.6391843035817146, + "num_tokens": 14718972.0, + "step": 1131 + }, + { + "epoch": 0.72448, + "grad_norm": 3.717512369155884, + "learning_rate": 5e-06, + "loss": 1.337, + "mean_token_accuracy": 0.6694408059120178, + "num_tokens": 14731864.0, + "step": 1132 + }, + { + "epoch": 0.72512, + "grad_norm": 3.640937566757202, + "learning_rate": 5e-06, + "loss": 1.2358, + "mean_token_accuracy": 0.6641776859760284, + "num_tokens": 14743467.0, + "step": 1133 + }, + { + "epoch": 0.72576, + "grad_norm": 3.5870702266693115, + "learning_rate": 5e-06, + "loss": 1.2264, + "mean_token_accuracy": 0.6706337183713913, + "num_tokens": 14755081.0, + "step": 1134 + }, + { + "epoch": 0.7264, + "grad_norm": 3.6272132396698, + "learning_rate": 5e-06, + "loss": 1.3878, + "mean_token_accuracy": 0.6861530616879463, + "num_tokens": 14766730.0, + "step": 1135 + }, + { + "epoch": 0.72704, + "grad_norm": 3.349130392074585, + "learning_rate": 5e-06, + "loss": 1.428, + "mean_token_accuracy": 0.6495333984494209, + "num_tokens": 14780158.0, + "step": 1136 + }, + { + "epoch": 0.72768, + "grad_norm": 3.8108246326446533, + "learning_rate": 5e-06, + "loss": 1.2034, + "mean_token_accuracy": 0.6748805195093155, + "num_tokens": 14793884.0, + "step": 1137 + }, + { + "epoch": 0.72832, + "grad_norm": 3.4483556747436523, + "learning_rate": 5e-06, + "loss": 1.3949, + "mean_token_accuracy": 0.6391731649637222, + "num_tokens": 14806967.0, + "step": 1138 + }, + { + "epoch": 0.72896, + "grad_norm": 3.3666470050811768, + "learning_rate": 5e-06, + "loss": 1.2185, + "mean_token_accuracy": 0.6691764742136002, + "num_tokens": 14820303.0, + "step": 1139 + }, + { + "epoch": 0.7296, + "grad_norm": 3.32536244392395, + "learning_rate": 5e-06, + "loss": 1.2849, + "mean_token_accuracy": 0.6658271849155426, + "num_tokens": 14834025.0, + "step": 1140 + }, + { + "epoch": 0.73024, + "grad_norm": 3.825983762741089, + "learning_rate": 5e-06, + "loss": 1.6181, + "mean_token_accuracy": 0.602071076631546, + "num_tokens": 14847478.0, + "step": 1141 + }, + { + "epoch": 0.73088, + "grad_norm": 4.397375106811523, + "learning_rate": 5e-06, + "loss": 1.3697, + "mean_token_accuracy": 0.6505918800830841, + "num_tokens": 14859179.0, + "step": 1142 + }, + { + "epoch": 0.73152, + "grad_norm": 4.159323215484619, + "learning_rate": 5e-06, + "loss": 1.1297, + "mean_token_accuracy": 0.7010925114154816, + "num_tokens": 14869522.0, + "step": 1143 + }, + { + "epoch": 0.73216, + "grad_norm": 3.4876530170440674, + "learning_rate": 5e-06, + "loss": 1.245, + "mean_token_accuracy": 0.652951605618, + "num_tokens": 14883372.0, + "step": 1144 + }, + { + "epoch": 0.7328, + "grad_norm": 3.0746846199035645, + "learning_rate": 5e-06, + "loss": 1.5279, + "mean_token_accuracy": 0.6031446754932404, + "num_tokens": 14899033.0, + "step": 1145 + }, + { + "epoch": 0.73344, + "grad_norm": 3.7521297931671143, + "learning_rate": 5e-06, + "loss": 1.186, + "mean_token_accuracy": 0.6688085421919823, + "num_tokens": 14913040.0, + "step": 1146 + }, + { + "epoch": 0.73408, + "grad_norm": 3.9737706184387207, + "learning_rate": 5e-06, + "loss": 1.314, + "mean_token_accuracy": 0.6808184832334518, + "num_tokens": 14927324.0, + "step": 1147 + }, + { + "epoch": 0.73472, + "grad_norm": 3.6961631774902344, + "learning_rate": 5e-06, + "loss": 1.264, + "mean_token_accuracy": 0.6656776443123817, + "num_tokens": 14938251.0, + "step": 1148 + }, + { + "epoch": 0.73536, + "grad_norm": 4.080604553222656, + "learning_rate": 5e-06, + "loss": 1.4443, + "mean_token_accuracy": 0.6355870217084885, + "num_tokens": 14948864.0, + "step": 1149 + }, + { + "epoch": 0.736, + "grad_norm": 3.284268617630005, + "learning_rate": 5e-06, + "loss": 1.1416, + "mean_token_accuracy": 0.6982963308691978, + "num_tokens": 14962968.0, + "step": 1150 + }, + { + "epoch": 0.73664, + "grad_norm": 3.623760223388672, + "learning_rate": 5e-06, + "loss": 1.206, + "mean_token_accuracy": 0.6715554222464561, + "num_tokens": 14974492.0, + "step": 1151 + }, + { + "epoch": 0.73728, + "grad_norm": 3.6222383975982666, + "learning_rate": 5e-06, + "loss": 1.2002, + "mean_token_accuracy": 0.665322557091713, + "num_tokens": 14987080.0, + "step": 1152 + }, + { + "epoch": 0.73792, + "grad_norm": 4.134393692016602, + "learning_rate": 5e-06, + "loss": 1.3446, + "mean_token_accuracy": 0.6732967086136341, + "num_tokens": 14997376.0, + "step": 1153 + }, + { + "epoch": 0.73856, + "grad_norm": 3.1004269123077393, + "learning_rate": 5e-06, + "loss": 1.1475, + "mean_token_accuracy": 0.6673456728458405, + "num_tokens": 15011696.0, + "step": 1154 + }, + { + "epoch": 0.7392, + "grad_norm": 3.437642812728882, + "learning_rate": 5e-06, + "loss": 1.3756, + "mean_token_accuracy": 0.6303885355591774, + "num_tokens": 15026552.0, + "step": 1155 + }, + { + "epoch": 0.73984, + "grad_norm": 8.039863586425781, + "learning_rate": 5e-06, + "loss": 1.2034, + "mean_token_accuracy": 0.7016506418585777, + "num_tokens": 15038554.0, + "step": 1156 + }, + { + "epoch": 0.74048, + "grad_norm": 3.248920440673828, + "learning_rate": 5e-06, + "loss": 1.2926, + "mean_token_accuracy": 0.6631775796413422, + "num_tokens": 15054408.0, + "step": 1157 + }, + { + "epoch": 0.74112, + "grad_norm": 3.959541082382202, + "learning_rate": 5e-06, + "loss": 1.45, + "mean_token_accuracy": 0.6382048651576042, + "num_tokens": 15065684.0, + "step": 1158 + }, + { + "epoch": 0.74176, + "grad_norm": 4.347902297973633, + "learning_rate": 5e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.7234738394618034, + "num_tokens": 15077041.0, + "step": 1159 + }, + { + "epoch": 0.7424, + "grad_norm": 4.224346160888672, + "learning_rate": 5e-06, + "loss": 1.272, + "mean_token_accuracy": 0.6782936900854111, + "num_tokens": 15088347.0, + "step": 1160 + }, + { + "epoch": 0.74304, + "grad_norm": 3.770258903503418, + "learning_rate": 5e-06, + "loss": 1.3465, + "mean_token_accuracy": 0.661977045238018, + "num_tokens": 15101460.0, + "step": 1161 + }, + { + "epoch": 0.74368, + "grad_norm": 3.7153191566467285, + "learning_rate": 5e-06, + "loss": 1.3216, + "mean_token_accuracy": 0.6487752310931683, + "num_tokens": 15115486.0, + "step": 1162 + }, + { + "epoch": 0.74432, + "grad_norm": 4.508492469787598, + "learning_rate": 5e-06, + "loss": 1.2035, + "mean_token_accuracy": 0.6841593757271767, + "num_tokens": 15125039.0, + "step": 1163 + }, + { + "epoch": 0.74496, + "grad_norm": 3.0245108604431152, + "learning_rate": 5e-06, + "loss": 1.3148, + "mean_token_accuracy": 0.6497530564665794, + "num_tokens": 15142007.0, + "step": 1164 + }, + { + "epoch": 0.7456, + "grad_norm": 3.7130560874938965, + "learning_rate": 5e-06, + "loss": 1.365, + "mean_token_accuracy": 0.6439146772027016, + "num_tokens": 15154070.0, + "step": 1165 + }, + { + "epoch": 0.74624, + "grad_norm": 4.014090538024902, + "learning_rate": 5e-06, + "loss": 1.5601, + "mean_token_accuracy": 0.6075103767216206, + "num_tokens": 15165816.0, + "step": 1166 + }, + { + "epoch": 0.74688, + "grad_norm": 3.5442097187042236, + "learning_rate": 5e-06, + "loss": 1.2896, + "mean_token_accuracy": 0.6592826843261719, + "num_tokens": 15180213.0, + "step": 1167 + }, + { + "epoch": 0.74752, + "grad_norm": 3.3585143089294434, + "learning_rate": 5e-06, + "loss": 1.3492, + "mean_token_accuracy": 0.650349847972393, + "num_tokens": 15195358.0, + "step": 1168 + }, + { + "epoch": 0.74816, + "grad_norm": 3.3249661922454834, + "learning_rate": 5e-06, + "loss": 1.2987, + "mean_token_accuracy": 0.6541887670755386, + "num_tokens": 15208187.0, + "step": 1169 + }, + { + "epoch": 0.7488, + "grad_norm": 3.2732949256896973, + "learning_rate": 5e-06, + "loss": 1.4745, + "mean_token_accuracy": 0.6346693634986877, + "num_tokens": 15224302.0, + "step": 1170 + }, + { + "epoch": 0.74944, + "grad_norm": 3.717664957046509, + "learning_rate": 5e-06, + "loss": 1.2258, + "mean_token_accuracy": 0.6592052280902863, + "num_tokens": 15235541.0, + "step": 1171 + }, + { + "epoch": 0.75008, + "grad_norm": 3.3119561672210693, + "learning_rate": 5e-06, + "loss": 1.2281, + "mean_token_accuracy": 0.6707274541258812, + "num_tokens": 15249155.0, + "step": 1172 + }, + { + "epoch": 0.75072, + "grad_norm": 3.4180824756622314, + "learning_rate": 5e-06, + "loss": 1.2323, + "mean_token_accuracy": 0.6759630665183067, + "num_tokens": 15262745.0, + "step": 1173 + }, + { + "epoch": 0.75136, + "grad_norm": 3.351557970046997, + "learning_rate": 5e-06, + "loss": 1.2131, + "mean_token_accuracy": 0.6812815740704536, + "num_tokens": 15276413.0, + "step": 1174 + }, + { + "epoch": 0.752, + "grad_norm": 4.228631973266602, + "learning_rate": 5e-06, + "loss": 1.1811, + "mean_token_accuracy": 0.694750115275383, + "num_tokens": 15287419.0, + "step": 1175 + }, + { + "epoch": 0.75264, + "grad_norm": 3.346228837966919, + "learning_rate": 5e-06, + "loss": 1.4694, + "mean_token_accuracy": 0.6221407428383827, + "num_tokens": 15304245.0, + "step": 1176 + }, + { + "epoch": 0.75328, + "grad_norm": 3.899305582046509, + "learning_rate": 5e-06, + "loss": 1.3843, + "mean_token_accuracy": 0.6404564082622528, + "num_tokens": 15315843.0, + "step": 1177 + }, + { + "epoch": 0.75392, + "grad_norm": 3.3452677726745605, + "learning_rate": 5e-06, + "loss": 1.4037, + "mean_token_accuracy": 0.6525731533765793, + "num_tokens": 15330084.0, + "step": 1178 + }, + { + "epoch": 0.75456, + "grad_norm": 3.4091222286224365, + "learning_rate": 5e-06, + "loss": 1.4005, + "mean_token_accuracy": 0.6623276621103287, + "num_tokens": 15342406.0, + "step": 1179 + }, + { + "epoch": 0.7552, + "grad_norm": 3.5373282432556152, + "learning_rate": 5e-06, + "loss": 1.1613, + "mean_token_accuracy": 0.6908905506134033, + "num_tokens": 15355306.0, + "step": 1180 + }, + { + "epoch": 0.75584, + "grad_norm": 3.9077682495117188, + "learning_rate": 5e-06, + "loss": 1.2934, + "mean_token_accuracy": 0.6559719070792198, + "num_tokens": 15365562.0, + "step": 1181 + }, + { + "epoch": 0.75648, + "grad_norm": 4.251070022583008, + "learning_rate": 5e-06, + "loss": 1.1809, + "mean_token_accuracy": 0.7090832963585854, + "num_tokens": 15377910.0, + "step": 1182 + }, + { + "epoch": 0.75712, + "grad_norm": 3.6916239261627197, + "learning_rate": 5e-06, + "loss": 1.1891, + "mean_token_accuracy": 0.6642716750502586, + "num_tokens": 15390158.0, + "step": 1183 + }, + { + "epoch": 0.75776, + "grad_norm": 3.235966682434082, + "learning_rate": 5e-06, + "loss": 1.3514, + "mean_token_accuracy": 0.654154047369957, + "num_tokens": 15405763.0, + "step": 1184 + }, + { + "epoch": 0.7584, + "grad_norm": 3.0988378524780273, + "learning_rate": 5e-06, + "loss": 1.3606, + "mean_token_accuracy": 0.6405491232872009, + "num_tokens": 15421326.0, + "step": 1185 + }, + { + "epoch": 0.75904, + "grad_norm": 3.5612781047821045, + "learning_rate": 5e-06, + "loss": 1.3463, + "mean_token_accuracy": 0.6501871645450592, + "num_tokens": 15434530.0, + "step": 1186 + }, + { + "epoch": 0.75968, + "grad_norm": 3.6004257202148438, + "learning_rate": 5e-06, + "loss": 1.346, + "mean_token_accuracy": 0.6661404147744179, + "num_tokens": 15448462.0, + "step": 1187 + }, + { + "epoch": 0.76032, + "grad_norm": 4.093327045440674, + "learning_rate": 5e-06, + "loss": 1.257, + "mean_token_accuracy": 0.6833517551422119, + "num_tokens": 15460521.0, + "step": 1188 + }, + { + "epoch": 0.76096, + "grad_norm": 3.7774133682250977, + "learning_rate": 5e-06, + "loss": 1.4023, + "mean_token_accuracy": 0.6524300873279572, + "num_tokens": 15472815.0, + "step": 1189 + }, + { + "epoch": 0.7616, + "grad_norm": 3.2685515880584717, + "learning_rate": 5e-06, + "loss": 1.1868, + "mean_token_accuracy": 0.6791387870907784, + "num_tokens": 15488314.0, + "step": 1190 + }, + { + "epoch": 0.76224, + "grad_norm": 3.4335551261901855, + "learning_rate": 5e-06, + "loss": 1.4461, + "mean_token_accuracy": 0.6327428966760635, + "num_tokens": 15502345.0, + "step": 1191 + }, + { + "epoch": 0.76288, + "grad_norm": 3.3318638801574707, + "learning_rate": 5e-06, + "loss": 1.4262, + "mean_token_accuracy": 0.632860004901886, + "num_tokens": 15518148.0, + "step": 1192 + }, + { + "epoch": 0.76352, + "grad_norm": 3.1482911109924316, + "learning_rate": 5e-06, + "loss": 1.2723, + "mean_token_accuracy": 0.696734681725502, + "num_tokens": 15532567.0, + "step": 1193 + }, + { + "epoch": 0.76416, + "grad_norm": 4.470282554626465, + "learning_rate": 5e-06, + "loss": 1.4906, + "mean_token_accuracy": 0.6341715455055237, + "num_tokens": 15544296.0, + "step": 1194 + }, + { + "epoch": 0.7648, + "grad_norm": 3.548245429992676, + "learning_rate": 5e-06, + "loss": 1.4285, + "mean_token_accuracy": 0.628907784819603, + "num_tokens": 15556178.0, + "step": 1195 + }, + { + "epoch": 0.76544, + "grad_norm": 3.0455758571624756, + "learning_rate": 5e-06, + "loss": 1.2669, + "mean_token_accuracy": 0.6598797962069511, + "num_tokens": 15570402.0, + "step": 1196 + }, + { + "epoch": 0.76608, + "grad_norm": 3.394630193710327, + "learning_rate": 5e-06, + "loss": 1.3457, + "mean_token_accuracy": 0.6525625661015511, + "num_tokens": 15583933.0, + "step": 1197 + }, + { + "epoch": 0.76672, + "grad_norm": 3.572402238845825, + "learning_rate": 5e-06, + "loss": 1.1829, + "mean_token_accuracy": 0.6823357492685318, + "num_tokens": 15596838.0, + "step": 1198 + }, + { + "epoch": 0.76736, + "grad_norm": 4.091769695281982, + "learning_rate": 5e-06, + "loss": 1.3601, + "mean_token_accuracy": 0.6605678722262383, + "num_tokens": 15609200.0, + "step": 1199 + }, + { + "epoch": 0.768, + "grad_norm": 3.402550220489502, + "learning_rate": 5e-06, + "loss": 1.2345, + "mean_token_accuracy": 0.6593906283378601, + "num_tokens": 15623053.0, + "step": 1200 + }, + { + "epoch": 0.76864, + "grad_norm": 3.7215263843536377, + "learning_rate": 5e-06, + "loss": 1.3201, + "mean_token_accuracy": 0.6674540042877197, + "num_tokens": 15635540.0, + "step": 1201 + }, + { + "epoch": 0.76928, + "grad_norm": 3.5162336826324463, + "learning_rate": 5e-06, + "loss": 1.3041, + "mean_token_accuracy": 0.662396639585495, + "num_tokens": 15648152.0, + "step": 1202 + }, + { + "epoch": 0.76992, + "grad_norm": 3.8758740425109863, + "learning_rate": 5e-06, + "loss": 1.2048, + "mean_token_accuracy": 0.6618586331605911, + "num_tokens": 15659825.0, + "step": 1203 + }, + { + "epoch": 0.77056, + "grad_norm": 3.6302740573883057, + "learning_rate": 5e-06, + "loss": 1.463, + "mean_token_accuracy": 0.6243670582771301, + "num_tokens": 15675357.0, + "step": 1204 + }, + { + "epoch": 0.7712, + "grad_norm": 3.250278949737549, + "learning_rate": 5e-06, + "loss": 1.3677, + "mean_token_accuracy": 0.6358913704752922, + "num_tokens": 15690114.0, + "step": 1205 + }, + { + "epoch": 0.77184, + "grad_norm": 3.5102968215942383, + "learning_rate": 5e-06, + "loss": 1.1182, + "mean_token_accuracy": 0.6968755125999451, + "num_tokens": 15704497.0, + "step": 1206 + }, + { + "epoch": 0.77248, + "grad_norm": 3.386099100112915, + "learning_rate": 5e-06, + "loss": 1.1454, + "mean_token_accuracy": 0.6858478710055351, + "num_tokens": 15718228.0, + "step": 1207 + }, + { + "epoch": 0.77312, + "grad_norm": 3.6120481491088867, + "learning_rate": 5e-06, + "loss": 1.3312, + "mean_token_accuracy": 0.669069878757, + "num_tokens": 15731116.0, + "step": 1208 + }, + { + "epoch": 0.77376, + "grad_norm": 3.7133243083953857, + "learning_rate": 5e-06, + "loss": 1.5032, + "mean_token_accuracy": 0.6273391470313072, + "num_tokens": 15743678.0, + "step": 1209 + }, + { + "epoch": 0.7744, + "grad_norm": 3.4095213413238525, + "learning_rate": 5e-06, + "loss": 1.3583, + "mean_token_accuracy": 0.6577341482043266, + "num_tokens": 15757250.0, + "step": 1210 + }, + { + "epoch": 0.77504, + "grad_norm": 4.357828140258789, + "learning_rate": 5e-06, + "loss": 1.4194, + "mean_token_accuracy": 0.6646361202001572, + "num_tokens": 15767568.0, + "step": 1211 + }, + { + "epoch": 0.77568, + "grad_norm": 3.3669044971466064, + "learning_rate": 5e-06, + "loss": 1.3806, + "mean_token_accuracy": 0.6653162240982056, + "num_tokens": 15781457.0, + "step": 1212 + }, + { + "epoch": 0.77632, + "grad_norm": 3.057096004486084, + "learning_rate": 5e-06, + "loss": 1.2735, + "mean_token_accuracy": 0.6667918264865875, + "num_tokens": 15796444.0, + "step": 1213 + }, + { + "epoch": 0.77696, + "grad_norm": 3.549315929412842, + "learning_rate": 5e-06, + "loss": 1.255, + "mean_token_accuracy": 0.668325200676918, + "num_tokens": 15807909.0, + "step": 1214 + }, + { + "epoch": 0.7776, + "grad_norm": 4.293363571166992, + "learning_rate": 5e-06, + "loss": 1.2011, + "mean_token_accuracy": 0.7006052732467651, + "num_tokens": 15818410.0, + "step": 1215 + }, + { + "epoch": 0.77824, + "grad_norm": 3.4453113079071045, + "learning_rate": 5e-06, + "loss": 1.4502, + "mean_token_accuracy": 0.6456183791160583, + "num_tokens": 15830410.0, + "step": 1216 + }, + { + "epoch": 0.77888, + "grad_norm": 3.340660572052002, + "learning_rate": 5e-06, + "loss": 1.3797, + "mean_token_accuracy": 0.6471363380551338, + "num_tokens": 15843270.0, + "step": 1217 + }, + { + "epoch": 0.77952, + "grad_norm": 3.578989267349243, + "learning_rate": 5e-06, + "loss": 1.3165, + "mean_token_accuracy": 0.6591121554374695, + "num_tokens": 15856513.0, + "step": 1218 + }, + { + "epoch": 0.78016, + "grad_norm": 3.311697483062744, + "learning_rate": 5e-06, + "loss": 1.3268, + "mean_token_accuracy": 0.6524678990244865, + "num_tokens": 15869453.0, + "step": 1219 + }, + { + "epoch": 0.7808, + "grad_norm": 3.2292022705078125, + "learning_rate": 5e-06, + "loss": 1.2939, + "mean_token_accuracy": 0.6603180393576622, + "num_tokens": 15884284.0, + "step": 1220 + }, + { + "epoch": 0.78144, + "grad_norm": 3.189804792404175, + "learning_rate": 5e-06, + "loss": 1.3883, + "mean_token_accuracy": 0.6416840329766273, + "num_tokens": 15898664.0, + "step": 1221 + }, + { + "epoch": 0.78208, + "grad_norm": 3.1236817836761475, + "learning_rate": 5e-06, + "loss": 1.3237, + "mean_token_accuracy": 0.6408574059605598, + "num_tokens": 15913125.0, + "step": 1222 + }, + { + "epoch": 0.78272, + "grad_norm": 4.161830902099609, + "learning_rate": 5e-06, + "loss": 1.4025, + "mean_token_accuracy": 0.630896121263504, + "num_tokens": 15925782.0, + "step": 1223 + }, + { + "epoch": 0.78336, + "grad_norm": 3.626995086669922, + "learning_rate": 5e-06, + "loss": 1.087, + "mean_token_accuracy": 0.6901156529784203, + "num_tokens": 15936307.0, + "step": 1224 + }, + { + "epoch": 0.784, + "grad_norm": 3.5811476707458496, + "learning_rate": 5e-06, + "loss": 1.1955, + "mean_token_accuracy": 0.6784915700554848, + "num_tokens": 15947469.0, + "step": 1225 + }, + { + "epoch": 0.78464, + "grad_norm": 3.4900920391082764, + "learning_rate": 5e-06, + "loss": 1.3206, + "mean_token_accuracy": 0.6642494723200798, + "num_tokens": 15961802.0, + "step": 1226 + }, + { + "epoch": 0.78528, + "grad_norm": 3.295171022415161, + "learning_rate": 5e-06, + "loss": 1.366, + "mean_token_accuracy": 0.6388103812932968, + "num_tokens": 15973542.0, + "step": 1227 + }, + { + "epoch": 0.78592, + "grad_norm": 3.179863214492798, + "learning_rate": 5e-06, + "loss": 1.4789, + "mean_token_accuracy": 0.6259790062904358, + "num_tokens": 15987918.0, + "step": 1228 + }, + { + "epoch": 0.78656, + "grad_norm": 3.5669660568237305, + "learning_rate": 5e-06, + "loss": 1.3716, + "mean_token_accuracy": 0.6410401687026024, + "num_tokens": 16000309.0, + "step": 1229 + }, + { + "epoch": 0.7872, + "grad_norm": 3.2992517948150635, + "learning_rate": 5e-06, + "loss": 1.3798, + "mean_token_accuracy": 0.6566397473216057, + "num_tokens": 16014417.0, + "step": 1230 + }, + { + "epoch": 0.78784, + "grad_norm": 3.6735100746154785, + "learning_rate": 5e-06, + "loss": 1.2311, + "mean_token_accuracy": 0.6738264411687851, + "num_tokens": 16026169.0, + "step": 1231 + }, + { + "epoch": 0.78848, + "grad_norm": 4.013977527618408, + "learning_rate": 5e-06, + "loss": 1.3946, + "mean_token_accuracy": 0.6297749131917953, + "num_tokens": 16036396.0, + "step": 1232 + }, + { + "epoch": 0.78912, + "grad_norm": 3.506371259689331, + "learning_rate": 5e-06, + "loss": 1.3798, + "mean_token_accuracy": 0.6475069150328636, + "num_tokens": 16049573.0, + "step": 1233 + }, + { + "epoch": 0.78976, + "grad_norm": 3.0766477584838867, + "learning_rate": 5e-06, + "loss": 1.4281, + "mean_token_accuracy": 0.6643402278423309, + "num_tokens": 16064639.0, + "step": 1234 + }, + { + "epoch": 0.7904, + "grad_norm": 3.5113558769226074, + "learning_rate": 5e-06, + "loss": 1.1854, + "mean_token_accuracy": 0.6849528402090073, + "num_tokens": 16078167.0, + "step": 1235 + }, + { + "epoch": 0.79104, + "grad_norm": 3.223271369934082, + "learning_rate": 5e-06, + "loss": 1.3133, + "mean_token_accuracy": 0.6554304733872414, + "num_tokens": 16093693.0, + "step": 1236 + }, + { + "epoch": 0.79168, + "grad_norm": 3.661078691482544, + "learning_rate": 5e-06, + "loss": 1.2121, + "mean_token_accuracy": 0.7052098885178566, + "num_tokens": 16105008.0, + "step": 1237 + }, + { + "epoch": 0.79232, + "grad_norm": 3.4575560092926025, + "learning_rate": 5e-06, + "loss": 1.3498, + "mean_token_accuracy": 0.6544990688562393, + "num_tokens": 16117846.0, + "step": 1238 + }, + { + "epoch": 0.79296, + "grad_norm": 3.559100866317749, + "learning_rate": 5e-06, + "loss": 1.4116, + "mean_token_accuracy": 0.6313577368855476, + "num_tokens": 16130981.0, + "step": 1239 + }, + { + "epoch": 0.7936, + "grad_norm": 3.2983896732330322, + "learning_rate": 5e-06, + "loss": 1.3647, + "mean_token_accuracy": 0.6640519946813583, + "num_tokens": 16144847.0, + "step": 1240 + }, + { + "epoch": 0.79424, + "grad_norm": 3.622084856033325, + "learning_rate": 5e-06, + "loss": 1.292, + "mean_token_accuracy": 0.6699836328625679, + "num_tokens": 16156779.0, + "step": 1241 + }, + { + "epoch": 0.79488, + "grad_norm": 4.421840190887451, + "learning_rate": 5e-06, + "loss": 1.2792, + "mean_token_accuracy": 0.673715990036726, + "num_tokens": 16166792.0, + "step": 1242 + }, + { + "epoch": 0.79552, + "grad_norm": 3.312913656234741, + "learning_rate": 5e-06, + "loss": 1.4633, + "mean_token_accuracy": 0.6244674026966095, + "num_tokens": 16181336.0, + "step": 1243 + }, + { + "epoch": 0.79616, + "grad_norm": 3.5397815704345703, + "learning_rate": 5e-06, + "loss": 1.4338, + "mean_token_accuracy": 0.6298167407512665, + "num_tokens": 16194684.0, + "step": 1244 + }, + { + "epoch": 0.7968, + "grad_norm": 3.798386335372925, + "learning_rate": 5e-06, + "loss": 1.3008, + "mean_token_accuracy": 0.6577273011207581, + "num_tokens": 16206010.0, + "step": 1245 + }, + { + "epoch": 0.79744, + "grad_norm": 3.379908561706543, + "learning_rate": 5e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6839143261313438, + "num_tokens": 16220119.0, + "step": 1246 + }, + { + "epoch": 0.79808, + "grad_norm": 3.7385215759277344, + "learning_rate": 5e-06, + "loss": 1.2292, + "mean_token_accuracy": 0.6770885214209557, + "num_tokens": 16230676.0, + "step": 1247 + }, + { + "epoch": 0.79872, + "grad_norm": 3.6756489276885986, + "learning_rate": 5e-06, + "loss": 1.2106, + "mean_token_accuracy": 0.6802457198500633, + "num_tokens": 16243393.0, + "step": 1248 + }, + { + "epoch": 0.79936, + "grad_norm": 3.861645221710205, + "learning_rate": 5e-06, + "loss": 1.2558, + "mean_token_accuracy": 0.6762053146958351, + "num_tokens": 16254097.0, + "step": 1249 + }, + { + "epoch": 0.8, + "grad_norm": 3.3169620037078857, + "learning_rate": 5e-06, + "loss": 1.2539, + "mean_token_accuracy": 0.6607455164194107, + "num_tokens": 16268379.0, + "step": 1250 + }, + { + "epoch": 0.80064, + "grad_norm": 3.2894480228424072, + "learning_rate": 5e-06, + "loss": 1.3166, + "mean_token_accuracy": 0.6549070253968239, + "num_tokens": 16283564.0, + "step": 1251 + }, + { + "epoch": 0.80128, + "grad_norm": 3.8048436641693115, + "learning_rate": 5e-06, + "loss": 1.424, + "mean_token_accuracy": 0.6509182900190353, + "num_tokens": 16295386.0, + "step": 1252 + }, + { + "epoch": 0.80192, + "grad_norm": 3.7577552795410156, + "learning_rate": 5e-06, + "loss": 1.1979, + "mean_token_accuracy": 0.6759278625249863, + "num_tokens": 16309119.0, + "step": 1253 + }, + { + "epoch": 0.80256, + "grad_norm": 3.8013439178466797, + "learning_rate": 5e-06, + "loss": 1.3405, + "mean_token_accuracy": 0.668271005153656, + "num_tokens": 16320088.0, + "step": 1254 + }, + { + "epoch": 0.8032, + "grad_norm": 3.75661039352417, + "learning_rate": 5e-06, + "loss": 1.2675, + "mean_token_accuracy": 0.6828467771410942, + "num_tokens": 16332206.0, + "step": 1255 + }, + { + "epoch": 0.80384, + "grad_norm": 4.377762794494629, + "learning_rate": 5e-06, + "loss": 1.49, + "mean_token_accuracy": 0.6417308822274208, + "num_tokens": 16344154.0, + "step": 1256 + }, + { + "epoch": 0.80448, + "grad_norm": 3.524298906326294, + "learning_rate": 5e-06, + "loss": 1.2527, + "mean_token_accuracy": 0.6651709750294685, + "num_tokens": 16357771.0, + "step": 1257 + }, + { + "epoch": 0.80512, + "grad_norm": 3.6572201251983643, + "learning_rate": 5e-06, + "loss": 1.3117, + "mean_token_accuracy": 0.6568779051303864, + "num_tokens": 16369715.0, + "step": 1258 + }, + { + "epoch": 0.80576, + "grad_norm": 3.557985305786133, + "learning_rate": 5e-06, + "loss": 1.178, + "mean_token_accuracy": 0.688338540494442, + "num_tokens": 16381110.0, + "step": 1259 + }, + { + "epoch": 0.8064, + "grad_norm": 3.9126033782958984, + "learning_rate": 5e-06, + "loss": 1.5385, + "mean_token_accuracy": 0.6207233294844627, + "num_tokens": 16393612.0, + "step": 1260 + }, + { + "epoch": 0.80704, + "grad_norm": 3.5483007431030273, + "learning_rate": 5e-06, + "loss": 1.1773, + "mean_token_accuracy": 0.6948479861021042, + "num_tokens": 16406331.0, + "step": 1261 + }, + { + "epoch": 0.80768, + "grad_norm": 3.6159143447875977, + "learning_rate": 5e-06, + "loss": 1.3108, + "mean_token_accuracy": 0.6626102104783058, + "num_tokens": 16418794.0, + "step": 1262 + }, + { + "epoch": 0.80832, + "grad_norm": 3.201352834701538, + "learning_rate": 5e-06, + "loss": 1.346, + "mean_token_accuracy": 0.6304316557943821, + "num_tokens": 16429613.0, + "step": 1263 + }, + { + "epoch": 0.80896, + "grad_norm": 3.9572861194610596, + "learning_rate": 5e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.7080657631158829, + "num_tokens": 16440987.0, + "step": 1264 + }, + { + "epoch": 0.8096, + "grad_norm": 3.182184934616089, + "learning_rate": 5e-06, + "loss": 1.356, + "mean_token_accuracy": 0.6579003632068634, + "num_tokens": 16455570.0, + "step": 1265 + }, + { + "epoch": 0.81024, + "grad_norm": 3.835308313369751, + "learning_rate": 5e-06, + "loss": 1.4172, + "mean_token_accuracy": 0.6239579617977142, + "num_tokens": 16468080.0, + "step": 1266 + }, + { + "epoch": 0.81088, + "grad_norm": 3.3559696674346924, + "learning_rate": 5e-06, + "loss": 1.1735, + "mean_token_accuracy": 0.6860703229904175, + "num_tokens": 16481207.0, + "step": 1267 + }, + { + "epoch": 0.81152, + "grad_norm": 3.19657039642334, + "learning_rate": 5e-06, + "loss": 1.3224, + "mean_token_accuracy": 0.6673007681965828, + "num_tokens": 16495219.0, + "step": 1268 + }, + { + "epoch": 0.81216, + "grad_norm": 3.2514398097991943, + "learning_rate": 5e-06, + "loss": 1.4182, + "mean_token_accuracy": 0.6229546442627907, + "num_tokens": 16511488.0, + "step": 1269 + }, + { + "epoch": 0.8128, + "grad_norm": 2.9578235149383545, + "learning_rate": 5e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.6994348987936974, + "num_tokens": 16527874.0, + "step": 1270 + }, + { + "epoch": 0.81344, + "grad_norm": 3.202214479446411, + "learning_rate": 5e-06, + "loss": 1.4316, + "mean_token_accuracy": 0.625508576631546, + "num_tokens": 16542263.0, + "step": 1271 + }, + { + "epoch": 0.81408, + "grad_norm": 3.9414408206939697, + "learning_rate": 5e-06, + "loss": 1.2243, + "mean_token_accuracy": 0.6666690483689308, + "num_tokens": 16554216.0, + "step": 1272 + }, + { + "epoch": 0.81472, + "grad_norm": 3.792768955230713, + "learning_rate": 5e-06, + "loss": 1.2563, + "mean_token_accuracy": 0.6469622924923897, + "num_tokens": 16566251.0, + "step": 1273 + }, + { + "epoch": 0.81536, + "grad_norm": 3.4059951305389404, + "learning_rate": 5e-06, + "loss": 1.3999, + "mean_token_accuracy": 0.6422073394060135, + "num_tokens": 16579765.0, + "step": 1274 + }, + { + "epoch": 0.816, + "grad_norm": 4.562513828277588, + "learning_rate": 5e-06, + "loss": 1.2946, + "mean_token_accuracy": 0.6693150997161865, + "num_tokens": 16589352.0, + "step": 1275 + }, + { + "epoch": 0.81664, + "grad_norm": 4.269272327423096, + "learning_rate": 5e-06, + "loss": 1.2899, + "mean_token_accuracy": 0.6878708451986313, + "num_tokens": 16598911.0, + "step": 1276 + }, + { + "epoch": 0.81728, + "grad_norm": 3.5766615867614746, + "learning_rate": 5e-06, + "loss": 1.3643, + "mean_token_accuracy": 0.6596589758992195, + "num_tokens": 16612073.0, + "step": 1277 + }, + { + "epoch": 0.81792, + "grad_norm": 3.2693169116973877, + "learning_rate": 5e-06, + "loss": 1.2538, + "mean_token_accuracy": 0.6582971885800362, + "num_tokens": 16626673.0, + "step": 1278 + }, + { + "epoch": 0.81856, + "grad_norm": 3.7346718311309814, + "learning_rate": 5e-06, + "loss": 1.4059, + "mean_token_accuracy": 0.6605831310153008, + "num_tokens": 16640222.0, + "step": 1279 + }, + { + "epoch": 0.8192, + "grad_norm": 3.571347951889038, + "learning_rate": 5e-06, + "loss": 1.378, + "mean_token_accuracy": 0.6396291702985764, + "num_tokens": 16652331.0, + "step": 1280 + }, + { + "epoch": 0.81984, + "grad_norm": 3.3202948570251465, + "learning_rate": 5e-06, + "loss": 1.3002, + "mean_token_accuracy": 0.656744010746479, + "num_tokens": 16664098.0, + "step": 1281 + }, + { + "epoch": 0.82048, + "grad_norm": 3.2276108264923096, + "learning_rate": 5e-06, + "loss": 1.1796, + "mean_token_accuracy": 0.6782404407858849, + "num_tokens": 16678834.0, + "step": 1282 + }, + { + "epoch": 0.82112, + "grad_norm": 3.5021538734436035, + "learning_rate": 5e-06, + "loss": 1.1757, + "mean_token_accuracy": 0.6985038220882416, + "num_tokens": 16692514.0, + "step": 1283 + }, + { + "epoch": 0.82176, + "grad_norm": 3.8361024856567383, + "learning_rate": 5e-06, + "loss": 1.3879, + "mean_token_accuracy": 0.6511978656053543, + "num_tokens": 16705296.0, + "step": 1284 + }, + { + "epoch": 0.8224, + "grad_norm": 3.3450541496276855, + "learning_rate": 5e-06, + "loss": 1.4618, + "mean_token_accuracy": 0.6236701160669327, + "num_tokens": 16719506.0, + "step": 1285 + }, + { + "epoch": 0.82304, + "grad_norm": 3.344872236251831, + "learning_rate": 5e-06, + "loss": 1.4341, + "mean_token_accuracy": 0.6452220380306244, + "num_tokens": 16733788.0, + "step": 1286 + }, + { + "epoch": 0.82368, + "grad_norm": 3.2765679359436035, + "learning_rate": 5e-06, + "loss": 1.3945, + "mean_token_accuracy": 0.6469878405332565, + "num_tokens": 16746762.0, + "step": 1287 + }, + { + "epoch": 0.82432, + "grad_norm": 3.3606464862823486, + "learning_rate": 5e-06, + "loss": 1.442, + "mean_token_accuracy": 0.6300052553415298, + "num_tokens": 16762035.0, + "step": 1288 + }, + { + "epoch": 0.82496, + "grad_norm": 3.9703168869018555, + "learning_rate": 5e-06, + "loss": 1.4146, + "mean_token_accuracy": 0.6354392319917679, + "num_tokens": 16772696.0, + "step": 1289 + }, + { + "epoch": 0.8256, + "grad_norm": 3.2966363430023193, + "learning_rate": 5e-06, + "loss": 1.2722, + "mean_token_accuracy": 0.665034607052803, + "num_tokens": 16787285.0, + "step": 1290 + }, + { + "epoch": 0.82624, + "grad_norm": 3.6354568004608154, + "learning_rate": 5e-06, + "loss": 1.2903, + "mean_token_accuracy": 0.6690637767314911, + "num_tokens": 16799868.0, + "step": 1291 + }, + { + "epoch": 0.82688, + "grad_norm": 3.9511008262634277, + "learning_rate": 5e-06, + "loss": 1.3668, + "mean_token_accuracy": 0.6623431816697121, + "num_tokens": 16811306.0, + "step": 1292 + }, + { + "epoch": 0.82752, + "grad_norm": 3.4990999698638916, + "learning_rate": 5e-06, + "loss": 1.2118, + "mean_token_accuracy": 0.6844175234436989, + "num_tokens": 16824295.0, + "step": 1293 + }, + { + "epoch": 0.82816, + "grad_norm": 3.638296604156494, + "learning_rate": 5e-06, + "loss": 1.1873, + "mean_token_accuracy": 0.6708608791232109, + "num_tokens": 16836129.0, + "step": 1294 + }, + { + "epoch": 0.8288, + "grad_norm": 3.5374062061309814, + "learning_rate": 5e-06, + "loss": 1.3716, + "mean_token_accuracy": 0.662922739982605, + "num_tokens": 16849257.0, + "step": 1295 + }, + { + "epoch": 0.82944, + "grad_norm": 4.183645725250244, + "learning_rate": 5e-06, + "loss": 1.2535, + "mean_token_accuracy": 0.6610106378793716, + "num_tokens": 16860223.0, + "step": 1296 + }, + { + "epoch": 0.83008, + "grad_norm": 3.551673412322998, + "learning_rate": 5e-06, + "loss": 1.2743, + "mean_token_accuracy": 0.6478192396461964, + "num_tokens": 16871987.0, + "step": 1297 + }, + { + "epoch": 0.83072, + "grad_norm": 3.2299296855926514, + "learning_rate": 5e-06, + "loss": 1.3783, + "mean_token_accuracy": 0.6627595871686935, + "num_tokens": 16886179.0, + "step": 1298 + }, + { + "epoch": 0.83136, + "grad_norm": 3.688389301300049, + "learning_rate": 5e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.7124327570199966, + "num_tokens": 16898088.0, + "step": 1299 + }, + { + "epoch": 0.832, + "grad_norm": 3.371751070022583, + "learning_rate": 5e-06, + "loss": 1.4761, + "mean_token_accuracy": 0.624034658074379, + "num_tokens": 16912488.0, + "step": 1300 + }, + { + "epoch": 0.83264, + "grad_norm": 3.6259591579437256, + "learning_rate": 5e-06, + "loss": 1.1865, + "mean_token_accuracy": 0.6735802069306374, + "num_tokens": 16926127.0, + "step": 1301 + }, + { + "epoch": 0.83328, + "grad_norm": 3.571916103363037, + "learning_rate": 5e-06, + "loss": 1.5566, + "mean_token_accuracy": 0.62827018648386, + "num_tokens": 16939816.0, + "step": 1302 + }, + { + "epoch": 0.83392, + "grad_norm": 3.3074350357055664, + "learning_rate": 5e-06, + "loss": 1.4043, + "mean_token_accuracy": 0.639069065451622, + "num_tokens": 16953696.0, + "step": 1303 + }, + { + "epoch": 0.83456, + "grad_norm": 3.573622941970825, + "learning_rate": 5e-06, + "loss": 1.3567, + "mean_token_accuracy": 0.6488766446709633, + "num_tokens": 16965974.0, + "step": 1304 + }, + { + "epoch": 0.8352, + "grad_norm": 3.201739549636841, + "learning_rate": 5e-06, + "loss": 1.2488, + "mean_token_accuracy": 0.6712930873036385, + "num_tokens": 16980031.0, + "step": 1305 + }, + { + "epoch": 0.83584, + "grad_norm": 3.284263849258423, + "learning_rate": 5e-06, + "loss": 1.3163, + "mean_token_accuracy": 0.6636942848563194, + "num_tokens": 16993264.0, + "step": 1306 + }, + { + "epoch": 0.83648, + "grad_norm": 3.39267897605896, + "learning_rate": 5e-06, + "loss": 1.2675, + "mean_token_accuracy": 0.6717317998409271, + "num_tokens": 17005939.0, + "step": 1307 + }, + { + "epoch": 0.83712, + "grad_norm": 3.601962089538574, + "learning_rate": 5e-06, + "loss": 1.2444, + "mean_token_accuracy": 0.7010955587029457, + "num_tokens": 17019858.0, + "step": 1308 + }, + { + "epoch": 0.83776, + "grad_norm": 4.25007438659668, + "learning_rate": 5e-06, + "loss": 1.2578, + "mean_token_accuracy": 0.6884395852684975, + "num_tokens": 17031840.0, + "step": 1309 + }, + { + "epoch": 0.8384, + "grad_norm": 3.216642379760742, + "learning_rate": 5e-06, + "loss": 1.07, + "mean_token_accuracy": 0.6811397597193718, + "num_tokens": 17043624.0, + "step": 1310 + }, + { + "epoch": 0.83904, + "grad_norm": 4.06812858581543, + "learning_rate": 5e-06, + "loss": 1.2633, + "mean_token_accuracy": 0.6581188440322876, + "num_tokens": 17055221.0, + "step": 1311 + }, + { + "epoch": 0.83968, + "grad_norm": 4.409648418426514, + "learning_rate": 5e-06, + "loss": 1.4064, + "mean_token_accuracy": 0.6504970565438271, + "num_tokens": 17065224.0, + "step": 1312 + }, + { + "epoch": 0.84032, + "grad_norm": 3.070948839187622, + "learning_rate": 5e-06, + "loss": 1.3761, + "mean_token_accuracy": 0.6479237154126167, + "num_tokens": 17078405.0, + "step": 1313 + }, + { + "epoch": 0.84096, + "grad_norm": 3.568082094192505, + "learning_rate": 5e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6967073529958725, + "num_tokens": 17091614.0, + "step": 1314 + }, + { + "epoch": 0.8416, + "grad_norm": 3.664025068283081, + "learning_rate": 5e-06, + "loss": 1.4398, + "mean_token_accuracy": 0.6788545474410057, + "num_tokens": 17104109.0, + "step": 1315 + }, + { + "epoch": 0.84224, + "grad_norm": 3.4449939727783203, + "learning_rate": 5e-06, + "loss": 1.203, + "mean_token_accuracy": 0.6785411536693573, + "num_tokens": 17116517.0, + "step": 1316 + }, + { + "epoch": 0.84288, + "grad_norm": 3.2764899730682373, + "learning_rate": 5e-06, + "loss": 1.2928, + "mean_token_accuracy": 0.6439727321267128, + "num_tokens": 17130912.0, + "step": 1317 + }, + { + "epoch": 0.84352, + "grad_norm": 3.6440088748931885, + "learning_rate": 5e-06, + "loss": 1.076, + "mean_token_accuracy": 0.7138596475124359, + "num_tokens": 17143603.0, + "step": 1318 + }, + { + "epoch": 0.84416, + "grad_norm": 3.7815802097320557, + "learning_rate": 5e-06, + "loss": 1.4247, + "mean_token_accuracy": 0.6309964135289192, + "num_tokens": 17156597.0, + "step": 1319 + }, + { + "epoch": 0.8448, + "grad_norm": 3.145379066467285, + "learning_rate": 5e-06, + "loss": 1.0981, + "mean_token_accuracy": 0.7020114660263062, + "num_tokens": 17170210.0, + "step": 1320 + }, + { + "epoch": 0.84544, + "grad_norm": 4.029253005981445, + "learning_rate": 5e-06, + "loss": 1.4513, + "mean_token_accuracy": 0.6537614092230797, + "num_tokens": 17182328.0, + "step": 1321 + }, + { + "epoch": 0.84608, + "grad_norm": 3.2656235694885254, + "learning_rate": 5e-06, + "loss": 1.5357, + "mean_token_accuracy": 0.641093410551548, + "num_tokens": 17197005.0, + "step": 1322 + }, + { + "epoch": 0.84672, + "grad_norm": 3.559967041015625, + "learning_rate": 5e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.7045318782329559, + "num_tokens": 17208973.0, + "step": 1323 + }, + { + "epoch": 0.84736, + "grad_norm": 3.366745710372925, + "learning_rate": 5e-06, + "loss": 1.3679, + "mean_token_accuracy": 0.6683920547366142, + "num_tokens": 17221909.0, + "step": 1324 + }, + { + "epoch": 0.848, + "grad_norm": 3.4706954956054688, + "learning_rate": 5e-06, + "loss": 1.317, + "mean_token_accuracy": 0.6562648341059685, + "num_tokens": 17234739.0, + "step": 1325 + }, + { + "epoch": 0.84864, + "grad_norm": 3.4657156467437744, + "learning_rate": 5e-06, + "loss": 1.4667, + "mean_token_accuracy": 0.6328883245587349, + "num_tokens": 17249245.0, + "step": 1326 + }, + { + "epoch": 0.84928, + "grad_norm": 3.4521939754486084, + "learning_rate": 5e-06, + "loss": 1.4047, + "mean_token_accuracy": 0.6429140567779541, + "num_tokens": 17263466.0, + "step": 1327 + }, + { + "epoch": 0.84992, + "grad_norm": 3.3580243587493896, + "learning_rate": 5e-06, + "loss": 1.4, + "mean_token_accuracy": 0.6485451236367226, + "num_tokens": 17277966.0, + "step": 1328 + }, + { + "epoch": 0.85056, + "grad_norm": 3.6181726455688477, + "learning_rate": 5e-06, + "loss": 1.4906, + "mean_token_accuracy": 0.6203976050019264, + "num_tokens": 17290080.0, + "step": 1329 + }, + { + "epoch": 0.8512, + "grad_norm": 3.0654401779174805, + "learning_rate": 5e-06, + "loss": 1.3708, + "mean_token_accuracy": 0.6423755809664726, + "num_tokens": 17307462.0, + "step": 1330 + }, + { + "epoch": 0.85184, + "grad_norm": 3.682450294494629, + "learning_rate": 5e-06, + "loss": 1.4412, + "mean_token_accuracy": 0.6563334167003632, + "num_tokens": 17320760.0, + "step": 1331 + }, + { + "epoch": 0.85248, + "grad_norm": 4.22981071472168, + "learning_rate": 5e-06, + "loss": 1.1568, + "mean_token_accuracy": 0.687714472413063, + "num_tokens": 17330799.0, + "step": 1332 + }, + { + "epoch": 0.85312, + "grad_norm": 3.9495580196380615, + "learning_rate": 5e-06, + "loss": 1.385, + "mean_token_accuracy": 0.643461637198925, + "num_tokens": 17340826.0, + "step": 1333 + }, + { + "epoch": 0.85376, + "grad_norm": 3.5318918228149414, + "learning_rate": 5e-06, + "loss": 1.2977, + "mean_token_accuracy": 0.6616112142801285, + "num_tokens": 17353805.0, + "step": 1334 + }, + { + "epoch": 0.8544, + "grad_norm": 3.967776298522949, + "learning_rate": 5e-06, + "loss": 1.2952, + "mean_token_accuracy": 0.6506235525012016, + "num_tokens": 17366394.0, + "step": 1335 + }, + { + "epoch": 0.85504, + "grad_norm": 3.663810968399048, + "learning_rate": 5e-06, + "loss": 1.148, + "mean_token_accuracy": 0.6877422258257866, + "num_tokens": 17377970.0, + "step": 1336 + }, + { + "epoch": 0.85568, + "grad_norm": 3.229074478149414, + "learning_rate": 5e-06, + "loss": 1.2273, + "mean_token_accuracy": 0.6863315925002098, + "num_tokens": 17392020.0, + "step": 1337 + }, + { + "epoch": 0.85632, + "grad_norm": 3.3477957248687744, + "learning_rate": 5e-06, + "loss": 1.054, + "mean_token_accuracy": 0.7150156199932098, + "num_tokens": 17404119.0, + "step": 1338 + }, + { + "epoch": 0.85696, + "grad_norm": 3.4252710342407227, + "learning_rate": 5e-06, + "loss": 1.4177, + "mean_token_accuracy": 0.6546554416418076, + "num_tokens": 17418373.0, + "step": 1339 + }, + { + "epoch": 0.8576, + "grad_norm": 3.3960907459259033, + "learning_rate": 5e-06, + "loss": 1.2424, + "mean_token_accuracy": 0.6713818609714508, + "num_tokens": 17430650.0, + "step": 1340 + }, + { + "epoch": 0.85824, + "grad_norm": 3.5569021701812744, + "learning_rate": 5e-06, + "loss": 1.4048, + "mean_token_accuracy": 0.6662162095308304, + "num_tokens": 17443979.0, + "step": 1341 + }, + { + "epoch": 0.85888, + "grad_norm": 3.508941650390625, + "learning_rate": 5e-06, + "loss": 1.552, + "mean_token_accuracy": 0.6020488813519478, + "num_tokens": 17458181.0, + "step": 1342 + }, + { + "epoch": 0.85952, + "grad_norm": 3.9543237686157227, + "learning_rate": 5e-06, + "loss": 1.5179, + "mean_token_accuracy": 0.6510177925229073, + "num_tokens": 17469923.0, + "step": 1343 + }, + { + "epoch": 0.86016, + "grad_norm": 4.113687515258789, + "learning_rate": 5e-06, + "loss": 1.3311, + "mean_token_accuracy": 0.652983695268631, + "num_tokens": 17483696.0, + "step": 1344 + }, + { + "epoch": 0.8608, + "grad_norm": 3.756329298019409, + "learning_rate": 5e-06, + "loss": 1.2371, + "mean_token_accuracy": 0.6460439562797546, + "num_tokens": 17496524.0, + "step": 1345 + }, + { + "epoch": 0.86144, + "grad_norm": 3.375931978225708, + "learning_rate": 5e-06, + "loss": 1.3934, + "mean_token_accuracy": 0.6481117159128189, + "num_tokens": 17510332.0, + "step": 1346 + }, + { + "epoch": 0.86208, + "grad_norm": 4.059141635894775, + "learning_rate": 5e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.6906588524580002, + "num_tokens": 17520886.0, + "step": 1347 + }, + { + "epoch": 0.86272, + "grad_norm": 2.9917287826538086, + "learning_rate": 5e-06, + "loss": 1.2105, + "mean_token_accuracy": 0.6634574681520462, + "num_tokens": 17536190.0, + "step": 1348 + }, + { + "epoch": 0.86336, + "grad_norm": 3.9010698795318604, + "learning_rate": 5e-06, + "loss": 1.2638, + "mean_token_accuracy": 0.6710032075643539, + "num_tokens": 17548427.0, + "step": 1349 + }, + { + "epoch": 0.864, + "grad_norm": 3.535780668258667, + "learning_rate": 5e-06, + "loss": 1.2962, + "mean_token_accuracy": 0.6431876122951508, + "num_tokens": 17561319.0, + "step": 1350 + }, + { + "epoch": 0.86464, + "grad_norm": 3.2573955059051514, + "learning_rate": 5e-06, + "loss": 1.233, + "mean_token_accuracy": 0.6907457932829857, + "num_tokens": 17573944.0, + "step": 1351 + }, + { + "epoch": 0.86528, + "grad_norm": 3.478487491607666, + "learning_rate": 5e-06, + "loss": 1.1014, + "mean_token_accuracy": 0.6931867897510529, + "num_tokens": 17587912.0, + "step": 1352 + }, + { + "epoch": 0.86592, + "grad_norm": 3.618330955505371, + "learning_rate": 5e-06, + "loss": 1.4492, + "mean_token_accuracy": 0.6480180844664574, + "num_tokens": 17600364.0, + "step": 1353 + }, + { + "epoch": 0.86656, + "grad_norm": 3.834172248840332, + "learning_rate": 5e-06, + "loss": 1.4564, + "mean_token_accuracy": 0.6302541047334671, + "num_tokens": 17614033.0, + "step": 1354 + }, + { + "epoch": 0.8672, + "grad_norm": 3.973057746887207, + "learning_rate": 5e-06, + "loss": 1.4296, + "mean_token_accuracy": 0.6398394256830215, + "num_tokens": 17626618.0, + "step": 1355 + }, + { + "epoch": 0.86784, + "grad_norm": 3.6730847358703613, + "learning_rate": 5e-06, + "loss": 1.3343, + "mean_token_accuracy": 0.6507444530725479, + "num_tokens": 17638206.0, + "step": 1356 + }, + { + "epoch": 0.86848, + "grad_norm": 3.6375482082366943, + "learning_rate": 5e-06, + "loss": 1.3747, + "mean_token_accuracy": 0.6530400216579437, + "num_tokens": 17650041.0, + "step": 1357 + }, + { + "epoch": 0.86912, + "grad_norm": 3.4408140182495117, + "learning_rate": 5e-06, + "loss": 1.1361, + "mean_token_accuracy": 0.6785493567585945, + "num_tokens": 17661144.0, + "step": 1358 + }, + { + "epoch": 0.86976, + "grad_norm": 3.449578046798706, + "learning_rate": 5e-06, + "loss": 1.3709, + "mean_token_accuracy": 0.6438928842544556, + "num_tokens": 17674539.0, + "step": 1359 + }, + { + "epoch": 0.8704, + "grad_norm": 5.356245994567871, + "learning_rate": 5e-06, + "loss": 1.4153, + "mean_token_accuracy": 0.6556727215647697, + "num_tokens": 17685582.0, + "step": 1360 + }, + { + "epoch": 0.87104, + "grad_norm": 3.2209205627441406, + "learning_rate": 5e-06, + "loss": 1.3495, + "mean_token_accuracy": 0.6475684642791748, + "num_tokens": 17700534.0, + "step": 1361 + }, + { + "epoch": 0.87168, + "grad_norm": 4.095639705657959, + "learning_rate": 5e-06, + "loss": 1.3171, + "mean_token_accuracy": 0.6621948033571243, + "num_tokens": 17712854.0, + "step": 1362 + }, + { + "epoch": 0.87232, + "grad_norm": 4.265082359313965, + "learning_rate": 5e-06, + "loss": 1.3075, + "mean_token_accuracy": 0.6697151511907578, + "num_tokens": 17723003.0, + "step": 1363 + }, + { + "epoch": 0.87296, + "grad_norm": 3.368932008743286, + "learning_rate": 5e-06, + "loss": 1.4089, + "mean_token_accuracy": 0.6478553786873817, + "num_tokens": 17736730.0, + "step": 1364 + }, + { + "epoch": 0.8736, + "grad_norm": 3.5103371143341064, + "learning_rate": 5e-06, + "loss": 1.1961, + "mean_token_accuracy": 0.6800166815519333, + "num_tokens": 17749195.0, + "step": 1365 + }, + { + "epoch": 0.87424, + "grad_norm": 3.6628217697143555, + "learning_rate": 5e-06, + "loss": 1.2206, + "mean_token_accuracy": 0.7043485268950462, + "num_tokens": 17761716.0, + "step": 1366 + }, + { + "epoch": 0.87488, + "grad_norm": 3.283897638320923, + "learning_rate": 5e-06, + "loss": 1.2755, + "mean_token_accuracy": 0.6614806577563286, + "num_tokens": 17776711.0, + "step": 1367 + }, + { + "epoch": 0.87552, + "grad_norm": 4.253682613372803, + "learning_rate": 5e-06, + "loss": 1.1304, + "mean_token_accuracy": 0.7110883370041847, + "num_tokens": 17787618.0, + "step": 1368 + }, + { + "epoch": 0.87616, + "grad_norm": 3.7107419967651367, + "learning_rate": 5e-06, + "loss": 1.353, + "mean_token_accuracy": 0.6722783967852592, + "num_tokens": 17798686.0, + "step": 1369 + }, + { + "epoch": 0.8768, + "grad_norm": 4.0010271072387695, + "learning_rate": 5e-06, + "loss": 1.2605, + "mean_token_accuracy": 0.660023458302021, + "num_tokens": 17812008.0, + "step": 1370 + }, + { + "epoch": 0.87744, + "grad_norm": 3.8963913917541504, + "learning_rate": 5e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6989761069417, + "num_tokens": 17822062.0, + "step": 1371 + }, + { + "epoch": 0.87808, + "grad_norm": 3.409618854522705, + "learning_rate": 5e-06, + "loss": 1.5595, + "mean_token_accuracy": 0.6186339408159256, + "num_tokens": 17836142.0, + "step": 1372 + }, + { + "epoch": 0.87872, + "grad_norm": 2.955591917037964, + "learning_rate": 5e-06, + "loss": 1.2973, + "mean_token_accuracy": 0.6578470319509506, + "num_tokens": 17850508.0, + "step": 1373 + }, + { + "epoch": 0.87936, + "grad_norm": 3.400749921798706, + "learning_rate": 5e-06, + "loss": 1.1947, + "mean_token_accuracy": 0.6701619401574135, + "num_tokens": 17864240.0, + "step": 1374 + }, + { + "epoch": 0.88, + "grad_norm": 3.2822978496551514, + "learning_rate": 5e-06, + "loss": 1.46, + "mean_token_accuracy": 0.6225104928016663, + "num_tokens": 17879022.0, + "step": 1375 + }, + { + "epoch": 0.88064, + "grad_norm": 3.9761667251586914, + "learning_rate": 5e-06, + "loss": 1.1623, + "mean_token_accuracy": 0.6682100668549538, + "num_tokens": 17890289.0, + "step": 1376 + }, + { + "epoch": 0.88128, + "grad_norm": 3.6653897762298584, + "learning_rate": 5e-06, + "loss": 1.3524, + "mean_token_accuracy": 0.6353943534195423, + "num_tokens": 17903080.0, + "step": 1377 + }, + { + "epoch": 0.88192, + "grad_norm": 4.603322505950928, + "learning_rate": 5e-06, + "loss": 1.6278, + "mean_token_accuracy": 0.6351469904184341, + "num_tokens": 17912567.0, + "step": 1378 + }, + { + "epoch": 0.88256, + "grad_norm": 3.411752700805664, + "learning_rate": 5e-06, + "loss": 1.2195, + "mean_token_accuracy": 0.675203487277031, + "num_tokens": 17927030.0, + "step": 1379 + }, + { + "epoch": 0.8832, + "grad_norm": 4.03117036819458, + "learning_rate": 5e-06, + "loss": 1.1379, + "mean_token_accuracy": 0.6800655201077461, + "num_tokens": 17936846.0, + "step": 1380 + }, + { + "epoch": 0.88384, + "grad_norm": 3.4626095294952393, + "learning_rate": 5e-06, + "loss": 1.2256, + "mean_token_accuracy": 0.6742624565958977, + "num_tokens": 17949176.0, + "step": 1381 + }, + { + "epoch": 0.88448, + "grad_norm": 3.326813220977783, + "learning_rate": 5e-06, + "loss": 1.2921, + "mean_token_accuracy": 0.6827266663312912, + "num_tokens": 17962574.0, + "step": 1382 + }, + { + "epoch": 0.88512, + "grad_norm": 3.539931535720825, + "learning_rate": 5e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.7009731009602547, + "num_tokens": 17975312.0, + "step": 1383 + }, + { + "epoch": 0.88576, + "grad_norm": 3.1076414585113525, + "learning_rate": 5e-06, + "loss": 1.2004, + "mean_token_accuracy": 0.6538084149360657, + "num_tokens": 17992044.0, + "step": 1384 + }, + { + "epoch": 0.8864, + "grad_norm": 3.54392147064209, + "learning_rate": 5e-06, + "loss": 1.3749, + "mean_token_accuracy": 0.6528435945510864, + "num_tokens": 18004880.0, + "step": 1385 + }, + { + "epoch": 0.88704, + "grad_norm": 3.1049365997314453, + "learning_rate": 5e-06, + "loss": 1.4558, + "mean_token_accuracy": 0.6254525259137154, + "num_tokens": 18018899.0, + "step": 1386 + }, + { + "epoch": 0.88768, + "grad_norm": 3.872276782989502, + "learning_rate": 5e-06, + "loss": 1.3721, + "mean_token_accuracy": 0.6540368646383286, + "num_tokens": 18031425.0, + "step": 1387 + }, + { + "epoch": 0.88832, + "grad_norm": 4.218468189239502, + "learning_rate": 5e-06, + "loss": 1.1603, + "mean_token_accuracy": 0.6857082098722458, + "num_tokens": 18042604.0, + "step": 1388 + }, + { + "epoch": 0.88896, + "grad_norm": 3.564180374145508, + "learning_rate": 5e-06, + "loss": 1.3641, + "mean_token_accuracy": 0.6548919975757599, + "num_tokens": 18055105.0, + "step": 1389 + }, + { + "epoch": 0.8896, + "grad_norm": 3.4216361045837402, + "learning_rate": 5e-06, + "loss": 1.3133, + "mean_token_accuracy": 0.6648430451750755, + "num_tokens": 18068895.0, + "step": 1390 + }, + { + "epoch": 0.89024, + "grad_norm": 3.466216564178467, + "learning_rate": 5e-06, + "loss": 1.426, + "mean_token_accuracy": 0.675171747803688, + "num_tokens": 18082806.0, + "step": 1391 + }, + { + "epoch": 0.89088, + "grad_norm": 4.009366512298584, + "learning_rate": 5e-06, + "loss": 1.151, + "mean_token_accuracy": 0.6659594774246216, + "num_tokens": 18093287.0, + "step": 1392 + }, + { + "epoch": 0.89152, + "grad_norm": 3.287865161895752, + "learning_rate": 5e-06, + "loss": 1.1818, + "mean_token_accuracy": 0.697388269007206, + "num_tokens": 18107755.0, + "step": 1393 + }, + { + "epoch": 0.89216, + "grad_norm": 3.865363597869873, + "learning_rate": 5e-06, + "loss": 1.2494, + "mean_token_accuracy": 0.6640971228480339, + "num_tokens": 18118665.0, + "step": 1394 + }, + { + "epoch": 0.8928, + "grad_norm": 3.694581985473633, + "learning_rate": 5e-06, + "loss": 1.2886, + "mean_token_accuracy": 0.6522090062499046, + "num_tokens": 18130135.0, + "step": 1395 + }, + { + "epoch": 0.89344, + "grad_norm": 3.5079498291015625, + "learning_rate": 5e-06, + "loss": 1.2224, + "mean_token_accuracy": 0.6756654903292656, + "num_tokens": 18143669.0, + "step": 1396 + }, + { + "epoch": 0.89408, + "grad_norm": 3.9231410026550293, + "learning_rate": 5e-06, + "loss": 1.2382, + "mean_token_accuracy": 0.6565620601177216, + "num_tokens": 18155787.0, + "step": 1397 + }, + { + "epoch": 0.89472, + "grad_norm": 3.2922706604003906, + "learning_rate": 5e-06, + "loss": 1.3624, + "mean_token_accuracy": 0.657343290746212, + "num_tokens": 18169330.0, + "step": 1398 + }, + { + "epoch": 0.89536, + "grad_norm": 4.219677448272705, + "learning_rate": 5e-06, + "loss": 1.3696, + "mean_token_accuracy": 0.6795709133148193, + "num_tokens": 18181111.0, + "step": 1399 + }, + { + "epoch": 0.896, + "grad_norm": 3.3847157955169678, + "learning_rate": 5e-06, + "loss": 1.2803, + "mean_token_accuracy": 0.6801963672041893, + "num_tokens": 18194826.0, + "step": 1400 + }, + { + "epoch": 0.89664, + "grad_norm": 3.3101882934570312, + "learning_rate": 5e-06, + "loss": 1.2146, + "mean_token_accuracy": 0.6810724586248398, + "num_tokens": 18207891.0, + "step": 1401 + }, + { + "epoch": 0.89728, + "grad_norm": 4.586159706115723, + "learning_rate": 5e-06, + "loss": 1.234, + "mean_token_accuracy": 0.6616763696074486, + "num_tokens": 18219068.0, + "step": 1402 + }, + { + "epoch": 0.89792, + "grad_norm": 2.9213805198669434, + "learning_rate": 5e-06, + "loss": 1.5959, + "mean_token_accuracy": 0.6127992421388626, + "num_tokens": 18234945.0, + "step": 1403 + }, + { + "epoch": 0.89856, + "grad_norm": 3.180678606033325, + "learning_rate": 5e-06, + "loss": 1.2227, + "mean_token_accuracy": 0.680058054625988, + "num_tokens": 18249768.0, + "step": 1404 + }, + { + "epoch": 0.8992, + "grad_norm": 3.4679532051086426, + "learning_rate": 5e-06, + "loss": 1.2374, + "mean_token_accuracy": 0.67696313560009, + "num_tokens": 18265924.0, + "step": 1405 + }, + { + "epoch": 0.89984, + "grad_norm": 3.4234979152679443, + "learning_rate": 5e-06, + "loss": 1.5505, + "mean_token_accuracy": 0.6274235621094704, + "num_tokens": 18280819.0, + "step": 1406 + }, + { + "epoch": 0.90048, + "grad_norm": 4.96069860458374, + "learning_rate": 5e-06, + "loss": 1.4236, + "mean_token_accuracy": 0.607517022639513, + "num_tokens": 18291686.0, + "step": 1407 + }, + { + "epoch": 0.90112, + "grad_norm": 3.1977005004882812, + "learning_rate": 5e-06, + "loss": 1.3486, + "mean_token_accuracy": 0.6483859121799469, + "num_tokens": 18304993.0, + "step": 1408 + }, + { + "epoch": 0.90176, + "grad_norm": 3.5749099254608154, + "learning_rate": 5e-06, + "loss": 1.2922, + "mean_token_accuracy": 0.6452238261699677, + "num_tokens": 18319373.0, + "step": 1409 + }, + { + "epoch": 0.9024, + "grad_norm": 3.388899803161621, + "learning_rate": 5e-06, + "loss": 1.3281, + "mean_token_accuracy": 0.6475742906332016, + "num_tokens": 18331998.0, + "step": 1410 + }, + { + "epoch": 0.90304, + "grad_norm": 3.4031882286071777, + "learning_rate": 5e-06, + "loss": 1.3355, + "mean_token_accuracy": 0.6812806725502014, + "num_tokens": 18344632.0, + "step": 1411 + }, + { + "epoch": 0.90368, + "grad_norm": 3.8880221843719482, + "learning_rate": 5e-06, + "loss": 1.3898, + "mean_token_accuracy": 0.6473888382315636, + "num_tokens": 18356350.0, + "step": 1412 + }, + { + "epoch": 0.90432, + "grad_norm": 3.5985724925994873, + "learning_rate": 5e-06, + "loss": 1.2345, + "mean_token_accuracy": 0.6557316966354847, + "num_tokens": 18368400.0, + "step": 1413 + }, + { + "epoch": 0.90496, + "grad_norm": 3.6234962940216064, + "learning_rate": 5e-06, + "loss": 1.1942, + "mean_token_accuracy": 0.6906508356332779, + "num_tokens": 18379118.0, + "step": 1414 + }, + { + "epoch": 0.9056, + "grad_norm": 3.8934993743896484, + "learning_rate": 5e-06, + "loss": 1.2382, + "mean_token_accuracy": 0.6724176928400993, + "num_tokens": 18391595.0, + "step": 1415 + }, + { + "epoch": 0.90624, + "grad_norm": 3.603591203689575, + "learning_rate": 5e-06, + "loss": 1.3737, + "mean_token_accuracy": 0.6498560681939125, + "num_tokens": 18403595.0, + "step": 1416 + }, + { + "epoch": 0.90688, + "grad_norm": 3.2106738090515137, + "learning_rate": 5e-06, + "loss": 1.2911, + "mean_token_accuracy": 0.6614857837557793, + "num_tokens": 18418034.0, + "step": 1417 + }, + { + "epoch": 0.90752, + "grad_norm": 3.0255284309387207, + "learning_rate": 5e-06, + "loss": 1.2975, + "mean_token_accuracy": 0.653803177177906, + "num_tokens": 18434798.0, + "step": 1418 + }, + { + "epoch": 0.90816, + "grad_norm": 3.696108818054199, + "learning_rate": 5e-06, + "loss": 1.3184, + "mean_token_accuracy": 0.6341993510723114, + "num_tokens": 18446612.0, + "step": 1419 + }, + { + "epoch": 0.9088, + "grad_norm": 4.0753254890441895, + "learning_rate": 5e-06, + "loss": 1.2244, + "mean_token_accuracy": 0.6535854563117027, + "num_tokens": 18458141.0, + "step": 1420 + }, + { + "epoch": 0.90944, + "grad_norm": 3.655604124069214, + "learning_rate": 5e-06, + "loss": 1.3088, + "mean_token_accuracy": 0.6778343543410301, + "num_tokens": 18471653.0, + "step": 1421 + }, + { + "epoch": 0.91008, + "grad_norm": 3.4860193729400635, + "learning_rate": 5e-06, + "loss": 1.2065, + "mean_token_accuracy": 0.6871431916952133, + "num_tokens": 18482903.0, + "step": 1422 + }, + { + "epoch": 0.91072, + "grad_norm": 3.5701212882995605, + "learning_rate": 5e-06, + "loss": 1.161, + "mean_token_accuracy": 0.6876110881567001, + "num_tokens": 18495519.0, + "step": 1423 + }, + { + "epoch": 0.91136, + "grad_norm": 4.311164855957031, + "learning_rate": 5e-06, + "loss": 1.2691, + "mean_token_accuracy": 0.6963246017694473, + "num_tokens": 18506391.0, + "step": 1424 + }, + { + "epoch": 0.912, + "grad_norm": 3.228339672088623, + "learning_rate": 5e-06, + "loss": 1.2486, + "mean_token_accuracy": 0.6647578254342079, + "num_tokens": 18521751.0, + "step": 1425 + }, + { + "epoch": 0.91264, + "grad_norm": 3.649463176727295, + "learning_rate": 5e-06, + "loss": 1.2265, + "mean_token_accuracy": 0.6655023992061615, + "num_tokens": 18533605.0, + "step": 1426 + }, + { + "epoch": 0.91328, + "grad_norm": 3.822047710418701, + "learning_rate": 5e-06, + "loss": 1.2303, + "mean_token_accuracy": 0.6853557825088501, + "num_tokens": 18545920.0, + "step": 1427 + }, + { + "epoch": 0.91392, + "grad_norm": 3.622427463531494, + "learning_rate": 5e-06, + "loss": 1.3153, + "mean_token_accuracy": 0.6682358086109161, + "num_tokens": 18558370.0, + "step": 1428 + }, + { + "epoch": 0.91456, + "grad_norm": 3.013226270675659, + "learning_rate": 5e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.7230858653783798, + "num_tokens": 18572388.0, + "step": 1429 + }, + { + "epoch": 0.9152, + "grad_norm": 2.999063730239868, + "learning_rate": 5e-06, + "loss": 1.2757, + "mean_token_accuracy": 0.658422015607357, + "num_tokens": 18587001.0, + "step": 1430 + }, + { + "epoch": 0.91584, + "grad_norm": 3.246445417404175, + "learning_rate": 5e-06, + "loss": 1.1428, + "mean_token_accuracy": 0.7174563780426979, + "num_tokens": 18600196.0, + "step": 1431 + }, + { + "epoch": 0.91648, + "grad_norm": 3.52728533744812, + "learning_rate": 5e-06, + "loss": 1.2719, + "mean_token_accuracy": 0.6571086049079895, + "num_tokens": 18612602.0, + "step": 1432 + }, + { + "epoch": 0.91712, + "grad_norm": 3.3236947059631348, + "learning_rate": 5e-06, + "loss": 1.3722, + "mean_token_accuracy": 0.6516182944178581, + "num_tokens": 18628569.0, + "step": 1433 + }, + { + "epoch": 0.91776, + "grad_norm": 3.9207522869110107, + "learning_rate": 5e-06, + "loss": 1.289, + "mean_token_accuracy": 0.6646075919270515, + "num_tokens": 18639375.0, + "step": 1434 + }, + { + "epoch": 0.9184, + "grad_norm": 3.3679165840148926, + "learning_rate": 5e-06, + "loss": 1.3844, + "mean_token_accuracy": 0.6545412912964821, + "num_tokens": 18652531.0, + "step": 1435 + }, + { + "epoch": 0.91904, + "grad_norm": 3.58003830909729, + "learning_rate": 5e-06, + "loss": 1.3116, + "mean_token_accuracy": 0.655610017478466, + "num_tokens": 18665160.0, + "step": 1436 + }, + { + "epoch": 0.91968, + "grad_norm": 3.827817916870117, + "learning_rate": 5e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.6569493412971497, + "num_tokens": 18676671.0, + "step": 1437 + }, + { + "epoch": 0.92032, + "grad_norm": 3.6998956203460693, + "learning_rate": 5e-06, + "loss": 1.5481, + "mean_token_accuracy": 0.6249835789203644, + "num_tokens": 18690078.0, + "step": 1438 + }, + { + "epoch": 0.92096, + "grad_norm": 3.2389333248138428, + "learning_rate": 5e-06, + "loss": 1.2938, + "mean_token_accuracy": 0.65943942964077, + "num_tokens": 18703678.0, + "step": 1439 + }, + { + "epoch": 0.9216, + "grad_norm": 2.924175262451172, + "learning_rate": 5e-06, + "loss": 1.2873, + "mean_token_accuracy": 0.6494470685720444, + "num_tokens": 18719576.0, + "step": 1440 + }, + { + "epoch": 0.92224, + "grad_norm": 3.7290942668914795, + "learning_rate": 5e-06, + "loss": 1.2667, + "mean_token_accuracy": 0.6728792116045952, + "num_tokens": 18732712.0, + "step": 1441 + }, + { + "epoch": 0.92288, + "grad_norm": 3.406003952026367, + "learning_rate": 5e-06, + "loss": 1.1128, + "mean_token_accuracy": 0.7027332484722137, + "num_tokens": 18745929.0, + "step": 1442 + }, + { + "epoch": 0.92352, + "grad_norm": 3.9130918979644775, + "learning_rate": 5e-06, + "loss": 1.1714, + "mean_token_accuracy": 0.6731210052967072, + "num_tokens": 18755977.0, + "step": 1443 + }, + { + "epoch": 0.92416, + "grad_norm": 3.678868055343628, + "learning_rate": 5e-06, + "loss": 1.3613, + "mean_token_accuracy": 0.6376957893371582, + "num_tokens": 18767848.0, + "step": 1444 + }, + { + "epoch": 0.9248, + "grad_norm": 3.355009078979492, + "learning_rate": 5e-06, + "loss": 1.4501, + "mean_token_accuracy": 0.6530297324061394, + "num_tokens": 18781692.0, + "step": 1445 + }, + { + "epoch": 0.92544, + "grad_norm": 3.197375774383545, + "learning_rate": 5e-06, + "loss": 1.4667, + "mean_token_accuracy": 0.6258358731865883, + "num_tokens": 18796361.0, + "step": 1446 + }, + { + "epoch": 0.92608, + "grad_norm": 3.364900588989258, + "learning_rate": 5e-06, + "loss": 1.4204, + "mean_token_accuracy": 0.6358629465103149, + "num_tokens": 18810771.0, + "step": 1447 + }, + { + "epoch": 0.92672, + "grad_norm": 3.323707342147827, + "learning_rate": 5e-06, + "loss": 1.1537, + "mean_token_accuracy": 0.700812466442585, + "num_tokens": 18824895.0, + "step": 1448 + }, + { + "epoch": 0.92736, + "grad_norm": 3.5423851013183594, + "learning_rate": 5e-06, + "loss": 1.1198, + "mean_token_accuracy": 0.6927414685487747, + "num_tokens": 18838244.0, + "step": 1449 + }, + { + "epoch": 0.928, + "grad_norm": 3.5557827949523926, + "learning_rate": 5e-06, + "loss": 1.3942, + "mean_token_accuracy": 0.6344395503401756, + "num_tokens": 18850747.0, + "step": 1450 + }, + { + "epoch": 0.92864, + "grad_norm": 3.8772428035736084, + "learning_rate": 5e-06, + "loss": 1.1849, + "mean_token_accuracy": 0.6797264739871025, + "num_tokens": 18863209.0, + "step": 1451 + }, + { + "epoch": 0.92928, + "grad_norm": 3.387641668319702, + "learning_rate": 5e-06, + "loss": 1.4152, + "mean_token_accuracy": 0.6333313882350922, + "num_tokens": 18876056.0, + "step": 1452 + }, + { + "epoch": 0.92992, + "grad_norm": 3.554407835006714, + "learning_rate": 5e-06, + "loss": 1.1832, + "mean_token_accuracy": 0.6640536859631538, + "num_tokens": 18890920.0, + "step": 1453 + }, + { + "epoch": 0.93056, + "grad_norm": 3.302236795425415, + "learning_rate": 5e-06, + "loss": 1.5489, + "mean_token_accuracy": 0.6134847179055214, + "num_tokens": 18905793.0, + "step": 1454 + }, + { + "epoch": 0.9312, + "grad_norm": 3.531574010848999, + "learning_rate": 5e-06, + "loss": 1.2801, + "mean_token_accuracy": 0.6507202833890915, + "num_tokens": 18920224.0, + "step": 1455 + }, + { + "epoch": 0.93184, + "grad_norm": 3.5933139324188232, + "learning_rate": 5e-06, + "loss": 1.3922, + "mean_token_accuracy": 0.6551200449466705, + "num_tokens": 18932613.0, + "step": 1456 + }, + { + "epoch": 0.93248, + "grad_norm": 3.254462480545044, + "learning_rate": 5e-06, + "loss": 1.3985, + "mean_token_accuracy": 0.6505570337176323, + "num_tokens": 18946774.0, + "step": 1457 + }, + { + "epoch": 0.93312, + "grad_norm": 3.2945821285247803, + "learning_rate": 5e-06, + "loss": 1.5279, + "mean_token_accuracy": 0.6084811314940453, + "num_tokens": 18961275.0, + "step": 1458 + }, + { + "epoch": 0.93376, + "grad_norm": 3.2776741981506348, + "learning_rate": 5e-06, + "loss": 1.3401, + "mean_token_accuracy": 0.640129804611206, + "num_tokens": 18975529.0, + "step": 1459 + }, + { + "epoch": 0.9344, + "grad_norm": 3.2493832111358643, + "learning_rate": 5e-06, + "loss": 1.1077, + "mean_token_accuracy": 0.6907928735017776, + "num_tokens": 18988267.0, + "step": 1460 + }, + { + "epoch": 0.93504, + "grad_norm": 3.765650987625122, + "learning_rate": 5e-06, + "loss": 1.3092, + "mean_token_accuracy": 0.6711199656128883, + "num_tokens": 19000229.0, + "step": 1461 + }, + { + "epoch": 0.93568, + "grad_norm": 3.1340558528900146, + "learning_rate": 5e-06, + "loss": 1.4336, + "mean_token_accuracy": 0.6485133692622185, + "num_tokens": 19014356.0, + "step": 1462 + }, + { + "epoch": 0.93632, + "grad_norm": 3.672553300857544, + "learning_rate": 5e-06, + "loss": 1.1751, + "mean_token_accuracy": 0.664104662835598, + "num_tokens": 19025717.0, + "step": 1463 + }, + { + "epoch": 0.93696, + "grad_norm": 3.753906726837158, + "learning_rate": 5e-06, + "loss": 1.1003, + "mean_token_accuracy": 0.6995716020464897, + "num_tokens": 19037864.0, + "step": 1464 + }, + { + "epoch": 0.9376, + "grad_norm": 3.1207399368286133, + "learning_rate": 5e-06, + "loss": 1.2334, + "mean_token_accuracy": 0.6692882552742958, + "num_tokens": 19052336.0, + "step": 1465 + }, + { + "epoch": 0.93824, + "grad_norm": 3.639620065689087, + "learning_rate": 5e-06, + "loss": 1.396, + "mean_token_accuracy": 0.6677844971418381, + "num_tokens": 19065183.0, + "step": 1466 + }, + { + "epoch": 0.93888, + "grad_norm": 3.5665981769561768, + "learning_rate": 5e-06, + "loss": 1.3489, + "mean_token_accuracy": 0.66384107619524, + "num_tokens": 19078765.0, + "step": 1467 + }, + { + "epoch": 0.93952, + "grad_norm": 3.5918264389038086, + "learning_rate": 5e-06, + "loss": 1.4087, + "mean_token_accuracy": 0.6427194476127625, + "num_tokens": 19091098.0, + "step": 1468 + }, + { + "epoch": 0.94016, + "grad_norm": 3.3692591190338135, + "learning_rate": 5e-06, + "loss": 1.3897, + "mean_token_accuracy": 0.6431680992245674, + "num_tokens": 19105664.0, + "step": 1469 + }, + { + "epoch": 0.9408, + "grad_norm": 3.6854288578033447, + "learning_rate": 5e-06, + "loss": 1.3319, + "mean_token_accuracy": 0.6552760303020477, + "num_tokens": 19118215.0, + "step": 1470 + }, + { + "epoch": 0.94144, + "grad_norm": 3.3998701572418213, + "learning_rate": 5e-06, + "loss": 1.1683, + "mean_token_accuracy": 0.675237774848938, + "num_tokens": 19130126.0, + "step": 1471 + }, + { + "epoch": 0.94208, + "grad_norm": 3.5668833255767822, + "learning_rate": 5e-06, + "loss": 1.4991, + "mean_token_accuracy": 0.6222522705793381, + "num_tokens": 19142375.0, + "step": 1472 + }, + { + "epoch": 0.94272, + "grad_norm": 3.275745153427124, + "learning_rate": 5e-06, + "loss": 1.3953, + "mean_token_accuracy": 0.6239468678832054, + "num_tokens": 19157943.0, + "step": 1473 + }, + { + "epoch": 0.94336, + "grad_norm": 4.061445236206055, + "learning_rate": 5e-06, + "loss": 1.3817, + "mean_token_accuracy": 0.6464495584368706, + "num_tokens": 19169261.0, + "step": 1474 + }, + { + "epoch": 0.944, + "grad_norm": 3.1921486854553223, + "learning_rate": 5e-06, + "loss": 1.284, + "mean_token_accuracy": 0.6610319390892982, + "num_tokens": 19184566.0, + "step": 1475 + }, + { + "epoch": 0.94464, + "grad_norm": 3.192448139190674, + "learning_rate": 5e-06, + "loss": 1.2544, + "mean_token_accuracy": 0.670927107334137, + "num_tokens": 19199161.0, + "step": 1476 + }, + { + "epoch": 0.94528, + "grad_norm": 3.534567356109619, + "learning_rate": 5e-06, + "loss": 1.2898, + "mean_token_accuracy": 0.6620035171508789, + "num_tokens": 19210216.0, + "step": 1477 + }, + { + "epoch": 0.94592, + "grad_norm": 3.4070894718170166, + "learning_rate": 5e-06, + "loss": 1.2067, + "mean_token_accuracy": 0.665832906961441, + "num_tokens": 19222748.0, + "step": 1478 + }, + { + "epoch": 0.94656, + "grad_norm": 3.373779058456421, + "learning_rate": 5e-06, + "loss": 1.3304, + "mean_token_accuracy": 0.6508694216609001, + "num_tokens": 19236471.0, + "step": 1479 + }, + { + "epoch": 0.9472, + "grad_norm": 3.518333911895752, + "learning_rate": 5e-06, + "loss": 1.4454, + "mean_token_accuracy": 0.645517073571682, + "num_tokens": 19249438.0, + "step": 1480 + }, + { + "epoch": 0.94784, + "grad_norm": 3.995748519897461, + "learning_rate": 5e-06, + "loss": 1.4204, + "mean_token_accuracy": 0.6810062602162361, + "num_tokens": 19262043.0, + "step": 1481 + }, + { + "epoch": 0.94848, + "grad_norm": 3.0706183910369873, + "learning_rate": 5e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7076255902647972, + "num_tokens": 19277307.0, + "step": 1482 + }, + { + "epoch": 0.94912, + "grad_norm": 3.0978240966796875, + "learning_rate": 5e-06, + "loss": 1.3144, + "mean_token_accuracy": 0.6533934101462364, + "num_tokens": 19292657.0, + "step": 1483 + }, + { + "epoch": 0.94976, + "grad_norm": 3.988011121749878, + "learning_rate": 5e-06, + "loss": 1.3691, + "mean_token_accuracy": 0.6342190653085709, + "num_tokens": 19303123.0, + "step": 1484 + }, + { + "epoch": 0.9504, + "grad_norm": 3.7990894317626953, + "learning_rate": 5e-06, + "loss": 1.107, + "mean_token_accuracy": 0.7004605457186699, + "num_tokens": 19314275.0, + "step": 1485 + }, + { + "epoch": 0.95104, + "grad_norm": 3.5531113147735596, + "learning_rate": 5e-06, + "loss": 1.3478, + "mean_token_accuracy": 0.6372592151165009, + "num_tokens": 19327717.0, + "step": 1486 + }, + { + "epoch": 0.95168, + "grad_norm": 3.129286050796509, + "learning_rate": 5e-06, + "loss": 1.5809, + "mean_token_accuracy": 0.6213468164205551, + "num_tokens": 19341237.0, + "step": 1487 + }, + { + "epoch": 0.95232, + "grad_norm": 3.394064426422119, + "learning_rate": 5e-06, + "loss": 1.3591, + "mean_token_accuracy": 0.6372789964079857, + "num_tokens": 19355577.0, + "step": 1488 + }, + { + "epoch": 0.95296, + "grad_norm": 3.2110018730163574, + "learning_rate": 5e-06, + "loss": 1.2399, + "mean_token_accuracy": 0.679095022380352, + "num_tokens": 19371326.0, + "step": 1489 + }, + { + "epoch": 0.9536, + "grad_norm": 3.3202333450317383, + "learning_rate": 5e-06, + "loss": 1.3916, + "mean_token_accuracy": 0.6611816883087158, + "num_tokens": 19385868.0, + "step": 1490 + }, + { + "epoch": 0.95424, + "grad_norm": 3.5390098094940186, + "learning_rate": 5e-06, + "loss": 1.2219, + "mean_token_accuracy": 0.6761639937758446, + "num_tokens": 19398025.0, + "step": 1491 + }, + { + "epoch": 0.95488, + "grad_norm": 3.390742778778076, + "learning_rate": 5e-06, + "loss": 1.5499, + "mean_token_accuracy": 0.6107296124100685, + "num_tokens": 19412343.0, + "step": 1492 + }, + { + "epoch": 0.95552, + "grad_norm": 2.821200132369995, + "learning_rate": 5e-06, + "loss": 1.2155, + "mean_token_accuracy": 0.6577084437012672, + "num_tokens": 19428748.0, + "step": 1493 + }, + { + "epoch": 0.95616, + "grad_norm": 3.292036771774292, + "learning_rate": 5e-06, + "loss": 1.3155, + "mean_token_accuracy": 0.644202746450901, + "num_tokens": 19440656.0, + "step": 1494 + }, + { + "epoch": 0.9568, + "grad_norm": 3.416463851928711, + "learning_rate": 5e-06, + "loss": 1.2269, + "mean_token_accuracy": 0.6907675266265869, + "num_tokens": 19452544.0, + "step": 1495 + }, + { + "epoch": 0.95744, + "grad_norm": 3.6329751014709473, + "learning_rate": 5e-06, + "loss": 1.3323, + "mean_token_accuracy": 0.6382646858692169, + "num_tokens": 19465315.0, + "step": 1496 + }, + { + "epoch": 0.95808, + "grad_norm": 3.5367205142974854, + "learning_rate": 5e-06, + "loss": 1.373, + "mean_token_accuracy": 0.6586090922355652, + "num_tokens": 19480115.0, + "step": 1497 + }, + { + "epoch": 0.95872, + "grad_norm": 3.5177509784698486, + "learning_rate": 5e-06, + "loss": 1.2388, + "mean_token_accuracy": 0.6645878851413727, + "num_tokens": 19494388.0, + "step": 1498 + }, + { + "epoch": 0.95936, + "grad_norm": 3.709169626235962, + "learning_rate": 5e-06, + "loss": 1.3733, + "mean_token_accuracy": 0.6607565060257912, + "num_tokens": 19505621.0, + "step": 1499 + }, + { + "epoch": 0.96, + "grad_norm": 3.3196604251861572, + "learning_rate": 5e-06, + "loss": 1.1325, + "mean_token_accuracy": 0.6826166063547134, + "num_tokens": 19519830.0, + "step": 1500 + }, + { + "epoch": 0.96064, + "grad_norm": 4.17763090133667, + "learning_rate": 5e-06, + "loss": 1.2355, + "mean_token_accuracy": 0.6763554587960243, + "num_tokens": 19532118.0, + "step": 1501 + }, + { + "epoch": 0.96128, + "grad_norm": 3.9797887802124023, + "learning_rate": 5e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7308862134814262, + "num_tokens": 19543422.0, + "step": 1502 + }, + { + "epoch": 0.96192, + "grad_norm": 3.3593435287475586, + "learning_rate": 5e-06, + "loss": 1.121, + "mean_token_accuracy": 0.6892295926809311, + "num_tokens": 19555897.0, + "step": 1503 + }, + { + "epoch": 0.96256, + "grad_norm": 3.6559438705444336, + "learning_rate": 5e-06, + "loss": 1.1375, + "mean_token_accuracy": 0.6769029051065445, + "num_tokens": 19567248.0, + "step": 1504 + }, + { + "epoch": 0.9632, + "grad_norm": 3.6883292198181152, + "learning_rate": 5e-06, + "loss": 1.3164, + "mean_token_accuracy": 0.643324077129364, + "num_tokens": 19579310.0, + "step": 1505 + }, + { + "epoch": 0.96384, + "grad_norm": 3.5200116634368896, + "learning_rate": 5e-06, + "loss": 1.2694, + "mean_token_accuracy": 0.6747664734721184, + "num_tokens": 19592537.0, + "step": 1506 + }, + { + "epoch": 0.96448, + "grad_norm": 3.3167619705200195, + "learning_rate": 5e-06, + "loss": 1.2958, + "mean_token_accuracy": 0.6770147830247879, + "num_tokens": 19606932.0, + "step": 1507 + }, + { + "epoch": 0.96512, + "grad_norm": 2.7224249839782715, + "learning_rate": 5e-06, + "loss": 1.3125, + "mean_token_accuracy": 0.6614532843232155, + "num_tokens": 19624296.0, + "step": 1508 + }, + { + "epoch": 0.96576, + "grad_norm": 3.4137089252471924, + "learning_rate": 5e-06, + "loss": 1.2778, + "mean_token_accuracy": 0.662353903055191, + "num_tokens": 19637049.0, + "step": 1509 + }, + { + "epoch": 0.9664, + "grad_norm": 3.7370848655700684, + "learning_rate": 5e-06, + "loss": 1.4503, + "mean_token_accuracy": 0.639873132109642, + "num_tokens": 19649788.0, + "step": 1510 + }, + { + "epoch": 0.96704, + "grad_norm": 3.4333293437957764, + "learning_rate": 5e-06, + "loss": 1.4996, + "mean_token_accuracy": 0.63913669064641, + "num_tokens": 19662956.0, + "step": 1511 + }, + { + "epoch": 0.96768, + "grad_norm": 3.8436150550842285, + "learning_rate": 5e-06, + "loss": 1.2372, + "mean_token_accuracy": 0.6666671261191368, + "num_tokens": 19674701.0, + "step": 1512 + }, + { + "epoch": 0.96832, + "grad_norm": 3.4364569187164307, + "learning_rate": 5e-06, + "loss": 1.4256, + "mean_token_accuracy": 0.6375450566411018, + "num_tokens": 19688356.0, + "step": 1513 + }, + { + "epoch": 0.96896, + "grad_norm": 3.1849286556243896, + "learning_rate": 5e-06, + "loss": 1.3019, + "mean_token_accuracy": 0.654203861951828, + "num_tokens": 19703055.0, + "step": 1514 + }, + { + "epoch": 0.9696, + "grad_norm": 3.790954828262329, + "learning_rate": 5e-06, + "loss": 1.1957, + "mean_token_accuracy": 0.6805417165160179, + "num_tokens": 19715360.0, + "step": 1515 + }, + { + "epoch": 0.97024, + "grad_norm": 3.696563243865967, + "learning_rate": 5e-06, + "loss": 1.2499, + "mean_token_accuracy": 0.6584246829152107, + "num_tokens": 19726044.0, + "step": 1516 + }, + { + "epoch": 0.97088, + "grad_norm": 4.10850191116333, + "learning_rate": 5e-06, + "loss": 1.4378, + "mean_token_accuracy": 0.6355271711945534, + "num_tokens": 19739139.0, + "step": 1517 + }, + { + "epoch": 0.97152, + "grad_norm": 3.1323556900024414, + "learning_rate": 5e-06, + "loss": 1.3652, + "mean_token_accuracy": 0.6413158774375916, + "num_tokens": 19753058.0, + "step": 1518 + }, + { + "epoch": 0.97216, + "grad_norm": 3.334622859954834, + "learning_rate": 5e-06, + "loss": 1.2963, + "mean_token_accuracy": 0.6517771631479263, + "num_tokens": 19765569.0, + "step": 1519 + }, + { + "epoch": 0.9728, + "grad_norm": 5.364054203033447, + "learning_rate": 5e-06, + "loss": 1.1438, + "mean_token_accuracy": 0.6952068582177162, + "num_tokens": 19778058.0, + "step": 1520 + }, + { + "epoch": 0.97344, + "grad_norm": 3.416874408721924, + "learning_rate": 5e-06, + "loss": 1.1759, + "mean_token_accuracy": 0.6735500246286392, + "num_tokens": 19792728.0, + "step": 1521 + }, + { + "epoch": 0.97408, + "grad_norm": 3.164233922958374, + "learning_rate": 5e-06, + "loss": 1.1211, + "mean_token_accuracy": 0.6952219977974892, + "num_tokens": 19807085.0, + "step": 1522 + }, + { + "epoch": 0.97472, + "grad_norm": 3.73028564453125, + "learning_rate": 5e-06, + "loss": 1.3345, + "mean_token_accuracy": 0.6841987073421478, + "num_tokens": 19821681.0, + "step": 1523 + }, + { + "epoch": 0.97536, + "grad_norm": 3.401895761489868, + "learning_rate": 5e-06, + "loss": 1.3681, + "mean_token_accuracy": 0.6333037242293358, + "num_tokens": 19834796.0, + "step": 1524 + }, + { + "epoch": 0.976, + "grad_norm": 3.8067119121551514, + "learning_rate": 5e-06, + "loss": 1.0905, + "mean_token_accuracy": 0.6978934183716774, + "num_tokens": 19846639.0, + "step": 1525 + }, + { + "epoch": 0.97664, + "grad_norm": 3.070439338684082, + "learning_rate": 5e-06, + "loss": 1.2461, + "mean_token_accuracy": 0.653811477124691, + "num_tokens": 19860465.0, + "step": 1526 + }, + { + "epoch": 0.97728, + "grad_norm": 3.186588764190674, + "learning_rate": 5e-06, + "loss": 1.1821, + "mean_token_accuracy": 0.7026697173714638, + "num_tokens": 19876272.0, + "step": 1527 + }, + { + "epoch": 0.97792, + "grad_norm": 3.122529983520508, + "learning_rate": 5e-06, + "loss": 1.1799, + "mean_token_accuracy": 0.6770785599946976, + "num_tokens": 19892221.0, + "step": 1528 + }, + { + "epoch": 0.97856, + "grad_norm": 3.7920093536376953, + "learning_rate": 5e-06, + "loss": 1.3852, + "mean_token_accuracy": 0.6517146974802017, + "num_tokens": 19903219.0, + "step": 1529 + }, + { + "epoch": 0.9792, + "grad_norm": 3.9800093173980713, + "learning_rate": 5e-06, + "loss": 1.3666, + "mean_token_accuracy": 0.6671213582158089, + "num_tokens": 19914283.0, + "step": 1530 + }, + { + "epoch": 0.97984, + "grad_norm": 4.115480899810791, + "learning_rate": 5e-06, + "loss": 1.6462, + "mean_token_accuracy": 0.6288831681013107, + "num_tokens": 19924831.0, + "step": 1531 + }, + { + "epoch": 0.98048, + "grad_norm": 3.8407366275787354, + "learning_rate": 5e-06, + "loss": 1.3123, + "mean_token_accuracy": 0.646103672683239, + "num_tokens": 19935368.0, + "step": 1532 + }, + { + "epoch": 0.98112, + "grad_norm": 3.036931276321411, + "learning_rate": 5e-06, + "loss": 1.3947, + "mean_token_accuracy": 0.6490734815597534, + "num_tokens": 19950888.0, + "step": 1533 + }, + { + "epoch": 0.98176, + "grad_norm": 3.3416826725006104, + "learning_rate": 5e-06, + "loss": 1.3709, + "mean_token_accuracy": 0.6444736868143082, + "num_tokens": 19964717.0, + "step": 1534 + }, + { + "epoch": 0.9824, + "grad_norm": 3.184088945388794, + "learning_rate": 5e-06, + "loss": 1.3429, + "mean_token_accuracy": 0.673894077539444, + "num_tokens": 19977976.0, + "step": 1535 + }, + { + "epoch": 0.98304, + "grad_norm": 3.382946491241455, + "learning_rate": 5e-06, + "loss": 1.481, + "mean_token_accuracy": 0.6425464749336243, + "num_tokens": 19991312.0, + "step": 1536 + }, + { + "epoch": 0.98368, + "grad_norm": 3.7429699897766113, + "learning_rate": 5e-06, + "loss": 1.2422, + "mean_token_accuracy": 0.6737086698412895, + "num_tokens": 20002300.0, + "step": 1537 + }, + { + "epoch": 0.98432, + "grad_norm": 3.6931872367858887, + "learning_rate": 5e-06, + "loss": 1.3122, + "mean_token_accuracy": 0.6581440344452858, + "num_tokens": 20015107.0, + "step": 1538 + }, + { + "epoch": 0.98496, + "grad_norm": 4.0337300300598145, + "learning_rate": 5e-06, + "loss": 1.3912, + "mean_token_accuracy": 0.6898427382111549, + "num_tokens": 20027265.0, + "step": 1539 + }, + { + "epoch": 0.9856, + "grad_norm": 3.514187812805176, + "learning_rate": 5e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.7012772336602211, + "num_tokens": 20038919.0, + "step": 1540 + }, + { + "epoch": 0.98624, + "grad_norm": 3.5034477710723877, + "learning_rate": 5e-06, + "loss": 1.4009, + "mean_token_accuracy": 0.6428939253091812, + "num_tokens": 20052482.0, + "step": 1541 + }, + { + "epoch": 0.98688, + "grad_norm": 3.3519279956817627, + "learning_rate": 5e-06, + "loss": 1.4362, + "mean_token_accuracy": 0.6396335512399673, + "num_tokens": 20067032.0, + "step": 1542 + }, + { + "epoch": 0.98752, + "grad_norm": 3.7068188190460205, + "learning_rate": 5e-06, + "loss": 1.2301, + "mean_token_accuracy": 0.6891591548919678, + "num_tokens": 20079146.0, + "step": 1543 + }, + { + "epoch": 0.98816, + "grad_norm": 3.6617250442504883, + "learning_rate": 5e-06, + "loss": 1.211, + "mean_token_accuracy": 0.7132939025759697, + "num_tokens": 20089620.0, + "step": 1544 + }, + { + "epoch": 0.9888, + "grad_norm": 3.217038631439209, + "learning_rate": 5e-06, + "loss": 1.3661, + "mean_token_accuracy": 0.6576998308300972, + "num_tokens": 20103587.0, + "step": 1545 + }, + { + "epoch": 0.98944, + "grad_norm": 3.996293783187866, + "learning_rate": 5e-06, + "loss": 1.2923, + "mean_token_accuracy": 0.6637570187449455, + "num_tokens": 20115402.0, + "step": 1546 + }, + { + "epoch": 0.99008, + "grad_norm": 3.543278932571411, + "learning_rate": 5e-06, + "loss": 1.2429, + "mean_token_accuracy": 0.6742196753621101, + "num_tokens": 20126222.0, + "step": 1547 + }, + { + "epoch": 0.99072, + "grad_norm": 3.501190662384033, + "learning_rate": 5e-06, + "loss": 1.2304, + "mean_token_accuracy": 0.6541951596736908, + "num_tokens": 20137476.0, + "step": 1548 + }, + { + "epoch": 0.99136, + "grad_norm": 3.904467821121216, + "learning_rate": 5e-06, + "loss": 1.2723, + "mean_token_accuracy": 0.6750770211219788, + "num_tokens": 20149377.0, + "step": 1549 + }, + { + "epoch": 0.992, + "grad_norm": 3.557426691055298, + "learning_rate": 5e-06, + "loss": 1.4754, + "mean_token_accuracy": 0.6633486226201057, + "num_tokens": 20161955.0, + "step": 1550 + }, + { + "epoch": 0.99264, + "grad_norm": 3.5321543216705322, + "learning_rate": 5e-06, + "loss": 1.3909, + "mean_token_accuracy": 0.6640786305069923, + "num_tokens": 20174432.0, + "step": 1551 + }, + { + "epoch": 0.99328, + "grad_norm": 4.1432929039001465, + "learning_rate": 5e-06, + "loss": 1.2162, + "mean_token_accuracy": 0.6733951196074486, + "num_tokens": 20186656.0, + "step": 1552 + }, + { + "epoch": 0.99392, + "grad_norm": 3.221876859664917, + "learning_rate": 5e-06, + "loss": 1.2039, + "mean_token_accuracy": 0.6700675636529922, + "num_tokens": 20200325.0, + "step": 1553 + }, + { + "epoch": 0.99456, + "grad_norm": 3.4923529624938965, + "learning_rate": 5e-06, + "loss": 1.2479, + "mean_token_accuracy": 0.6704057157039642, + "num_tokens": 20211958.0, + "step": 1554 + }, + { + "epoch": 0.9952, + "grad_norm": 3.4751315116882324, + "learning_rate": 5e-06, + "loss": 1.2513, + "mean_token_accuracy": 0.6954710930585861, + "num_tokens": 20224457.0, + "step": 1555 + }, + { + "epoch": 0.99584, + "grad_norm": 3.4763216972351074, + "learning_rate": 5e-06, + "loss": 1.1645, + "mean_token_accuracy": 0.6789154633879662, + "num_tokens": 20236259.0, + "step": 1556 + }, + { + "epoch": 0.99648, + "grad_norm": 3.582597017288208, + "learning_rate": 5e-06, + "loss": 1.383, + "mean_token_accuracy": 0.6580745279788971, + "num_tokens": 20250508.0, + "step": 1557 + }, + { + "epoch": 0.99712, + "grad_norm": 4.058999061584473, + "learning_rate": 5e-06, + "loss": 1.3162, + "mean_token_accuracy": 0.6591609418392181, + "num_tokens": 20262337.0, + "step": 1558 + }, + { + "epoch": 0.99776, + "grad_norm": 3.842996597290039, + "learning_rate": 5e-06, + "loss": 1.3768, + "mean_token_accuracy": 0.6542828008532524, + "num_tokens": 20273865.0, + "step": 1559 + }, + { + "epoch": 0.9984, + "grad_norm": 3.5340254306793213, + "learning_rate": 5e-06, + "loss": 1.2762, + "mean_token_accuracy": 0.6779467761516571, + "num_tokens": 20286723.0, + "step": 1560 + }, + { + "epoch": 0.99904, + "grad_norm": 3.087484836578369, + "learning_rate": 5e-06, + "loss": 1.3845, + "mean_token_accuracy": 0.6415645852684975, + "num_tokens": 20302849.0, + "step": 1561 + }, + { + "epoch": 0.99968, + "grad_norm": 3.4678475856781006, + "learning_rate": 5e-06, + "loss": 1.2984, + "mean_token_accuracy": 0.6632586568593979, + "num_tokens": 20315462.0, + "step": 1562 + } + ], + "logging_steps": 1, + "max_steps": 1562, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 56623305523200.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}