{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994972347913524, "eval_steps": 750, "global_step": 1491, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033517680576504107, "grad_norm": 14.694869995117188, "learning_rate": 6.666666666666667e-06, "loss": 53.6406, "mean_token_accuracy": 0.5338318642228842, "step": 5 }, { "epoch": 0.006703536115300821, "grad_norm": 14.033230781555176, "learning_rate": 1.3333333333333333e-05, "loss": 52.3838, "mean_token_accuracy": 0.5248840853571892, "step": 10 }, { "epoch": 0.010055304172951232, "grad_norm": 6.804769039154053, "learning_rate": 2e-05, "loss": 47.9105, "mean_token_accuracy": 0.5399681400507689, "step": 15 }, { "epoch": 0.013407072230601643, "grad_norm": 7.750083923339844, "learning_rate": 2.6666666666666667e-05, "loss": 41.8861, "mean_token_accuracy": 0.55653104968369, "step": 20 }, { "epoch": 0.01675884028825205, "grad_norm": 6.184543132781982, "learning_rate": 3.3333333333333335e-05, "loss": 37.33, "mean_token_accuracy": 0.5655230440199375, "step": 25 }, { "epoch": 0.020110608345902465, "grad_norm": 4.537179946899414, "learning_rate": 4e-05, "loss": 32.7503, "mean_token_accuracy": 0.587661711126566, "step": 30 }, { "epoch": 0.023462376403552875, "grad_norm": 3.6645753383636475, "learning_rate": 4.666666666666667e-05, "loss": 29.1892, "mean_token_accuracy": 0.6075583577156067, "step": 35 }, { "epoch": 0.026814144461203285, "grad_norm": 3.7526533603668213, "learning_rate": 5.333333333333333e-05, "loss": 26.3524, "mean_token_accuracy": 0.6198613092303276, "step": 40 }, { "epoch": 0.030165912518853696, "grad_norm": 3.0561397075653076, "learning_rate": 6e-05, "loss": 24.1513, "mean_token_accuracy": 0.6353930421173573, "step": 45 }, { "epoch": 0.0335176805765041, "grad_norm": 2.857618808746338, "learning_rate": 6.666666666666667e-05, "loss": 23.5029, "mean_token_accuracy": 0.6437373287975788, "step": 50 }, { "epoch": 0.03686944863415452, "grad_norm": 2.7901978492736816, "learning_rate": 7.333333333333333e-05, "loss": 22.9387, "mean_token_accuracy": 0.646886795759201, "step": 55 }, { "epoch": 0.04022121669180493, "grad_norm": 2.8266501426696777, "learning_rate": 8e-05, "loss": 22.0359, "mean_token_accuracy": 0.6525138475000858, "step": 60 }, { "epoch": 0.04357298474945534, "grad_norm": 2.5010733604431152, "learning_rate": 8.666666666666667e-05, "loss": 21.5158, "mean_token_accuracy": 0.6548139773309231, "step": 65 }, { "epoch": 0.04692475280710575, "grad_norm": 2.5834386348724365, "learning_rate": 9.333333333333334e-05, "loss": 21.5409, "mean_token_accuracy": 0.6478891499340534, "step": 70 }, { "epoch": 0.05027652086475616, "grad_norm": 2.6927576065063477, "learning_rate": 0.0001, "loss": 20.1017, "mean_token_accuracy": 0.6757474772632122, "step": 75 }, { "epoch": 0.05362828892240657, "grad_norm": 2.0276572704315186, "learning_rate": 9.964689265536724e-05, "loss": 19.9912, "mean_token_accuracy": 0.6763999305665493, "step": 80 }, { "epoch": 0.05698005698005698, "grad_norm": 2.4628567695617676, "learning_rate": 9.929378531073446e-05, "loss": 19.9089, "mean_token_accuracy": 0.672279854118824, "step": 85 }, { "epoch": 0.06033182503770739, "grad_norm": 2.258838415145874, "learning_rate": 9.89406779661017e-05, "loss": 19.7132, "mean_token_accuracy": 0.6713059276342392, "step": 90 }, { "epoch": 0.0636835930953578, "grad_norm": 2.447565793991089, "learning_rate": 9.858757062146892e-05, "loss": 18.7631, "mean_token_accuracy": 0.6825208596885204, "step": 95 }, { "epoch": 0.0670353611530082, "grad_norm": 2.1105902194976807, "learning_rate": 9.823446327683616e-05, "loss": 19.4631, "mean_token_accuracy": 0.6674435302615166, "step": 100 }, { "epoch": 0.07038712921065862, "grad_norm": 2.309248447418213, "learning_rate": 9.78813559322034e-05, "loss": 19.0249, "mean_token_accuracy": 0.6734571024775505, "step": 105 }, { "epoch": 0.07373889726830904, "grad_norm": 2.101681709289551, "learning_rate": 9.752824858757063e-05, "loss": 18.593, "mean_token_accuracy": 0.6875097192823887, "step": 110 }, { "epoch": 0.07709066532595944, "grad_norm": 2.157726526260376, "learning_rate": 9.717514124293787e-05, "loss": 18.5973, "mean_token_accuracy": 0.6829216606914997, "step": 115 }, { "epoch": 0.08044243338360986, "grad_norm": 2.0711209774017334, "learning_rate": 9.682203389830509e-05, "loss": 19.1541, "mean_token_accuracy": 0.6785640828311443, "step": 120 }, { "epoch": 0.08379420144126026, "grad_norm": 2.015594959259033, "learning_rate": 9.646892655367233e-05, "loss": 18.9493, "mean_token_accuracy": 0.6861244946718216, "step": 125 }, { "epoch": 0.08714596949891068, "grad_norm": 2.1295998096466064, "learning_rate": 9.611581920903955e-05, "loss": 18.5125, "mean_token_accuracy": 0.6793887488543987, "step": 130 }, { "epoch": 0.09049773755656108, "grad_norm": 2.2496395111083984, "learning_rate": 9.576271186440679e-05, "loss": 18.4019, "mean_token_accuracy": 0.6890006221830844, "step": 135 }, { "epoch": 0.0938495056142115, "grad_norm": 2.1168577671051025, "learning_rate": 9.540960451977402e-05, "loss": 18.7305, "mean_token_accuracy": 0.6841622419655323, "step": 140 }, { "epoch": 0.0972012736718619, "grad_norm": 1.8554915189743042, "learning_rate": 9.505649717514125e-05, "loss": 18.6606, "mean_token_accuracy": 0.6859239712357521, "step": 145 }, { "epoch": 0.10055304172951232, "grad_norm": 1.9698066711425781, "learning_rate": 9.470338983050848e-05, "loss": 19.1065, "mean_token_accuracy": 0.6759489566087723, "step": 150 }, { "epoch": 0.10390480978716272, "grad_norm": 2.2483623027801514, "learning_rate": 9.43502824858757e-05, "loss": 18.8041, "mean_token_accuracy": 0.68142851293087, "step": 155 }, { "epoch": 0.10725657784481314, "grad_norm": 1.8570690155029297, "learning_rate": 9.399717514124294e-05, "loss": 18.8862, "mean_token_accuracy": 0.6791303649544715, "step": 160 }, { "epoch": 0.11060834590246355, "grad_norm": 2.143021583557129, "learning_rate": 9.364406779661016e-05, "loss": 18.7605, "mean_token_accuracy": 0.681893227249384, "step": 165 }, { "epoch": 0.11396011396011396, "grad_norm": 1.8951307535171509, "learning_rate": 9.32909604519774e-05, "loss": 18.3005, "mean_token_accuracy": 0.6897541806101799, "step": 170 }, { "epoch": 0.11731188201776437, "grad_norm": 1.971745252609253, "learning_rate": 9.293785310734464e-05, "loss": 18.8995, "mean_token_accuracy": 0.6820204116404056, "step": 175 }, { "epoch": 0.12066365007541478, "grad_norm": 1.910328984260559, "learning_rate": 9.258474576271187e-05, "loss": 18.8808, "mean_token_accuracy": 0.6812884464859963, "step": 180 }, { "epoch": 0.12401541813306519, "grad_norm": 1.730974555015564, "learning_rate": 9.223163841807911e-05, "loss": 18.0871, "mean_token_accuracy": 0.6907590143382549, "step": 185 }, { "epoch": 0.1273671861907156, "grad_norm": 2.125452995300293, "learning_rate": 9.187853107344633e-05, "loss": 18.1569, "mean_token_accuracy": 0.689236406236887, "step": 190 }, { "epoch": 0.13071895424836602, "grad_norm": 2.0234949588775635, "learning_rate": 9.152542372881357e-05, "loss": 18.3342, "mean_token_accuracy": 0.6902932204306126, "step": 195 }, { "epoch": 0.1340707223060164, "grad_norm": 1.9802364110946655, "learning_rate": 9.11723163841808e-05, "loss": 18.7942, "mean_token_accuracy": 0.6788501650094986, "step": 200 }, { "epoch": 0.13742249036366683, "grad_norm": 1.8897534608840942, "learning_rate": 9.081920903954803e-05, "loss": 18.4679, "mean_token_accuracy": 0.6900524459779263, "step": 205 }, { "epoch": 0.14077425842131724, "grad_norm": 1.9040635824203491, "learning_rate": 9.046610169491526e-05, "loss": 18.0058, "mean_token_accuracy": 0.690093420445919, "step": 210 }, { "epoch": 0.14412602647896766, "grad_norm": 2.0558955669403076, "learning_rate": 9.011299435028249e-05, "loss": 17.5489, "mean_token_accuracy": 0.7006829999387264, "step": 215 }, { "epoch": 0.14747779453661808, "grad_norm": 1.7952055931091309, "learning_rate": 8.975988700564972e-05, "loss": 18.2907, "mean_token_accuracy": 0.6876891441643238, "step": 220 }, { "epoch": 0.15082956259426847, "grad_norm": 1.8588192462921143, "learning_rate": 8.940677966101694e-05, "loss": 18.4005, "mean_token_accuracy": 0.6897859051823616, "step": 225 }, { "epoch": 0.15418133065191889, "grad_norm": 1.9269477128982544, "learning_rate": 8.905367231638418e-05, "loss": 18.2096, "mean_token_accuracy": 0.6909494370222091, "step": 230 }, { "epoch": 0.1575330987095693, "grad_norm": 1.8693301677703857, "learning_rate": 8.870056497175142e-05, "loss": 18.394, "mean_token_accuracy": 0.6836515329778194, "step": 235 }, { "epoch": 0.16088486676721972, "grad_norm": 1.787061333656311, "learning_rate": 8.834745762711864e-05, "loss": 18.1503, "mean_token_accuracy": 0.6907145738601684, "step": 240 }, { "epoch": 0.1642366348248701, "grad_norm": 1.8895225524902344, "learning_rate": 8.799435028248588e-05, "loss": 18.3026, "mean_token_accuracy": 0.6878940775990486, "step": 245 }, { "epoch": 0.16758840288252053, "grad_norm": 1.835693120956421, "learning_rate": 8.764124293785311e-05, "loss": 17.9347, "mean_token_accuracy": 0.6917316012084485, "step": 250 }, { "epoch": 0.17094017094017094, "grad_norm": 1.7408661842346191, "learning_rate": 8.728813559322035e-05, "loss": 18.0051, "mean_token_accuracy": 0.689583633840084, "step": 255 }, { "epoch": 0.17429193899782136, "grad_norm": 1.9096996784210205, "learning_rate": 8.693502824858759e-05, "loss": 17.6064, "mean_token_accuracy": 0.6965925216674804, "step": 260 }, { "epoch": 0.17764370705547175, "grad_norm": 1.9822146892547607, "learning_rate": 8.658192090395481e-05, "loss": 17.6301, "mean_token_accuracy": 0.7005406267940998, "step": 265 }, { "epoch": 0.18099547511312217, "grad_norm": 1.8383901119232178, "learning_rate": 8.622881355932204e-05, "loss": 17.9114, "mean_token_accuracy": 0.6876685306429863, "step": 270 }, { "epoch": 0.18434724317077258, "grad_norm": 1.7920355796813965, "learning_rate": 8.587570621468927e-05, "loss": 18.1271, "mean_token_accuracy": 0.689356567710638, "step": 275 }, { "epoch": 0.187699011228423, "grad_norm": 1.6455663442611694, "learning_rate": 8.55225988700565e-05, "loss": 17.787, "mean_token_accuracy": 0.6919776491820813, "step": 280 }, { "epoch": 0.1910507792860734, "grad_norm": 1.9442647695541382, "learning_rate": 8.516949152542373e-05, "loss": 17.6019, "mean_token_accuracy": 0.6980393722653389, "step": 285 }, { "epoch": 0.1944025473437238, "grad_norm": 2.294377565383911, "learning_rate": 8.481638418079096e-05, "loss": 17.8778, "mean_token_accuracy": 0.6954585202038288, "step": 290 }, { "epoch": 0.19775431540137423, "grad_norm": 1.8009259700775146, "learning_rate": 8.44632768361582e-05, "loss": 17.5257, "mean_token_accuracy": 0.6998075112700463, "step": 295 }, { "epoch": 0.20110608345902464, "grad_norm": 2.015516757965088, "learning_rate": 8.411016949152542e-05, "loss": 17.7554, "mean_token_accuracy": 0.6968327619135379, "step": 300 }, { "epoch": 0.20445785151667506, "grad_norm": 1.5640082359313965, "learning_rate": 8.375706214689266e-05, "loss": 17.3438, "mean_token_accuracy": 0.69996168166399, "step": 305 }, { "epoch": 0.20780961957432545, "grad_norm": 1.9527899026870728, "learning_rate": 8.340395480225988e-05, "loss": 17.6883, "mean_token_accuracy": 0.6988407798111439, "step": 310 }, { "epoch": 0.21116138763197587, "grad_norm": 1.8222606182098389, "learning_rate": 8.305084745762712e-05, "loss": 17.0646, "mean_token_accuracy": 0.7061679445207119, "step": 315 }, { "epoch": 0.21451315568962628, "grad_norm": 1.8560868501663208, "learning_rate": 8.269774011299435e-05, "loss": 17.8875, "mean_token_accuracy": 0.6941629223525524, "step": 320 }, { "epoch": 0.2178649237472767, "grad_norm": 1.7588037252426147, "learning_rate": 8.234463276836159e-05, "loss": 17.6412, "mean_token_accuracy": 0.6954927705228329, "step": 325 }, { "epoch": 0.2212166918049271, "grad_norm": 1.738242268562317, "learning_rate": 8.199152542372883e-05, "loss": 17.8251, "mean_token_accuracy": 0.6898994512856007, "step": 330 }, { "epoch": 0.2245684598625775, "grad_norm": 1.8485089540481567, "learning_rate": 8.163841807909605e-05, "loss": 17.3078, "mean_token_accuracy": 0.7000270999968052, "step": 335 }, { "epoch": 0.22792022792022792, "grad_norm": 1.8579105138778687, "learning_rate": 8.128531073446328e-05, "loss": 17.3078, "mean_token_accuracy": 0.6995702408254146, "step": 340 }, { "epoch": 0.23127199597787834, "grad_norm": 1.7994352579116821, "learning_rate": 8.093220338983051e-05, "loss": 17.7557, "mean_token_accuracy": 0.6928035505115986, "step": 345 }, { "epoch": 0.23462376403552873, "grad_norm": 1.9240634441375732, "learning_rate": 8.057909604519774e-05, "loss": 17.4329, "mean_token_accuracy": 0.6960855178534985, "step": 350 }, { "epoch": 0.23797553209317915, "grad_norm": 1.6718952655792236, "learning_rate": 8.022598870056498e-05, "loss": 17.5951, "mean_token_accuracy": 0.6947735913097859, "step": 355 }, { "epoch": 0.24132730015082957, "grad_norm": 1.6835826635360718, "learning_rate": 7.98728813559322e-05, "loss": 18.1085, "mean_token_accuracy": 0.6882089108228684, "step": 360 }, { "epoch": 0.24467906820847998, "grad_norm": 1.7387073040008545, "learning_rate": 7.951977401129944e-05, "loss": 17.799, "mean_token_accuracy": 0.6932998545467853, "step": 365 }, { "epoch": 0.24803083626613037, "grad_norm": 2.0071725845336914, "learning_rate": 7.916666666666666e-05, "loss": 17.4076, "mean_token_accuracy": 0.6961173862218857, "step": 370 }, { "epoch": 0.2513826043237808, "grad_norm": 2.326915740966797, "learning_rate": 7.88135593220339e-05, "loss": 17.3121, "mean_token_accuracy": 0.7005321949720382, "step": 375 }, { "epoch": 0.2547343723814312, "grad_norm": 2.1876060962677, "learning_rate": 7.846045197740113e-05, "loss": 17.9069, "mean_token_accuracy": 0.6906426399946213, "step": 380 }, { "epoch": 0.2580861404390816, "grad_norm": 1.849671483039856, "learning_rate": 7.810734463276837e-05, "loss": 17.483, "mean_token_accuracy": 0.7000573620200157, "step": 385 }, { "epoch": 0.26143790849673204, "grad_norm": 1.6676862239837646, "learning_rate": 7.775423728813561e-05, "loss": 16.8936, "mean_token_accuracy": 0.7045633904635906, "step": 390 }, { "epoch": 0.26478967655438246, "grad_norm": 1.6702505350112915, "learning_rate": 7.740112994350283e-05, "loss": 17.904, "mean_token_accuracy": 0.6874841086566448, "step": 395 }, { "epoch": 0.2681414446120328, "grad_norm": 1.7280704975128174, "learning_rate": 7.704802259887007e-05, "loss": 17.4515, "mean_token_accuracy": 0.7018027983605861, "step": 400 }, { "epoch": 0.27149321266968324, "grad_norm": 1.8801991939544678, "learning_rate": 7.669491525423729e-05, "loss": 17.43, "mean_token_accuracy": 0.7009049601852894, "step": 405 }, { "epoch": 0.27484498072733365, "grad_norm": 1.9758073091506958, "learning_rate": 7.634180790960453e-05, "loss": 17.5984, "mean_token_accuracy": 0.6948069363832474, "step": 410 }, { "epoch": 0.27819674878498407, "grad_norm": 1.5747147798538208, "learning_rate": 7.598870056497176e-05, "loss": 18.3079, "mean_token_accuracy": 0.6853139907121658, "step": 415 }, { "epoch": 0.2815485168426345, "grad_norm": 1.6292234659194946, "learning_rate": 7.563559322033898e-05, "loss": 17.4527, "mean_token_accuracy": 0.697540608048439, "step": 420 }, { "epoch": 0.2849002849002849, "grad_norm": 1.6185086965560913, "learning_rate": 7.528248587570622e-05, "loss": 17.4193, "mean_token_accuracy": 0.7012022204697133, "step": 425 }, { "epoch": 0.2882520529579353, "grad_norm": 1.8361762762069702, "learning_rate": 7.492937853107344e-05, "loss": 17.4544, "mean_token_accuracy": 0.698820473998785, "step": 430 }, { "epoch": 0.29160382101558574, "grad_norm": 1.7740592956542969, "learning_rate": 7.457627118644068e-05, "loss": 18.0507, "mean_token_accuracy": 0.6881603226065636, "step": 435 }, { "epoch": 0.29495558907323616, "grad_norm": 1.8252911567687988, "learning_rate": 7.42231638418079e-05, "loss": 17.155, "mean_token_accuracy": 0.7065504610538482, "step": 440 }, { "epoch": 0.2983073571308865, "grad_norm": 1.8424382209777832, "learning_rate": 7.387005649717514e-05, "loss": 17.3055, "mean_token_accuracy": 0.6978819817304611, "step": 445 }, { "epoch": 0.30165912518853694, "grad_norm": 1.7494243383407593, "learning_rate": 7.351694915254238e-05, "loss": 16.8365, "mean_token_accuracy": 0.7099504336714745, "step": 450 }, { "epoch": 0.30501089324618735, "grad_norm": 1.936540961265564, "learning_rate": 7.316384180790961e-05, "loss": 18.2753, "mean_token_accuracy": 0.6913827233016491, "step": 455 }, { "epoch": 0.30836266130383777, "grad_norm": 1.810272216796875, "learning_rate": 7.281073446327685e-05, "loss": 17.0536, "mean_token_accuracy": 0.6986232809722424, "step": 460 }, { "epoch": 0.3117144293614882, "grad_norm": 1.6832094192504883, "learning_rate": 7.245762711864407e-05, "loss": 17.2231, "mean_token_accuracy": 0.702030860632658, "step": 465 }, { "epoch": 0.3150661974191386, "grad_norm": 1.8872151374816895, "learning_rate": 7.21045197740113e-05, "loss": 17.5502, "mean_token_accuracy": 0.6932449921965599, "step": 470 }, { "epoch": 0.318417965476789, "grad_norm": 1.788021445274353, "learning_rate": 7.175141242937854e-05, "loss": 16.8596, "mean_token_accuracy": 0.7096694305539131, "step": 475 }, { "epoch": 0.32176973353443944, "grad_norm": 1.8025559186935425, "learning_rate": 7.139830508474577e-05, "loss": 16.662, "mean_token_accuracy": 0.7063573338091373, "step": 480 }, { "epoch": 0.3251215015920898, "grad_norm": 2.274674654006958, "learning_rate": 7.1045197740113e-05, "loss": 17.5965, "mean_token_accuracy": 0.6934389650821686, "step": 485 }, { "epoch": 0.3284732696497402, "grad_norm": 1.6426053047180176, "learning_rate": 7.069209039548022e-05, "loss": 17.0914, "mean_token_accuracy": 0.7049042917788029, "step": 490 }, { "epoch": 0.33182503770739064, "grad_norm": 1.6252586841583252, "learning_rate": 7.033898305084746e-05, "loss": 17.6078, "mean_token_accuracy": 0.6924709647893905, "step": 495 }, { "epoch": 0.33517680576504105, "grad_norm": 1.7185930013656616, "learning_rate": 6.998587570621468e-05, "loss": 17.314, "mean_token_accuracy": 0.7039985358715057, "step": 500 }, { "epoch": 0.33852857382269147, "grad_norm": 1.7891852855682373, "learning_rate": 6.963276836158192e-05, "loss": 17.2188, "mean_token_accuracy": 0.6977060906589031, "step": 505 }, { "epoch": 0.3418803418803419, "grad_norm": 1.9103929996490479, "learning_rate": 6.927966101694916e-05, "loss": 17.4467, "mean_token_accuracy": 0.6982413403689861, "step": 510 }, { "epoch": 0.3452321099379923, "grad_norm": 1.8996375799179077, "learning_rate": 6.892655367231638e-05, "loss": 16.9608, "mean_token_accuracy": 0.7054095402359962, "step": 515 }, { "epoch": 0.3485838779956427, "grad_norm": 2.0335419178009033, "learning_rate": 6.857344632768362e-05, "loss": 17.3361, "mean_token_accuracy": 0.7016568422317505, "step": 520 }, { "epoch": 0.35193564605329314, "grad_norm": 1.9008755683898926, "learning_rate": 6.822033898305085e-05, "loss": 16.9694, "mean_token_accuracy": 0.7059390284121037, "step": 525 }, { "epoch": 0.3552874141109435, "grad_norm": 1.8340988159179688, "learning_rate": 6.786723163841809e-05, "loss": 17.3528, "mean_token_accuracy": 0.7033507622778415, "step": 530 }, { "epoch": 0.3586391821685939, "grad_norm": 1.6903594732284546, "learning_rate": 6.751412429378532e-05, "loss": 17.3021, "mean_token_accuracy": 0.7001501135528088, "step": 535 }, { "epoch": 0.36199095022624433, "grad_norm": 1.8101950883865356, "learning_rate": 6.716101694915255e-05, "loss": 17.938, "mean_token_accuracy": 0.6908830553293228, "step": 540 }, { "epoch": 0.36534271828389475, "grad_norm": 1.6470075845718384, "learning_rate": 6.680790960451978e-05, "loss": 17.6612, "mean_token_accuracy": 0.6923478744924069, "step": 545 }, { "epoch": 0.36869448634154517, "grad_norm": 2.1860337257385254, "learning_rate": 6.6454802259887e-05, "loss": 17.5684, "mean_token_accuracy": 0.6983748801052571, "step": 550 }, { "epoch": 0.3720462543991956, "grad_norm": 1.717653512954712, "learning_rate": 6.610169491525424e-05, "loss": 17.1166, "mean_token_accuracy": 0.7025655619800091, "step": 555 }, { "epoch": 0.375398022456846, "grad_norm": 1.9525723457336426, "learning_rate": 6.574858757062147e-05, "loss": 17.2908, "mean_token_accuracy": 0.6997996769845486, "step": 560 }, { "epoch": 0.3787497905144964, "grad_norm": 1.6053602695465088, "learning_rate": 6.53954802259887e-05, "loss": 17.3894, "mean_token_accuracy": 0.698741364479065, "step": 565 }, { "epoch": 0.3821015585721468, "grad_norm": 1.7356934547424316, "learning_rate": 6.504237288135594e-05, "loss": 17.1546, "mean_token_accuracy": 0.7013543620705605, "step": 570 }, { "epoch": 0.3854533266297972, "grad_norm": 1.7188559770584106, "learning_rate": 6.468926553672316e-05, "loss": 17.7637, "mean_token_accuracy": 0.6936320647597313, "step": 575 }, { "epoch": 0.3888050946874476, "grad_norm": 1.8413478136062622, "learning_rate": 6.43361581920904e-05, "loss": 17.8498, "mean_token_accuracy": 0.695782047510147, "step": 580 }, { "epoch": 0.39215686274509803, "grad_norm": 1.5715190172195435, "learning_rate": 6.398305084745762e-05, "loss": 17.4304, "mean_token_accuracy": 0.6989135831594467, "step": 585 }, { "epoch": 0.39550863080274845, "grad_norm": 1.8729442358016968, "learning_rate": 6.362994350282486e-05, "loss": 16.9125, "mean_token_accuracy": 0.708356649428606, "step": 590 }, { "epoch": 0.39886039886039887, "grad_norm": 2.099592685699463, "learning_rate": 6.327683615819209e-05, "loss": 17.542, "mean_token_accuracy": 0.6888726130127907, "step": 595 }, { "epoch": 0.4022121669180493, "grad_norm": 1.6204314231872559, "learning_rate": 6.292372881355933e-05, "loss": 16.9305, "mean_token_accuracy": 0.7038852870464325, "step": 600 }, { "epoch": 0.4055639349756997, "grad_norm": 2.12034010887146, "learning_rate": 6.257062146892656e-05, "loss": 17.0389, "mean_token_accuracy": 0.704576326906681, "step": 605 }, { "epoch": 0.4089157030333501, "grad_norm": 1.6821502447128296, "learning_rate": 6.221751412429379e-05, "loss": 16.788, "mean_token_accuracy": 0.7000284940004349, "step": 610 }, { "epoch": 0.4122674710910005, "grad_norm": 1.8137435913085938, "learning_rate": 6.186440677966102e-05, "loss": 17.5926, "mean_token_accuracy": 0.6961537927389145, "step": 615 }, { "epoch": 0.4156192391486509, "grad_norm": 1.6652235984802246, "learning_rate": 6.151129943502825e-05, "loss": 17.3539, "mean_token_accuracy": 0.7028377398848533, "step": 620 }, { "epoch": 0.4189710072063013, "grad_norm": 1.766480803489685, "learning_rate": 6.115819209039548e-05, "loss": 17.529, "mean_token_accuracy": 0.6905739739537239, "step": 625 }, { "epoch": 0.42232277526395173, "grad_norm": 1.6319854259490967, "learning_rate": 6.080508474576272e-05, "loss": 16.9847, "mean_token_accuracy": 0.7060947254300117, "step": 630 }, { "epoch": 0.42567454332160215, "grad_norm": 2.1006696224212646, "learning_rate": 6.045197740112994e-05, "loss": 16.9317, "mean_token_accuracy": 0.7015593230724335, "step": 635 }, { "epoch": 0.42902631137925257, "grad_norm": 1.7353427410125732, "learning_rate": 6.009887005649718e-05, "loss": 17.4744, "mean_token_accuracy": 0.7001501567661762, "step": 640 }, { "epoch": 0.432378079436903, "grad_norm": 1.9449700117111206, "learning_rate": 5.974576271186441e-05, "loss": 16.8705, "mean_token_accuracy": 0.7026407413184643, "step": 645 }, { "epoch": 0.4357298474945534, "grad_norm": 1.6030067205429077, "learning_rate": 5.9392655367231644e-05, "loss": 16.8924, "mean_token_accuracy": 0.702277285605669, "step": 650 }, { "epoch": 0.43908161555220376, "grad_norm": 1.5722424983978271, "learning_rate": 5.903954802259888e-05, "loss": 17.364, "mean_token_accuracy": 0.6959278948605061, "step": 655 }, { "epoch": 0.4424333836098542, "grad_norm": 1.8168216943740845, "learning_rate": 5.86864406779661e-05, "loss": 16.704, "mean_token_accuracy": 0.7045813865959645, "step": 660 }, { "epoch": 0.4457851516675046, "grad_norm": 1.905402660369873, "learning_rate": 5.833333333333334e-05, "loss": 16.8896, "mean_token_accuracy": 0.7026248089969158, "step": 665 }, { "epoch": 0.449136919725155, "grad_norm": 1.7437454462051392, "learning_rate": 5.798022598870056e-05, "loss": 17.0496, "mean_token_accuracy": 0.702862861007452, "step": 670 }, { "epoch": 0.45248868778280543, "grad_norm": 1.7496871948242188, "learning_rate": 5.76271186440678e-05, "loss": 16.7024, "mean_token_accuracy": 0.7073140636086463, "step": 675 }, { "epoch": 0.45584045584045585, "grad_norm": 1.6521803140640259, "learning_rate": 5.727401129943503e-05, "loss": 17.4437, "mean_token_accuracy": 0.6910906590521335, "step": 680 }, { "epoch": 0.45919222389810627, "grad_norm": 1.7904677391052246, "learning_rate": 5.6920903954802264e-05, "loss": 17.4803, "mean_token_accuracy": 0.6987466789782047, "step": 685 }, { "epoch": 0.4625439919557567, "grad_norm": 2.4545388221740723, "learning_rate": 5.65677966101695e-05, "loss": 17.2987, "mean_token_accuracy": 0.699196208268404, "step": 690 }, { "epoch": 0.46589576001340705, "grad_norm": 1.6428866386413574, "learning_rate": 5.6214689265536723e-05, "loss": 16.7636, "mean_token_accuracy": 0.7029999569058418, "step": 695 }, { "epoch": 0.46924752807105746, "grad_norm": 1.9685977697372437, "learning_rate": 5.586158192090396e-05, "loss": 17.3887, "mean_token_accuracy": 0.6938736639916897, "step": 700 }, { "epoch": 0.4725992961287079, "grad_norm": 1.5567928552627563, "learning_rate": 5.550847457627118e-05, "loss": 17.1879, "mean_token_accuracy": 0.7024729043245316, "step": 705 }, { "epoch": 0.4759510641863583, "grad_norm": 1.6846567392349243, "learning_rate": 5.515536723163842e-05, "loss": 16.8679, "mean_token_accuracy": 0.7025640495121479, "step": 710 }, { "epoch": 0.4793028322440087, "grad_norm": 1.6596832275390625, "learning_rate": 5.480225988700565e-05, "loss": 16.7137, "mean_token_accuracy": 0.7031160019338131, "step": 715 }, { "epoch": 0.48265460030165913, "grad_norm": 2.04453444480896, "learning_rate": 5.4449152542372885e-05, "loss": 17.0646, "mean_token_accuracy": 0.7018779084086418, "step": 720 }, { "epoch": 0.48600636835930955, "grad_norm": 1.7244528532028198, "learning_rate": 5.409604519774012e-05, "loss": 17.1897, "mean_token_accuracy": 0.6981223806738853, "step": 725 }, { "epoch": 0.48935813641695997, "grad_norm": 1.6929802894592285, "learning_rate": 5.3742937853107344e-05, "loss": 17.2678, "mean_token_accuracy": 0.6996262572705746, "step": 730 }, { "epoch": 0.4927099044746104, "grad_norm": 1.7945303916931152, "learning_rate": 5.338983050847458e-05, "loss": 17.1465, "mean_token_accuracy": 0.7002299666404724, "step": 735 }, { "epoch": 0.49606167253226074, "grad_norm": 1.5936013460159302, "learning_rate": 5.30367231638418e-05, "loss": 17.0265, "mean_token_accuracy": 0.6998031720519066, "step": 740 }, { "epoch": 0.49941344058991116, "grad_norm": 1.553004264831543, "learning_rate": 5.268361581920904e-05, "loss": 16.7301, "mean_token_accuracy": 0.7022854961454869, "step": 745 }, { "epoch": 0.5027652086475616, "grad_norm": 1.7667690515518188, "learning_rate": 5.2330508474576275e-05, "loss": 16.8576, "mean_token_accuracy": 0.7085686258971691, "step": 750 }, { "epoch": 0.5027652086475616, "eval_loss": 1.0600364208221436, "eval_mean_token_accuracy": 0.7049777010093035, "eval_runtime": 1736.5707, "eval_samples_per_second": 1.392, "eval_steps_per_second": 0.174, "step": 750 }, { "epoch": 0.506116976705212, "grad_norm": 1.4901829957962036, "learning_rate": 5.1977401129943505e-05, "loss": 17.0004, "mean_token_accuracy": 0.6990960523486137, "step": 755 }, { "epoch": 0.5094687447628624, "grad_norm": 1.8451662063598633, "learning_rate": 5.162429378531074e-05, "loss": 17.2012, "mean_token_accuracy": 0.7007680244743824, "step": 760 }, { "epoch": 0.5128205128205128, "grad_norm": 1.6952011585235596, "learning_rate": 5.1271186440677964e-05, "loss": 17.612, "mean_token_accuracy": 0.6927438467741013, "step": 765 }, { "epoch": 0.5161722808781632, "grad_norm": 1.7307817935943604, "learning_rate": 5.09180790960452e-05, "loss": 16.8776, "mean_token_accuracy": 0.706513649225235, "step": 770 }, { "epoch": 0.5195240489358136, "grad_norm": 1.6692585945129395, "learning_rate": 5.056497175141243e-05, "loss": 17.0364, "mean_token_accuracy": 0.704279126226902, "step": 775 }, { "epoch": 0.5228758169934641, "grad_norm": 1.6963402032852173, "learning_rate": 5.0211864406779666e-05, "loss": 16.8957, "mean_token_accuracy": 0.7085353158414364, "step": 780 }, { "epoch": 0.5262275850511144, "grad_norm": 1.678458571434021, "learning_rate": 4.9858757062146896e-05, "loss": 17.7932, "mean_token_accuracy": 0.6964584030210972, "step": 785 }, { "epoch": 0.5295793531087649, "grad_norm": 1.7449827194213867, "learning_rate": 4.9505649717514125e-05, "loss": 16.8765, "mean_token_accuracy": 0.7036922007799149, "step": 790 }, { "epoch": 0.5329311211664153, "grad_norm": 1.7107524871826172, "learning_rate": 4.915254237288136e-05, "loss": 17.243, "mean_token_accuracy": 0.6997682720422744, "step": 795 }, { "epoch": 0.5362828892240656, "grad_norm": 1.6416223049163818, "learning_rate": 4.879943502824859e-05, "loss": 16.7253, "mean_token_accuracy": 0.7050332672894001, "step": 800 }, { "epoch": 0.5396346572817161, "grad_norm": 1.867213249206543, "learning_rate": 4.844632768361582e-05, "loss": 16.8566, "mean_token_accuracy": 0.7032786093652248, "step": 805 }, { "epoch": 0.5429864253393665, "grad_norm": 1.6539360284805298, "learning_rate": 4.809322033898305e-05, "loss": 16.6993, "mean_token_accuracy": 0.7117977932095527, "step": 810 }, { "epoch": 0.546338193397017, "grad_norm": 1.752715826034546, "learning_rate": 4.7740112994350286e-05, "loss": 17.5809, "mean_token_accuracy": 0.6992670528590679, "step": 815 }, { "epoch": 0.5496899614546673, "grad_norm": 1.806174397468567, "learning_rate": 4.7387005649717516e-05, "loss": 17.1588, "mean_token_accuracy": 0.6960965767502785, "step": 820 }, { "epoch": 0.5530417295123178, "grad_norm": 1.719764232635498, "learning_rate": 4.703389830508475e-05, "loss": 16.8685, "mean_token_accuracy": 0.7025568410754204, "step": 825 }, { "epoch": 0.5563934975699681, "grad_norm": 1.7800629138946533, "learning_rate": 4.668079096045198e-05, "loss": 16.8872, "mean_token_accuracy": 0.6994628652930259, "step": 830 }, { "epoch": 0.5597452656276186, "grad_norm": 1.7011103630065918, "learning_rate": 4.632768361581921e-05, "loss": 17.2342, "mean_token_accuracy": 0.7006913289427757, "step": 835 }, { "epoch": 0.563097033685269, "grad_norm": 1.6887695789337158, "learning_rate": 4.597457627118644e-05, "loss": 16.7385, "mean_token_accuracy": 0.7045929700136184, "step": 840 }, { "epoch": 0.5664488017429193, "grad_norm": 1.9496142864227295, "learning_rate": 4.562146892655367e-05, "loss": 16.8387, "mean_token_accuracy": 0.7083131410181522, "step": 845 }, { "epoch": 0.5698005698005698, "grad_norm": 1.7757388353347778, "learning_rate": 4.5268361581920906e-05, "loss": 17.3856, "mean_token_accuracy": 0.6994826771318913, "step": 850 }, { "epoch": 0.5731523378582202, "grad_norm": 1.7115302085876465, "learning_rate": 4.491525423728814e-05, "loss": 16.5993, "mean_token_accuracy": 0.7093915119767189, "step": 855 }, { "epoch": 0.5765041059158706, "grad_norm": 1.7968231439590454, "learning_rate": 4.456214689265537e-05, "loss": 16.8983, "mean_token_accuracy": 0.7087731070816516, "step": 860 }, { "epoch": 0.579855873973521, "grad_norm": 1.6066899299621582, "learning_rate": 4.42090395480226e-05, "loss": 16.7126, "mean_token_accuracy": 0.7053335346281528, "step": 865 }, { "epoch": 0.5832076420311715, "grad_norm": 1.6380205154418945, "learning_rate": 4.385593220338983e-05, "loss": 17.0037, "mean_token_accuracy": 0.7038719221949578, "step": 870 }, { "epoch": 0.5865594100888218, "grad_norm": 1.8956695795059204, "learning_rate": 4.350282485875706e-05, "loss": 16.9679, "mean_token_accuracy": 0.6983371920883655, "step": 875 }, { "epoch": 0.5899111781464723, "grad_norm": 1.625135064125061, "learning_rate": 4.314971751412429e-05, "loss": 17.0642, "mean_token_accuracy": 0.7067640118300915, "step": 880 }, { "epoch": 0.5932629462041227, "grad_norm": 1.6344581842422485, "learning_rate": 4.279661016949153e-05, "loss": 16.3079, "mean_token_accuracy": 0.7225491903722286, "step": 885 }, { "epoch": 0.596614714261773, "grad_norm": 1.7680976390838623, "learning_rate": 4.244350282485876e-05, "loss": 16.7187, "mean_token_accuracy": 0.7041032016277313, "step": 890 }, { "epoch": 0.5999664823194235, "grad_norm": 1.8056613206863403, "learning_rate": 4.209039548022599e-05, "loss": 17.3536, "mean_token_accuracy": 0.6975419208407402, "step": 895 }, { "epoch": 0.6033182503770739, "grad_norm": 1.8398966789245605, "learning_rate": 4.173728813559322e-05, "loss": 16.6245, "mean_token_accuracy": 0.7088275127112865, "step": 900 }, { "epoch": 0.6066700184347243, "grad_norm": 1.8332566022872925, "learning_rate": 4.138418079096045e-05, "loss": 17.0128, "mean_token_accuracy": 0.7018843114376068, "step": 905 }, { "epoch": 0.6100217864923747, "grad_norm": 1.6582337617874146, "learning_rate": 4.103107344632768e-05, "loss": 16.8948, "mean_token_accuracy": 0.7051651798188686, "step": 910 }, { "epoch": 0.6133735545500252, "grad_norm": 1.7373839616775513, "learning_rate": 4.067796610169492e-05, "loss": 16.9138, "mean_token_accuracy": 0.7022108249366283, "step": 915 }, { "epoch": 0.6167253226076755, "grad_norm": 1.6373577117919922, "learning_rate": 4.0324858757062154e-05, "loss": 17.0573, "mean_token_accuracy": 0.7042267486453057, "step": 920 }, { "epoch": 0.620077090665326, "grad_norm": 1.581024408340454, "learning_rate": 3.997175141242938e-05, "loss": 16.6234, "mean_token_accuracy": 0.7054463028907776, "step": 925 }, { "epoch": 0.6234288587229764, "grad_norm": 1.6900616884231567, "learning_rate": 3.961864406779661e-05, "loss": 17.0468, "mean_token_accuracy": 0.7014504976570606, "step": 930 }, { "epoch": 0.6267806267806267, "grad_norm": 1.6560430526733398, "learning_rate": 3.926553672316384e-05, "loss": 16.909, "mean_token_accuracy": 0.7064756542444229, "step": 935 }, { "epoch": 0.6301323948382772, "grad_norm": 1.8687000274658203, "learning_rate": 3.891242937853107e-05, "loss": 17.0047, "mean_token_accuracy": 0.7055176287889481, "step": 940 }, { "epoch": 0.6334841628959276, "grad_norm": 1.777716040611267, "learning_rate": 3.855932203389831e-05, "loss": 16.556, "mean_token_accuracy": 0.7047871246933937, "step": 945 }, { "epoch": 0.636835930953578, "grad_norm": 1.6830016374588013, "learning_rate": 3.820621468926554e-05, "loss": 16.5832, "mean_token_accuracy": 0.7049862682819367, "step": 950 }, { "epoch": 0.6401876990112284, "grad_norm": 1.5959638357162476, "learning_rate": 3.7853107344632774e-05, "loss": 16.8336, "mean_token_accuracy": 0.7072055459022522, "step": 955 }, { "epoch": 0.6435394670688789, "grad_norm": 1.82794189453125, "learning_rate": 3.7500000000000003e-05, "loss": 16.6644, "mean_token_accuracy": 0.7058505766093731, "step": 960 }, { "epoch": 0.6468912351265292, "grad_norm": 1.6554478406906128, "learning_rate": 3.714689265536723e-05, "loss": 16.2796, "mean_token_accuracy": 0.7101977132260799, "step": 965 }, { "epoch": 0.6502430031841796, "grad_norm": 1.8698370456695557, "learning_rate": 3.679378531073446e-05, "loss": 16.1934, "mean_token_accuracy": 0.7142874717712402, "step": 970 }, { "epoch": 0.6535947712418301, "grad_norm": 1.8040566444396973, "learning_rate": 3.644067796610169e-05, "loss": 16.5345, "mean_token_accuracy": 0.7125143676996231, "step": 975 }, { "epoch": 0.6569465392994804, "grad_norm": 1.6644558906555176, "learning_rate": 3.608757062146893e-05, "loss": 16.508, "mean_token_accuracy": 0.7078846462070942, "step": 980 }, { "epoch": 0.6602983073571309, "grad_norm": 1.7228506803512573, "learning_rate": 3.573446327683616e-05, "loss": 16.8474, "mean_token_accuracy": 0.7084795109927654, "step": 985 }, { "epoch": 0.6636500754147813, "grad_norm": 1.486241102218628, "learning_rate": 3.5381355932203394e-05, "loss": 17.1453, "mean_token_accuracy": 0.6975291892886162, "step": 990 }, { "epoch": 0.6670018434724317, "grad_norm": 1.7130765914916992, "learning_rate": 3.5028248587570624e-05, "loss": 16.458, "mean_token_accuracy": 0.7106956362724304, "step": 995 }, { "epoch": 0.6703536115300821, "grad_norm": 1.863926649093628, "learning_rate": 3.467514124293785e-05, "loss": 17.3095, "mean_token_accuracy": 0.6962033234536648, "step": 1000 }, { "epoch": 0.6737053795877326, "grad_norm": 1.6535072326660156, "learning_rate": 3.432203389830508e-05, "loss": 16.6846, "mean_token_accuracy": 0.7084034703671932, "step": 1005 }, { "epoch": 0.6770571476453829, "grad_norm": 1.7278594970703125, "learning_rate": 3.396892655367232e-05, "loss": 16.9805, "mean_token_accuracy": 0.7026786416769027, "step": 1010 }, { "epoch": 0.6804089157030333, "grad_norm": 1.9055004119873047, "learning_rate": 3.361581920903955e-05, "loss": 17.2562, "mean_token_accuracy": 0.6977267302572727, "step": 1015 }, { "epoch": 0.6837606837606838, "grad_norm": 1.6398614645004272, "learning_rate": 3.326271186440678e-05, "loss": 17.3378, "mean_token_accuracy": 0.6958214737474918, "step": 1020 }, { "epoch": 0.6871124518183341, "grad_norm": 1.926950454711914, "learning_rate": 3.2909604519774014e-05, "loss": 16.6536, "mean_token_accuracy": 0.7083842910826206, "step": 1025 }, { "epoch": 0.6904642198759846, "grad_norm": 1.8061659336090088, "learning_rate": 3.2556497175141244e-05, "loss": 16.643, "mean_token_accuracy": 0.7093963578343392, "step": 1030 }, { "epoch": 0.693815987933635, "grad_norm": 1.6816084384918213, "learning_rate": 3.2203389830508473e-05, "loss": 16.9696, "mean_token_accuracy": 0.7000316813588142, "step": 1035 }, { "epoch": 0.6971677559912854, "grad_norm": 1.630842685699463, "learning_rate": 3.185028248587571e-05, "loss": 16.587, "mean_token_accuracy": 0.7107978977262974, "step": 1040 }, { "epoch": 0.7005195240489358, "grad_norm": 1.755123257637024, "learning_rate": 3.149717514124294e-05, "loss": 17.0736, "mean_token_accuracy": 0.7017260067164898, "step": 1045 }, { "epoch": 0.7038712921065863, "grad_norm": 1.4850029945373535, "learning_rate": 3.114406779661017e-05, "loss": 16.3165, "mean_token_accuracy": 0.7119720429182053, "step": 1050 }, { "epoch": 0.7072230601642366, "grad_norm": 1.916961908340454, "learning_rate": 3.0790960451977405e-05, "loss": 17.0237, "mean_token_accuracy": 0.6976533338427544, "step": 1055 }, { "epoch": 0.710574828221887, "grad_norm": 1.5003294944763184, "learning_rate": 3.043785310734463e-05, "loss": 16.8504, "mean_token_accuracy": 0.7056308597326278, "step": 1060 }, { "epoch": 0.7139265962795375, "grad_norm": 1.9166836738586426, "learning_rate": 3.0084745762711864e-05, "loss": 16.8231, "mean_token_accuracy": 0.7023352533578873, "step": 1065 }, { "epoch": 0.7172783643371878, "grad_norm": 1.7789411544799805, "learning_rate": 2.97316384180791e-05, "loss": 17.3132, "mean_token_accuracy": 0.6994914725422859, "step": 1070 }, { "epoch": 0.7206301323948383, "grad_norm": 1.7289875745773315, "learning_rate": 2.937853107344633e-05, "loss": 17.3902, "mean_token_accuracy": 0.69447166249156, "step": 1075 }, { "epoch": 0.7239819004524887, "grad_norm": 1.4835467338562012, "learning_rate": 2.902542372881356e-05, "loss": 16.751, "mean_token_accuracy": 0.7052346661686897, "step": 1080 }, { "epoch": 0.7273336685101391, "grad_norm": 1.5802119970321655, "learning_rate": 2.8672316384180792e-05, "loss": 16.6574, "mean_token_accuracy": 0.7059398606419564, "step": 1085 }, { "epoch": 0.7306854365677895, "grad_norm": 1.8420851230621338, "learning_rate": 2.8319209039548022e-05, "loss": 16.9315, "mean_token_accuracy": 0.7063411138951778, "step": 1090 }, { "epoch": 0.7340372046254399, "grad_norm": 1.7593777179718018, "learning_rate": 2.7966101694915255e-05, "loss": 16.8653, "mean_token_accuracy": 0.7089171193540096, "step": 1095 }, { "epoch": 0.7373889726830903, "grad_norm": 1.681443452835083, "learning_rate": 2.7612994350282488e-05, "loss": 16.9878, "mean_token_accuracy": 0.7057393230497837, "step": 1100 }, { "epoch": 0.7407407407407407, "grad_norm": 1.6064281463623047, "learning_rate": 2.725988700564972e-05, "loss": 16.6153, "mean_token_accuracy": 0.7038764618337154, "step": 1105 }, { "epoch": 0.7440925087983912, "grad_norm": 1.5632483959197998, "learning_rate": 2.690677966101695e-05, "loss": 16.0927, "mean_token_accuracy": 0.7171440742909908, "step": 1110 }, { "epoch": 0.7474442768560415, "grad_norm": 1.8588156700134277, "learning_rate": 2.6553672316384183e-05, "loss": 16.5765, "mean_token_accuracy": 0.7098327249288559, "step": 1115 }, { "epoch": 0.750796044913692, "grad_norm": 1.5576221942901611, "learning_rate": 2.6200564971751413e-05, "loss": 16.6568, "mean_token_accuracy": 0.7029327027499676, "step": 1120 }, { "epoch": 0.7541478129713424, "grad_norm": 1.645244836807251, "learning_rate": 2.5847457627118642e-05, "loss": 16.7294, "mean_token_accuracy": 0.7060277953743934, "step": 1125 }, { "epoch": 0.7574995810289928, "grad_norm": 1.4038984775543213, "learning_rate": 2.549435028248588e-05, "loss": 16.5925, "mean_token_accuracy": 0.7068064086139202, "step": 1130 }, { "epoch": 0.7608513490866432, "grad_norm": 1.7987641096115112, "learning_rate": 2.514124293785311e-05, "loss": 16.6834, "mean_token_accuracy": 0.7070130936801433, "step": 1135 }, { "epoch": 0.7642031171442936, "grad_norm": 1.5423444509506226, "learning_rate": 2.478813559322034e-05, "loss": 16.4551, "mean_token_accuracy": 0.7121224895119667, "step": 1140 }, { "epoch": 0.767554885201944, "grad_norm": 1.7546942234039307, "learning_rate": 2.443502824858757e-05, "loss": 16.9741, "mean_token_accuracy": 0.7010989025235176, "step": 1145 }, { "epoch": 0.7709066532595944, "grad_norm": 1.8481935262680054, "learning_rate": 2.4081920903954803e-05, "loss": 16.6323, "mean_token_accuracy": 0.7058765202760696, "step": 1150 }, { "epoch": 0.7742584213172449, "grad_norm": 1.6855909824371338, "learning_rate": 2.3728813559322036e-05, "loss": 16.6844, "mean_token_accuracy": 0.7119428858160972, "step": 1155 }, { "epoch": 0.7776101893748952, "grad_norm": 1.9828130006790161, "learning_rate": 2.3375706214689266e-05, "loss": 16.866, "mean_token_accuracy": 0.7036800056695938, "step": 1160 }, { "epoch": 0.7809619574325457, "grad_norm": 1.5005120038986206, "learning_rate": 2.30225988700565e-05, "loss": 16.3539, "mean_token_accuracy": 0.711839384585619, "step": 1165 }, { "epoch": 0.7843137254901961, "grad_norm": 2.262735366821289, "learning_rate": 2.266949152542373e-05, "loss": 16.4102, "mean_token_accuracy": 0.7110463745892048, "step": 1170 }, { "epoch": 0.7876654935478465, "grad_norm": 1.6699568033218384, "learning_rate": 2.231638418079096e-05, "loss": 17.1027, "mean_token_accuracy": 0.7031991191208362, "step": 1175 }, { "epoch": 0.7910172616054969, "grad_norm": 1.6248890161514282, "learning_rate": 2.196327683615819e-05, "loss": 16.3399, "mean_token_accuracy": 0.7143234215676785, "step": 1180 }, { "epoch": 0.7943690296631473, "grad_norm": 1.7570775747299194, "learning_rate": 2.1610169491525427e-05, "loss": 16.2255, "mean_token_accuracy": 0.7123358778655529, "step": 1185 }, { "epoch": 0.7977207977207977, "grad_norm": 1.9391677379608154, "learning_rate": 2.1257062146892657e-05, "loss": 16.3472, "mean_token_accuracy": 0.711616413295269, "step": 1190 }, { "epoch": 0.8010725657784481, "grad_norm": 1.8997981548309326, "learning_rate": 2.0903954802259886e-05, "loss": 16.5601, "mean_token_accuracy": 0.7071553356945515, "step": 1195 }, { "epoch": 0.8044243338360986, "grad_norm": 1.6094359159469604, "learning_rate": 2.055084745762712e-05, "loss": 16.622, "mean_token_accuracy": 0.7043877936899662, "step": 1200 }, { "epoch": 0.8077761018937489, "grad_norm": 1.7940973043441772, "learning_rate": 2.0197740112994352e-05, "loss": 16.6535, "mean_token_accuracy": 0.705554535984993, "step": 1205 }, { "epoch": 0.8111278699513994, "grad_norm": 1.6890041828155518, "learning_rate": 1.984463276836158e-05, "loss": 17.2328, "mean_token_accuracy": 0.6988375537097454, "step": 1210 }, { "epoch": 0.8144796380090498, "grad_norm": 1.5568735599517822, "learning_rate": 1.9491525423728814e-05, "loss": 16.9753, "mean_token_accuracy": 0.7015632651746273, "step": 1215 }, { "epoch": 0.8178314060667002, "grad_norm": 1.7157835960388184, "learning_rate": 1.9138418079096047e-05, "loss": 16.3668, "mean_token_accuracy": 0.7098449252545833, "step": 1220 }, { "epoch": 0.8211831741243506, "grad_norm": 1.7175644636154175, "learning_rate": 1.8785310734463277e-05, "loss": 16.8061, "mean_token_accuracy": 0.7032932281494141, "step": 1225 }, { "epoch": 0.824534942182001, "grad_norm": 1.7225829362869263, "learning_rate": 1.843220338983051e-05, "loss": 16.5716, "mean_token_accuracy": 0.7074852548539639, "step": 1230 }, { "epoch": 0.8278867102396514, "grad_norm": 1.8654727935791016, "learning_rate": 1.8079096045197743e-05, "loss": 16.8172, "mean_token_accuracy": 0.7035241700708866, "step": 1235 }, { "epoch": 0.8312384782973018, "grad_norm": 1.9604694843292236, "learning_rate": 1.7725988700564972e-05, "loss": 16.2992, "mean_token_accuracy": 0.714275274425745, "step": 1240 }, { "epoch": 0.8345902463549523, "grad_norm": 1.7569185495376587, "learning_rate": 1.7372881355932205e-05, "loss": 16.6269, "mean_token_accuracy": 0.7052666112780571, "step": 1245 }, { "epoch": 0.8379420144126026, "grad_norm": 1.6537069082260132, "learning_rate": 1.7019774011299435e-05, "loss": 16.5978, "mean_token_accuracy": 0.708269502967596, "step": 1250 }, { "epoch": 0.8412937824702531, "grad_norm": 1.8623359203338623, "learning_rate": 1.6666666666666667e-05, "loss": 16.1831, "mean_token_accuracy": 0.7164609245955944, "step": 1255 }, { "epoch": 0.8446455505279035, "grad_norm": 1.7004101276397705, "learning_rate": 1.63135593220339e-05, "loss": 16.9611, "mean_token_accuracy": 0.7057129152119159, "step": 1260 }, { "epoch": 0.8479973185855538, "grad_norm": 1.8294973373413086, "learning_rate": 1.596045197740113e-05, "loss": 16.8036, "mean_token_accuracy": 0.7046464517712593, "step": 1265 }, { "epoch": 0.8513490866432043, "grad_norm": 1.7992702722549438, "learning_rate": 1.5607344632768363e-05, "loss": 16.139, "mean_token_accuracy": 0.7126708298921585, "step": 1270 }, { "epoch": 0.8547008547008547, "grad_norm": 2.033846855163574, "learning_rate": 1.5254237288135596e-05, "loss": 16.49, "mean_token_accuracy": 0.707030464708805, "step": 1275 }, { "epoch": 0.8580526227585051, "grad_norm": 1.690617561340332, "learning_rate": 1.4901129943502825e-05, "loss": 16.7829, "mean_token_accuracy": 0.7026272863149643, "step": 1280 }, { "epoch": 0.8614043908161555, "grad_norm": 1.7161706686019897, "learning_rate": 1.4548022598870056e-05, "loss": 16.4907, "mean_token_accuracy": 0.7054763376712799, "step": 1285 }, { "epoch": 0.864756158873806, "grad_norm": 1.5910500288009644, "learning_rate": 1.419491525423729e-05, "loss": 16.3073, "mean_token_accuracy": 0.7165283918380737, "step": 1290 }, { "epoch": 0.8681079269314563, "grad_norm": 1.5939749479293823, "learning_rate": 1.384180790960452e-05, "loss": 16.6524, "mean_token_accuracy": 0.705347529053688, "step": 1295 }, { "epoch": 0.8714596949891068, "grad_norm": 1.7478996515274048, "learning_rate": 1.3488700564971752e-05, "loss": 17.1832, "mean_token_accuracy": 0.6956523738801479, "step": 1300 }, { "epoch": 0.8748114630467572, "grad_norm": 1.6442205905914307, "learning_rate": 1.3135593220338985e-05, "loss": 16.3978, "mean_token_accuracy": 0.7132278561592102, "step": 1305 }, { "epoch": 0.8781632311044075, "grad_norm": 1.7201565504074097, "learning_rate": 1.2782485875706216e-05, "loss": 16.3159, "mean_token_accuracy": 0.711051919311285, "step": 1310 }, { "epoch": 0.881514999162058, "grad_norm": 1.829209327697754, "learning_rate": 1.2429378531073447e-05, "loss": 16.7987, "mean_token_accuracy": 0.7058401651680469, "step": 1315 }, { "epoch": 0.8848667672197084, "grad_norm": 1.4660886526107788, "learning_rate": 1.2076271186440678e-05, "loss": 16.7297, "mean_token_accuracy": 0.7092804253101349, "step": 1320 }, { "epoch": 0.8882185352773588, "grad_norm": 1.4927663803100586, "learning_rate": 1.172316384180791e-05, "loss": 15.9333, "mean_token_accuracy": 0.7158772744238376, "step": 1325 }, { "epoch": 0.8915703033350092, "grad_norm": 1.6522186994552612, "learning_rate": 1.137005649717514e-05, "loss": 16.4156, "mean_token_accuracy": 0.7134528748691082, "step": 1330 }, { "epoch": 0.8949220713926597, "grad_norm": 1.7809523344039917, "learning_rate": 1.1016949152542374e-05, "loss": 16.2625, "mean_token_accuracy": 0.7148336976766586, "step": 1335 }, { "epoch": 0.89827383945031, "grad_norm": 1.8860619068145752, "learning_rate": 1.0663841807909605e-05, "loss": 16.6187, "mean_token_accuracy": 0.7087382405996323, "step": 1340 }, { "epoch": 0.9016256075079605, "grad_norm": 1.854195475578308, "learning_rate": 1.0310734463276836e-05, "loss": 16.5843, "mean_token_accuracy": 0.7144103929400444, "step": 1345 }, { "epoch": 0.9049773755656109, "grad_norm": 1.7052239179611206, "learning_rate": 9.957627118644067e-06, "loss": 16.3345, "mean_token_accuracy": 0.7125584341585636, "step": 1350 }, { "epoch": 0.9083291436232612, "grad_norm": 1.5887420177459717, "learning_rate": 9.6045197740113e-06, "loss": 16.2409, "mean_token_accuracy": 0.7080107174813748, "step": 1355 }, { "epoch": 0.9116809116809117, "grad_norm": 1.6052732467651367, "learning_rate": 9.251412429378532e-06, "loss": 16.2373, "mean_token_accuracy": 0.7137157171964645, "step": 1360 }, { "epoch": 0.9150326797385621, "grad_norm": 1.7612617015838623, "learning_rate": 8.898305084745763e-06, "loss": 16.0292, "mean_token_accuracy": 0.7181592255830764, "step": 1365 }, { "epoch": 0.9183844477962125, "grad_norm": 1.8271749019622803, "learning_rate": 8.545197740112996e-06, "loss": 16.8757, "mean_token_accuracy": 0.701992305368185, "step": 1370 }, { "epoch": 0.9217362158538629, "grad_norm": 1.6350926160812378, "learning_rate": 8.192090395480225e-06, "loss": 16.6061, "mean_token_accuracy": 0.7089238859713077, "step": 1375 }, { "epoch": 0.9250879839115134, "grad_norm": 1.7321621179580688, "learning_rate": 7.838983050847458e-06, "loss": 16.2532, "mean_token_accuracy": 0.7115737572312355, "step": 1380 }, { "epoch": 0.9284397519691637, "grad_norm": 1.8958040475845337, "learning_rate": 7.48587570621469e-06, "loss": 16.5068, "mean_token_accuracy": 0.7108790181577206, "step": 1385 }, { "epoch": 0.9317915200268141, "grad_norm": 1.629992127418518, "learning_rate": 7.1327683615819206e-06, "loss": 16.2367, "mean_token_accuracy": 0.7134776934981346, "step": 1390 }, { "epoch": 0.9351432880844646, "grad_norm": 1.904123067855835, "learning_rate": 6.779661016949153e-06, "loss": 16.3444, "mean_token_accuracy": 0.7045241884887219, "step": 1395 }, { "epoch": 0.9384950561421149, "grad_norm": 1.6319600343704224, "learning_rate": 6.426553672316385e-06, "loss": 16.3, "mean_token_accuracy": 0.7118948072195053, "step": 1400 }, { "epoch": 0.9418468241997654, "grad_norm": 1.6921709775924683, "learning_rate": 6.073446327683617e-06, "loss": 16.5816, "mean_token_accuracy": 0.7079687170684338, "step": 1405 }, { "epoch": 0.9451985922574158, "grad_norm": 1.636551856994629, "learning_rate": 5.720338983050848e-06, "loss": 16.785, "mean_token_accuracy": 0.7054948009550571, "step": 1410 }, { "epoch": 0.9485503603150662, "grad_norm": 1.6171858310699463, "learning_rate": 5.367231638418079e-06, "loss": 16.6877, "mean_token_accuracy": 0.7033485405147075, "step": 1415 }, { "epoch": 0.9519021283727166, "grad_norm": 1.6833641529083252, "learning_rate": 5.014124293785311e-06, "loss": 16.5803, "mean_token_accuracy": 0.706027402728796, "step": 1420 }, { "epoch": 0.9552538964303671, "grad_norm": 2.0238494873046875, "learning_rate": 4.6610169491525425e-06, "loss": 16.4305, "mean_token_accuracy": 0.7110757566988468, "step": 1425 }, { "epoch": 0.9586056644880174, "grad_norm": 1.5262683629989624, "learning_rate": 4.307909604519774e-06, "loss": 16.105, "mean_token_accuracy": 0.7173994883894921, "step": 1430 }, { "epoch": 0.9619574325456678, "grad_norm": 1.6822128295898438, "learning_rate": 3.954802259887006e-06, "loss": 17.0064, "mean_token_accuracy": 0.7033144362270832, "step": 1435 }, { "epoch": 0.9653092006033183, "grad_norm": 2.1382946968078613, "learning_rate": 3.6016949152542374e-06, "loss": 16.6567, "mean_token_accuracy": 0.7085098147392273, "step": 1440 }, { "epoch": 0.9686609686609686, "grad_norm": 1.6137080192565918, "learning_rate": 3.248587570621469e-06, "loss": 16.4193, "mean_token_accuracy": 0.7077061600983143, "step": 1445 }, { "epoch": 0.9720127367186191, "grad_norm": 1.6318018436431885, "learning_rate": 2.8954802259887007e-06, "loss": 16.5904, "mean_token_accuracy": 0.7037704810500145, "step": 1450 }, { "epoch": 0.9753645047762695, "grad_norm": 1.6723519563674927, "learning_rate": 2.5423728813559323e-06, "loss": 16.351, "mean_token_accuracy": 0.715372896194458, "step": 1455 }, { "epoch": 0.9787162728339199, "grad_norm": 2.6915719509124756, "learning_rate": 2.189265536723164e-06, "loss": 16.5627, "mean_token_accuracy": 0.706637478619814, "step": 1460 }, { "epoch": 0.9820680408915703, "grad_norm": 1.9349390268325806, "learning_rate": 1.8361581920903956e-06, "loss": 16.7821, "mean_token_accuracy": 0.7010103747248649, "step": 1465 }, { "epoch": 0.9854198089492208, "grad_norm": 1.6685172319412231, "learning_rate": 1.4830508474576273e-06, "loss": 16.7016, "mean_token_accuracy": 0.7086931586265564, "step": 1470 }, { "epoch": 0.9887715770068711, "grad_norm": 1.7148998975753784, "learning_rate": 1.129943502824859e-06, "loss": 16.4809, "mean_token_accuracy": 0.7131018862128258, "step": 1475 }, { "epoch": 0.9921233450645215, "grad_norm": 1.8873836994171143, "learning_rate": 7.768361581920904e-07, "loss": 16.5183, "mean_token_accuracy": 0.7111847102642059, "step": 1480 }, { "epoch": 0.995475113122172, "grad_norm": 1.8390552997589111, "learning_rate": 4.2372881355932204e-07, "loss": 16.1742, "mean_token_accuracy": 0.7128683432936669, "step": 1485 }, { "epoch": 0.9988268811798223, "grad_norm": 1.8799461126327515, "learning_rate": 7.062146892655368e-08, "loss": 17.1633, "mean_token_accuracy": 0.6963419988751411, "step": 1490 } ], "logging_steps": 5, "max_steps": 1491, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 750, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5012213304045076e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }