diff --git "a/outputs/qlora-out/checkpoint-2655/trainer_state.json" "b/outputs/qlora-out/checkpoint-2655/trainer_state.json" new file mode 100644--- /dev/null +++ "b/outputs/qlora-out/checkpoint-2655/trainer_state.json" @@ -0,0 +1,18698 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.994915254237288, + "eval_steps": 295, + "global_step": 2655, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011296243998870376, + "grad_norm": 0.03433802351355553, + "learning_rate": 2e-05, + "loss": 1.1396, + "step": 1 + }, + { + "epoch": 0.0011296243998870376, + "eval_loss": 1.1312694549560547, + "eval_runtime": 554.2279, + "eval_samples_per_second": 17.652, + "eval_steps_per_second": 8.827, + "step": 1 + }, + { + "epoch": 0.0022592487997740753, + "grad_norm": 0.03576268255710602, + "learning_rate": 4e-05, + "loss": 1.1188, + "step": 2 + }, + { + "epoch": 0.003388873199661113, + "grad_norm": 0.03207174688577652, + "learning_rate": 6e-05, + "loss": 1.2194, + "step": 3 + }, + { + "epoch": 0.0045184975995481505, + "grad_norm": 0.03231927007436752, + "learning_rate": 8e-05, + "loss": 1.1735, + "step": 4 + }, + { + "epoch": 0.005648121999435188, + "grad_norm": 0.03306754678487778, + "learning_rate": 0.0001, + "loss": 1.1689, + "step": 5 + }, + { + "epoch": 0.006777746399322226, + "grad_norm": 0.035009413957595825, + "learning_rate": 0.00012, + "loss": 1.1808, + "step": 6 + }, + { + "epoch": 0.007907370799209263, + "grad_norm": 0.035900842398405075, + "learning_rate": 0.00014, + "loss": 1.0441, + "step": 7 + }, + { + "epoch": 0.009036995199096301, + "grad_norm": 0.028419604524970055, + "learning_rate": 0.00016, + "loss": 1.0858, + "step": 8 + }, + { + "epoch": 0.010166619598983339, + "grad_norm": 0.024826928973197937, + "learning_rate": 0.00018, + "loss": 1.1937, + "step": 9 + }, + { + "epoch": 0.011296243998870376, + "grad_norm": 0.026519587263464928, + "learning_rate": 0.0002, + "loss": 1.0726, + "step": 10 + }, + { + "epoch": 0.012425868398757414, + "grad_norm": 0.024713166058063507, + "learning_rate": 0.00019999992946277893, + "loss": 1.1191, + "step": 11 + }, + { + "epoch": 0.013555492798644452, + "grad_norm": 0.02494538575410843, + "learning_rate": 0.00019999971785121523, + "loss": 1.1751, + "step": 12 + }, + { + "epoch": 0.014685117198531489, + "grad_norm": 0.023202786222100258, + "learning_rate": 0.00019999936516560744, + "loss": 1.1801, + "step": 13 + }, + { + "epoch": 0.015814741598418527, + "grad_norm": 0.021448194980621338, + "learning_rate": 0.00019999887140645308, + "loss": 1.1676, + "step": 14 + }, + { + "epoch": 0.016944365998305563, + "grad_norm": 0.021449347957968712, + "learning_rate": 0.00019999823657444873, + "loss": 1.1224, + "step": 15 + }, + { + "epoch": 0.018073990398192602, + "grad_norm": 0.021438075229525566, + "learning_rate": 0.00019999746067049, + "loss": 1.0713, + "step": 16 + }, + { + "epoch": 0.019203614798079638, + "grad_norm": 0.020727725699543953, + "learning_rate": 0.00019999654369567147, + "loss": 1.1317, + "step": 17 + }, + { + "epoch": 0.020333239197966677, + "grad_norm": 0.023839153349399567, + "learning_rate": 0.00019999548565128678, + "loss": 0.994, + "step": 18 + }, + { + "epoch": 0.021462863597853713, + "grad_norm": 0.020854901522397995, + "learning_rate": 0.0001999942865388285, + "loss": 1.0086, + "step": 19 + }, + { + "epoch": 0.022592487997740753, + "grad_norm": 0.019373737275600433, + "learning_rate": 0.00019999294635998833, + "loss": 1.0284, + "step": 20 + }, + { + "epoch": 0.02372211239762779, + "grad_norm": 0.020383458584547043, + "learning_rate": 0.00019999146511665692, + "loss": 1.0926, + "step": 21 + }, + { + "epoch": 0.024851736797514828, + "grad_norm": 0.018995020538568497, + "learning_rate": 0.0001999898428109239, + "loss": 1.0333, + "step": 22 + }, + { + "epoch": 0.025981361197401864, + "grad_norm": 0.018192732706665993, + "learning_rate": 0.00019998807944507791, + "loss": 1.1471, + "step": 23 + }, + { + "epoch": 0.027110985597288903, + "grad_norm": 0.017383620142936707, + "learning_rate": 0.00019998617502160664, + "loss": 1.0417, + "step": 24 + }, + { + "epoch": 0.02824060999717594, + "grad_norm": 0.0169441569596529, + "learning_rate": 0.00019998412954319675, + "loss": 1.0375, + "step": 25 + }, + { + "epoch": 0.029370234397062978, + "grad_norm": 0.017483294010162354, + "learning_rate": 0.0001999819430127339, + "loss": 1.1357, + "step": 26 + }, + { + "epoch": 0.030499858796950014, + "grad_norm": 0.016769247129559517, + "learning_rate": 0.00019997961543330269, + "loss": 1.0641, + "step": 27 + }, + { + "epoch": 0.031629483196837054, + "grad_norm": 0.017466655001044273, + "learning_rate": 0.00019997714680818673, + "loss": 1.0919, + "step": 28 + }, + { + "epoch": 0.03275910759672409, + "grad_norm": 0.01684914343059063, + "learning_rate": 0.00019997453714086866, + "loss": 1.1458, + "step": 29 + }, + { + "epoch": 0.033888731996611125, + "grad_norm": 0.017144447192549706, + "learning_rate": 0.00019997178643503004, + "loss": 1.0499, + "step": 30 + }, + { + "epoch": 0.03501835639649816, + "grad_norm": 0.01914755254983902, + "learning_rate": 0.0001999688946945514, + "loss": 1.03, + "step": 31 + }, + { + "epoch": 0.036147980796385204, + "grad_norm": 0.017046675086021423, + "learning_rate": 0.00019996586192351225, + "loss": 1.0747, + "step": 32 + }, + { + "epoch": 0.03727760519627224, + "grad_norm": 0.01652969978749752, + "learning_rate": 0.00019996268812619107, + "loss": 1.1694, + "step": 33 + }, + { + "epoch": 0.038407229596159276, + "grad_norm": 0.016702843829989433, + "learning_rate": 0.00019995937330706526, + "loss": 1.0339, + "step": 34 + }, + { + "epoch": 0.03953685399604631, + "grad_norm": 0.016058262437582016, + "learning_rate": 0.00019995591747081122, + "loss": 1.1002, + "step": 35 + }, + { + "epoch": 0.040666478395933355, + "grad_norm": 0.01609026826918125, + "learning_rate": 0.0001999523206223042, + "loss": 1.0934, + "step": 36 + }, + { + "epoch": 0.04179610279582039, + "grad_norm": 0.017004741355776787, + "learning_rate": 0.00019994858276661844, + "loss": 1.0748, + "step": 37 + }, + { + "epoch": 0.042925727195707426, + "grad_norm": 0.017046496272087097, + "learning_rate": 0.00019994470390902712, + "loss": 1.0851, + "step": 38 + }, + { + "epoch": 0.04405535159559446, + "grad_norm": 0.016196010634303093, + "learning_rate": 0.0001999406840550023, + "loss": 0.9796, + "step": 39 + }, + { + "epoch": 0.045184975995481505, + "grad_norm": 0.01703856885433197, + "learning_rate": 0.000199936523210215, + "loss": 1.0511, + "step": 40 + }, + { + "epoch": 0.04631460039536854, + "grad_norm": 0.017249640077352524, + "learning_rate": 0.00019993222138053507, + "loss": 0.9521, + "step": 41 + }, + { + "epoch": 0.04744422479525558, + "grad_norm": 0.01665697991847992, + "learning_rate": 0.0001999277785720313, + "loss": 1.0732, + "step": 42 + }, + { + "epoch": 0.04857384919514261, + "grad_norm": 0.016691800206899643, + "learning_rate": 0.0001999231947909714, + "loss": 1.1504, + "step": 43 + }, + { + "epoch": 0.049703473595029656, + "grad_norm": 0.016521582379937172, + "learning_rate": 0.00019991847004382186, + "loss": 1.1022, + "step": 44 + }, + { + "epoch": 0.05083309799491669, + "grad_norm": 0.017150631174445152, + "learning_rate": 0.00019991360433724813, + "loss": 1.0843, + "step": 45 + }, + { + "epoch": 0.05196272239480373, + "grad_norm": 0.016666896641254425, + "learning_rate": 0.00019990859767811444, + "loss": 1.0328, + "step": 46 + }, + { + "epoch": 0.05309234679469076, + "grad_norm": 0.016831668093800545, + "learning_rate": 0.0001999034500734839, + "loss": 1.0285, + "step": 47 + }, + { + "epoch": 0.054221971194577806, + "grad_norm": 0.01656479388475418, + "learning_rate": 0.00019989816153061853, + "loss": 0.9825, + "step": 48 + }, + { + "epoch": 0.05535159559446484, + "grad_norm": 0.016812896355986595, + "learning_rate": 0.00019989273205697904, + "loss": 1.0912, + "step": 49 + }, + { + "epoch": 0.05648121999435188, + "grad_norm": 0.01740356534719467, + "learning_rate": 0.0001998871616602251, + "loss": 1.0601, + "step": 50 + }, + { + "epoch": 0.057610844394238914, + "grad_norm": 0.017337938770651817, + "learning_rate": 0.00019988145034821502, + "loss": 1.1482, + "step": 51 + }, + { + "epoch": 0.058740468794125957, + "grad_norm": 0.017546923831105232, + "learning_rate": 0.0001998755981290061, + "loss": 1.0472, + "step": 52 + }, + { + "epoch": 0.05987009319401299, + "grad_norm": 0.017129387706518173, + "learning_rate": 0.00019986960501085428, + "loss": 1.1083, + "step": 53 + }, + { + "epoch": 0.06099971759390003, + "grad_norm": 0.01688031479716301, + "learning_rate": 0.00019986347100221433, + "loss": 1.0589, + "step": 54 + }, + { + "epoch": 0.062129341993787064, + "grad_norm": 0.017208745703101158, + "learning_rate": 0.00019985719611173973, + "loss": 1.1292, + "step": 55 + }, + { + "epoch": 0.06325896639367411, + "grad_norm": 0.01749996654689312, + "learning_rate": 0.0001998507803482828, + "loss": 1.1252, + "step": 56 + }, + { + "epoch": 0.06438859079356114, + "grad_norm": 0.017861831933259964, + "learning_rate": 0.00019984422372089453, + "loss": 1.104, + "step": 57 + }, + { + "epoch": 0.06551821519344818, + "grad_norm": 0.01797177456319332, + "learning_rate": 0.00019983752623882462, + "loss": 1.0569, + "step": 58 + }, + { + "epoch": 0.06664783959333521, + "grad_norm": 0.01702985167503357, + "learning_rate": 0.00019983068791152152, + "loss": 1.0238, + "step": 59 + }, + { + "epoch": 0.06777746399322225, + "grad_norm": 0.01766786351799965, + "learning_rate": 0.00019982370874863236, + "loss": 1.0509, + "step": 60 + }, + { + "epoch": 0.06890708839310929, + "grad_norm": 0.017592614516615868, + "learning_rate": 0.00019981658876000298, + "loss": 1.0613, + "step": 61 + }, + { + "epoch": 0.07003671279299632, + "grad_norm": 0.018118126317858696, + "learning_rate": 0.00019980932795567782, + "loss": 1.1727, + "step": 62 + }, + { + "epoch": 0.07116633719288337, + "grad_norm": 0.017846032977104187, + "learning_rate": 0.00019980192634590007, + "loss": 1.0042, + "step": 63 + }, + { + "epoch": 0.07229596159277041, + "grad_norm": 0.018457984551787376, + "learning_rate": 0.00019979438394111145, + "loss": 1.0648, + "step": 64 + }, + { + "epoch": 0.07342558599265744, + "grad_norm": 0.019480090588331223, + "learning_rate": 0.0001997867007519524, + "loss": 1.0307, + "step": 65 + }, + { + "epoch": 0.07455521039254448, + "grad_norm": 0.018102938309311867, + "learning_rate": 0.00019977887678926195, + "loss": 1.1129, + "step": 66 + }, + { + "epoch": 0.07568483479243152, + "grad_norm": 0.017858000472187996, + "learning_rate": 0.00019977091206407768, + "loss": 1.1574, + "step": 67 + }, + { + "epoch": 0.07681445919231855, + "grad_norm": 0.019296329468488693, + "learning_rate": 0.0001997628065876358, + "loss": 1.046, + "step": 68 + }, + { + "epoch": 0.07794408359220559, + "grad_norm": 0.017915885895490646, + "learning_rate": 0.0001997545603713711, + "loss": 1.0357, + "step": 69 + }, + { + "epoch": 0.07907370799209262, + "grad_norm": 0.01840789057314396, + "learning_rate": 0.00019974617342691678, + "loss": 1.0383, + "step": 70 + }, + { + "epoch": 0.08020333239197967, + "grad_norm": 0.019900545477867126, + "learning_rate": 0.00019973764576610478, + "loss": 0.9994, + "step": 71 + }, + { + "epoch": 0.08133295679186671, + "grad_norm": 0.0189906544983387, + "learning_rate": 0.0001997289774009654, + "loss": 0.9418, + "step": 72 + }, + { + "epoch": 0.08246258119175374, + "grad_norm": 0.01873377151787281, + "learning_rate": 0.00019972016834372749, + "loss": 0.9937, + "step": 73 + }, + { + "epoch": 0.08359220559164078, + "grad_norm": 0.019596470519900322, + "learning_rate": 0.0001997112186068184, + "loss": 1.0887, + "step": 74 + }, + { + "epoch": 0.08472182999152782, + "grad_norm": 0.020303891971707344, + "learning_rate": 0.00019970212820286394, + "loss": 1.0142, + "step": 75 + }, + { + "epoch": 0.08585145439141485, + "grad_norm": 0.019804317504167557, + "learning_rate": 0.00019969289714468825, + "loss": 1.0394, + "step": 76 + }, + { + "epoch": 0.08698107879130189, + "grad_norm": 0.019536610692739487, + "learning_rate": 0.0001996835254453141, + "loss": 0.9576, + "step": 77 + }, + { + "epoch": 0.08811070319118892, + "grad_norm": 0.019902685657143593, + "learning_rate": 0.0001996740131179625, + "loss": 0.9835, + "step": 78 + }, + { + "epoch": 0.08924032759107597, + "grad_norm": 0.01986609399318695, + "learning_rate": 0.00019966436017605297, + "loss": 1.0597, + "step": 79 + }, + { + "epoch": 0.09036995199096301, + "grad_norm": 0.019435487687587738, + "learning_rate": 0.00019965456663320329, + "loss": 1.0863, + "step": 80 + }, + { + "epoch": 0.09149957639085005, + "grad_norm": 0.019000260159373283, + "learning_rate": 0.00019964463250322966, + "loss": 1.0935, + "step": 81 + }, + { + "epoch": 0.09262920079073708, + "grad_norm": 0.018888210877776146, + "learning_rate": 0.0001996345578001466, + "loss": 1.0399, + "step": 82 + }, + { + "epoch": 0.09375882519062412, + "grad_norm": 0.019765490666031837, + "learning_rate": 0.00019962434253816694, + "loss": 1.0265, + "step": 83 + }, + { + "epoch": 0.09488844959051115, + "grad_norm": 0.01926722563803196, + "learning_rate": 0.00019961398673170181, + "loss": 1.0307, + "step": 84 + }, + { + "epoch": 0.09601807399039819, + "grad_norm": 0.019572502002120018, + "learning_rate": 0.00019960349039536062, + "loss": 1.0217, + "step": 85 + }, + { + "epoch": 0.09714769839028523, + "grad_norm": 0.024138784036040306, + "learning_rate": 0.000199592853543951, + "loss": 1.1376, + "step": 86 + }, + { + "epoch": 0.09827732279017226, + "grad_norm": 0.03155818581581116, + "learning_rate": 0.0001995820761924788, + "loss": 1.1029, + "step": 87 + }, + { + "epoch": 0.09940694719005931, + "grad_norm": 0.019589390605688095, + "learning_rate": 0.00019957115835614816, + "loss": 1.0353, + "step": 88 + }, + { + "epoch": 0.10053657158994635, + "grad_norm": 0.020419439300894737, + "learning_rate": 0.00019956010005036133, + "loss": 1.0228, + "step": 89 + }, + { + "epoch": 0.10166619598983338, + "grad_norm": 0.02149847149848938, + "learning_rate": 0.00019954890129071876, + "loss": 1.1214, + "step": 90 + }, + { + "epoch": 0.10279582038972042, + "grad_norm": 0.01982825994491577, + "learning_rate": 0.00019953756209301903, + "loss": 1.0302, + "step": 91 + }, + { + "epoch": 0.10392544478960745, + "grad_norm": 0.01985330507159233, + "learning_rate": 0.00019952608247325885, + "loss": 1.0674, + "step": 92 + }, + { + "epoch": 0.10505506918949449, + "grad_norm": 0.020196454599499702, + "learning_rate": 0.00019951446244763309, + "loss": 1.0113, + "step": 93 + }, + { + "epoch": 0.10618469358938153, + "grad_norm": 0.020652327686548233, + "learning_rate": 0.00019950270203253454, + "loss": 1.0635, + "step": 94 + }, + { + "epoch": 0.10731431798926856, + "grad_norm": 0.020714478567242622, + "learning_rate": 0.00019949080124455416, + "loss": 1.0226, + "step": 95 + }, + { + "epoch": 0.10844394238915561, + "grad_norm": 0.021647842600941658, + "learning_rate": 0.000199478760100481, + "loss": 1.0575, + "step": 96 + }, + { + "epoch": 0.10957356678904265, + "grad_norm": 0.02076675370335579, + "learning_rate": 0.00019946657861730194, + "loss": 1.1146, + "step": 97 + }, + { + "epoch": 0.11070319118892968, + "grad_norm": 0.02103651687502861, + "learning_rate": 0.000199454256812202, + "loss": 0.9953, + "step": 98 + }, + { + "epoch": 0.11183281558881672, + "grad_norm": 0.02276523970067501, + "learning_rate": 0.00019944179470256405, + "loss": 1.021, + "step": 99 + }, + { + "epoch": 0.11296243998870376, + "grad_norm": 0.020957166329026222, + "learning_rate": 0.00019942919230596896, + "loss": 0.9838, + "step": 100 + }, + { + "epoch": 0.11409206438859079, + "grad_norm": 0.022394055500626564, + "learning_rate": 0.00019941644964019552, + "loss": 1.0169, + "step": 101 + }, + { + "epoch": 0.11522168878847783, + "grad_norm": 0.02139163948595524, + "learning_rate": 0.00019940356672322037, + "loss": 1.0788, + "step": 102 + }, + { + "epoch": 0.11635131318836486, + "grad_norm": 0.021381577476859093, + "learning_rate": 0.00019939054357321799, + "loss": 1.0669, + "step": 103 + }, + { + "epoch": 0.11748093758825191, + "grad_norm": 0.02302641049027443, + "learning_rate": 0.00019937738020856072, + "loss": 1.0122, + "step": 104 + }, + { + "epoch": 0.11861056198813895, + "grad_norm": 0.021372724324464798, + "learning_rate": 0.00019936407664781868, + "loss": 1.0974, + "step": 105 + }, + { + "epoch": 0.11974018638802598, + "grad_norm": 0.021260784938931465, + "learning_rate": 0.00019935063290975986, + "loss": 0.9996, + "step": 106 + }, + { + "epoch": 0.12086981078791302, + "grad_norm": 0.021557705476880074, + "learning_rate": 0.0001993370490133499, + "loss": 1.0215, + "step": 107 + }, + { + "epoch": 0.12199943518780006, + "grad_norm": 0.023252975195646286, + "learning_rate": 0.00019932332497775215, + "loss": 1.0908, + "step": 108 + }, + { + "epoch": 0.12312905958768709, + "grad_norm": 0.02185026742517948, + "learning_rate": 0.00019930946082232783, + "loss": 1.0751, + "step": 109 + }, + { + "epoch": 0.12425868398757413, + "grad_norm": 0.022223595529794693, + "learning_rate": 0.00019929545656663562, + "loss": 0.9737, + "step": 110 + }, + { + "epoch": 0.12538830838746118, + "grad_norm": 0.021415019407868385, + "learning_rate": 0.000199281312230432, + "loss": 1.0864, + "step": 111 + }, + { + "epoch": 0.12651793278734821, + "grad_norm": 0.02144046686589718, + "learning_rate": 0.000199267027833671, + "loss": 0.9984, + "step": 112 + }, + { + "epoch": 0.12764755718723525, + "grad_norm": 0.0225879717618227, + "learning_rate": 0.00019925260339650428, + "loss": 1.0685, + "step": 113 + }, + { + "epoch": 0.12877718158712229, + "grad_norm": 0.022809404879808426, + "learning_rate": 0.000199238038939281, + "loss": 1.0733, + "step": 114 + }, + { + "epoch": 0.12990680598700932, + "grad_norm": 0.023381488397717476, + "learning_rate": 0.00019922333448254786, + "loss": 1.0107, + "step": 115 + }, + { + "epoch": 0.13103643038689636, + "grad_norm": 0.022633766755461693, + "learning_rate": 0.00019920849004704914, + "loss": 0.9885, + "step": 116 + }, + { + "epoch": 0.1321660547867834, + "grad_norm": 0.02235741913318634, + "learning_rate": 0.00019919350565372656, + "loss": 1.0714, + "step": 117 + }, + { + "epoch": 0.13329567918667043, + "grad_norm": 0.02206304483115673, + "learning_rate": 0.00019917838132371923, + "loss": 1.0749, + "step": 118 + }, + { + "epoch": 0.13442530358655747, + "grad_norm": 0.022310512140393257, + "learning_rate": 0.0001991631170783637, + "loss": 1.0437, + "step": 119 + }, + { + "epoch": 0.1355549279864445, + "grad_norm": 0.021498341113328934, + "learning_rate": 0.00019914771293919395, + "loss": 1.0317, + "step": 120 + }, + { + "epoch": 0.13668455238633154, + "grad_norm": 0.021773051470518112, + "learning_rate": 0.0001991321689279413, + "loss": 1.0489, + "step": 121 + }, + { + "epoch": 0.13781417678621857, + "grad_norm": 0.021639568731188774, + "learning_rate": 0.0001991164850665343, + "loss": 0.9893, + "step": 122 + }, + { + "epoch": 0.1389438011861056, + "grad_norm": 0.022304847836494446, + "learning_rate": 0.00019910066137709896, + "loss": 1.0542, + "step": 123 + }, + { + "epoch": 0.14007342558599264, + "grad_norm": 0.022173380479216576, + "learning_rate": 0.0001990846978819584, + "loss": 1.0776, + "step": 124 + }, + { + "epoch": 0.1412030499858797, + "grad_norm": 0.023623231798410416, + "learning_rate": 0.00019906859460363307, + "loss": 1.1279, + "step": 125 + }, + { + "epoch": 0.14233267438576674, + "grad_norm": 0.022697214037179947, + "learning_rate": 0.0001990523515648406, + "loss": 0.9973, + "step": 126 + }, + { + "epoch": 0.14346229878565378, + "grad_norm": 0.02267601527273655, + "learning_rate": 0.00019903596878849568, + "loss": 1.1131, + "step": 127 + }, + { + "epoch": 0.14459192318554082, + "grad_norm": 0.02244328148663044, + "learning_rate": 0.0001990194462977103, + "loss": 1.0157, + "step": 128 + }, + { + "epoch": 0.14572154758542785, + "grad_norm": 0.02371121197938919, + "learning_rate": 0.00019900278411579344, + "loss": 0.9888, + "step": 129 + }, + { + "epoch": 0.1468511719853149, + "grad_norm": 0.0227819811552763, + "learning_rate": 0.00019898598226625119, + "loss": 1.0003, + "step": 130 + }, + { + "epoch": 0.14798079638520192, + "grad_norm": 0.02316221408545971, + "learning_rate": 0.00019896904077278663, + "loss": 1.0181, + "step": 131 + }, + { + "epoch": 0.14911042078508896, + "grad_norm": 0.022808320820331573, + "learning_rate": 0.00019895195965929994, + "loss": 1.0546, + "step": 132 + }, + { + "epoch": 0.150240045184976, + "grad_norm": 0.022865859791636467, + "learning_rate": 0.00019893473894988815, + "loss": 1.1513, + "step": 133 + }, + { + "epoch": 0.15136966958486303, + "grad_norm": 0.024973077699542046, + "learning_rate": 0.0001989173786688453, + "loss": 1.0109, + "step": 134 + }, + { + "epoch": 0.15249929398475007, + "grad_norm": 0.02237241342663765, + "learning_rate": 0.00019889987884066237, + "loss": 1.0991, + "step": 135 + }, + { + "epoch": 0.1536289183846371, + "grad_norm": 0.023280750960111618, + "learning_rate": 0.000198882239490027, + "loss": 1.0501, + "step": 136 + }, + { + "epoch": 0.15475854278452414, + "grad_norm": 0.022803954780101776, + "learning_rate": 0.00019886446064182396, + "loss": 1.0033, + "step": 137 + }, + { + "epoch": 0.15588816718441117, + "grad_norm": 0.02270156517624855, + "learning_rate": 0.0001988465423211346, + "loss": 1.0715, + "step": 138 + }, + { + "epoch": 0.1570177915842982, + "grad_norm": 0.023484455421566963, + "learning_rate": 0.00019882848455323704, + "loss": 1.1598, + "step": 139 + }, + { + "epoch": 0.15814741598418525, + "grad_norm": 0.02300065942108631, + "learning_rate": 0.00019881028736360622, + "loss": 1.0813, + "step": 140 + }, + { + "epoch": 0.15927704038407228, + "grad_norm": 0.02320142462849617, + "learning_rate": 0.00019879195077791376, + "loss": 1.0169, + "step": 141 + }, + { + "epoch": 0.16040666478395935, + "grad_norm": 0.02317328006029129, + "learning_rate": 0.00019877347482202785, + "loss": 1.0301, + "step": 142 + }, + { + "epoch": 0.16153628918384638, + "grad_norm": 0.02378895878791809, + "learning_rate": 0.0001987548595220133, + "loss": 0.977, + "step": 143 + }, + { + "epoch": 0.16266591358373342, + "grad_norm": 0.023976027965545654, + "learning_rate": 0.00019873610490413166, + "loss": 1.0859, + "step": 144 + }, + { + "epoch": 0.16379553798362045, + "grad_norm": 0.022843923419713974, + "learning_rate": 0.0001987172109948408, + "loss": 1.0103, + "step": 145 + }, + { + "epoch": 0.1649251623835075, + "grad_norm": 0.023723525926470757, + "learning_rate": 0.00019869817782079525, + "loss": 1.0704, + "step": 146 + }, + { + "epoch": 0.16605478678339453, + "grad_norm": 0.02391059510409832, + "learning_rate": 0.00019867900540884592, + "loss": 1.058, + "step": 147 + }, + { + "epoch": 0.16718441118328156, + "grad_norm": 0.023995989933609962, + "learning_rate": 0.0001986596937860402, + "loss": 1.0034, + "step": 148 + }, + { + "epoch": 0.1683140355831686, + "grad_norm": 0.024091636762022972, + "learning_rate": 0.00019864024297962186, + "loss": 1.1214, + "step": 149 + }, + { + "epoch": 0.16944365998305563, + "grad_norm": 0.024035949259996414, + "learning_rate": 0.000198620653017031, + "loss": 1.0219, + "step": 150 + }, + { + "epoch": 0.17057328438294267, + "grad_norm": 0.02359904535114765, + "learning_rate": 0.00019860092392590408, + "loss": 0.9627, + "step": 151 + }, + { + "epoch": 0.1717029087828297, + "grad_norm": 0.023622050881385803, + "learning_rate": 0.00019858105573407377, + "loss": 1.0582, + "step": 152 + }, + { + "epoch": 0.17283253318271674, + "grad_norm": 0.02392633818089962, + "learning_rate": 0.00019856104846956906, + "loss": 1.0089, + "step": 153 + }, + { + "epoch": 0.17396215758260378, + "grad_norm": 0.024305053055286407, + "learning_rate": 0.00019854090216061502, + "loss": 1.0222, + "step": 154 + }, + { + "epoch": 0.1750917819824908, + "grad_norm": 0.024212822318077087, + "learning_rate": 0.00019852061683563296, + "loss": 1.0429, + "step": 155 + }, + { + "epoch": 0.17622140638237785, + "grad_norm": 0.024261048063635826, + "learning_rate": 0.00019850019252324032, + "loss": 1.0506, + "step": 156 + }, + { + "epoch": 0.17735103078226488, + "grad_norm": 0.022689295932650566, + "learning_rate": 0.0001984796292522506, + "loss": 0.9999, + "step": 157 + }, + { + "epoch": 0.17848065518215195, + "grad_norm": 0.023288823664188385, + "learning_rate": 0.00019845892705167324, + "loss": 1.0242, + "step": 158 + }, + { + "epoch": 0.17961027958203898, + "grad_norm": 0.030173135921359062, + "learning_rate": 0.00019843808595071383, + "loss": 1.0641, + "step": 159 + }, + { + "epoch": 0.18073990398192602, + "grad_norm": 0.024562738835811615, + "learning_rate": 0.00019841710597877382, + "loss": 0.9781, + "step": 160 + }, + { + "epoch": 0.18186952838181306, + "grad_norm": 0.024897055700421333, + "learning_rate": 0.00019839598716545057, + "loss": 1.1015, + "step": 161 + }, + { + "epoch": 0.1829991527817001, + "grad_norm": 0.023950692266225815, + "learning_rate": 0.00019837472954053732, + "loss": 1.125, + "step": 162 + }, + { + "epoch": 0.18412877718158713, + "grad_norm": 0.025099674239754677, + "learning_rate": 0.00019835333313402318, + "loss": 1.0359, + "step": 163 + }, + { + "epoch": 0.18525840158147416, + "grad_norm": 0.025351393967866898, + "learning_rate": 0.000198331797976093, + "loss": 1.0281, + "step": 164 + }, + { + "epoch": 0.1863880259813612, + "grad_norm": 0.024693114683032036, + "learning_rate": 0.00019831012409712737, + "loss": 1.1521, + "step": 165 + }, + { + "epoch": 0.18751765038124824, + "grad_norm": 0.024255136027932167, + "learning_rate": 0.0001982883115277026, + "loss": 1.0842, + "step": 166 + }, + { + "epoch": 0.18864727478113527, + "grad_norm": 0.02501499280333519, + "learning_rate": 0.00019826636029859066, + "loss": 0.9975, + "step": 167 + }, + { + "epoch": 0.1897768991810223, + "grad_norm": 0.025276506319642067, + "learning_rate": 0.00019824427044075912, + "loss": 1.0119, + "step": 168 + }, + { + "epoch": 0.19090652358090934, + "grad_norm": 0.024857770651578903, + "learning_rate": 0.0001982220419853711, + "loss": 0.9733, + "step": 169 + }, + { + "epoch": 0.19203614798079638, + "grad_norm": 0.02459135465323925, + "learning_rate": 0.0001981996749637853, + "loss": 1.0791, + "step": 170 + }, + { + "epoch": 0.19316577238068341, + "grad_norm": 0.026056725531816483, + "learning_rate": 0.00019817716940755586, + "loss": 1.0849, + "step": 171 + }, + { + "epoch": 0.19429539678057045, + "grad_norm": 0.024211106821894646, + "learning_rate": 0.0001981545253484324, + "loss": 1.0253, + "step": 172 + }, + { + "epoch": 0.1954250211804575, + "grad_norm": 0.024450423195958138, + "learning_rate": 0.00019813174281835982, + "loss": 1.1101, + "step": 173 + }, + { + "epoch": 0.19655464558034452, + "grad_norm": 0.02433086559176445, + "learning_rate": 0.0001981088218494785, + "loss": 0.9887, + "step": 174 + }, + { + "epoch": 0.1976842699802316, + "grad_norm": 0.02424442023038864, + "learning_rate": 0.0001980857624741241, + "loss": 1.074, + "step": 175 + }, + { + "epoch": 0.19881389438011862, + "grad_norm": 0.02318243682384491, + "learning_rate": 0.00019806256472482744, + "loss": 1.1045, + "step": 176 + }, + { + "epoch": 0.19994351878000566, + "grad_norm": 0.02407553791999817, + "learning_rate": 0.00019803922863431467, + "loss": 1.0062, + "step": 177 + }, + { + "epoch": 0.2010731431798927, + "grad_norm": 0.02463892102241516, + "learning_rate": 0.000198015754235507, + "loss": 1.0689, + "step": 178 + }, + { + "epoch": 0.20220276757977973, + "grad_norm": 0.023701028898358345, + "learning_rate": 0.00019799214156152083, + "loss": 1.0672, + "step": 179 + }, + { + "epoch": 0.20333239197966677, + "grad_norm": 0.02471453696489334, + "learning_rate": 0.00019796839064566761, + "loss": 1.033, + "step": 180 + }, + { + "epoch": 0.2044620163795538, + "grad_norm": 0.02426736056804657, + "learning_rate": 0.00019794450152145382, + "loss": 1.0831, + "step": 181 + }, + { + "epoch": 0.20559164077944084, + "grad_norm": 0.0243529062718153, + "learning_rate": 0.0001979204742225809, + "loss": 1.0815, + "step": 182 + }, + { + "epoch": 0.20672126517932787, + "grad_norm": 0.0243973471224308, + "learning_rate": 0.00019789630878294526, + "loss": 1.0541, + "step": 183 + }, + { + "epoch": 0.2078508895792149, + "grad_norm": 0.02461186796426773, + "learning_rate": 0.0001978720052366381, + "loss": 1.1203, + "step": 184 + }, + { + "epoch": 0.20898051397910195, + "grad_norm": 0.02479882724583149, + "learning_rate": 0.00019784756361794555, + "loss": 1.078, + "step": 185 + }, + { + "epoch": 0.21011013837898898, + "grad_norm": 0.02605288103222847, + "learning_rate": 0.00019782298396134844, + "loss": 1.01, + "step": 186 + }, + { + "epoch": 0.21123976277887602, + "grad_norm": 0.025911834090948105, + "learning_rate": 0.00019779826630152245, + "loss": 1.1173, + "step": 187 + }, + { + "epoch": 0.21236938717876305, + "grad_norm": 0.024420902132987976, + "learning_rate": 0.00019777341067333786, + "loss": 1.0023, + "step": 188 + }, + { + "epoch": 0.2134990115786501, + "grad_norm": 0.024010393768548965, + "learning_rate": 0.0001977484171118596, + "loss": 1.1382, + "step": 189 + }, + { + "epoch": 0.21462863597853712, + "grad_norm": 0.024915101006627083, + "learning_rate": 0.00019772328565234717, + "loss": 1.0734, + "step": 190 + }, + { + "epoch": 0.21575826037842416, + "grad_norm": 0.025032367557287216, + "learning_rate": 0.0001976980163302547, + "loss": 0.9585, + "step": 191 + }, + { + "epoch": 0.21688788477831122, + "grad_norm": 0.024727528914809227, + "learning_rate": 0.0001976726091812307, + "loss": 1.0731, + "step": 192 + }, + { + "epoch": 0.21801750917819826, + "grad_norm": 0.024914614856243134, + "learning_rate": 0.00019764706424111816, + "loss": 0.9522, + "step": 193 + }, + { + "epoch": 0.2191471335780853, + "grad_norm": 0.024750174954533577, + "learning_rate": 0.00019762138154595446, + "loss": 0.9646, + "step": 194 + }, + { + "epoch": 0.22027675797797233, + "grad_norm": 0.02512511797249317, + "learning_rate": 0.00019759556113197135, + "loss": 1.0643, + "step": 195 + }, + { + "epoch": 0.22140638237785937, + "grad_norm": 0.026546582579612732, + "learning_rate": 0.00019756960303559483, + "loss": 1.1158, + "step": 196 + }, + { + "epoch": 0.2225360067777464, + "grad_norm": 0.02506748028099537, + "learning_rate": 0.0001975435072934451, + "loss": 1.0261, + "step": 197 + }, + { + "epoch": 0.22366563117763344, + "grad_norm": 0.024585796520113945, + "learning_rate": 0.00019751727394233667, + "loss": 1.017, + "step": 198 + }, + { + "epoch": 0.22479525557752048, + "grad_norm": 0.02528531290590763, + "learning_rate": 0.00019749090301927796, + "loss": 1.042, + "step": 199 + }, + { + "epoch": 0.2259248799774075, + "grad_norm": 0.025023646652698517, + "learning_rate": 0.00019746439456147172, + "loss": 0.9618, + "step": 200 + }, + { + "epoch": 0.22705450437729455, + "grad_norm": 0.025859549641609192, + "learning_rate": 0.00019743774860631457, + "loss": 0.9982, + "step": 201 + }, + { + "epoch": 0.22818412877718158, + "grad_norm": 0.026021264493465424, + "learning_rate": 0.00019741096519139713, + "loss": 1.0131, + "step": 202 + }, + { + "epoch": 0.22931375317706862, + "grad_norm": 0.025675011798739433, + "learning_rate": 0.00019738404435450395, + "loss": 1.0186, + "step": 203 + }, + { + "epoch": 0.23044337757695565, + "grad_norm": 0.025758078321814537, + "learning_rate": 0.00019735698613361347, + "loss": 1.0869, + "step": 204 + }, + { + "epoch": 0.2315730019768427, + "grad_norm": 0.02666814811527729, + "learning_rate": 0.00019732979056689794, + "loss": 1.0894, + "step": 205 + }, + { + "epoch": 0.23270262637672973, + "grad_norm": 0.024690723046660423, + "learning_rate": 0.0001973024576927233, + "loss": 1.0898, + "step": 206 + }, + { + "epoch": 0.23383225077661676, + "grad_norm": 0.025678694248199463, + "learning_rate": 0.00019727498754964928, + "loss": 1.091, + "step": 207 + }, + { + "epoch": 0.23496187517650383, + "grad_norm": 0.025275958701968193, + "learning_rate": 0.00019724738017642924, + "loss": 1.089, + "step": 208 + }, + { + "epoch": 0.23609149957639086, + "grad_norm": 0.02560093067586422, + "learning_rate": 0.00019721963561201012, + "loss": 0.9755, + "step": 209 + }, + { + "epoch": 0.2372211239762779, + "grad_norm": 0.026244761422276497, + "learning_rate": 0.00019719175389553242, + "loss": 1.0696, + "step": 210 + }, + { + "epoch": 0.23835074837616493, + "grad_norm": 0.025443457067012787, + "learning_rate": 0.0001971637350663301, + "loss": 1.0032, + "step": 211 + }, + { + "epoch": 0.23948037277605197, + "grad_norm": 0.027356769889593124, + "learning_rate": 0.00019713557916393058, + "loss": 1.0393, + "step": 212 + }, + { + "epoch": 0.240609997175939, + "grad_norm": 0.025765880942344666, + "learning_rate": 0.0001971072862280546, + "loss": 1.015, + "step": 213 + }, + { + "epoch": 0.24173962157582604, + "grad_norm": 0.025718411430716515, + "learning_rate": 0.00019707885629861632, + "loss": 1.0343, + "step": 214 + }, + { + "epoch": 0.24286924597571308, + "grad_norm": 0.026691369712352753, + "learning_rate": 0.00019705028941572307, + "loss": 1.0896, + "step": 215 + }, + { + "epoch": 0.2439988703756001, + "grad_norm": 0.025440771132707596, + "learning_rate": 0.00019702158561967544, + "loss": 0.9986, + "step": 216 + }, + { + "epoch": 0.24512849477548715, + "grad_norm": 0.02483600750565529, + "learning_rate": 0.00019699274495096712, + "loss": 1.0287, + "step": 217 + }, + { + "epoch": 0.24625811917537418, + "grad_norm": 0.027423838153481483, + "learning_rate": 0.00019696376745028497, + "loss": 1.0626, + "step": 218 + }, + { + "epoch": 0.24738774357526122, + "grad_norm": 0.026005201041698456, + "learning_rate": 0.0001969346531585088, + "loss": 1.0203, + "step": 219 + }, + { + "epoch": 0.24851736797514826, + "grad_norm": 0.026350049301981926, + "learning_rate": 0.00019690540211671144, + "loss": 1.0482, + "step": 220 + }, + { + "epoch": 0.2496469923750353, + "grad_norm": 0.026930196210741997, + "learning_rate": 0.00019687601436615864, + "loss": 1.0258, + "step": 221 + }, + { + "epoch": 0.25077661677492236, + "grad_norm": 0.025890439748764038, + "learning_rate": 0.00019684648994830903, + "loss": 1.0886, + "step": 222 + }, + { + "epoch": 0.25190624117480936, + "grad_norm": 0.025864360854029655, + "learning_rate": 0.00019681682890481398, + "loss": 0.976, + "step": 223 + }, + { + "epoch": 0.25303586557469643, + "grad_norm": 0.025524241849780083, + "learning_rate": 0.00019678703127751763, + "loss": 1.0251, + "step": 224 + }, + { + "epoch": 0.25416548997458344, + "grad_norm": 0.02650127001106739, + "learning_rate": 0.00019675709710845687, + "loss": 1.0435, + "step": 225 + }, + { + "epoch": 0.2552951143744705, + "grad_norm": 0.025557860732078552, + "learning_rate": 0.00019672702643986113, + "loss": 1.0555, + "step": 226 + }, + { + "epoch": 0.2564247387743575, + "grad_norm": 0.027075499296188354, + "learning_rate": 0.0001966968193141524, + "loss": 0.9965, + "step": 227 + }, + { + "epoch": 0.25755436317424457, + "grad_norm": 0.025682270526885986, + "learning_rate": 0.00019666647577394527, + "loss": 1.0151, + "step": 228 + }, + { + "epoch": 0.2586839875741316, + "grad_norm": 0.026663288474082947, + "learning_rate": 0.00019663599586204673, + "loss": 1.0354, + "step": 229 + }, + { + "epoch": 0.25981361197401864, + "grad_norm": 0.026434747502207756, + "learning_rate": 0.0001966053796214561, + "loss": 1.0551, + "step": 230 + }, + { + "epoch": 0.26094323637390565, + "grad_norm": 0.025536926463246346, + "learning_rate": 0.0001965746270953651, + "loss": 0.9731, + "step": 231 + }, + { + "epoch": 0.2620728607737927, + "grad_norm": 0.07522192597389221, + "learning_rate": 0.0001965437383271577, + "loss": 0.9796, + "step": 232 + }, + { + "epoch": 0.2632024851736798, + "grad_norm": 0.027285447344183922, + "learning_rate": 0.00019651271336040997, + "loss": 1.011, + "step": 233 + }, + { + "epoch": 0.2643321095735668, + "grad_norm": 0.026399778202176094, + "learning_rate": 0.0001964815522388903, + "loss": 1.0199, + "step": 234 + }, + { + "epoch": 0.26546173397345385, + "grad_norm": 0.026532689109444618, + "learning_rate": 0.00019645025500655906, + "loss": 0.9918, + "step": 235 + }, + { + "epoch": 0.26659135837334086, + "grad_norm": 0.025576921179890633, + "learning_rate": 0.00019641882170756862, + "loss": 1.0198, + "step": 236 + }, + { + "epoch": 0.2677209827732279, + "grad_norm": 0.026158379390835762, + "learning_rate": 0.00019638725238626335, + "loss": 1.0204, + "step": 237 + }, + { + "epoch": 0.26885060717311493, + "grad_norm": 0.025530420243740082, + "learning_rate": 0.00019635554708717946, + "loss": 1.0885, + "step": 238 + }, + { + "epoch": 0.269980231573002, + "grad_norm": 0.02707337960600853, + "learning_rate": 0.00019632370585504502, + "loss": 1.0649, + "step": 239 + }, + { + "epoch": 0.271109855972889, + "grad_norm": 0.027028286829590797, + "learning_rate": 0.00019629172873477995, + "loss": 1.0544, + "step": 240 + }, + { + "epoch": 0.27223948037277607, + "grad_norm": 0.02564058266580105, + "learning_rate": 0.0001962596157714957, + "loss": 1.0481, + "step": 241 + }, + { + "epoch": 0.2733691047726631, + "grad_norm": 0.026479296386241913, + "learning_rate": 0.0001962273670104955, + "loss": 1.0413, + "step": 242 + }, + { + "epoch": 0.27449872917255014, + "grad_norm": 0.0330955870449543, + "learning_rate": 0.00019619498249727412, + "loss": 1.0292, + "step": 243 + }, + { + "epoch": 0.27562835357243715, + "grad_norm": 0.02611500211060047, + "learning_rate": 0.0001961624622775178, + "loss": 1.009, + "step": 244 + }, + { + "epoch": 0.2767579779723242, + "grad_norm": 0.026876097545027733, + "learning_rate": 0.00019612980639710428, + "loss": 0.9854, + "step": 245 + }, + { + "epoch": 0.2778876023722112, + "grad_norm": 0.02685077115893364, + "learning_rate": 0.00019609701490210264, + "loss": 1.0282, + "step": 246 + }, + { + "epoch": 0.2790172267720983, + "grad_norm": 0.026131028309464455, + "learning_rate": 0.00019606408783877334, + "loss": 1.0673, + "step": 247 + }, + { + "epoch": 0.2801468511719853, + "grad_norm": 0.02628222666680813, + "learning_rate": 0.00019603102525356798, + "loss": 1.0659, + "step": 248 + }, + { + "epoch": 0.28127647557187235, + "grad_norm": 0.027401477098464966, + "learning_rate": 0.00019599782719312948, + "loss": 0.9942, + "step": 249 + }, + { + "epoch": 0.2824060999717594, + "grad_norm": 0.02594529278576374, + "learning_rate": 0.00019596449370429183, + "loss": 1.0091, + "step": 250 + }, + { + "epoch": 0.2835357243716464, + "grad_norm": 0.028301890939474106, + "learning_rate": 0.00019593102483408, + "loss": 1.0083, + "step": 251 + }, + { + "epoch": 0.2846653487715335, + "grad_norm": 0.02808901108801365, + "learning_rate": 0.00019589742062971007, + "loss": 1.071, + "step": 252 + }, + { + "epoch": 0.2857949731714205, + "grad_norm": 0.02654552273452282, + "learning_rate": 0.00019586368113858892, + "loss": 1.0865, + "step": 253 + }, + { + "epoch": 0.28692459757130756, + "grad_norm": 0.02610975131392479, + "learning_rate": 0.00019582980640831443, + "loss": 1.1093, + "step": 254 + }, + { + "epoch": 0.28805422197119457, + "grad_norm": 0.027240293100476265, + "learning_rate": 0.0001957957964866751, + "loss": 1.0822, + "step": 255 + }, + { + "epoch": 0.28918384637108163, + "grad_norm": 0.027821950614452362, + "learning_rate": 0.00019576165142165032, + "loss": 1.0371, + "step": 256 + }, + { + "epoch": 0.29031347077096864, + "grad_norm": 0.02755453623831272, + "learning_rate": 0.00019572737126141002, + "loss": 1.0752, + "step": 257 + }, + { + "epoch": 0.2914430951708557, + "grad_norm": 0.02676587551832199, + "learning_rate": 0.0001956929560543147, + "loss": 1.0599, + "step": 258 + }, + { + "epoch": 0.2925727195707427, + "grad_norm": 0.02904544584453106, + "learning_rate": 0.00019565840584891549, + "loss": 1.0568, + "step": 259 + }, + { + "epoch": 0.2937023439706298, + "grad_norm": 0.027289781719446182, + "learning_rate": 0.00019562372069395384, + "loss": 1.0671, + "step": 260 + }, + { + "epoch": 0.2948319683705168, + "grad_norm": 0.025955747812986374, + "learning_rate": 0.00019558890063836167, + "loss": 0.9118, + "step": 261 + }, + { + "epoch": 0.29596159277040385, + "grad_norm": 0.028641648590564728, + "learning_rate": 0.00019555394573126118, + "loss": 1.0498, + "step": 262 + }, + { + "epoch": 0.29709121717029086, + "grad_norm": 0.028356773778796196, + "learning_rate": 0.0001955188560219648, + "loss": 1.0238, + "step": 263 + }, + { + "epoch": 0.2982208415701779, + "grad_norm": 0.02746075950562954, + "learning_rate": 0.00019548363155997517, + "loss": 1.0741, + "step": 264 + }, + { + "epoch": 0.2993504659700649, + "grad_norm": 0.02712567336857319, + "learning_rate": 0.000195448272394985, + "loss": 1.0861, + "step": 265 + }, + { + "epoch": 0.300480090369952, + "grad_norm": 0.026709580793976784, + "learning_rate": 0.00019541277857687694, + "loss": 1.0024, + "step": 266 + }, + { + "epoch": 0.30160971476983905, + "grad_norm": 0.027716003358364105, + "learning_rate": 0.00019537715015572382, + "loss": 1.0406, + "step": 267 + }, + { + "epoch": 0.30273933916972606, + "grad_norm": 0.02704858034849167, + "learning_rate": 0.00019534138718178818, + "loss": 1.0088, + "step": 268 + }, + { + "epoch": 0.3038689635696131, + "grad_norm": 0.026793915778398514, + "learning_rate": 0.00019530548970552247, + "loss": 1.0556, + "step": 269 + }, + { + "epoch": 0.30499858796950013, + "grad_norm": 0.028323287144303322, + "learning_rate": 0.00019526945777756879, + "loss": 1.057, + "step": 270 + }, + { + "epoch": 0.3061282123693872, + "grad_norm": 0.0279136560857296, + "learning_rate": 0.00019523329144875904, + "loss": 1.0654, + "step": 271 + }, + { + "epoch": 0.3072578367692742, + "grad_norm": 0.02878638356924057, + "learning_rate": 0.00019519699077011465, + "loss": 1.0357, + "step": 272 + }, + { + "epoch": 0.30838746116916127, + "grad_norm": 0.026021145284175873, + "learning_rate": 0.00019516055579284658, + "loss": 1.092, + "step": 273 + }, + { + "epoch": 0.3095170855690483, + "grad_norm": 0.0282638818025589, + "learning_rate": 0.00019512398656835528, + "loss": 1.0242, + "step": 274 + }, + { + "epoch": 0.31064670996893534, + "grad_norm": 0.0277785062789917, + "learning_rate": 0.00019508728314823062, + "loss": 1.0922, + "step": 275 + }, + { + "epoch": 0.31177633436882235, + "grad_norm": 0.027666205540299416, + "learning_rate": 0.00019505044558425168, + "loss": 1.0434, + "step": 276 + }, + { + "epoch": 0.3129059587687094, + "grad_norm": 0.02734490856528282, + "learning_rate": 0.0001950134739283869, + "loss": 1.0726, + "step": 277 + }, + { + "epoch": 0.3140355831685964, + "grad_norm": 0.026907166466116905, + "learning_rate": 0.0001949763682327938, + "loss": 1.0807, + "step": 278 + }, + { + "epoch": 0.3151652075684835, + "grad_norm": 0.02773541398346424, + "learning_rate": 0.00019493912854981905, + "loss": 1.0941, + "step": 279 + }, + { + "epoch": 0.3162948319683705, + "grad_norm": 0.027467425912618637, + "learning_rate": 0.00019490175493199833, + "loss": 1.031, + "step": 280 + }, + { + "epoch": 0.31742445636825756, + "grad_norm": 0.02712651528418064, + "learning_rate": 0.00019486424743205626, + "loss": 1.0015, + "step": 281 + }, + { + "epoch": 0.31855408076814457, + "grad_norm": 0.026572776958346367, + "learning_rate": 0.00019482660610290636, + "loss": 0.9459, + "step": 282 + }, + { + "epoch": 0.31968370516803163, + "grad_norm": 0.02701294608414173, + "learning_rate": 0.00019478883099765086, + "loss": 1.0652, + "step": 283 + }, + { + "epoch": 0.3208133295679187, + "grad_norm": 0.02713761292397976, + "learning_rate": 0.0001947509221695808, + "loss": 1.0455, + "step": 284 + }, + { + "epoch": 0.3219429539678057, + "grad_norm": 0.028251413255929947, + "learning_rate": 0.00019471287967217594, + "loss": 0.9885, + "step": 285 + }, + { + "epoch": 0.32307257836769276, + "grad_norm": 0.028362903743982315, + "learning_rate": 0.00019467470355910438, + "loss": 1.0896, + "step": 286 + }, + { + "epoch": 0.3242022027675798, + "grad_norm": 0.027835773304104805, + "learning_rate": 0.00019463639388422297, + "loss": 0.9381, + "step": 287 + }, + { + "epoch": 0.32533182716746684, + "grad_norm": 0.026659086346626282, + "learning_rate": 0.0001945979507015768, + "loss": 0.9987, + "step": 288 + }, + { + "epoch": 0.32646145156735384, + "grad_norm": 0.028285473585128784, + "learning_rate": 0.0001945593740653994, + "loss": 1.0055, + "step": 289 + }, + { + "epoch": 0.3275910759672409, + "grad_norm": 0.027459239587187767, + "learning_rate": 0.00019452066403011253, + "loss": 1.0468, + "step": 290 + }, + { + "epoch": 0.3287207003671279, + "grad_norm": 0.028836321085691452, + "learning_rate": 0.00019448182065032621, + "loss": 1.0855, + "step": 291 + }, + { + "epoch": 0.329850324767015, + "grad_norm": 0.029597043991088867, + "learning_rate": 0.00019444284398083847, + "loss": 1.1135, + "step": 292 + }, + { + "epoch": 0.330979949166902, + "grad_norm": 0.029845820739865303, + "learning_rate": 0.00019440373407663542, + "loss": 1.0117, + "step": 293 + }, + { + "epoch": 0.33210957356678905, + "grad_norm": 0.027042267844080925, + "learning_rate": 0.00019436449099289119, + "loss": 1.0173, + "step": 294 + }, + { + "epoch": 0.33323919796667606, + "grad_norm": 0.027646934613585472, + "learning_rate": 0.00019432511478496768, + "loss": 1.0777, + "step": 295 + }, + { + "epoch": 0.33323919796667606, + "eval_loss": 1.0277949571609497, + "eval_runtime": 565.1236, + "eval_samples_per_second": 17.311, + "eval_steps_per_second": 8.657, + "step": 295 + }, + { + "epoch": 0.3343688223665631, + "grad_norm": 0.026499278843402863, + "learning_rate": 0.00019428560550841472, + "loss": 0.9618, + "step": 296 + }, + { + "epoch": 0.33549844676645013, + "grad_norm": 0.027500445023179054, + "learning_rate": 0.00019424596321896976, + "loss": 0.9794, + "step": 297 + }, + { + "epoch": 0.3366280711663372, + "grad_norm": 0.027349818497896194, + "learning_rate": 0.00019420618797255795, + "loss": 1.1008, + "step": 298 + }, + { + "epoch": 0.3377576955662242, + "grad_norm": 0.027657683938741684, + "learning_rate": 0.000194166279825292, + "loss": 1.0801, + "step": 299 + }, + { + "epoch": 0.33888731996611127, + "grad_norm": 0.027384718880057335, + "learning_rate": 0.00019412623883347207, + "loss": 1.038, + "step": 300 + }, + { + "epoch": 0.34001694436599833, + "grad_norm": 0.026920663192868233, + "learning_rate": 0.00019408606505358583, + "loss": 0.9868, + "step": 301 + }, + { + "epoch": 0.34114656876588534, + "grad_norm": 0.028844624757766724, + "learning_rate": 0.00019404575854230818, + "loss": 1.0293, + "step": 302 + }, + { + "epoch": 0.3422761931657724, + "grad_norm": 0.02755833975970745, + "learning_rate": 0.00019400531935650128, + "loss": 1.0087, + "step": 303 + }, + { + "epoch": 0.3434058175656594, + "grad_norm": 0.027301400899887085, + "learning_rate": 0.00019396474755321456, + "loss": 1.0318, + "step": 304 + }, + { + "epoch": 0.3445354419655465, + "grad_norm": 0.02760390006005764, + "learning_rate": 0.0001939240431896844, + "loss": 0.9421, + "step": 305 + }, + { + "epoch": 0.3456650663654335, + "grad_norm": 0.027442464604973793, + "learning_rate": 0.00019388320632333429, + "loss": 1.0801, + "step": 306 + }, + { + "epoch": 0.34679469076532055, + "grad_norm": 0.027593247592449188, + "learning_rate": 0.00019384223701177455, + "loss": 1.0607, + "step": 307 + }, + { + "epoch": 0.34792431516520755, + "grad_norm": 0.028117630630731583, + "learning_rate": 0.00019380113531280245, + "loss": 1.054, + "step": 308 + }, + { + "epoch": 0.3490539395650946, + "grad_norm": 0.029217706993222237, + "learning_rate": 0.00019375990128440204, + "loss": 1.0997, + "step": 309 + }, + { + "epoch": 0.3501835639649816, + "grad_norm": 0.027274932712316513, + "learning_rate": 0.0001937185349847439, + "loss": 1.0051, + "step": 310 + }, + { + "epoch": 0.3513131883648687, + "grad_norm": 0.03279178589582443, + "learning_rate": 0.0001936770364721854, + "loss": 1.0293, + "step": 311 + }, + { + "epoch": 0.3524428127647557, + "grad_norm": 0.026957320049405098, + "learning_rate": 0.00019363540580527025, + "loss": 1.0358, + "step": 312 + }, + { + "epoch": 0.35357243716464276, + "grad_norm": 0.029469158500432968, + "learning_rate": 0.0001935936430427287, + "loss": 1.1446, + "step": 313 + }, + { + "epoch": 0.35470206156452977, + "grad_norm": 0.03025597333908081, + "learning_rate": 0.00019355174824347735, + "loss": 1.0722, + "step": 314 + }, + { + "epoch": 0.35583168596441683, + "grad_norm": 0.02727232687175274, + "learning_rate": 0.00019350972146661905, + "loss": 1.0592, + "step": 315 + }, + { + "epoch": 0.3569613103643039, + "grad_norm": 0.028911981731653214, + "learning_rate": 0.00019346756277144285, + "loss": 1.1644, + "step": 316 + }, + { + "epoch": 0.3580909347641909, + "grad_norm": 0.02783570997416973, + "learning_rate": 0.0001934252722174239, + "loss": 0.9406, + "step": 317 + }, + { + "epoch": 0.35922055916407797, + "grad_norm": 0.02677338756620884, + "learning_rate": 0.00019338284986422335, + "loss": 0.9287, + "step": 318 + }, + { + "epoch": 0.360350183563965, + "grad_norm": 0.027951853349804878, + "learning_rate": 0.00019334029577168827, + "loss": 0.9541, + "step": 319 + }, + { + "epoch": 0.36147980796385204, + "grad_norm": 0.028323214501142502, + "learning_rate": 0.00019329760999985167, + "loss": 1.1566, + "step": 320 + }, + { + "epoch": 0.36260943236373905, + "grad_norm": 0.027881423011422157, + "learning_rate": 0.00019325479260893223, + "loss": 1.0662, + "step": 321 + }, + { + "epoch": 0.3637390567636261, + "grad_norm": 0.02717737667262554, + "learning_rate": 0.00019321184365933433, + "loss": 1.0317, + "step": 322 + }, + { + "epoch": 0.3648686811635131, + "grad_norm": 0.028628146275877953, + "learning_rate": 0.00019316876321164798, + "loss": 1.0503, + "step": 323 + }, + { + "epoch": 0.3659983055634002, + "grad_norm": 0.02851051092147827, + "learning_rate": 0.0001931255513266487, + "loss": 1.0565, + "step": 324 + }, + { + "epoch": 0.3671279299632872, + "grad_norm": 0.02863175794482231, + "learning_rate": 0.00019308220806529738, + "loss": 1.0243, + "step": 325 + }, + { + "epoch": 0.36825755436317426, + "grad_norm": 0.03015504591166973, + "learning_rate": 0.0001930387334887403, + "loss": 1.0208, + "step": 326 + }, + { + "epoch": 0.36938717876306126, + "grad_norm": 0.02771030366420746, + "learning_rate": 0.00019299512765830895, + "loss": 1.0094, + "step": 327 + }, + { + "epoch": 0.3705168031629483, + "grad_norm": 0.027864158153533936, + "learning_rate": 0.00019295139063552007, + "loss": 0.9863, + "step": 328 + }, + { + "epoch": 0.37164642756283534, + "grad_norm": 0.028755534440279007, + "learning_rate": 0.00019290752248207537, + "loss": 1.0542, + "step": 329 + }, + { + "epoch": 0.3727760519627224, + "grad_norm": 0.029860056936740875, + "learning_rate": 0.00019286352325986164, + "loss": 1.0006, + "step": 330 + }, + { + "epoch": 0.3739056763626094, + "grad_norm": 0.027963971719145775, + "learning_rate": 0.0001928193930309505, + "loss": 0.9609, + "step": 331 + }, + { + "epoch": 0.37503530076249647, + "grad_norm": 0.02750619500875473, + "learning_rate": 0.00019277513185759844, + "loss": 1.0076, + "step": 332 + }, + { + "epoch": 0.37616492516238353, + "grad_norm": 0.02815542183816433, + "learning_rate": 0.0001927307398022467, + "loss": 1.04, + "step": 333 + }, + { + "epoch": 0.37729454956227054, + "grad_norm": 0.028742128983139992, + "learning_rate": 0.00019268621692752108, + "loss": 0.9947, + "step": 334 + }, + { + "epoch": 0.3784241739621576, + "grad_norm": 0.027735736221075058, + "learning_rate": 0.00019264156329623197, + "loss": 1.0265, + "step": 335 + }, + { + "epoch": 0.3795537983620446, + "grad_norm": 0.02745204232633114, + "learning_rate": 0.00019259677897137426, + "loss": 1.0308, + "step": 336 + }, + { + "epoch": 0.3806834227619317, + "grad_norm": 0.028459064662456512, + "learning_rate": 0.00019255186401612718, + "loss": 1.0069, + "step": 337 + }, + { + "epoch": 0.3818130471618187, + "grad_norm": 0.028107335790991783, + "learning_rate": 0.00019250681849385424, + "loss": 1.0812, + "step": 338 + }, + { + "epoch": 0.38294267156170575, + "grad_norm": 0.029490889981389046, + "learning_rate": 0.00019246164246810316, + "loss": 1.0247, + "step": 339 + }, + { + "epoch": 0.38407229596159276, + "grad_norm": 0.027926163747906685, + "learning_rate": 0.00019241633600260578, + "loss": 0.9761, + "step": 340 + }, + { + "epoch": 0.3852019203614798, + "grad_norm": 0.02847837097942829, + "learning_rate": 0.00019237089916127793, + "loss": 1.0841, + "step": 341 + }, + { + "epoch": 0.38633154476136683, + "grad_norm": 0.027178598567843437, + "learning_rate": 0.00019232533200821942, + "loss": 1.1123, + "step": 342 + }, + { + "epoch": 0.3874611691612539, + "grad_norm": 0.027773573994636536, + "learning_rate": 0.00019227963460771377, + "loss": 0.9871, + "step": 343 + }, + { + "epoch": 0.3885907935611409, + "grad_norm": 0.027409275993704796, + "learning_rate": 0.00019223380702422844, + "loss": 1.0916, + "step": 344 + }, + { + "epoch": 0.38972041796102797, + "grad_norm": 0.028152553364634514, + "learning_rate": 0.00019218784932241434, + "loss": 1.0301, + "step": 345 + }, + { + "epoch": 0.390850042360915, + "grad_norm": 0.028817711398005486, + "learning_rate": 0.00019214176156710612, + "loss": 1.0203, + "step": 346 + }, + { + "epoch": 0.39197966676080204, + "grad_norm": 0.02772883139550686, + "learning_rate": 0.0001920955438233218, + "loss": 0.9991, + "step": 347 + }, + { + "epoch": 0.39310929116068904, + "grad_norm": 0.028133943676948547, + "learning_rate": 0.00019204919615626275, + "loss": 0.9834, + "step": 348 + }, + { + "epoch": 0.3942389155605761, + "grad_norm": 0.02936532348394394, + "learning_rate": 0.00019200271863131375, + "loss": 1.0227, + "step": 349 + }, + { + "epoch": 0.3953685399604632, + "grad_norm": 0.028890248388051987, + "learning_rate": 0.0001919561113140427, + "loss": 0.9551, + "step": 350 + }, + { + "epoch": 0.3964981643603502, + "grad_norm": 0.02820666879415512, + "learning_rate": 0.0001919093742702006, + "loss": 1.0343, + "step": 351 + }, + { + "epoch": 0.39762778876023724, + "grad_norm": 0.029474567621946335, + "learning_rate": 0.00019186250756572144, + "loss": 0.9853, + "step": 352 + }, + { + "epoch": 0.39875741316012425, + "grad_norm": 0.02914329618215561, + "learning_rate": 0.0001918155112667222, + "loss": 0.9542, + "step": 353 + }, + { + "epoch": 0.3998870375600113, + "grad_norm": 0.028036657720804214, + "learning_rate": 0.00019176838543950267, + "loss": 0.945, + "step": 354 + }, + { + "epoch": 0.4010166619598983, + "grad_norm": 0.027309326454997063, + "learning_rate": 0.00019172113015054532, + "loss": 0.977, + "step": 355 + }, + { + "epoch": 0.4021462863597854, + "grad_norm": 0.027427159249782562, + "learning_rate": 0.00019167374546651526, + "loss": 1.0505, + "step": 356 + }, + { + "epoch": 0.4032759107596724, + "grad_norm": 0.03023376129567623, + "learning_rate": 0.0001916262314542602, + "loss": 1.1378, + "step": 357 + }, + { + "epoch": 0.40440553515955946, + "grad_norm": 0.027807191014289856, + "learning_rate": 0.00019157858818081026, + "loss": 1.0516, + "step": 358 + }, + { + "epoch": 0.40553515955944647, + "grad_norm": 0.028308499604463577, + "learning_rate": 0.00019153081571337795, + "loss": 1.0673, + "step": 359 + }, + { + "epoch": 0.40666478395933353, + "grad_norm": 0.028541473671793938, + "learning_rate": 0.00019148291411935796, + "loss": 1.0567, + "step": 360 + }, + { + "epoch": 0.40779440835922054, + "grad_norm": 0.027455326169729233, + "learning_rate": 0.00019143488346632723, + "loss": 1.0078, + "step": 361 + }, + { + "epoch": 0.4089240327591076, + "grad_norm": 0.02952658385038376, + "learning_rate": 0.00019138672382204471, + "loss": 1.0686, + "step": 362 + }, + { + "epoch": 0.4100536571589946, + "grad_norm": 0.028435127809643745, + "learning_rate": 0.0001913384352544514, + "loss": 0.9846, + "step": 363 + }, + { + "epoch": 0.4111832815588817, + "grad_norm": 0.028838949277997017, + "learning_rate": 0.00019129001783167005, + "loss": 1.0602, + "step": 364 + }, + { + "epoch": 0.4123129059587687, + "grad_norm": 0.029650872573256493, + "learning_rate": 0.00019124147162200535, + "loss": 0.9967, + "step": 365 + }, + { + "epoch": 0.41344253035865575, + "grad_norm": 0.028792966157197952, + "learning_rate": 0.00019119279669394353, + "loss": 1.0562, + "step": 366 + }, + { + "epoch": 0.4145721547585428, + "grad_norm": 0.029962720349431038, + "learning_rate": 0.00019114399311615253, + "loss": 1.0016, + "step": 367 + }, + { + "epoch": 0.4157017791584298, + "grad_norm": 0.029513955116271973, + "learning_rate": 0.00019109506095748167, + "loss": 1.007, + "step": 368 + }, + { + "epoch": 0.4168314035583169, + "grad_norm": 0.028869032859802246, + "learning_rate": 0.00019104600028696175, + "loss": 1.033, + "step": 369 + }, + { + "epoch": 0.4179610279582039, + "grad_norm": 0.02818440832197666, + "learning_rate": 0.00019099681117380486, + "loss": 0.9947, + "step": 370 + }, + { + "epoch": 0.41909065235809095, + "grad_norm": 0.030735397711396217, + "learning_rate": 0.00019094749368740423, + "loss": 1.031, + "step": 371 + }, + { + "epoch": 0.42022027675797796, + "grad_norm": 0.029516831040382385, + "learning_rate": 0.00019089804789733424, + "loss": 1.1093, + "step": 372 + }, + { + "epoch": 0.421349901157865, + "grad_norm": 0.028589509427547455, + "learning_rate": 0.00019084847387335025, + "loss": 1.0524, + "step": 373 + }, + { + "epoch": 0.42247952555775203, + "grad_norm": 0.029599323868751526, + "learning_rate": 0.00019079877168538855, + "loss": 1.0867, + "step": 374 + }, + { + "epoch": 0.4236091499576391, + "grad_norm": 0.029633615165948868, + "learning_rate": 0.00019074894140356624, + "loss": 1.0187, + "step": 375 + }, + { + "epoch": 0.4247387743575261, + "grad_norm": 0.029569542035460472, + "learning_rate": 0.00019069898309818106, + "loss": 1.0172, + "step": 376 + }, + { + "epoch": 0.42586839875741317, + "grad_norm": 0.02864873595535755, + "learning_rate": 0.00019064889683971149, + "loss": 1.0408, + "step": 377 + }, + { + "epoch": 0.4269980231573002, + "grad_norm": 0.02849559485912323, + "learning_rate": 0.0001905986826988164, + "loss": 1.0513, + "step": 378 + }, + { + "epoch": 0.42812764755718724, + "grad_norm": 0.028202759101986885, + "learning_rate": 0.00019054834074633506, + "loss": 1.0536, + "step": 379 + }, + { + "epoch": 0.42925727195707425, + "grad_norm": 0.02983192540705204, + "learning_rate": 0.00019049787105328715, + "loss": 1.0294, + "step": 380 + }, + { + "epoch": 0.4303868963569613, + "grad_norm": 0.028043275699019432, + "learning_rate": 0.0001904472736908725, + "loss": 0.9645, + "step": 381 + }, + { + "epoch": 0.4315165207568483, + "grad_norm": 0.02895670384168625, + "learning_rate": 0.0001903965487304711, + "loss": 1.154, + "step": 382 + }, + { + "epoch": 0.4326461451567354, + "grad_norm": 0.02832162007689476, + "learning_rate": 0.0001903456962436428, + "loss": 1.0332, + "step": 383 + }, + { + "epoch": 0.43377576955662245, + "grad_norm": 0.029863545671105385, + "learning_rate": 0.00019029471630212762, + "loss": 1.0002, + "step": 384 + }, + { + "epoch": 0.43490539395650946, + "grad_norm": 0.02890811115503311, + "learning_rate": 0.00019024360897784508, + "loss": 1.0644, + "step": 385 + }, + { + "epoch": 0.4360350183563965, + "grad_norm": 0.03050493635237217, + "learning_rate": 0.0001901923743428946, + "loss": 1.0324, + "step": 386 + }, + { + "epoch": 0.43716464275628353, + "grad_norm": 0.029246153309941292, + "learning_rate": 0.00019014101246955515, + "loss": 1.0591, + "step": 387 + }, + { + "epoch": 0.4382942671561706, + "grad_norm": 0.02876698225736618, + "learning_rate": 0.00019008952343028526, + "loss": 0.9519, + "step": 388 + }, + { + "epoch": 0.4394238915560576, + "grad_norm": 0.029059743508696556, + "learning_rate": 0.00019003790729772273, + "loss": 1.0165, + "step": 389 + }, + { + "epoch": 0.44055351595594466, + "grad_norm": 0.02885555475950241, + "learning_rate": 0.00018998616414468478, + "loss": 1.004, + "step": 390 + }, + { + "epoch": 0.44168314035583167, + "grad_norm": 0.02809917740523815, + "learning_rate": 0.00018993429404416773, + "loss": 0.9685, + "step": 391 + }, + { + "epoch": 0.44281276475571874, + "grad_norm": 0.028004605323076248, + "learning_rate": 0.0001898822970693471, + "loss": 0.9923, + "step": 392 + }, + { + "epoch": 0.44394238915560574, + "grad_norm": 0.029958872124552727, + "learning_rate": 0.00018983017329357729, + "loss": 1.0468, + "step": 393 + }, + { + "epoch": 0.4450720135554928, + "grad_norm": 0.03032870590686798, + "learning_rate": 0.00018977792279039162, + "loss": 0.9573, + "step": 394 + }, + { + "epoch": 0.4462016379553798, + "grad_norm": 0.029365211725234985, + "learning_rate": 0.0001897255456335022, + "loss": 0.9673, + "step": 395 + }, + { + "epoch": 0.4473312623552669, + "grad_norm": 0.03092394582927227, + "learning_rate": 0.00018967304189679984, + "loss": 1.1468, + "step": 396 + }, + { + "epoch": 0.4484608867551539, + "grad_norm": 0.029345886781811714, + "learning_rate": 0.00018962041165435388, + "loss": 1.1213, + "step": 397 + }, + { + "epoch": 0.44959051115504095, + "grad_norm": 0.029504388570785522, + "learning_rate": 0.0001895676549804121, + "loss": 1.0483, + "step": 398 + }, + { + "epoch": 0.450720135554928, + "grad_norm": 0.029384993016719818, + "learning_rate": 0.00018951477194940075, + "loss": 0.9973, + "step": 399 + }, + { + "epoch": 0.451849759954815, + "grad_norm": 0.02798447571694851, + "learning_rate": 0.0001894617626359242, + "loss": 1.0041, + "step": 400 + }, + { + "epoch": 0.4529793843547021, + "grad_norm": 0.028576720505952835, + "learning_rate": 0.00018940862711476513, + "loss": 1.0699, + "step": 401 + }, + { + "epoch": 0.4541090087545891, + "grad_norm": 0.029531830921769142, + "learning_rate": 0.0001893553654608841, + "loss": 1.0396, + "step": 402 + }, + { + "epoch": 0.45523863315447616, + "grad_norm": 0.02875913865864277, + "learning_rate": 0.00018930197774941974, + "loss": 1.0302, + "step": 403 + }, + { + "epoch": 0.45636825755436317, + "grad_norm": 0.02790944278240204, + "learning_rate": 0.00018924846405568845, + "loss": 1.1243, + "step": 404 + }, + { + "epoch": 0.45749788195425023, + "grad_norm": 0.02811037190258503, + "learning_rate": 0.00018919482445518436, + "loss": 1.0377, + "step": 405 + }, + { + "epoch": 0.45862750635413724, + "grad_norm": 0.029786163941025734, + "learning_rate": 0.00018914105902357925, + "loss": 0.9825, + "step": 406 + }, + { + "epoch": 0.4597571307540243, + "grad_norm": 0.028242526575922966, + "learning_rate": 0.0001890871678367224, + "loss": 1.0738, + "step": 407 + }, + { + "epoch": 0.4608867551539113, + "grad_norm": 0.028527051210403442, + "learning_rate": 0.00018903315097064055, + "loss": 1.0024, + "step": 408 + }, + { + "epoch": 0.4620163795537984, + "grad_norm": 0.02773975394666195, + "learning_rate": 0.0001889790085015376, + "loss": 1.0042, + "step": 409 + }, + { + "epoch": 0.4631460039536854, + "grad_norm": 0.028500793501734734, + "learning_rate": 0.0001889247405057948, + "loss": 1.0938, + "step": 410 + }, + { + "epoch": 0.46427562835357244, + "grad_norm": 0.028347400948405266, + "learning_rate": 0.0001888703470599704, + "loss": 0.9892, + "step": 411 + }, + { + "epoch": 0.46540525275345945, + "grad_norm": 0.030584534630179405, + "learning_rate": 0.00018881582824079965, + "loss": 0.9977, + "step": 412 + }, + { + "epoch": 0.4665348771533465, + "grad_norm": 0.030196473002433777, + "learning_rate": 0.0001887611841251947, + "loss": 1.0442, + "step": 413 + }, + { + "epoch": 0.4676645015532335, + "grad_norm": 0.02942134439945221, + "learning_rate": 0.00018870641479024438, + "loss": 1.0096, + "step": 414 + }, + { + "epoch": 0.4687941259531206, + "grad_norm": 0.0283603947609663, + "learning_rate": 0.00018865152031321427, + "loss": 1.1341, + "step": 415 + }, + { + "epoch": 0.46992375035300765, + "grad_norm": 0.02936590276658535, + "learning_rate": 0.0001885965007715464, + "loss": 1.0823, + "step": 416 + }, + { + "epoch": 0.47105337475289466, + "grad_norm": 0.029375478625297546, + "learning_rate": 0.00018854135624285935, + "loss": 1.1148, + "step": 417 + }, + { + "epoch": 0.4721829991527817, + "grad_norm": 0.02892325632274151, + "learning_rate": 0.00018848608680494788, + "loss": 1.0905, + "step": 418 + }, + { + "epoch": 0.47331262355266873, + "grad_norm": 0.028916003182530403, + "learning_rate": 0.00018843069253578312, + "loss": 1.0133, + "step": 419 + }, + { + "epoch": 0.4744422479525558, + "grad_norm": 0.03031068667769432, + "learning_rate": 0.00018837517351351214, + "loss": 0.9835, + "step": 420 + }, + { + "epoch": 0.4755718723524428, + "grad_norm": 0.02931569144129753, + "learning_rate": 0.00018831952981645817, + "loss": 0.9664, + "step": 421 + }, + { + "epoch": 0.47670149675232987, + "grad_norm": 0.029150547459721565, + "learning_rate": 0.0001882637615231202, + "loss": 0.9604, + "step": 422 + }, + { + "epoch": 0.4778311211522169, + "grad_norm": 0.03003125637769699, + "learning_rate": 0.00018820786871217305, + "loss": 1.0735, + "step": 423 + }, + { + "epoch": 0.47896074555210394, + "grad_norm": 0.030021261423826218, + "learning_rate": 0.00018815185146246716, + "loss": 1.0005, + "step": 424 + }, + { + "epoch": 0.48009036995199095, + "grad_norm": 0.029816657304763794, + "learning_rate": 0.00018809570985302862, + "loss": 0.9366, + "step": 425 + }, + { + "epoch": 0.481219994351878, + "grad_norm": 0.02971251681447029, + "learning_rate": 0.00018803944396305884, + "loss": 1.0121, + "step": 426 + }, + { + "epoch": 0.482349618751765, + "grad_norm": 0.03110647387802601, + "learning_rate": 0.00018798305387193463, + "loss": 1.0021, + "step": 427 + }, + { + "epoch": 0.4834792431516521, + "grad_norm": 0.030216267332434654, + "learning_rate": 0.000187926539659208, + "loss": 0.9594, + "step": 428 + }, + { + "epoch": 0.4846088675515391, + "grad_norm": 0.030311699956655502, + "learning_rate": 0.000187869901404606, + "loss": 1.0478, + "step": 429 + }, + { + "epoch": 0.48573849195142615, + "grad_norm": 0.028579862788319588, + "learning_rate": 0.00018781313918803086, + "loss": 0.9539, + "step": 430 + }, + { + "epoch": 0.48686811635131316, + "grad_norm": 0.03003637120127678, + "learning_rate": 0.00018775625308955942, + "loss": 1.0172, + "step": 431 + }, + { + "epoch": 0.4879977407512002, + "grad_norm": 0.03043578751385212, + "learning_rate": 0.0001876992431894435, + "loss": 0.9997, + "step": 432 + }, + { + "epoch": 0.4891273651510873, + "grad_norm": 0.03140099346637726, + "learning_rate": 0.0001876421095681095, + "loss": 1.0307, + "step": 433 + }, + { + "epoch": 0.4902569895509743, + "grad_norm": 0.03060254082083702, + "learning_rate": 0.00018758485230615837, + "loss": 0.9873, + "step": 434 + }, + { + "epoch": 0.49138661395086136, + "grad_norm": 0.030223416164517403, + "learning_rate": 0.00018752747148436543, + "loss": 1.0629, + "step": 435 + }, + { + "epoch": 0.49251623835074837, + "grad_norm": 0.030368085950613022, + "learning_rate": 0.00018746996718368037, + "loss": 0.9692, + "step": 436 + }, + { + "epoch": 0.49364586275063543, + "grad_norm": 0.03002486564218998, + "learning_rate": 0.00018741233948522707, + "loss": 1.0334, + "step": 437 + }, + { + "epoch": 0.49477548715052244, + "grad_norm": 0.029050812125205994, + "learning_rate": 0.0001873545884703035, + "loss": 0.9861, + "step": 438 + }, + { + "epoch": 0.4959051115504095, + "grad_norm": 0.030488910153508186, + "learning_rate": 0.0001872967142203815, + "loss": 1.1141, + "step": 439 + }, + { + "epoch": 0.4970347359502965, + "grad_norm": 0.029405072331428528, + "learning_rate": 0.00018723871681710697, + "loss": 1.0318, + "step": 440 + }, + { + "epoch": 0.4981643603501836, + "grad_norm": 0.030446210876107216, + "learning_rate": 0.0001871805963422993, + "loss": 0.9895, + "step": 441 + }, + { + "epoch": 0.4992939847500706, + "grad_norm": 0.029718847945332527, + "learning_rate": 0.00018712235287795176, + "loss": 1.1104, + "step": 442 + }, + { + "epoch": 0.5004236091499576, + "grad_norm": 0.03045968897640705, + "learning_rate": 0.00018706398650623088, + "loss": 0.9305, + "step": 443 + }, + { + "epoch": 0.5015532335498447, + "grad_norm": 0.030085409060120583, + "learning_rate": 0.0001870054973094767, + "loss": 1.0243, + "step": 444 + }, + { + "epoch": 0.5026828579497317, + "grad_norm": 0.030122725293040276, + "learning_rate": 0.0001869468853702026, + "loss": 1.0977, + "step": 445 + }, + { + "epoch": 0.5038124823496187, + "grad_norm": 0.03070569783449173, + "learning_rate": 0.00018688815077109498, + "loss": 1.0352, + "step": 446 + }, + { + "epoch": 0.5049421067495058, + "grad_norm": 0.029172202572226524, + "learning_rate": 0.00018682929359501338, + "loss": 1.0018, + "step": 447 + }, + { + "epoch": 0.5060717311493929, + "grad_norm": 0.02992609702050686, + "learning_rate": 0.00018677031392499023, + "loss": 1.0543, + "step": 448 + }, + { + "epoch": 0.5072013555492799, + "grad_norm": 0.03060738928616047, + "learning_rate": 0.00018671121184423076, + "loss": 0.9548, + "step": 449 + }, + { + "epoch": 0.5083309799491669, + "grad_norm": 0.03061763569712639, + "learning_rate": 0.0001866519874361129, + "loss": 1.0017, + "step": 450 + }, + { + "epoch": 0.5094606043490539, + "grad_norm": 0.031224450096488, + "learning_rate": 0.00018659264078418718, + "loss": 1.0203, + "step": 451 + }, + { + "epoch": 0.510590228748941, + "grad_norm": 0.028874509036540985, + "learning_rate": 0.00018653317197217653, + "loss": 1.0266, + "step": 452 + }, + { + "epoch": 0.5117198531488281, + "grad_norm": 0.029967116191983223, + "learning_rate": 0.00018647358108397625, + "loss": 1.0335, + "step": 453 + }, + { + "epoch": 0.512849477548715, + "grad_norm": 0.030794909223914146, + "learning_rate": 0.00018641386820365385, + "loss": 1.0284, + "step": 454 + }, + { + "epoch": 0.5139791019486021, + "grad_norm": 0.031100483611226082, + "learning_rate": 0.000186354033415449, + "loss": 1.0486, + "step": 455 + }, + { + "epoch": 0.5151087263484891, + "grad_norm": 0.030945099890232086, + "learning_rate": 0.00018629407680377318, + "loss": 1.0685, + "step": 456 + }, + { + "epoch": 0.5162383507483762, + "grad_norm": 0.030694004148244858, + "learning_rate": 0.00018623399845320993, + "loss": 0.9765, + "step": 457 + }, + { + "epoch": 0.5173679751482632, + "grad_norm": 0.03131450340151787, + "learning_rate": 0.00018617379844851443, + "loss": 1.0927, + "step": 458 + }, + { + "epoch": 0.5184975995481502, + "grad_norm": 0.030793707817792892, + "learning_rate": 0.00018611347687461349, + "loss": 0.9999, + "step": 459 + }, + { + "epoch": 0.5196272239480373, + "grad_norm": 0.029182102531194687, + "learning_rate": 0.00018605303381660543, + "loss": 0.967, + "step": 460 + }, + { + "epoch": 0.5207568483479244, + "grad_norm": 0.030693160369992256, + "learning_rate": 0.00018599246935976, + "loss": 1.084, + "step": 461 + }, + { + "epoch": 0.5218864727478113, + "grad_norm": 0.030196724459528923, + "learning_rate": 0.0001859317835895181, + "loss": 1.024, + "step": 462 + }, + { + "epoch": 0.5230160971476984, + "grad_norm": 0.029934274032711983, + "learning_rate": 0.0001858709765914919, + "loss": 1.0975, + "step": 463 + }, + { + "epoch": 0.5241457215475854, + "grad_norm": 0.030209926888346672, + "learning_rate": 0.00018581004845146453, + "loss": 1.0485, + "step": 464 + }, + { + "epoch": 0.5252753459474725, + "grad_norm": 0.0305222999304533, + "learning_rate": 0.00018574899925538998, + "loss": 1.0272, + "step": 465 + }, + { + "epoch": 0.5264049703473596, + "grad_norm": 0.029943542554974556, + "learning_rate": 0.00018568782908939309, + "loss": 1.0122, + "step": 466 + }, + { + "epoch": 0.5275345947472465, + "grad_norm": 0.02910439483821392, + "learning_rate": 0.00018562653803976936, + "loss": 0.8831, + "step": 467 + }, + { + "epoch": 0.5286642191471336, + "grad_norm": 0.030156375840306282, + "learning_rate": 0.00018556512619298472, + "loss": 1.0245, + "step": 468 + }, + { + "epoch": 0.5297938435470206, + "grad_norm": 0.029457733035087585, + "learning_rate": 0.00018550359363567567, + "loss": 0.9933, + "step": 469 + }, + { + "epoch": 0.5309234679469077, + "grad_norm": 0.03006352297961712, + "learning_rate": 0.00018544194045464886, + "loss": 0.9978, + "step": 470 + }, + { + "epoch": 0.5320530923467947, + "grad_norm": 0.03152355179190636, + "learning_rate": 0.0001853801667368812, + "loss": 0.9832, + "step": 471 + }, + { + "epoch": 0.5331827167466817, + "grad_norm": 0.02921919897198677, + "learning_rate": 0.00018531827256951962, + "loss": 0.9178, + "step": 472 + }, + { + "epoch": 0.5343123411465688, + "grad_norm": 0.031064407899975777, + "learning_rate": 0.00018525625803988104, + "loss": 1.0384, + "step": 473 + }, + { + "epoch": 0.5354419655464558, + "grad_norm": 0.029859617352485657, + "learning_rate": 0.00018519412323545194, + "loss": 0.9886, + "step": 474 + }, + { + "epoch": 0.5365715899463428, + "grad_norm": 0.030883649364113808, + "learning_rate": 0.00018513186824388879, + "loss": 1.1247, + "step": 475 + }, + { + "epoch": 0.5377012143462299, + "grad_norm": 0.030706819146871567, + "learning_rate": 0.00018506949315301742, + "loss": 0.9923, + "step": 476 + }, + { + "epoch": 0.5388308387461169, + "grad_norm": 0.02973487228155136, + "learning_rate": 0.00018500699805083318, + "loss": 0.9388, + "step": 477 + }, + { + "epoch": 0.539960463146004, + "grad_norm": 0.03165286406874657, + "learning_rate": 0.00018494438302550062, + "loss": 1.0297, + "step": 478 + }, + { + "epoch": 0.5410900875458909, + "grad_norm": 0.0324639268219471, + "learning_rate": 0.0001848816481653536, + "loss": 1.0399, + "step": 479 + }, + { + "epoch": 0.542219711945778, + "grad_norm": 0.03156152740120888, + "learning_rate": 0.00018481879355889495, + "loss": 0.9528, + "step": 480 + }, + { + "epoch": 0.5433493363456651, + "grad_norm": 0.030102282762527466, + "learning_rate": 0.00018475581929479646, + "loss": 0.9972, + "step": 481 + }, + { + "epoch": 0.5444789607455521, + "grad_norm": 0.03062708117067814, + "learning_rate": 0.0001846927254618987, + "loss": 0.9629, + "step": 482 + }, + { + "epoch": 0.5456085851454392, + "grad_norm": 0.02973772957921028, + "learning_rate": 0.000184629512149211, + "loss": 1.1059, + "step": 483 + }, + { + "epoch": 0.5467382095453261, + "grad_norm": 0.030491316691040993, + "learning_rate": 0.00018456617944591111, + "loss": 1.093, + "step": 484 + }, + { + "epoch": 0.5478678339452132, + "grad_norm": 0.029982471838593483, + "learning_rate": 0.00018450272744134532, + "loss": 1.0719, + "step": 485 + }, + { + "epoch": 0.5489974583451003, + "grad_norm": 0.03204856067895889, + "learning_rate": 0.00018443915622502822, + "loss": 1.0136, + "step": 486 + }, + { + "epoch": 0.5501270827449873, + "grad_norm": 0.030183738097548485, + "learning_rate": 0.00018437546588664252, + "loss": 1.0613, + "step": 487 + }, + { + "epoch": 0.5512567071448743, + "grad_norm": 0.03049345500767231, + "learning_rate": 0.00018431165651603903, + "loss": 0.9428, + "step": 488 + }, + { + "epoch": 0.5523863315447614, + "grad_norm": 0.030976206064224243, + "learning_rate": 0.00018424772820323644, + "loss": 0.9908, + "step": 489 + }, + { + "epoch": 0.5535159559446484, + "grad_norm": 0.030059922486543655, + "learning_rate": 0.00018418368103842125, + "loss": 0.9546, + "step": 490 + }, + { + "epoch": 0.5546455803445355, + "grad_norm": 0.029848681762814522, + "learning_rate": 0.0001841195151119477, + "loss": 1.0269, + "step": 491 + }, + { + "epoch": 0.5557752047444224, + "grad_norm": 0.03216058760881424, + "learning_rate": 0.00018405523051433743, + "loss": 0.9717, + "step": 492 + }, + { + "epoch": 0.5569048291443095, + "grad_norm": 0.030524935573339462, + "learning_rate": 0.00018399082733627965, + "loss": 1.0208, + "step": 493 + }, + { + "epoch": 0.5580344535441966, + "grad_norm": 0.03152266517281532, + "learning_rate": 0.00018392630566863076, + "loss": 1.0353, + "step": 494 + }, + { + "epoch": 0.5591640779440836, + "grad_norm": 0.03233015537261963, + "learning_rate": 0.00018386166560241434, + "loss": 1.1238, + "step": 495 + }, + { + "epoch": 0.5602937023439706, + "grad_norm": 0.031183136627078056, + "learning_rate": 0.000183796907228821, + "loss": 1.0266, + "step": 496 + }, + { + "epoch": 0.5614233267438576, + "grad_norm": 0.030228251591324806, + "learning_rate": 0.00018373203063920822, + "loss": 1.0074, + "step": 497 + }, + { + "epoch": 0.5625529511437447, + "grad_norm": 0.031268905848264694, + "learning_rate": 0.00018366703592510034, + "loss": 1.0106, + "step": 498 + }, + { + "epoch": 0.5636825755436318, + "grad_norm": 0.031185952946543694, + "learning_rate": 0.0001836019231781883, + "loss": 1.0476, + "step": 499 + }, + { + "epoch": 0.5648121999435188, + "grad_norm": 0.03026709146797657, + "learning_rate": 0.0001835366924903295, + "loss": 1.0619, + "step": 500 + }, + { + "epoch": 0.5659418243434058, + "grad_norm": 0.029817136004567146, + "learning_rate": 0.00018347134395354776, + "loss": 1.0016, + "step": 501 + }, + { + "epoch": 0.5670714487432928, + "grad_norm": 0.030526304617524147, + "learning_rate": 0.00018340587766003323, + "loss": 1.0559, + "step": 502 + }, + { + "epoch": 0.5682010731431799, + "grad_norm": 0.03136800602078438, + "learning_rate": 0.00018334029370214208, + "loss": 0.9867, + "step": 503 + }, + { + "epoch": 0.569330697543067, + "grad_norm": 0.030273810029029846, + "learning_rate": 0.0001832745921723965, + "loss": 0.9358, + "step": 504 + }, + { + "epoch": 0.5704603219429539, + "grad_norm": 0.02991536259651184, + "learning_rate": 0.00018320877316348454, + "loss": 0.9964, + "step": 505 + }, + { + "epoch": 0.571589946342841, + "grad_norm": 0.031318966299295425, + "learning_rate": 0.00018314283676826009, + "loss": 0.9946, + "step": 506 + }, + { + "epoch": 0.5727195707427281, + "grad_norm": 0.030620397999882698, + "learning_rate": 0.00018307678307974241, + "loss": 1.0597, + "step": 507 + }, + { + "epoch": 0.5738491951426151, + "grad_norm": 0.03023059107363224, + "learning_rate": 0.0001830106121911165, + "loss": 0.9825, + "step": 508 + }, + { + "epoch": 0.5749788195425021, + "grad_norm": 0.03067387081682682, + "learning_rate": 0.0001829443241957325, + "loss": 0.9863, + "step": 509 + }, + { + "epoch": 0.5761084439423891, + "grad_norm": 0.03259598836302757, + "learning_rate": 0.00018287791918710587, + "loss": 1.0366, + "step": 510 + }, + { + "epoch": 0.5772380683422762, + "grad_norm": 0.03081597201526165, + "learning_rate": 0.00018281139725891707, + "loss": 1.144, + "step": 511 + }, + { + "epoch": 0.5783676927421633, + "grad_norm": 0.03100423514842987, + "learning_rate": 0.00018274475850501158, + "loss": 1.011, + "step": 512 + }, + { + "epoch": 0.5794973171420502, + "grad_norm": 0.030796082690358162, + "learning_rate": 0.00018267800301939965, + "loss": 0.8843, + "step": 513 + }, + { + "epoch": 0.5806269415419373, + "grad_norm": 0.030977580696344376, + "learning_rate": 0.00018261113089625613, + "loss": 1.0606, + "step": 514 + }, + { + "epoch": 0.5817565659418243, + "grad_norm": 0.03037908300757408, + "learning_rate": 0.0001825441422299206, + "loss": 0.9751, + "step": 515 + }, + { + "epoch": 0.5828861903417114, + "grad_norm": 0.03079284355044365, + "learning_rate": 0.00018247703711489686, + "loss": 1.0062, + "step": 516 + }, + { + "epoch": 0.5840158147415985, + "grad_norm": 0.031534090638160706, + "learning_rate": 0.00018240981564585313, + "loss": 0.949, + "step": 517 + }, + { + "epoch": 0.5851454391414854, + "grad_norm": 0.03137180209159851, + "learning_rate": 0.0001823424779176217, + "loss": 1.0799, + "step": 518 + }, + { + "epoch": 0.5862750635413725, + "grad_norm": 0.0305685643106699, + "learning_rate": 0.00018227502402519893, + "loss": 1.0609, + "step": 519 + }, + { + "epoch": 0.5874046879412596, + "grad_norm": 0.02950458414852619, + "learning_rate": 0.00018220745406374498, + "loss": 0.9671, + "step": 520 + }, + { + "epoch": 0.5885343123411466, + "grad_norm": 0.030199820175766945, + "learning_rate": 0.00018213976812858382, + "loss": 1.0684, + "step": 521 + }, + { + "epoch": 0.5896639367410336, + "grad_norm": 0.031708989292383194, + "learning_rate": 0.00018207196631520297, + "loss": 0.9994, + "step": 522 + }, + { + "epoch": 0.5907935611409206, + "grad_norm": 0.03120891936123371, + "learning_rate": 0.00018200404871925353, + "loss": 1.001, + "step": 523 + }, + { + "epoch": 0.5919231855408077, + "grad_norm": 0.033152077347040176, + "learning_rate": 0.0001819360154365498, + "loss": 1.0489, + "step": 524 + }, + { + "epoch": 0.5930528099406948, + "grad_norm": 0.03135927394032478, + "learning_rate": 0.00018186786656306935, + "loss": 1.1065, + "step": 525 + }, + { + "epoch": 0.5941824343405817, + "grad_norm": 0.030605459585785866, + "learning_rate": 0.0001817996021949529, + "loss": 1.0116, + "step": 526 + }, + { + "epoch": 0.5953120587404688, + "grad_norm": 0.031958550214767456, + "learning_rate": 0.00018173122242850397, + "loss": 1.0113, + "step": 527 + }, + { + "epoch": 0.5964416831403558, + "grad_norm": 0.033079009503126144, + "learning_rate": 0.00018166272736018895, + "loss": 0.9531, + "step": 528 + }, + { + "epoch": 0.5975713075402429, + "grad_norm": 0.0316440686583519, + "learning_rate": 0.00018159411708663684, + "loss": 0.9916, + "step": 529 + }, + { + "epoch": 0.5987009319401299, + "grad_norm": 0.030489858239889145, + "learning_rate": 0.00018152539170463925, + "loss": 0.995, + "step": 530 + }, + { + "epoch": 0.5998305563400169, + "grad_norm": 0.0322355218231678, + "learning_rate": 0.00018145655131115009, + "loss": 1.0784, + "step": 531 + }, + { + "epoch": 0.600960180739904, + "grad_norm": 0.03130833059549332, + "learning_rate": 0.00018138759600328563, + "loss": 1.0537, + "step": 532 + }, + { + "epoch": 0.602089805139791, + "grad_norm": 0.031001951545476913, + "learning_rate": 0.0001813185258783241, + "loss": 0.9956, + "step": 533 + }, + { + "epoch": 0.6032194295396781, + "grad_norm": 0.03067929483950138, + "learning_rate": 0.0001812493410337058, + "loss": 1.0148, + "step": 534 + }, + { + "epoch": 0.6043490539395651, + "grad_norm": 0.03192298486828804, + "learning_rate": 0.00018118004156703296, + "loss": 0.9635, + "step": 535 + }, + { + "epoch": 0.6054786783394521, + "grad_norm": 0.031253885477781296, + "learning_rate": 0.00018111062757606932, + "loss": 0.9987, + "step": 536 + }, + { + "epoch": 0.6066083027393392, + "grad_norm": 0.031125715002417564, + "learning_rate": 0.0001810410991587403, + "loss": 0.9915, + "step": 537 + }, + { + "epoch": 0.6077379271392263, + "grad_norm": 0.03175501897931099, + "learning_rate": 0.00018097145641313272, + "loss": 1.0357, + "step": 538 + }, + { + "epoch": 0.6088675515391132, + "grad_norm": 0.031910236924886703, + "learning_rate": 0.00018090169943749476, + "loss": 1.0679, + "step": 539 + }, + { + "epoch": 0.6099971759390003, + "grad_norm": 0.03214259445667267, + "learning_rate": 0.00018083182833023562, + "loss": 1.0173, + "step": 540 + }, + { + "epoch": 0.6111268003388873, + "grad_norm": 0.03169810026884079, + "learning_rate": 0.00018076184318992558, + "loss": 1.0428, + "step": 541 + }, + { + "epoch": 0.6122564247387744, + "grad_norm": 0.03129338473081589, + "learning_rate": 0.00018069174411529577, + "loss": 1.0236, + "step": 542 + }, + { + "epoch": 0.6133860491386613, + "grad_norm": 0.03245764225721359, + "learning_rate": 0.0001806215312052381, + "loss": 1.0081, + "step": 543 + }, + { + "epoch": 0.6145156735385484, + "grad_norm": 0.030435949563980103, + "learning_rate": 0.0001805512045588051, + "loss": 1.0731, + "step": 544 + }, + { + "epoch": 0.6156452979384355, + "grad_norm": 0.030730856582522392, + "learning_rate": 0.0001804807642752096, + "loss": 1.0793, + "step": 545 + }, + { + "epoch": 0.6167749223383225, + "grad_norm": 0.02937515825033188, + "learning_rate": 0.00018041021045382485, + "loss": 1.0123, + "step": 546 + }, + { + "epoch": 0.6179045467382095, + "grad_norm": 0.03019302524626255, + "learning_rate": 0.0001803395431941843, + "loss": 1.0232, + "step": 547 + }, + { + "epoch": 0.6190341711380966, + "grad_norm": 0.04123188927769661, + "learning_rate": 0.00018026876259598135, + "loss": 1.0309, + "step": 548 + }, + { + "epoch": 0.6201637955379836, + "grad_norm": 0.03046722523868084, + "learning_rate": 0.00018019786875906935, + "loss": 0.9721, + "step": 549 + }, + { + "epoch": 0.6212934199378707, + "grad_norm": 0.033260468393564224, + "learning_rate": 0.00018012686178346142, + "loss": 1.0726, + "step": 550 + }, + { + "epoch": 0.6224230443377577, + "grad_norm": 0.03144606575369835, + "learning_rate": 0.0001800557417693302, + "loss": 0.947, + "step": 551 + }, + { + "epoch": 0.6235526687376447, + "grad_norm": 0.03095083311200142, + "learning_rate": 0.00017998450881700787, + "loss": 0.9937, + "step": 552 + }, + { + "epoch": 0.6246822931375318, + "grad_norm": 0.03133854269981384, + "learning_rate": 0.00017991316302698595, + "loss": 0.9502, + "step": 553 + }, + { + "epoch": 0.6258119175374188, + "grad_norm": 0.03148304298520088, + "learning_rate": 0.00017984170449991506, + "loss": 1.1628, + "step": 554 + }, + { + "epoch": 0.6269415419373059, + "grad_norm": 0.03164827451109886, + "learning_rate": 0.000179770133336605, + "loss": 0.8814, + "step": 555 + }, + { + "epoch": 0.6280711663371928, + "grad_norm": 0.03083074651658535, + "learning_rate": 0.0001796984496380243, + "loss": 0.9999, + "step": 556 + }, + { + "epoch": 0.6292007907370799, + "grad_norm": 0.03223288804292679, + "learning_rate": 0.0001796266535053004, + "loss": 1.0819, + "step": 557 + }, + { + "epoch": 0.630330415136967, + "grad_norm": 0.03053288348019123, + "learning_rate": 0.00017955474503971925, + "loss": 1.1077, + "step": 558 + }, + { + "epoch": 0.631460039536854, + "grad_norm": 0.03127776086330414, + "learning_rate": 0.00017948272434272535, + "loss": 1.041, + "step": 559 + }, + { + "epoch": 0.632589663936741, + "grad_norm": 0.03209880739450455, + "learning_rate": 0.00017941059151592147, + "loss": 1.0081, + "step": 560 + }, + { + "epoch": 0.633719288336628, + "grad_norm": 0.02959609404206276, + "learning_rate": 0.00017933834666106864, + "loss": 0.9875, + "step": 561 + }, + { + "epoch": 0.6348489127365151, + "grad_norm": 0.03344092145562172, + "learning_rate": 0.00017926598988008582, + "loss": 0.9677, + "step": 562 + }, + { + "epoch": 0.6359785371364022, + "grad_norm": 0.03260407596826553, + "learning_rate": 0.00017919352127505, + "loss": 1.0449, + "step": 563 + }, + { + "epoch": 0.6371081615362891, + "grad_norm": 0.031249945983290672, + "learning_rate": 0.0001791209409481958, + "loss": 1.0662, + "step": 564 + }, + { + "epoch": 0.6382377859361762, + "grad_norm": 0.031923823058605194, + "learning_rate": 0.00017904824900191556, + "loss": 1.0379, + "step": 565 + }, + { + "epoch": 0.6393674103360633, + "grad_norm": 0.030242929235100746, + "learning_rate": 0.00017897544553875902, + "loss": 1.0257, + "step": 566 + }, + { + "epoch": 0.6404970347359503, + "grad_norm": 0.032716382294893265, + "learning_rate": 0.00017890253066143324, + "loss": 0.9987, + "step": 567 + }, + { + "epoch": 0.6416266591358374, + "grad_norm": 0.03140626102685928, + "learning_rate": 0.0001788295044728025, + "loss": 1.0162, + "step": 568 + }, + { + "epoch": 0.6427562835357243, + "grad_norm": 0.029912738129496574, + "learning_rate": 0.0001787563670758881, + "loss": 1.0318, + "step": 569 + }, + { + "epoch": 0.6438859079356114, + "grad_norm": 0.03130066394805908, + "learning_rate": 0.0001786831185738682, + "loss": 1.0026, + "step": 570 + }, + { + "epoch": 0.6450155323354985, + "grad_norm": 0.033079057931900024, + "learning_rate": 0.00017860975907007772, + "loss": 1.0262, + "step": 571 + }, + { + "epoch": 0.6461451567353855, + "grad_norm": 0.03027520515024662, + "learning_rate": 0.00017853628866800812, + "loss": 1.0075, + "step": 572 + }, + { + "epoch": 0.6472747811352725, + "grad_norm": 0.03166157007217407, + "learning_rate": 0.00017846270747130742, + "loss": 1.0858, + "step": 573 + }, + { + "epoch": 0.6484044055351595, + "grad_norm": 0.03081650286912918, + "learning_rate": 0.00017838901558377986, + "loss": 1.0215, + "step": 574 + }, + { + "epoch": 0.6495340299350466, + "grad_norm": 0.03256387263536453, + "learning_rate": 0.0001783152131093859, + "loss": 0.973, + "step": 575 + }, + { + "epoch": 0.6506636543349337, + "grad_norm": 0.030604898929595947, + "learning_rate": 0.00017824130015224192, + "loss": 1.057, + "step": 576 + }, + { + "epoch": 0.6517932787348206, + "grad_norm": 0.030695458874106407, + "learning_rate": 0.00017816727681662023, + "loss": 1.0804, + "step": 577 + }, + { + "epoch": 0.6529229031347077, + "grad_norm": 0.031340498477220535, + "learning_rate": 0.0001780931432069488, + "loss": 0.9722, + "step": 578 + }, + { + "epoch": 0.6540525275345948, + "grad_norm": 0.03206819295883179, + "learning_rate": 0.00017801889942781126, + "loss": 1.0593, + "step": 579 + }, + { + "epoch": 0.6551821519344818, + "grad_norm": 0.030380915850400925, + "learning_rate": 0.00017794454558394657, + "loss": 0.9263, + "step": 580 + }, + { + "epoch": 0.6563117763343688, + "grad_norm": 0.03320132568478584, + "learning_rate": 0.00017787008178024905, + "loss": 1.0798, + "step": 581 + }, + { + "epoch": 0.6574414007342558, + "grad_norm": 0.0311865396797657, + "learning_rate": 0.00017779550812176806, + "loss": 0.9205, + "step": 582 + }, + { + "epoch": 0.6585710251341429, + "grad_norm": 0.032210152596235275, + "learning_rate": 0.00017772082471370797, + "loss": 1.0411, + "step": 583 + }, + { + "epoch": 0.65970064953403, + "grad_norm": 0.03178109973669052, + "learning_rate": 0.00017764603166142798, + "loss": 1.0502, + "step": 584 + }, + { + "epoch": 0.660830273933917, + "grad_norm": 0.0323721244931221, + "learning_rate": 0.000177571129070442, + "loss": 1.0372, + "step": 585 + }, + { + "epoch": 0.661959898333804, + "grad_norm": 0.031241752207279205, + "learning_rate": 0.0001774961170464184, + "loss": 0.9741, + "step": 586 + }, + { + "epoch": 0.663089522733691, + "grad_norm": 0.03263148292899132, + "learning_rate": 0.00017742099569518, + "loss": 1.0956, + "step": 587 + }, + { + "epoch": 0.6642191471335781, + "grad_norm": 0.031760070472955704, + "learning_rate": 0.00017734576512270383, + "loss": 0.9795, + "step": 588 + }, + { + "epoch": 0.6653487715334652, + "grad_norm": 0.03184381127357483, + "learning_rate": 0.00017727042543512099, + "loss": 0.9054, + "step": 589 + }, + { + "epoch": 0.6664783959333521, + "grad_norm": 0.03145081177353859, + "learning_rate": 0.00017719497673871653, + "loss": 1.0219, + "step": 590 + }, + { + "epoch": 0.6664783959333521, + "eval_loss": 1.0118999481201172, + "eval_runtime": 547.41, + "eval_samples_per_second": 17.871, + "eval_steps_per_second": 8.937, + "step": 590 + }, + { + "epoch": 0.6676080203332392, + "grad_norm": 0.03253559768199921, + "learning_rate": 0.00017711941913992928, + "loss": 0.9635, + "step": 591 + }, + { + "epoch": 0.6687376447331262, + "grad_norm": 0.03181855380535126, + "learning_rate": 0.00017704375274535167, + "loss": 0.8852, + "step": 592 + }, + { + "epoch": 0.6698672691330133, + "grad_norm": 0.03165988251566887, + "learning_rate": 0.0001769679776617297, + "loss": 1.0201, + "step": 593 + }, + { + "epoch": 0.6709968935329003, + "grad_norm": 0.03077312745153904, + "learning_rate": 0.00017689209399596257, + "loss": 1.0307, + "step": 594 + }, + { + "epoch": 0.6721265179327873, + "grad_norm": 0.032232630997896194, + "learning_rate": 0.00017681610185510285, + "loss": 1.0121, + "step": 595 + }, + { + "epoch": 0.6732561423326744, + "grad_norm": 0.03249699994921684, + "learning_rate": 0.0001767400013463559, + "loss": 0.9288, + "step": 596 + }, + { + "epoch": 0.6743857667325615, + "grad_norm": 0.03216133266687393, + "learning_rate": 0.0001766637925770802, + "loss": 0.9665, + "step": 597 + }, + { + "epoch": 0.6755153911324484, + "grad_norm": 0.03151794150471687, + "learning_rate": 0.00017658747565478677, + "loss": 1.0497, + "step": 598 + }, + { + "epoch": 0.6766450155323355, + "grad_norm": 0.03118024580180645, + "learning_rate": 0.00017651105068713935, + "loss": 0.9403, + "step": 599 + }, + { + "epoch": 0.6777746399322225, + "grad_norm": 0.030804403126239777, + "learning_rate": 0.00017643451778195395, + "loss": 1.0011, + "step": 600 + }, + { + "epoch": 0.6789042643321096, + "grad_norm": 0.03352154418826103, + "learning_rate": 0.000176357877047199, + "loss": 1.0294, + "step": 601 + }, + { + "epoch": 0.6800338887319967, + "grad_norm": 0.03205511346459389, + "learning_rate": 0.00017628112859099498, + "loss": 1.0487, + "step": 602 + }, + { + "epoch": 0.6811635131318836, + "grad_norm": 0.031228026375174522, + "learning_rate": 0.00017620427252161433, + "loss": 0.9319, + "step": 603 + }, + { + "epoch": 0.6822931375317707, + "grad_norm": 0.031973280012607574, + "learning_rate": 0.00017612730894748136, + "loss": 1.0829, + "step": 604 + }, + { + "epoch": 0.6834227619316577, + "grad_norm": 0.03331442177295685, + "learning_rate": 0.00017605023797717195, + "loss": 1.0669, + "step": 605 + }, + { + "epoch": 0.6845523863315448, + "grad_norm": 0.0336139053106308, + "learning_rate": 0.00017597305971941358, + "loss": 1.0722, + "step": 606 + }, + { + "epoch": 0.6856820107314318, + "grad_norm": 0.03086121752858162, + "learning_rate": 0.00017589577428308502, + "loss": 1.092, + "step": 607 + }, + { + "epoch": 0.6868116351313188, + "grad_norm": 0.032204341143369675, + "learning_rate": 0.0001758183817772163, + "loss": 0.9483, + "step": 608 + }, + { + "epoch": 0.6879412595312059, + "grad_norm": 0.03183162584900856, + "learning_rate": 0.00017574088231098843, + "loss": 1.0029, + "step": 609 + }, + { + "epoch": 0.689070883931093, + "grad_norm": 0.031096026301383972, + "learning_rate": 0.00017566327599373338, + "loss": 1.0094, + "step": 610 + }, + { + "epoch": 0.6902005083309799, + "grad_norm": 0.032303981482982635, + "learning_rate": 0.0001755855629349338, + "loss": 0.976, + "step": 611 + }, + { + "epoch": 0.691330132730867, + "grad_norm": 0.03237254545092583, + "learning_rate": 0.00017550774324422296, + "loss": 0.9472, + "step": 612 + }, + { + "epoch": 0.692459757130754, + "grad_norm": 0.03161952272057533, + "learning_rate": 0.0001754298170313846, + "loss": 0.995, + "step": 613 + }, + { + "epoch": 0.6935893815306411, + "grad_norm": 0.032882727682590485, + "learning_rate": 0.00017535178440635264, + "loss": 0.9078, + "step": 614 + }, + { + "epoch": 0.694719005930528, + "grad_norm": 0.030476143583655357, + "learning_rate": 0.0001752736454792112, + "loss": 0.9488, + "step": 615 + }, + { + "epoch": 0.6958486303304151, + "grad_norm": 0.032640308141708374, + "learning_rate": 0.00017519540036019428, + "loss": 0.9968, + "step": 616 + }, + { + "epoch": 0.6969782547303022, + "grad_norm": 0.03207506611943245, + "learning_rate": 0.00017511704915968581, + "loss": 1.0598, + "step": 617 + }, + { + "epoch": 0.6981078791301892, + "grad_norm": 0.04379906877875328, + "learning_rate": 0.0001750385919882193, + "loss": 1.0801, + "step": 618 + }, + { + "epoch": 0.6992375035300763, + "grad_norm": 0.03258811682462692, + "learning_rate": 0.00017496002895647775, + "loss": 1.0901, + "step": 619 + }, + { + "epoch": 0.7003671279299633, + "grad_norm": 0.034346289932727814, + "learning_rate": 0.0001748813601752935, + "loss": 0.9995, + "step": 620 + }, + { + "epoch": 0.7014967523298503, + "grad_norm": 0.0332537442445755, + "learning_rate": 0.0001748025857556481, + "loss": 1.0382, + "step": 621 + }, + { + "epoch": 0.7026263767297374, + "grad_norm": 0.031845249235630035, + "learning_rate": 0.0001747237058086722, + "loss": 1.1217, + "step": 622 + }, + { + "epoch": 0.7037560011296244, + "grad_norm": 0.032574612647295, + "learning_rate": 0.00017464472044564512, + "loss": 0.8765, + "step": 623 + }, + { + "epoch": 0.7048856255295114, + "grad_norm": 0.031227873638272285, + "learning_rate": 0.00017456562977799514, + "loss": 0.9676, + "step": 624 + }, + { + "epoch": 0.7060152499293985, + "grad_norm": 0.032643262296915054, + "learning_rate": 0.00017448643391729888, + "loss": 0.9842, + "step": 625 + }, + { + "epoch": 0.7071448743292855, + "grad_norm": 0.031137650832533836, + "learning_rate": 0.00017440713297528154, + "loss": 0.9877, + "step": 626 + }, + { + "epoch": 0.7082744987291726, + "grad_norm": 0.030961019918322563, + "learning_rate": 0.0001743277270638164, + "loss": 1.017, + "step": 627 + }, + { + "epoch": 0.7094041231290595, + "grad_norm": 0.032677747309207916, + "learning_rate": 0.00017424821629492495, + "loss": 1.0023, + "step": 628 + }, + { + "epoch": 0.7105337475289466, + "grad_norm": 0.032716501504182816, + "learning_rate": 0.00017416860078077657, + "loss": 0.9893, + "step": 629 + }, + { + "epoch": 0.7116633719288337, + "grad_norm": 0.03239135444164276, + "learning_rate": 0.0001740888806336884, + "loss": 0.9949, + "step": 630 + }, + { + "epoch": 0.7127929963287207, + "grad_norm": 0.03276536986231804, + "learning_rate": 0.0001740090559661252, + "loss": 1.0778, + "step": 631 + }, + { + "epoch": 0.7139226207286078, + "grad_norm": 0.030909627676010132, + "learning_rate": 0.00017392912689069917, + "loss": 1.0098, + "step": 632 + }, + { + "epoch": 0.7150522451284947, + "grad_norm": 0.032094262540340424, + "learning_rate": 0.00017384909352016975, + "loss": 0.9703, + "step": 633 + }, + { + "epoch": 0.7161818695283818, + "grad_norm": 0.03388513997197151, + "learning_rate": 0.00017376895596744367, + "loss": 1.0014, + "step": 634 + }, + { + "epoch": 0.7173114939282689, + "grad_norm": 0.03186871111392975, + "learning_rate": 0.00017368871434557447, + "loss": 1.0076, + "step": 635 + }, + { + "epoch": 0.7184411183281559, + "grad_norm": 0.03189585730433464, + "learning_rate": 0.00017360836876776256, + "loss": 0.9721, + "step": 636 + }, + { + "epoch": 0.7195707427280429, + "grad_norm": 0.033433668315410614, + "learning_rate": 0.0001735279193473551, + "loss": 0.9798, + "step": 637 + }, + { + "epoch": 0.72070036712793, + "grad_norm": 0.031073307618498802, + "learning_rate": 0.00017344736619784553, + "loss": 0.9629, + "step": 638 + }, + { + "epoch": 0.721829991527817, + "grad_norm": 0.030326619744300842, + "learning_rate": 0.00017336670943287388, + "loss": 1.0727, + "step": 639 + }, + { + "epoch": 0.7229596159277041, + "grad_norm": 0.03189557045698166, + "learning_rate": 0.00017328594916622616, + "loss": 1.0175, + "step": 640 + }, + { + "epoch": 0.724089240327591, + "grad_norm": 0.03241390734910965, + "learning_rate": 0.00017320508551183446, + "loss": 1.1313, + "step": 641 + }, + { + "epoch": 0.7252188647274781, + "grad_norm": 0.0323265865445137, + "learning_rate": 0.0001731241185837768, + "loss": 1.0418, + "step": 642 + }, + { + "epoch": 0.7263484891273652, + "grad_norm": 0.032344575971364975, + "learning_rate": 0.00017304304849627677, + "loss": 1.0882, + "step": 643 + }, + { + "epoch": 0.7274781135272522, + "grad_norm": 0.032916001975536346, + "learning_rate": 0.00017296187536370355, + "loss": 0.9596, + "step": 644 + }, + { + "epoch": 0.7286077379271392, + "grad_norm": 0.031346168369054794, + "learning_rate": 0.00017288059930057166, + "loss": 0.9729, + "step": 645 + }, + { + "epoch": 0.7297373623270262, + "grad_norm": 0.032094355672597885, + "learning_rate": 0.00017279922042154092, + "loss": 1.0331, + "step": 646 + }, + { + "epoch": 0.7308669867269133, + "grad_norm": 0.03289850801229477, + "learning_rate": 0.00017271773884141607, + "loss": 1.0411, + "step": 647 + }, + { + "epoch": 0.7319966111268004, + "grad_norm": 0.03297988697886467, + "learning_rate": 0.0001726361546751468, + "loss": 0.9847, + "step": 648 + }, + { + "epoch": 0.7331262355266874, + "grad_norm": 0.03368350863456726, + "learning_rate": 0.00017255446803782754, + "loss": 0.9978, + "step": 649 + }, + { + "epoch": 0.7342558599265744, + "grad_norm": 0.03280177712440491, + "learning_rate": 0.00017247267904469725, + "loss": 1.0363, + "step": 650 + }, + { + "epoch": 0.7353854843264614, + "grad_norm": 0.031202217563986778, + "learning_rate": 0.00017239078781113926, + "loss": 1.025, + "step": 651 + }, + { + "epoch": 0.7365151087263485, + "grad_norm": 0.03219619765877724, + "learning_rate": 0.00017230879445268124, + "loss": 0.9878, + "step": 652 + }, + { + "epoch": 0.7376447331262356, + "grad_norm": 0.033005617558956146, + "learning_rate": 0.00017222669908499482, + "loss": 1.0223, + "step": 653 + }, + { + "epoch": 0.7387743575261225, + "grad_norm": 0.03350326791405678, + "learning_rate": 0.00017214450182389559, + "loss": 1.0173, + "step": 654 + }, + { + "epoch": 0.7399039819260096, + "grad_norm": 0.031389541923999786, + "learning_rate": 0.00017206220278534286, + "loss": 1.0458, + "step": 655 + }, + { + "epoch": 0.7410336063258967, + "grad_norm": 0.031541019678115845, + "learning_rate": 0.00017197980208543954, + "loss": 0.9489, + "step": 656 + }, + { + "epoch": 0.7421632307257837, + "grad_norm": 0.03202977776527405, + "learning_rate": 0.00017189729984043204, + "loss": 1.0364, + "step": 657 + }, + { + "epoch": 0.7432928551256707, + "grad_norm": 0.03152487054467201, + "learning_rate": 0.00017181469616670984, + "loss": 0.9827, + "step": 658 + }, + { + "epoch": 0.7444224795255577, + "grad_norm": 0.03225429356098175, + "learning_rate": 0.00017173199118080564, + "loss": 1.0996, + "step": 659 + }, + { + "epoch": 0.7455521039254448, + "grad_norm": 0.03217494860291481, + "learning_rate": 0.00017164918499939504, + "loss": 0.9355, + "step": 660 + }, + { + "epoch": 0.7466817283253319, + "grad_norm": 0.032104648649692535, + "learning_rate": 0.00017156627773929644, + "loss": 1.0552, + "step": 661 + }, + { + "epoch": 0.7478113527252188, + "grad_norm": 0.03186746686697006, + "learning_rate": 0.0001714832695174707, + "loss": 1.071, + "step": 662 + }, + { + "epoch": 0.7489409771251059, + "grad_norm": 0.03182530775666237, + "learning_rate": 0.00017140016045102133, + "loss": 1.0688, + "step": 663 + }, + { + "epoch": 0.7500706015249929, + "grad_norm": 0.03153397887945175, + "learning_rate": 0.00017131695065719386, + "loss": 0.9624, + "step": 664 + }, + { + "epoch": 0.75120022592488, + "grad_norm": 0.03226126730442047, + "learning_rate": 0.0001712336402533761, + "loss": 1.1134, + "step": 665 + }, + { + "epoch": 0.7523298503247671, + "grad_norm": 0.031511638313531876, + "learning_rate": 0.00017115022935709778, + "loss": 1.0753, + "step": 666 + }, + { + "epoch": 0.753459474724654, + "grad_norm": 0.03331499546766281, + "learning_rate": 0.00017106671808603027, + "loss": 0.9709, + "step": 667 + }, + { + "epoch": 0.7545890991245411, + "grad_norm": 0.032829850912094116, + "learning_rate": 0.0001709831065579867, + "loss": 0.9839, + "step": 668 + }, + { + "epoch": 0.7557187235244281, + "grad_norm": 0.032828208059072495, + "learning_rate": 0.00017089939489092152, + "loss": 0.9924, + "step": 669 + }, + { + "epoch": 0.7568483479243152, + "grad_norm": 0.0320126973092556, + "learning_rate": 0.00017081558320293055, + "loss": 0.9649, + "step": 670 + }, + { + "epoch": 0.7579779723242022, + "grad_norm": 0.03252957761287689, + "learning_rate": 0.0001707316716122506, + "loss": 0.9643, + "step": 671 + }, + { + "epoch": 0.7591075967240892, + "grad_norm": 0.032323576509952545, + "learning_rate": 0.00017064766023725948, + "loss": 0.9962, + "step": 672 + }, + { + "epoch": 0.7602372211239763, + "grad_norm": 0.03305547684431076, + "learning_rate": 0.00017056354919647583, + "loss": 1.0864, + "step": 673 + }, + { + "epoch": 0.7613668455238634, + "grad_norm": 0.0321505106985569, + "learning_rate": 0.0001704793386085588, + "loss": 1.01, + "step": 674 + }, + { + "epoch": 0.7624964699237503, + "grad_norm": 0.03243474289774895, + "learning_rate": 0.000170395028592308, + "loss": 1.1192, + "step": 675 + }, + { + "epoch": 0.7636260943236374, + "grad_norm": 0.03369235247373581, + "learning_rate": 0.00017031061926666333, + "loss": 0.9846, + "step": 676 + }, + { + "epoch": 0.7647557187235244, + "grad_norm": 0.03172389790415764, + "learning_rate": 0.00017022611075070474, + "loss": 1.0406, + "step": 677 + }, + { + "epoch": 0.7658853431234115, + "grad_norm": 0.03241589665412903, + "learning_rate": 0.00017014150316365216, + "loss": 0.9235, + "step": 678 + }, + { + "epoch": 0.7670149675232985, + "grad_norm": 0.03271762281656265, + "learning_rate": 0.0001700567966248653, + "loss": 0.9516, + "step": 679 + }, + { + "epoch": 0.7681445919231855, + "grad_norm": 0.032931018620729446, + "learning_rate": 0.00016997199125384343, + "loss": 1.0315, + "step": 680 + }, + { + "epoch": 0.7692742163230726, + "grad_norm": 0.032814498990774155, + "learning_rate": 0.00016988708717022522, + "loss": 0.941, + "step": 681 + }, + { + "epoch": 0.7704038407229596, + "grad_norm": 0.031445201486349106, + "learning_rate": 0.00016980208449378866, + "loss": 1.0588, + "step": 682 + }, + { + "epoch": 0.7715334651228467, + "grad_norm": 0.033274564892053604, + "learning_rate": 0.0001697169833444508, + "loss": 0.9968, + "step": 683 + }, + { + "epoch": 0.7726630895227337, + "grad_norm": 0.03313668072223663, + "learning_rate": 0.00016963178384226763, + "loss": 1.0308, + "step": 684 + }, + { + "epoch": 0.7737927139226207, + "grad_norm": 0.032828278839588165, + "learning_rate": 0.00016954648610743384, + "loss": 1.0245, + "step": 685 + }, + { + "epoch": 0.7749223383225078, + "grad_norm": 0.03268923610448837, + "learning_rate": 0.00016946109026028274, + "loss": 1.0515, + "step": 686 + }, + { + "epoch": 0.7760519627223949, + "grad_norm": 0.03162987902760506, + "learning_rate": 0.00016937559642128604, + "loss": 0.9649, + "step": 687 + }, + { + "epoch": 0.7771815871222818, + "grad_norm": 0.03206837549805641, + "learning_rate": 0.0001692900047110537, + "loss": 1.0174, + "step": 688 + }, + { + "epoch": 0.7783112115221689, + "grad_norm": 0.03194599226117134, + "learning_rate": 0.0001692043152503338, + "loss": 0.9872, + "step": 689 + }, + { + "epoch": 0.7794408359220559, + "grad_norm": 0.032261595129966736, + "learning_rate": 0.0001691185281600122, + "loss": 1.0046, + "step": 690 + }, + { + "epoch": 0.780570460321943, + "grad_norm": 0.032003022730350494, + "learning_rate": 0.00016903264356111258, + "loss": 1.0223, + "step": 691 + }, + { + "epoch": 0.78170008472183, + "grad_norm": 0.03204648569226265, + "learning_rate": 0.00016894666157479614, + "loss": 0.9402, + "step": 692 + }, + { + "epoch": 0.782829709121717, + "grad_norm": 0.03307194262742996, + "learning_rate": 0.00016886058232236156, + "loss": 0.977, + "step": 693 + }, + { + "epoch": 0.7839593335216041, + "grad_norm": 0.03305744007229805, + "learning_rate": 0.00016877440592524457, + "loss": 1.0158, + "step": 694 + }, + { + "epoch": 0.7850889579214911, + "grad_norm": 0.03376347944140434, + "learning_rate": 0.0001686881325050181, + "loss": 0.9266, + "step": 695 + }, + { + "epoch": 0.7862185823213781, + "grad_norm": 0.031977638602256775, + "learning_rate": 0.0001686017621833919, + "loss": 0.9966, + "step": 696 + }, + { + "epoch": 0.7873482067212652, + "grad_norm": 0.02983999252319336, + "learning_rate": 0.00016851529508221235, + "loss": 1.0418, + "step": 697 + }, + { + "epoch": 0.7884778311211522, + "grad_norm": 0.032275013625621796, + "learning_rate": 0.00016842873132346252, + "loss": 1.0745, + "step": 698 + }, + { + "epoch": 0.7896074555210393, + "grad_norm": 0.031730227172374725, + "learning_rate": 0.0001683420710292617, + "loss": 1.0505, + "step": 699 + }, + { + "epoch": 0.7907370799209263, + "grad_norm": 0.03272555023431778, + "learning_rate": 0.00016825531432186543, + "loss": 0.9826, + "step": 700 + }, + { + "epoch": 0.7918667043208133, + "grad_norm": 0.03270925581455231, + "learning_rate": 0.00016816846132366523, + "loss": 0.9954, + "step": 701 + }, + { + "epoch": 0.7929963287207004, + "grad_norm": 0.03216206654906273, + "learning_rate": 0.00016808151215718853, + "loss": 1.0266, + "step": 702 + }, + { + "epoch": 0.7941259531205874, + "grad_norm": 0.035289812833070755, + "learning_rate": 0.00016799446694509834, + "loss": 0.9776, + "step": 703 + }, + { + "epoch": 0.7952555775204745, + "grad_norm": 0.03199274092912674, + "learning_rate": 0.00016790732581019321, + "loss": 1.1088, + "step": 704 + }, + { + "epoch": 0.7963852019203614, + "grad_norm": 0.032748714089393616, + "learning_rate": 0.00016782008887540704, + "loss": 1.0957, + "step": 705 + }, + { + "epoch": 0.7975148263202485, + "grad_norm": 0.03328438848257065, + "learning_rate": 0.00016773275626380882, + "loss": 1.033, + "step": 706 + }, + { + "epoch": 0.7986444507201356, + "grad_norm": 0.031454868614673615, + "learning_rate": 0.00016764532809860255, + "loss": 0.9854, + "step": 707 + }, + { + "epoch": 0.7997740751200226, + "grad_norm": 0.032634053379297256, + "learning_rate": 0.00016755780450312705, + "loss": 0.9914, + "step": 708 + }, + { + "epoch": 0.8009036995199096, + "grad_norm": 0.03341520577669144, + "learning_rate": 0.00016747018560085572, + "loss": 0.9696, + "step": 709 + }, + { + "epoch": 0.8020333239197966, + "grad_norm": 0.03382014483213425, + "learning_rate": 0.00016738247151539643, + "loss": 1.0987, + "step": 710 + }, + { + "epoch": 0.8031629483196837, + "grad_norm": 0.03288474678993225, + "learning_rate": 0.00016729466237049137, + "loss": 1.0378, + "step": 711 + }, + { + "epoch": 0.8042925727195708, + "grad_norm": 0.03332820534706116, + "learning_rate": 0.00016720675829001675, + "loss": 1.0544, + "step": 712 + }, + { + "epoch": 0.8054221971194577, + "grad_norm": 0.033210329711437225, + "learning_rate": 0.0001671187593979828, + "loss": 0.9805, + "step": 713 + }, + { + "epoch": 0.8065518215193448, + "grad_norm": 0.03217016160488129, + "learning_rate": 0.00016703066581853345, + "loss": 1.0576, + "step": 714 + }, + { + "epoch": 0.8076814459192319, + "grad_norm": 0.03476932644844055, + "learning_rate": 0.00016694247767594624, + "loss": 1.0224, + "step": 715 + }, + { + "epoch": 0.8088110703191189, + "grad_norm": 0.03346959874033928, + "learning_rate": 0.00016685419509463213, + "loss": 1.0332, + "step": 716 + }, + { + "epoch": 0.809940694719006, + "grad_norm": 0.03398541361093521, + "learning_rate": 0.00016676581819913516, + "loss": 0.8649, + "step": 717 + }, + { + "epoch": 0.8110703191188929, + "grad_norm": 0.033196430653333664, + "learning_rate": 0.0001666773471141327, + "loss": 0.9434, + "step": 718 + }, + { + "epoch": 0.81219994351878, + "grad_norm": 0.03290561959147453, + "learning_rate": 0.00016658878196443476, + "loss": 0.9993, + "step": 719 + }, + { + "epoch": 0.8133295679186671, + "grad_norm": 0.03241690620779991, + "learning_rate": 0.00016650012287498412, + "loss": 1.0136, + "step": 720 + }, + { + "epoch": 0.8144591923185541, + "grad_norm": 0.03312429040670395, + "learning_rate": 0.00016641136997085608, + "loss": 1.0292, + "step": 721 + }, + { + "epoch": 0.8155888167184411, + "grad_norm": 0.03105839341878891, + "learning_rate": 0.0001663225233772584, + "loss": 0.9277, + "step": 722 + }, + { + "epoch": 0.8167184411183281, + "grad_norm": 0.0325755774974823, + "learning_rate": 0.00016623358321953078, + "loss": 1.0722, + "step": 723 + }, + { + "epoch": 0.8178480655182152, + "grad_norm": 0.033952005207538605, + "learning_rate": 0.00016614454962314516, + "loss": 1.0253, + "step": 724 + }, + { + "epoch": 0.8189776899181023, + "grad_norm": 0.0334470197558403, + "learning_rate": 0.00016605542271370513, + "loss": 1.0267, + "step": 725 + }, + { + "epoch": 0.8201073143179892, + "grad_norm": 0.03237008675932884, + "learning_rate": 0.00016596620261694604, + "loss": 1.0669, + "step": 726 + }, + { + "epoch": 0.8212369387178763, + "grad_norm": 0.03195658326148987, + "learning_rate": 0.00016587688945873458, + "loss": 0.9879, + "step": 727 + }, + { + "epoch": 0.8223665631177633, + "grad_norm": 0.03366916999220848, + "learning_rate": 0.0001657874833650688, + "loss": 0.9801, + "step": 728 + }, + { + "epoch": 0.8234961875176504, + "grad_norm": 0.03287327662110329, + "learning_rate": 0.0001656979844620779, + "loss": 0.9283, + "step": 729 + }, + { + "epoch": 0.8246258119175374, + "grad_norm": 0.03366275876760483, + "learning_rate": 0.00016560839287602192, + "loss": 1.0678, + "step": 730 + }, + { + "epoch": 0.8257554363174244, + "grad_norm": 0.03446445241570473, + "learning_rate": 0.00016551870873329167, + "loss": 0.9899, + "step": 731 + }, + { + "epoch": 0.8268850607173115, + "grad_norm": 0.03468972072005272, + "learning_rate": 0.0001654289321604086, + "loss": 1.0614, + "step": 732 + }, + { + "epoch": 0.8280146851171986, + "grad_norm": 0.03443734720349312, + "learning_rate": 0.00016533906328402448, + "loss": 1.0321, + "step": 733 + }, + { + "epoch": 0.8291443095170856, + "grad_norm": 0.03276367112994194, + "learning_rate": 0.0001652491022309213, + "loss": 0.9848, + "step": 734 + }, + { + "epoch": 0.8302739339169726, + "grad_norm": 0.03289159759879112, + "learning_rate": 0.00016515904912801118, + "loss": 1.0121, + "step": 735 + }, + { + "epoch": 0.8314035583168596, + "grad_norm": 0.034025318920612335, + "learning_rate": 0.000165068904102336, + "loss": 1.0589, + "step": 736 + }, + { + "epoch": 0.8325331827167467, + "grad_norm": 0.03421149030327797, + "learning_rate": 0.00016497866728106735, + "loss": 1.0138, + "step": 737 + }, + { + "epoch": 0.8336628071166338, + "grad_norm": 0.03334156796336174, + "learning_rate": 0.0001648883387915063, + "loss": 1.0337, + "step": 738 + }, + { + "epoch": 0.8347924315165207, + "grad_norm": 0.03213927149772644, + "learning_rate": 0.0001647979187610833, + "loss": 1.0248, + "step": 739 + }, + { + "epoch": 0.8359220559164078, + "grad_norm": 0.03407248482108116, + "learning_rate": 0.00016470740731735787, + "loss": 0.9995, + "step": 740 + }, + { + "epoch": 0.8370516803162948, + "grad_norm": 0.03234965354204178, + "learning_rate": 0.00016461680458801858, + "loss": 1.0526, + "step": 741 + }, + { + "epoch": 0.8381813047161819, + "grad_norm": 0.03325793519616127, + "learning_rate": 0.0001645261107008827, + "loss": 0.9461, + "step": 742 + }, + { + "epoch": 0.8393109291160689, + "grad_norm": 0.034206606447696686, + "learning_rate": 0.00016443532578389606, + "loss": 0.9095, + "step": 743 + }, + { + "epoch": 0.8404405535159559, + "grad_norm": 0.03346103057265282, + "learning_rate": 0.00016434444996513305, + "loss": 1.0337, + "step": 744 + }, + { + "epoch": 0.841570177915843, + "grad_norm": 0.03360540792346001, + "learning_rate": 0.0001642534833727962, + "loss": 0.9532, + "step": 745 + }, + { + "epoch": 0.84269980231573, + "grad_norm": 0.03263968229293823, + "learning_rate": 0.0001641624261352161, + "loss": 1.0579, + "step": 746 + }, + { + "epoch": 0.843829426715617, + "grad_norm": 0.033077508211135864, + "learning_rate": 0.0001640712783808513, + "loss": 0.9993, + "step": 747 + }, + { + "epoch": 0.8449590511155041, + "grad_norm": 0.03186168894171715, + "learning_rate": 0.00016398004023828797, + "loss": 0.9576, + "step": 748 + }, + { + "epoch": 0.8460886755153911, + "grad_norm": 0.032343216240406036, + "learning_rate": 0.00016388871183623977, + "loss": 1.0693, + "step": 749 + }, + { + "epoch": 0.8472182999152782, + "grad_norm": 0.03365077078342438, + "learning_rate": 0.00016379729330354774, + "loss": 0.9867, + "step": 750 + }, + { + "epoch": 0.8483479243151653, + "grad_norm": 0.03302355110645294, + "learning_rate": 0.00016370578476918008, + "loss": 1.002, + "step": 751 + }, + { + "epoch": 0.8494775487150522, + "grad_norm": 0.034719739109277725, + "learning_rate": 0.00016361418636223198, + "loss": 0.9621, + "step": 752 + }, + { + "epoch": 0.8506071731149393, + "grad_norm": 0.03225456923246384, + "learning_rate": 0.0001635224982119253, + "loss": 1.0285, + "step": 753 + }, + { + "epoch": 0.8517367975148263, + "grad_norm": 0.03379584476351738, + "learning_rate": 0.0001634307204476087, + "loss": 1.0787, + "step": 754 + }, + { + "epoch": 0.8528664219147134, + "grad_norm": 0.03374066203832626, + "learning_rate": 0.00016333885319875702, + "loss": 1.0322, + "step": 755 + }, + { + "epoch": 0.8539960463146004, + "grad_norm": 0.03438407927751541, + "learning_rate": 0.00016324689659497155, + "loss": 1.0204, + "step": 756 + }, + { + "epoch": 0.8551256707144874, + "grad_norm": 0.03300711140036583, + "learning_rate": 0.00016315485076597957, + "loss": 1.0088, + "step": 757 + }, + { + "epoch": 0.8562552951143745, + "grad_norm": 0.032439880073070526, + "learning_rate": 0.00016306271584163416, + "loss": 1.0198, + "step": 758 + }, + { + "epoch": 0.8573849195142615, + "grad_norm": 0.0341016985476017, + "learning_rate": 0.00016297049195191415, + "loss": 1.0242, + "step": 759 + }, + { + "epoch": 0.8585145439141485, + "grad_norm": 0.032917320728302, + "learning_rate": 0.00016287817922692395, + "loss": 1.0012, + "step": 760 + }, + { + "epoch": 0.8596441683140356, + "grad_norm": 0.03229722008109093, + "learning_rate": 0.00016278577779689314, + "loss": 0.9944, + "step": 761 + }, + { + "epoch": 0.8607737927139226, + "grad_norm": 0.0344838984310627, + "learning_rate": 0.0001626932877921766, + "loss": 0.9813, + "step": 762 + }, + { + "epoch": 0.8619034171138097, + "grad_norm": 0.033522870391607285, + "learning_rate": 0.00016260070934325402, + "loss": 1.0256, + "step": 763 + }, + { + "epoch": 0.8630330415136966, + "grad_norm": 0.03514671325683594, + "learning_rate": 0.00016250804258072997, + "loss": 0.9543, + "step": 764 + }, + { + "epoch": 0.8641626659135837, + "grad_norm": 0.03211130201816559, + "learning_rate": 0.00016241528763533353, + "loss": 1.0009, + "step": 765 + }, + { + "epoch": 0.8652922903134708, + "grad_norm": 0.033048368990421295, + "learning_rate": 0.00016232244463791826, + "loss": 1.0042, + "step": 766 + }, + { + "epoch": 0.8664219147133578, + "grad_norm": 0.031953100115060806, + "learning_rate": 0.00016222951371946192, + "loss": 1.0096, + "step": 767 + }, + { + "epoch": 0.8675515391132449, + "grad_norm": 0.03293442353606224, + "learning_rate": 0.00016213649501106622, + "loss": 0.9987, + "step": 768 + }, + { + "epoch": 0.8686811635131318, + "grad_norm": 0.033335424959659576, + "learning_rate": 0.00016204338864395684, + "loss": 1.0035, + "step": 769 + }, + { + "epoch": 0.8698107879130189, + "grad_norm": 0.04050195962190628, + "learning_rate": 0.00016195019474948299, + "loss": 1.0326, + "step": 770 + }, + { + "epoch": 0.870940412312906, + "grad_norm": 0.03311360627412796, + "learning_rate": 0.00016185691345911755, + "loss": 1.0184, + "step": 771 + }, + { + "epoch": 0.872070036712793, + "grad_norm": 0.03323720395565033, + "learning_rate": 0.0001617635449044565, + "loss": 0.9625, + "step": 772 + }, + { + "epoch": 0.87319966111268, + "grad_norm": 0.03422234579920769, + "learning_rate": 0.00016167008921721902, + "loss": 1.0654, + "step": 773 + }, + { + "epoch": 0.8743292855125671, + "grad_norm": 0.034163184463977814, + "learning_rate": 0.00016157654652924723, + "loss": 0.9953, + "step": 774 + }, + { + "epoch": 0.8754589099124541, + "grad_norm": 0.03320545703172684, + "learning_rate": 0.00016148291697250594, + "loss": 0.9766, + "step": 775 + }, + { + "epoch": 0.8765885343123412, + "grad_norm": 0.03346817195415497, + "learning_rate": 0.0001613892006790825, + "loss": 0.9201, + "step": 776 + }, + { + "epoch": 0.8777181587122281, + "grad_norm": 0.03284529596567154, + "learning_rate": 0.00016129539778118667, + "loss": 0.9284, + "step": 777 + }, + { + "epoch": 0.8788477831121152, + "grad_norm": 0.032990384846925735, + "learning_rate": 0.00016120150841115037, + "loss": 1.0058, + "step": 778 + }, + { + "epoch": 0.8799774075120023, + "grad_norm": 0.03396923094987869, + "learning_rate": 0.0001611075327014275, + "loss": 1.0831, + "step": 779 + }, + { + "epoch": 0.8811070319118893, + "grad_norm": 0.03224315121769905, + "learning_rate": 0.00016101347078459373, + "loss": 0.9318, + "step": 780 + }, + { + "epoch": 0.8822366563117763, + "grad_norm": 0.031812380999326706, + "learning_rate": 0.00016091932279334645, + "loss": 1.0566, + "step": 781 + }, + { + "epoch": 0.8833662807116633, + "grad_norm": 0.033773597329854965, + "learning_rate": 0.00016082508886050437, + "loss": 0.9349, + "step": 782 + }, + { + "epoch": 0.8844959051115504, + "grad_norm": 0.03375870734453201, + "learning_rate": 0.00016073076911900754, + "loss": 0.9875, + "step": 783 + }, + { + "epoch": 0.8856255295114375, + "grad_norm": 0.03443336486816406, + "learning_rate": 0.00016063636370191692, + "loss": 1.0604, + "step": 784 + }, + { + "epoch": 0.8867551539113245, + "grad_norm": 0.032187797129154205, + "learning_rate": 0.0001605418727424145, + "loss": 0.986, + "step": 785 + }, + { + "epoch": 0.8878847783112115, + "grad_norm": 0.03341427072882652, + "learning_rate": 0.00016044729637380284, + "loss": 0.9184, + "step": 786 + }, + { + "epoch": 0.8890144027110986, + "grad_norm": 0.03245866298675537, + "learning_rate": 0.000160352634729505, + "loss": 1.1511, + "step": 787 + }, + { + "epoch": 0.8901440271109856, + "grad_norm": 0.032569848001003265, + "learning_rate": 0.00016025788794306442, + "loss": 1.0948, + "step": 788 + }, + { + "epoch": 0.8912736515108727, + "grad_norm": 0.03429558128118515, + "learning_rate": 0.0001601630561481446, + "loss": 0.9379, + "step": 789 + }, + { + "epoch": 0.8924032759107596, + "grad_norm": 0.033720601350069046, + "learning_rate": 0.00016006813947852893, + "loss": 0.9845, + "step": 790 + }, + { + "epoch": 0.8935329003106467, + "grad_norm": 0.033525846898555756, + "learning_rate": 0.00015997313806812057, + "loss": 1.0279, + "step": 791 + }, + { + "epoch": 0.8946625247105338, + "grad_norm": 0.03577594459056854, + "learning_rate": 0.00015987805205094227, + "loss": 0.9654, + "step": 792 + }, + { + "epoch": 0.8957921491104208, + "grad_norm": 0.03572090342640877, + "learning_rate": 0.00015978288156113604, + "loss": 1.0292, + "step": 793 + }, + { + "epoch": 0.8969217735103078, + "grad_norm": 0.0330742709338665, + "learning_rate": 0.00015968762673296318, + "loss": 1.0898, + "step": 794 + }, + { + "epoch": 0.8980513979101948, + "grad_norm": 0.03374762088060379, + "learning_rate": 0.0001595922877008039, + "loss": 1.0368, + "step": 795 + }, + { + "epoch": 0.8991810223100819, + "grad_norm": 0.035000476986169815, + "learning_rate": 0.00015949686459915715, + "loss": 1.0531, + "step": 796 + }, + { + "epoch": 0.900310646709969, + "grad_norm": 0.03325015306472778, + "learning_rate": 0.00015940135756264062, + "loss": 1.0199, + "step": 797 + }, + { + "epoch": 0.901440271109856, + "grad_norm": 0.03547768294811249, + "learning_rate": 0.0001593057667259902, + "loss": 0.9988, + "step": 798 + }, + { + "epoch": 0.902569895509743, + "grad_norm": 0.03294992819428444, + "learning_rate": 0.0001592100922240603, + "loss": 0.9943, + "step": 799 + }, + { + "epoch": 0.90369951990963, + "grad_norm": 0.03369821235537529, + "learning_rate": 0.00015911433419182305, + "loss": 1.0186, + "step": 800 + }, + { + "epoch": 0.9048291443095171, + "grad_norm": 0.03317281976342201, + "learning_rate": 0.00015901849276436862, + "loss": 0.9601, + "step": 801 + }, + { + "epoch": 0.9059587687094042, + "grad_norm": 0.036661259829998016, + "learning_rate": 0.00015892256807690478, + "loss": 1.0847, + "step": 802 + }, + { + "epoch": 0.9070883931092911, + "grad_norm": 0.0334974080324173, + "learning_rate": 0.00015882656026475672, + "loss": 1.0264, + "step": 803 + }, + { + "epoch": 0.9082180175091782, + "grad_norm": 0.03364727646112442, + "learning_rate": 0.00015873046946336694, + "loss": 0.9768, + "step": 804 + }, + { + "epoch": 0.9093476419090653, + "grad_norm": 0.03534623235464096, + "learning_rate": 0.000158634295808295, + "loss": 1.0705, + "step": 805 + }, + { + "epoch": 0.9104772663089523, + "grad_norm": 0.032764844596385956, + "learning_rate": 0.00015853803943521733, + "loss": 0.9543, + "step": 806 + }, + { + "epoch": 0.9116068907088393, + "grad_norm": 0.03310185670852661, + "learning_rate": 0.00015844170047992712, + "loss": 1.0077, + "step": 807 + }, + { + "epoch": 0.9127365151087263, + "grad_norm": 0.0327795036137104, + "learning_rate": 0.00015834527907833396, + "loss": 0.9765, + "step": 808 + }, + { + "epoch": 0.9138661395086134, + "grad_norm": 0.03351445123553276, + "learning_rate": 0.00015824877536646382, + "loss": 1.0634, + "step": 809 + }, + { + "epoch": 0.9149957639085005, + "grad_norm": 0.03497536852955818, + "learning_rate": 0.00015815218948045878, + "loss": 0.9211, + "step": 810 + }, + { + "epoch": 0.9161253883083874, + "grad_norm": 0.03262564167380333, + "learning_rate": 0.00015805552155657683, + "loss": 0.9841, + "step": 811 + }, + { + "epoch": 0.9172550127082745, + "grad_norm": 0.03305838629603386, + "learning_rate": 0.00015795877173119176, + "loss": 0.9968, + "step": 812 + }, + { + "epoch": 0.9183846371081615, + "grad_norm": 0.03393985703587532, + "learning_rate": 0.00015786194014079274, + "loss": 1.0257, + "step": 813 + }, + { + "epoch": 0.9195142615080486, + "grad_norm": 0.03377285972237587, + "learning_rate": 0.00015776502692198448, + "loss": 0.979, + "step": 814 + }, + { + "epoch": 0.9206438859079357, + "grad_norm": 0.03390325978398323, + "learning_rate": 0.00015766803221148673, + "loss": 1.0935, + "step": 815 + }, + { + "epoch": 0.9217735103078226, + "grad_norm": 0.034586288034915924, + "learning_rate": 0.00015757095614613427, + "loss": 1.0286, + "step": 816 + }, + { + "epoch": 0.9229031347077097, + "grad_norm": 0.034462425857782364, + "learning_rate": 0.00015747379886287655, + "loss": 0.9826, + "step": 817 + }, + { + "epoch": 0.9240327591075967, + "grad_norm": 0.03412788733839989, + "learning_rate": 0.0001573765604987777, + "loss": 1.0391, + "step": 818 + }, + { + "epoch": 0.9251623835074838, + "grad_norm": 0.03411950543522835, + "learning_rate": 0.0001572792411910162, + "loss": 1.014, + "step": 819 + }, + { + "epoch": 0.9262920079073708, + "grad_norm": 0.03366335481405258, + "learning_rate": 0.0001571818410768848, + "loss": 1.0191, + "step": 820 + }, + { + "epoch": 0.9274216323072578, + "grad_norm": 0.033515483140945435, + "learning_rate": 0.00015708436029379004, + "loss": 1.0072, + "step": 821 + }, + { + "epoch": 0.9285512567071449, + "grad_norm": 0.033421795815229416, + "learning_rate": 0.0001569867989792525, + "loss": 1.0311, + "step": 822 + }, + { + "epoch": 0.929680881107032, + "grad_norm": 0.032961517572402954, + "learning_rate": 0.00015688915727090613, + "loss": 1.0476, + "step": 823 + }, + { + "epoch": 0.9308105055069189, + "grad_norm": 0.03382313251495361, + "learning_rate": 0.00015679143530649854, + "loss": 0.9863, + "step": 824 + }, + { + "epoch": 0.931940129906806, + "grad_norm": 0.03453601896762848, + "learning_rate": 0.0001566936332238904, + "loss": 0.981, + "step": 825 + }, + { + "epoch": 0.933069754306693, + "grad_norm": 0.03426108881831169, + "learning_rate": 0.00015659575116105544, + "loss": 1.0615, + "step": 826 + }, + { + "epoch": 0.9341993787065801, + "grad_norm": 0.03343765065073967, + "learning_rate": 0.0001564977892560803, + "loss": 1.0745, + "step": 827 + }, + { + "epoch": 0.935329003106467, + "grad_norm": 0.03495456650853157, + "learning_rate": 0.00015639974764716414, + "loss": 0.9985, + "step": 828 + }, + { + "epoch": 0.9364586275063541, + "grad_norm": 0.033679116517305374, + "learning_rate": 0.0001563016264726186, + "loss": 1.0216, + "step": 829 + }, + { + "epoch": 0.9375882519062412, + "grad_norm": 0.03362250700592995, + "learning_rate": 0.0001562034258708676, + "loss": 1.0337, + "step": 830 + }, + { + "epoch": 0.9387178763061282, + "grad_norm": 0.034377310425043106, + "learning_rate": 0.00015610514598044707, + "loss": 1.0583, + "step": 831 + }, + { + "epoch": 0.9398475007060153, + "grad_norm": 0.033647313714027405, + "learning_rate": 0.00015600678694000487, + "loss": 1.0126, + "step": 832 + }, + { + "epoch": 0.9409771251059023, + "grad_norm": 0.03457539901137352, + "learning_rate": 0.0001559083488883004, + "loss": 1.1528, + "step": 833 + }, + { + "epoch": 0.9421067495057893, + "grad_norm": 0.03426367789506912, + "learning_rate": 0.00015580983196420464, + "loss": 0.9055, + "step": 834 + }, + { + "epoch": 0.9432363739056764, + "grad_norm": 0.03347745165228844, + "learning_rate": 0.0001557112363066998, + "loss": 0.9978, + "step": 835 + }, + { + "epoch": 0.9443659983055634, + "grad_norm": 0.03355059772729874, + "learning_rate": 0.00015561256205487908, + "loss": 0.9844, + "step": 836 + }, + { + "epoch": 0.9454956227054504, + "grad_norm": 0.032837532460689545, + "learning_rate": 0.0001555138093479467, + "loss": 0.932, + "step": 837 + }, + { + "epoch": 0.9466252471053375, + "grad_norm": 0.03441225364804268, + "learning_rate": 0.0001554149783252175, + "loss": 0.9975, + "step": 838 + }, + { + "epoch": 0.9477548715052245, + "grad_norm": 0.033451907336711884, + "learning_rate": 0.00015531606912611674, + "loss": 0.9707, + "step": 839 + }, + { + "epoch": 0.9488844959051116, + "grad_norm": 0.03538847342133522, + "learning_rate": 0.00015521708189018005, + "loss": 1.0129, + "step": 840 + }, + { + "epoch": 0.9500141203049985, + "grad_norm": 0.033600080758333206, + "learning_rate": 0.00015511801675705312, + "loss": 1.0403, + "step": 841 + }, + { + "epoch": 0.9511437447048856, + "grad_norm": 0.03426308557391167, + "learning_rate": 0.00015501887386649155, + "loss": 0.9879, + "step": 842 + }, + { + "epoch": 0.9522733691047727, + "grad_norm": 0.033120229840278625, + "learning_rate": 0.00015491965335836055, + "loss": 1.0627, + "step": 843 + }, + { + "epoch": 0.9534029935046597, + "grad_norm": 0.0343567430973053, + "learning_rate": 0.00015482035537263498, + "loss": 1.0308, + "step": 844 + }, + { + "epoch": 0.9545326179045467, + "grad_norm": 0.033301327377557755, + "learning_rate": 0.00015472098004939888, + "loss": 1.0106, + "step": 845 + }, + { + "epoch": 0.9556622423044338, + "grad_norm": 0.03342900052666664, + "learning_rate": 0.00015462152752884544, + "loss": 1.0261, + "step": 846 + }, + { + "epoch": 0.9567918667043208, + "grad_norm": 0.032714009284973145, + "learning_rate": 0.00015452199795127678, + "loss": 0.8953, + "step": 847 + }, + { + "epoch": 0.9579214911042079, + "grad_norm": 0.0333135612308979, + "learning_rate": 0.00015442239145710364, + "loss": 1.0105, + "step": 848 + }, + { + "epoch": 0.9590511155040949, + "grad_norm": 0.03534407541155815, + "learning_rate": 0.00015432270818684532, + "loss": 0.9325, + "step": 849 + }, + { + "epoch": 0.9601807399039819, + "grad_norm": 0.03319082036614418, + "learning_rate": 0.00015422294828112954, + "loss": 0.9187, + "step": 850 + }, + { + "epoch": 0.961310364303869, + "grad_norm": 0.03402223438024521, + "learning_rate": 0.00015412311188069193, + "loss": 0.9523, + "step": 851 + }, + { + "epoch": 0.962439988703756, + "grad_norm": 0.038419678807258606, + "learning_rate": 0.00015402319912637613, + "loss": 1.0135, + "step": 852 + }, + { + "epoch": 0.9635696131036431, + "grad_norm": 0.03462392836809158, + "learning_rate": 0.00015392321015913357, + "loss": 1.0811, + "step": 853 + }, + { + "epoch": 0.96469923750353, + "grad_norm": 0.033567875623703, + "learning_rate": 0.0001538231451200231, + "loss": 1.0052, + "step": 854 + }, + { + "epoch": 0.9658288619034171, + "grad_norm": 0.03398734703660011, + "learning_rate": 0.00015372300415021091, + "loss": 0.9939, + "step": 855 + }, + { + "epoch": 0.9669584863033042, + "grad_norm": 0.03315124288201332, + "learning_rate": 0.00015362278739097026, + "loss": 1.0515, + "step": 856 + }, + { + "epoch": 0.9680881107031912, + "grad_norm": 0.03387816995382309, + "learning_rate": 0.0001535224949836815, + "loss": 1.0906, + "step": 857 + }, + { + "epoch": 0.9692177351030782, + "grad_norm": 0.033208638429641724, + "learning_rate": 0.00015342212706983153, + "loss": 0.9542, + "step": 858 + }, + { + "epoch": 0.9703473595029652, + "grad_norm": 0.0338163860142231, + "learning_rate": 0.00015332168379101377, + "loss": 0.9892, + "step": 859 + }, + { + "epoch": 0.9714769839028523, + "grad_norm": 0.033496033400297165, + "learning_rate": 0.00015322116528892807, + "loss": 1.0253, + "step": 860 + }, + { + "epoch": 0.9726066083027394, + "grad_norm": 0.034597545862197876, + "learning_rate": 0.00015312057170538035, + "loss": 1.0102, + "step": 861 + }, + { + "epoch": 0.9737362327026263, + "grad_norm": 0.03476065397262573, + "learning_rate": 0.00015301990318228244, + "loss": 0.938, + "step": 862 + }, + { + "epoch": 0.9748658571025134, + "grad_norm": 0.036271460354328156, + "learning_rate": 0.00015291915986165186, + "loss": 0.9072, + "step": 863 + }, + { + "epoch": 0.9759954815024005, + "grad_norm": 0.032739460468292236, + "learning_rate": 0.00015281834188561174, + "loss": 0.9955, + "step": 864 + }, + { + "epoch": 0.9771251059022875, + "grad_norm": 0.03603595495223999, + "learning_rate": 0.0001527174493963905, + "loss": 0.978, + "step": 865 + }, + { + "epoch": 0.9782547303021746, + "grad_norm": 0.03469686582684517, + "learning_rate": 0.00015261648253632156, + "loss": 1.0928, + "step": 866 + }, + { + "epoch": 0.9793843547020615, + "grad_norm": 0.03487220034003258, + "learning_rate": 0.0001525154414478434, + "loss": 1.0144, + "step": 867 + }, + { + "epoch": 0.9805139791019486, + "grad_norm": 0.03308931365609169, + "learning_rate": 0.00015241432627349918, + "loss": 0.9912, + "step": 868 + }, + { + "epoch": 0.9816436035018357, + "grad_norm": 0.0350349023938179, + "learning_rate": 0.00015231313715593662, + "loss": 1.0209, + "step": 869 + }, + { + "epoch": 0.9827732279017227, + "grad_norm": 0.034897249191999435, + "learning_rate": 0.0001522118742379076, + "loss": 0.9873, + "step": 870 + }, + { + "epoch": 0.9839028523016097, + "grad_norm": 0.03427942842245102, + "learning_rate": 0.00015211053766226828, + "loss": 0.9497, + "step": 871 + }, + { + "epoch": 0.9850324767014967, + "grad_norm": 0.0339798741042614, + "learning_rate": 0.00015200912757197868, + "loss": 0.9741, + "step": 872 + }, + { + "epoch": 0.9861621011013838, + "grad_norm": 0.03557536378502846, + "learning_rate": 0.00015190764411010247, + "loss": 0.9747, + "step": 873 + }, + { + "epoch": 0.9872917255012709, + "grad_norm": 0.036786146461963654, + "learning_rate": 0.00015180608741980692, + "loss": 1.0296, + "step": 874 + }, + { + "epoch": 0.9884213499011578, + "grad_norm": 0.03306087478995323, + "learning_rate": 0.00015170445764436252, + "loss": 1.0559, + "step": 875 + }, + { + "epoch": 0.9895509743010449, + "grad_norm": 0.03436678647994995, + "learning_rate": 0.00015160275492714296, + "loss": 0.9572, + "step": 876 + }, + { + "epoch": 0.990680598700932, + "grad_norm": 0.03426647186279297, + "learning_rate": 0.00015150097941162474, + "loss": 0.999, + "step": 877 + }, + { + "epoch": 0.991810223100819, + "grad_norm": 0.03366367891430855, + "learning_rate": 0.00015139913124138715, + "loss": 1.0365, + "step": 878 + }, + { + "epoch": 0.992939847500706, + "grad_norm": 0.034058500081300735, + "learning_rate": 0.00015129721056011185, + "loss": 0.9835, + "step": 879 + }, + { + "epoch": 0.994069471900593, + "grad_norm": 0.03479884937405586, + "learning_rate": 0.00015119521751158296, + "loss": 1.0604, + "step": 880 + }, + { + "epoch": 0.9951990963004801, + "grad_norm": 0.03426951542496681, + "learning_rate": 0.00015109315223968655, + "loss": 1.0344, + "step": 881 + }, + { + "epoch": 0.9963287207003672, + "grad_norm": 0.034726936370134354, + "learning_rate": 0.0001509910148884106, + "loss": 0.927, + "step": 882 + }, + { + "epoch": 0.9974583451002542, + "grad_norm": 0.03522869199514389, + "learning_rate": 0.00015088880560184493, + "loss": 1.035, + "step": 883 + }, + { + "epoch": 0.9985879695001412, + "grad_norm": 0.03507549315690994, + "learning_rate": 0.00015078652452418063, + "loss": 0.952, + "step": 884 + }, + { + "epoch": 0.9997175939000282, + "grad_norm": 0.03401617333292961, + "learning_rate": 0.00015068417179971014, + "loss": 1.0006, + "step": 885 + }, + { + "epoch": 0.9997175939000282, + "eval_loss": 1.0020042657852173, + "eval_runtime": 552.3244, + "eval_samples_per_second": 17.712, + "eval_steps_per_second": 8.857, + "step": 885 + }, + { + "epoch": 1.0008472182999153, + "grad_norm": 0.03515418618917465, + "learning_rate": 0.00015058174757282705, + "loss": 0.9925, + "step": 886 + }, + { + "epoch": 1.0019768426998024, + "grad_norm": 0.03516097739338875, + "learning_rate": 0.00015047925198802574, + "loss": 0.9803, + "step": 887 + }, + { + "epoch": 1.0008472182999153, + "grad_norm": 0.03710052743554115, + "learning_rate": 0.0001503766851899013, + "loss": 0.9624, + "step": 888 + }, + { + "epoch": 1.0019768426998024, + "grad_norm": 0.03626672551035881, + "learning_rate": 0.00015027404732314922, + "loss": 1.0015, + "step": 889 + }, + { + "epoch": 1.0031064670996894, + "grad_norm": 0.03808250650763512, + "learning_rate": 0.00015017133853256537, + "loss": 0.935, + "step": 890 + }, + { + "epoch": 1.0042360914995765, + "grad_norm": 0.035625848919153214, + "learning_rate": 0.00015006855896304558, + "loss": 0.9196, + "step": 891 + }, + { + "epoch": 1.0053657158994633, + "grad_norm": 0.03489432483911514, + "learning_rate": 0.00014996570875958553, + "loss": 0.9544, + "step": 892 + }, + { + "epoch": 1.0064953402993504, + "grad_norm": 0.034372955560684204, + "learning_rate": 0.0001498627880672806, + "loss": 1.011, + "step": 893 + }, + { + "epoch": 1.0076249646992375, + "grad_norm": 0.034961700439453125, + "learning_rate": 0.00014975979703132556, + "loss": 0.9769, + "step": 894 + }, + { + "epoch": 1.0087545890991245, + "grad_norm": 0.03485623002052307, + "learning_rate": 0.00014965673579701445, + "loss": 0.9856, + "step": 895 + }, + { + "epoch": 1.0098842134990116, + "grad_norm": 0.03519703075289726, + "learning_rate": 0.0001495536045097403, + "loss": 0.9178, + "step": 896 + }, + { + "epoch": 1.0110138378988986, + "grad_norm": 0.03748798742890358, + "learning_rate": 0.00014945040331499504, + "loss": 1.0493, + "step": 897 + }, + { + "epoch": 1.0121434622987857, + "grad_norm": 0.03519289940595627, + "learning_rate": 0.0001493471323583692, + "loss": 0.9539, + "step": 898 + }, + { + "epoch": 1.0132730866986728, + "grad_norm": 0.035702627152204514, + "learning_rate": 0.00014924379178555167, + "loss": 1.016, + "step": 899 + }, + { + "epoch": 1.0144027110985596, + "grad_norm": 0.03649812936782837, + "learning_rate": 0.00014914038174232956, + "loss": 1.0289, + "step": 900 + }, + { + "epoch": 1.0155323354984467, + "grad_norm": 0.03464585542678833, + "learning_rate": 0.00014903690237458802, + "loss": 1.0565, + "step": 901 + }, + { + "epoch": 1.0166619598983337, + "grad_norm": 0.035987745970487595, + "learning_rate": 0.00014893335382831007, + "loss": 1.0472, + "step": 902 + }, + { + "epoch": 1.0177915842982208, + "grad_norm": 0.035208627581596375, + "learning_rate": 0.00014882973624957615, + "loss": 1.0795, + "step": 903 + }, + { + "epoch": 1.0189212086981079, + "grad_norm": 0.035057902336120605, + "learning_rate": 0.0001487260497845642, + "loss": 1.0426, + "step": 904 + }, + { + "epoch": 1.020050833097995, + "grad_norm": 0.035428449511528015, + "learning_rate": 0.0001486222945795494, + "loss": 0.9672, + "step": 905 + }, + { + "epoch": 1.021180457497882, + "grad_norm": 0.034387003630399704, + "learning_rate": 0.0001485184707809037, + "loss": 0.9816, + "step": 906 + }, + { + "epoch": 1.022310081897769, + "grad_norm": 0.03481113165616989, + "learning_rate": 0.00014841457853509606, + "loss": 0.998, + "step": 907 + }, + { + "epoch": 1.0234397062976561, + "grad_norm": 0.03739047050476074, + "learning_rate": 0.00014831061798869182, + "loss": 1.0395, + "step": 908 + }, + { + "epoch": 1.024569330697543, + "grad_norm": 0.03472413867712021, + "learning_rate": 0.00014820658928835277, + "loss": 1.0398, + "step": 909 + }, + { + "epoch": 1.02569895509743, + "grad_norm": 0.03586706519126892, + "learning_rate": 0.00014810249258083677, + "loss": 1.0135, + "step": 910 + }, + { + "epoch": 1.026828579497317, + "grad_norm": 0.037143245339393616, + "learning_rate": 0.00014799832801299775, + "loss": 0.965, + "step": 911 + }, + { + "epoch": 1.0279582038972042, + "grad_norm": 0.03444663807749748, + "learning_rate": 0.00014789409573178521, + "loss": 1.001, + "step": 912 + }, + { + "epoch": 1.0290878282970912, + "grad_norm": 0.0362403504550457, + "learning_rate": 0.00014778979588424428, + "loss": 1.03, + "step": 913 + }, + { + "epoch": 1.0302174526969783, + "grad_norm": 0.03418262302875519, + "learning_rate": 0.0001476854286175155, + "loss": 0.9705, + "step": 914 + }, + { + "epoch": 1.0313470770968654, + "grad_norm": 0.0341293029487133, + "learning_rate": 0.00014758099407883422, + "loss": 1.0715, + "step": 915 + }, + { + "epoch": 1.0324767014967524, + "grad_norm": 0.03373926132917404, + "learning_rate": 0.00014747649241553102, + "loss": 0.9609, + "step": 916 + }, + { + "epoch": 1.0336063258966393, + "grad_norm": 0.0360686220228672, + "learning_rate": 0.00014737192377503098, + "loss": 1.0252, + "step": 917 + }, + { + "epoch": 1.0347359502965263, + "grad_norm": 0.036107342690229416, + "learning_rate": 0.00014726728830485376, + "loss": 0.8894, + "step": 918 + }, + { + "epoch": 1.0358655746964134, + "grad_norm": 0.03535093739628792, + "learning_rate": 0.00014716258615261323, + "loss": 0.9827, + "step": 919 + }, + { + "epoch": 1.0369951990963004, + "grad_norm": 0.03566374629735947, + "learning_rate": 0.0001470578174660174, + "loss": 0.9728, + "step": 920 + }, + { + "epoch": 1.0381248234961875, + "grad_norm": 0.035979773849248886, + "learning_rate": 0.0001469529823928681, + "loss": 0.9051, + "step": 921 + }, + { + "epoch": 1.0392544478960746, + "grad_norm": 0.03736504539847374, + "learning_rate": 0.0001468480810810608, + "loss": 1.0889, + "step": 922 + }, + { + "epoch": 1.0403840722959616, + "grad_norm": 0.0376417301595211, + "learning_rate": 0.0001467431136785845, + "loss": 1.0897, + "step": 923 + }, + { + "epoch": 1.0415136966958487, + "grad_norm": 0.034662578254938126, + "learning_rate": 0.00014663808033352132, + "loss": 0.9442, + "step": 924 + }, + { + "epoch": 1.0426433210957358, + "grad_norm": 0.03605816140770912, + "learning_rate": 0.00014653298119404645, + "loss": 0.9179, + "step": 925 + }, + { + "epoch": 1.0437729454956226, + "grad_norm": 0.03609084710478783, + "learning_rate": 0.000146427816408428, + "loss": 0.9713, + "step": 926 + }, + { + "epoch": 1.0449025698955097, + "grad_norm": 0.036889102309942245, + "learning_rate": 0.0001463225861250265, + "loss": 1.049, + "step": 927 + }, + { + "epoch": 1.0460321942953967, + "grad_norm": 0.035482168197631836, + "learning_rate": 0.00014621729049229507, + "loss": 1.0385, + "step": 928 + }, + { + "epoch": 1.0471618186952838, + "grad_norm": 0.035426728427410126, + "learning_rate": 0.00014611192965877892, + "loss": 0.9582, + "step": 929 + }, + { + "epoch": 1.0482914430951709, + "grad_norm": 0.037651337683200836, + "learning_rate": 0.00014600650377311522, + "loss": 0.9866, + "step": 930 + }, + { + "epoch": 1.049421067495058, + "grad_norm": 0.037454381585121155, + "learning_rate": 0.00014590101298403297, + "loss": 1.0368, + "step": 931 + }, + { + "epoch": 1.050550691894945, + "grad_norm": 0.037140969187021255, + "learning_rate": 0.0001457954574403527, + "loss": 1.0504, + "step": 932 + }, + { + "epoch": 1.051680316294832, + "grad_norm": 0.037180185317993164, + "learning_rate": 0.0001456898372909864, + "loss": 0.9409, + "step": 933 + }, + { + "epoch": 1.052809940694719, + "grad_norm": 0.03617197647690773, + "learning_rate": 0.00014558415268493694, + "loss": 1.0344, + "step": 934 + }, + { + "epoch": 1.053939565094606, + "grad_norm": 0.037174291908741, + "learning_rate": 0.00014547840377129842, + "loss": 1.0079, + "step": 935 + }, + { + "epoch": 1.055069189494493, + "grad_norm": 0.03483375906944275, + "learning_rate": 0.0001453725906992555, + "loss": 0.961, + "step": 936 + }, + { + "epoch": 1.05619881389438, + "grad_norm": 0.03393753245472908, + "learning_rate": 0.00014526671361808331, + "loss": 0.9776, + "step": 937 + }, + { + "epoch": 1.0573284382942671, + "grad_norm": 0.03698374330997467, + "learning_rate": 0.00014516077267714744, + "loss": 1.0631, + "step": 938 + }, + { + "epoch": 1.0584580626941542, + "grad_norm": 0.03589322045445442, + "learning_rate": 0.00014505476802590344, + "loss": 1.0516, + "step": 939 + }, + { + "epoch": 1.0595876870940413, + "grad_norm": 0.03666504845023155, + "learning_rate": 0.0001449486998138968, + "loss": 1.0051, + "step": 940 + }, + { + "epoch": 1.0607173114939283, + "grad_norm": 0.03527549281716347, + "learning_rate": 0.00014484256819076265, + "loss": 0.9629, + "step": 941 + }, + { + "epoch": 1.0618469358938154, + "grad_norm": 0.03527344390749931, + "learning_rate": 0.00014473637330622558, + "loss": 1.0052, + "step": 942 + }, + { + "epoch": 1.0629765602937022, + "grad_norm": 0.03708739951252937, + "learning_rate": 0.00014463011531009942, + "loss": 0.9415, + "step": 943 + }, + { + "epoch": 1.0641061846935893, + "grad_norm": 0.03615740314126015, + "learning_rate": 0.0001445237943522871, + "loss": 1.0204, + "step": 944 + }, + { + "epoch": 1.0652358090934764, + "grad_norm": 0.036477502435445786, + "learning_rate": 0.00014441741058278024, + "loss": 0.9572, + "step": 945 + }, + { + "epoch": 1.0663654334933634, + "grad_norm": 0.03599262610077858, + "learning_rate": 0.00014431096415165918, + "loss": 0.9827, + "step": 946 + }, + { + "epoch": 1.0674950578932505, + "grad_norm": 0.03469396010041237, + "learning_rate": 0.00014420445520909266, + "loss": 0.9757, + "step": 947 + }, + { + "epoch": 1.0686246822931376, + "grad_norm": 0.03587264567613602, + "learning_rate": 0.00014409788390533753, + "loss": 0.8875, + "step": 948 + }, + { + "epoch": 1.0697543066930246, + "grad_norm": 0.03640183061361313, + "learning_rate": 0.0001439912503907387, + "loss": 1.0261, + "step": 949 + }, + { + "epoch": 1.0708839310929117, + "grad_norm": 0.03576480597257614, + "learning_rate": 0.0001438845548157288, + "loss": 1.0378, + "step": 950 + }, + { + "epoch": 1.0720135554927985, + "grad_norm": 0.03689443692564964, + "learning_rate": 0.00014377779733082798, + "loss": 0.9868, + "step": 951 + }, + { + "epoch": 1.0731431798926856, + "grad_norm": 0.036124926060438156, + "learning_rate": 0.00014367097808664383, + "loss": 1.0565, + "step": 952 + }, + { + "epoch": 1.0742728042925727, + "grad_norm": 0.036695219576358795, + "learning_rate": 0.0001435640972338709, + "loss": 1.0738, + "step": 953 + }, + { + "epoch": 1.0754024286924597, + "grad_norm": 0.03690825775265694, + "learning_rate": 0.0001434571549232909, + "loss": 0.9761, + "step": 954 + }, + { + "epoch": 1.0765320530923468, + "grad_norm": 0.03635778650641441, + "learning_rate": 0.000143350151305772, + "loss": 0.947, + "step": 955 + }, + { + "epoch": 1.0776616774922338, + "grad_norm": 0.036799509078264236, + "learning_rate": 0.00014324308653226896, + "loss": 0.9861, + "step": 956 + }, + { + "epoch": 1.078791301892121, + "grad_norm": 0.03723785653710365, + "learning_rate": 0.00014313596075382284, + "loss": 1.0015, + "step": 957 + }, + { + "epoch": 1.079920926292008, + "grad_norm": 0.03606920316815376, + "learning_rate": 0.0001430287741215607, + "loss": 1.0458, + "step": 958 + }, + { + "epoch": 1.081050550691895, + "grad_norm": 0.03719672933220863, + "learning_rate": 0.00014292152678669557, + "loss": 0.9969, + "step": 959 + }, + { + "epoch": 1.0821801750917819, + "grad_norm": 0.03698967769742012, + "learning_rate": 0.0001428142189005259, + "loss": 1.0023, + "step": 960 + }, + { + "epoch": 1.083309799491669, + "grad_norm": 0.035385068506002426, + "learning_rate": 0.0001427068506144358, + "loss": 1.0768, + "step": 961 + }, + { + "epoch": 1.084439423891556, + "grad_norm": 0.03606635332107544, + "learning_rate": 0.00014259942207989443, + "loss": 1.0263, + "step": 962 + }, + { + "epoch": 1.085569048291443, + "grad_norm": 0.03499152511358261, + "learning_rate": 0.00014249193344845602, + "loss": 1.0146, + "step": 963 + }, + { + "epoch": 1.0866986726913301, + "grad_norm": 0.0371355339884758, + "learning_rate": 0.0001423843848717595, + "loss": 0.9832, + "step": 964 + }, + { + "epoch": 1.0878282970912172, + "grad_norm": 0.036203138530254364, + "learning_rate": 0.0001422767765015285, + "loss": 1.042, + "step": 965 + }, + { + "epoch": 1.0889579214911043, + "grad_norm": 0.03647124394774437, + "learning_rate": 0.00014216910848957088, + "loss": 0.9835, + "step": 966 + }, + { + "epoch": 1.0900875458909913, + "grad_norm": 0.03794392943382263, + "learning_rate": 0.0001420613809877787, + "loss": 0.974, + "step": 967 + }, + { + "epoch": 1.0912171702908782, + "grad_norm": 0.03726119548082352, + "learning_rate": 0.00014195359414812797, + "loss": 0.9381, + "step": 968 + }, + { + "epoch": 1.0923467946907652, + "grad_norm": 0.03955681249499321, + "learning_rate": 0.0001418457481226783, + "loss": 1.0461, + "step": 969 + }, + { + "epoch": 1.0934764190906523, + "grad_norm": 0.03713424876332283, + "learning_rate": 0.0001417378430635729, + "loss": 0.9963, + "step": 970 + }, + { + "epoch": 1.0946060434905394, + "grad_norm": 0.035701628774404526, + "learning_rate": 0.00014162987912303828, + "loss": 1.023, + "step": 971 + }, + { + "epoch": 1.0957356678904264, + "grad_norm": 0.037078987807035446, + "learning_rate": 0.00014152185645338388, + "loss": 0.9829, + "step": 972 + }, + { + "epoch": 1.0968652922903135, + "grad_norm": 0.037014082074165344, + "learning_rate": 0.00014141377520700214, + "loss": 1.1029, + "step": 973 + }, + { + "epoch": 1.0979949166902006, + "grad_norm": 0.03914770111441612, + "learning_rate": 0.00014130563553636807, + "loss": 0.9921, + "step": 974 + }, + { + "epoch": 1.0991245410900876, + "grad_norm": 0.03586632385849953, + "learning_rate": 0.00014119743759403907, + "loss": 0.9454, + "step": 975 + }, + { + "epoch": 1.1002541654899747, + "grad_norm": 0.03491047024726868, + "learning_rate": 0.00014108918153265485, + "loss": 0.8762, + "step": 976 + }, + { + "epoch": 1.1013837898898615, + "grad_norm": 0.036289557814598083, + "learning_rate": 0.00014098086750493694, + "loss": 1.0473, + "step": 977 + }, + { + "epoch": 1.1025134142897486, + "grad_norm": 0.036362823098897934, + "learning_rate": 0.0001408724956636889, + "loss": 0.9767, + "step": 978 + }, + { + "epoch": 1.1036430386896356, + "grad_norm": 0.03629835322499275, + "learning_rate": 0.0001407640661617955, + "loss": 0.9525, + "step": 979 + }, + { + "epoch": 1.1047726630895227, + "grad_norm": 0.03653542697429657, + "learning_rate": 0.00014065557915222322, + "loss": 0.9991, + "step": 980 + }, + { + "epoch": 1.1059022874894098, + "grad_norm": 0.03811173886060715, + "learning_rate": 0.00014054703478801948, + "loss": 0.9329, + "step": 981 + }, + { + "epoch": 1.1070319118892968, + "grad_norm": 0.037317369133234024, + "learning_rate": 0.00014043843322231257, + "loss": 1.022, + "step": 982 + }, + { + "epoch": 1.108161536289184, + "grad_norm": 0.03802327439188957, + "learning_rate": 0.00014032977460831162, + "loss": 0.9009, + "step": 983 + }, + { + "epoch": 1.109291160689071, + "grad_norm": 0.03663121536374092, + "learning_rate": 0.0001402210590993061, + "loss": 1.0042, + "step": 984 + }, + { + "epoch": 1.1104207850889578, + "grad_norm": 0.03598729893565178, + "learning_rate": 0.00014011228684866582, + "loss": 0.9048, + "step": 985 + }, + { + "epoch": 1.1115504094888449, + "grad_norm": 0.036852069199085236, + "learning_rate": 0.00014000345800984065, + "loss": 1.0355, + "step": 986 + }, + { + "epoch": 1.112680033888732, + "grad_norm": 0.03889515623450279, + "learning_rate": 0.00013989457273636026, + "loss": 1.0336, + "step": 987 + }, + { + "epoch": 1.113809658288619, + "grad_norm": 0.037266094237565994, + "learning_rate": 0.00013978563118183395, + "loss": 0.9659, + "step": 988 + }, + { + "epoch": 1.114939282688506, + "grad_norm": 0.035660672932863235, + "learning_rate": 0.00013967663349995037, + "loss": 1.0061, + "step": 989 + }, + { + "epoch": 1.1160689070883931, + "grad_norm": 0.035659607499837875, + "learning_rate": 0.00013956757984447745, + "loss": 0.9296, + "step": 990 + }, + { + "epoch": 1.1171985314882802, + "grad_norm": 0.03629400581121445, + "learning_rate": 0.00013945847036926195, + "loss": 1.0935, + "step": 991 + }, + { + "epoch": 1.1183281558881673, + "grad_norm": 0.03713002800941467, + "learning_rate": 0.00013934930522822953, + "loss": 0.9994, + "step": 992 + }, + { + "epoch": 1.1194577802880543, + "grad_norm": 0.03690033778548241, + "learning_rate": 0.0001392400845753843, + "loss": 0.9517, + "step": 993 + }, + { + "epoch": 1.1205874046879412, + "grad_norm": 0.03637135028839111, + "learning_rate": 0.0001391308085648086, + "loss": 0.9312, + "step": 994 + }, + { + "epoch": 1.1217170290878282, + "grad_norm": 0.03516946732997894, + "learning_rate": 0.00013902147735066306, + "loss": 0.8822, + "step": 995 + }, + { + "epoch": 1.1228466534877153, + "grad_norm": 0.03652889281511307, + "learning_rate": 0.00013891209108718599, + "loss": 0.9571, + "step": 996 + }, + { + "epoch": 1.1239762778876023, + "grad_norm": 0.036057133227586746, + "learning_rate": 0.00013880264992869354, + "loss": 0.9589, + "step": 997 + }, + { + "epoch": 1.1251059022874894, + "grad_norm": 0.03780772536993027, + "learning_rate": 0.00013869315402957914, + "loss": 1.0094, + "step": 998 + }, + { + "epoch": 1.1262355266873765, + "grad_norm": 0.03670530021190643, + "learning_rate": 0.00013858360354431355, + "loss": 0.9969, + "step": 999 + }, + { + "epoch": 1.1273651510872635, + "grad_norm": 0.037929147481918335, + "learning_rate": 0.0001384739986274445, + "loss": 0.9614, + "step": 1000 + }, + { + "epoch": 1.1284947754871506, + "grad_norm": 0.037020258605480194, + "learning_rate": 0.00013836433943359652, + "loss": 0.9567, + "step": 1001 + }, + { + "epoch": 1.1296243998870374, + "grad_norm": 0.036838166415691376, + "learning_rate": 0.0001382546261174707, + "loss": 0.9537, + "step": 1002 + }, + { + "epoch": 1.1307540242869245, + "grad_norm": 0.036922816187143326, + "learning_rate": 0.0001381448588338445, + "loss": 1.0113, + "step": 1003 + }, + { + "epoch": 1.1318836486868116, + "grad_norm": 0.036879848688840866, + "learning_rate": 0.00013803503773757148, + "loss": 1.0202, + "step": 1004 + }, + { + "epoch": 1.1330132730866986, + "grad_norm": 0.03841286897659302, + "learning_rate": 0.00013792516298358114, + "loss": 1.0836, + "step": 1005 + }, + { + "epoch": 1.1341428974865857, + "grad_norm": 0.03743935376405716, + "learning_rate": 0.00013781523472687873, + "loss": 1.006, + "step": 1006 + }, + { + "epoch": 1.1352725218864728, + "grad_norm": 0.038434430956840515, + "learning_rate": 0.0001377052531225449, + "loss": 0.9719, + "step": 1007 + }, + { + "epoch": 1.1364021462863598, + "grad_norm": 0.03810707479715347, + "learning_rate": 0.0001375952183257355, + "loss": 0.9137, + "step": 1008 + }, + { + "epoch": 1.137531770686247, + "grad_norm": 0.03748295083642006, + "learning_rate": 0.00013748513049168163, + "loss": 0.9542, + "step": 1009 + }, + { + "epoch": 1.138661395086134, + "grad_norm": 0.03579265996813774, + "learning_rate": 0.000137374989775689, + "loss": 0.9137, + "step": 1010 + }, + { + "epoch": 1.139791019486021, + "grad_norm": 0.037356823682785034, + "learning_rate": 0.00013726479633313808, + "loss": 1.037, + "step": 1011 + }, + { + "epoch": 1.1409206438859079, + "grad_norm": 0.03657485544681549, + "learning_rate": 0.0001371545503194836, + "loss": 0.9773, + "step": 1012 + }, + { + "epoch": 1.142050268285795, + "grad_norm": 0.03649429604411125, + "learning_rate": 0.00013704425189025452, + "loss": 0.9728, + "step": 1013 + }, + { + "epoch": 1.143179892685682, + "grad_norm": 0.036418452858924866, + "learning_rate": 0.00013693390120105375, + "loss": 0.9354, + "step": 1014 + }, + { + "epoch": 1.144309517085569, + "grad_norm": 0.036211349070072174, + "learning_rate": 0.00013682349840755785, + "loss": 0.9981, + "step": 1015 + }, + { + "epoch": 1.1454391414854561, + "grad_norm": 0.037094857543706894, + "learning_rate": 0.00013671304366551706, + "loss": 1.0344, + "step": 1016 + }, + { + "epoch": 1.1465687658853432, + "grad_norm": 0.03698652982711792, + "learning_rate": 0.00013660253713075471, + "loss": 1.0336, + "step": 1017 + }, + { + "epoch": 1.1476983902852302, + "grad_norm": 0.038060273975133896, + "learning_rate": 0.0001364919789591673, + "loss": 1.0393, + "step": 1018 + }, + { + "epoch": 1.148828014685117, + "grad_norm": 0.03657998517155647, + "learning_rate": 0.0001363813693067241, + "loss": 0.9779, + "step": 1019 + }, + { + "epoch": 1.1499576390850041, + "grad_norm": 0.037362392991781235, + "learning_rate": 0.00013627070832946718, + "loss": 1.0074, + "step": 1020 + }, + { + "epoch": 1.1510872634848912, + "grad_norm": 0.03672691062092781, + "learning_rate": 0.00013615999618351077, + "loss": 0.9936, + "step": 1021 + }, + { + "epoch": 1.1522168878847783, + "grad_norm": 0.03584654629230499, + "learning_rate": 0.00013604923302504147, + "loss": 1.0992, + "step": 1022 + }, + { + "epoch": 1.1533465122846653, + "grad_norm": 0.03621530905365944, + "learning_rate": 0.00013593841901031781, + "loss": 0.9189, + "step": 1023 + }, + { + "epoch": 1.1544761366845524, + "grad_norm": 0.03851611912250519, + "learning_rate": 0.00013582755429567, + "loss": 1.0137, + "step": 1024 + }, + { + "epoch": 1.1556057610844395, + "grad_norm": 0.03611522167921066, + "learning_rate": 0.00013571663903749984, + "loss": 0.9942, + "step": 1025 + }, + { + "epoch": 1.1567353854843265, + "grad_norm": 0.03798006474971771, + "learning_rate": 0.00013560567339228037, + "loss": 1.0745, + "step": 1026 + }, + { + "epoch": 1.1578650098842136, + "grad_norm": 0.03875018656253815, + "learning_rate": 0.00013549465751655585, + "loss": 0.9921, + "step": 1027 + }, + { + "epoch": 1.1589946342841007, + "grad_norm": 0.03791335970163345, + "learning_rate": 0.00013538359156694125, + "loss": 1.0061, + "step": 1028 + }, + { + "epoch": 1.1601242586839875, + "grad_norm": 0.03835611790418625, + "learning_rate": 0.00013527247570012217, + "loss": 0.922, + "step": 1029 + }, + { + "epoch": 1.1612538830838746, + "grad_norm": 0.03751290589570999, + "learning_rate": 0.00013516131007285483, + "loss": 0.9814, + "step": 1030 + }, + { + "epoch": 1.1623835074837616, + "grad_norm": 0.037721749395132065, + "learning_rate": 0.00013505009484196542, + "loss": 0.921, + "step": 1031 + }, + { + "epoch": 1.1635131318836487, + "grad_norm": 0.03777645528316498, + "learning_rate": 0.00013493883016435024, + "loss": 0.9686, + "step": 1032 + }, + { + "epoch": 1.1646427562835358, + "grad_norm": 0.037747886031866074, + "learning_rate": 0.00013482751619697534, + "loss": 1.0293, + "step": 1033 + }, + { + "epoch": 1.1657723806834228, + "grad_norm": 0.035930532962083817, + "learning_rate": 0.00013471615309687624, + "loss": 0.9538, + "step": 1034 + }, + { + "epoch": 1.1669020050833099, + "grad_norm": 0.036953311413526535, + "learning_rate": 0.00013460474102115785, + "loss": 0.9935, + "step": 1035 + }, + { + "epoch": 1.1680316294831967, + "grad_norm": 0.03994593396782875, + "learning_rate": 0.00013449328012699407, + "loss": 1.1051, + "step": 1036 + }, + { + "epoch": 1.1691612538830838, + "grad_norm": 0.03813205286860466, + "learning_rate": 0.0001343817705716278, + "loss": 0.9401, + "step": 1037 + }, + { + "epoch": 1.1702908782829708, + "grad_norm": 0.03704039007425308, + "learning_rate": 0.0001342702125123705, + "loss": 0.9015, + "step": 1038 + }, + { + "epoch": 1.171420502682858, + "grad_norm": 0.037067439407110214, + "learning_rate": 0.00013415860610660208, + "loss": 0.9702, + "step": 1039 + }, + { + "epoch": 1.172550127082745, + "grad_norm": 0.03761277720332146, + "learning_rate": 0.0001340469515117706, + "loss": 0.9302, + "step": 1040 + }, + { + "epoch": 1.173679751482632, + "grad_norm": 0.0372072272002697, + "learning_rate": 0.00013393524888539226, + "loss": 0.9595, + "step": 1041 + }, + { + "epoch": 1.174809375882519, + "grad_norm": 0.037263937294483185, + "learning_rate": 0.00013382349838505088, + "loss": 0.9506, + "step": 1042 + }, + { + "epoch": 1.1759390002824062, + "grad_norm": 0.03690359368920326, + "learning_rate": 0.0001337117001683978, + "loss": 0.9424, + "step": 1043 + }, + { + "epoch": 1.1770686246822932, + "grad_norm": 0.03702644258737564, + "learning_rate": 0.0001335998543931518, + "loss": 0.9763, + "step": 1044 + }, + { + "epoch": 1.1781982490821803, + "grad_norm": 0.03696833550930023, + "learning_rate": 0.00013348796121709862, + "loss": 0.9366, + "step": 1045 + }, + { + "epoch": 1.1793278734820671, + "grad_norm": 0.03843120485544205, + "learning_rate": 0.00013337602079809102, + "loss": 1.0288, + "step": 1046 + }, + { + "epoch": 1.1804574978819542, + "grad_norm": 0.03919968008995056, + "learning_rate": 0.0001332640332940482, + "loss": 1.0994, + "step": 1047 + }, + { + "epoch": 1.1815871222818413, + "grad_norm": 0.038000430911779404, + "learning_rate": 0.00013315199886295602, + "loss": 0.9932, + "step": 1048 + }, + { + "epoch": 1.1827167466817283, + "grad_norm": 0.037957947701215744, + "learning_rate": 0.0001330399176628664, + "loss": 1.0122, + "step": 1049 + }, + { + "epoch": 1.1838463710816154, + "grad_norm": 0.037756092846393585, + "learning_rate": 0.00013292778985189724, + "loss": 0.9397, + "step": 1050 + }, + { + "epoch": 1.1849759954815025, + "grad_norm": 0.03866475820541382, + "learning_rate": 0.00013281561558823224, + "loss": 0.9598, + "step": 1051 + }, + { + "epoch": 1.1861056198813895, + "grad_norm": 0.037318069487810135, + "learning_rate": 0.0001327033950301206, + "loss": 0.9921, + "step": 1052 + }, + { + "epoch": 1.1872352442812764, + "grad_norm": 0.039401739835739136, + "learning_rate": 0.00013259112833587686, + "loss": 1.0142, + "step": 1053 + }, + { + "epoch": 1.1883648686811634, + "grad_norm": 0.039969827979803085, + "learning_rate": 0.00013247881566388062, + "loss": 1.0763, + "step": 1054 + }, + { + "epoch": 1.1894944930810505, + "grad_norm": 0.038948189467191696, + "learning_rate": 0.0001323664571725764, + "loss": 0.9663, + "step": 1055 + }, + { + "epoch": 1.1906241174809375, + "grad_norm": 0.03894684091210365, + "learning_rate": 0.00013225405302047326, + "loss": 1.0565, + "step": 1056 + }, + { + "epoch": 1.1917537418808246, + "grad_norm": 0.03931012749671936, + "learning_rate": 0.00013214160336614476, + "loss": 0.9607, + "step": 1057 + }, + { + "epoch": 1.1928833662807117, + "grad_norm": 0.03936886042356491, + "learning_rate": 0.00013202910836822865, + "loss": 0.9651, + "step": 1058 + }, + { + "epoch": 1.1940129906805987, + "grad_norm": 0.03898691013455391, + "learning_rate": 0.00013191656818542654, + "loss": 0.9504, + "step": 1059 + }, + { + "epoch": 1.1951426150804858, + "grad_norm": 0.03756885230541229, + "learning_rate": 0.00013180398297650393, + "loss": 0.9724, + "step": 1060 + }, + { + "epoch": 1.1962722394803729, + "grad_norm": 0.03658022731542587, + "learning_rate": 0.00013169135290028974, + "loss": 1.0255, + "step": 1061 + }, + { + "epoch": 1.19740186388026, + "grad_norm": 0.03740183636546135, + "learning_rate": 0.00013157867811567628, + "loss": 1.0126, + "step": 1062 + }, + { + "epoch": 1.1985314882801468, + "grad_norm": 0.03823421150445938, + "learning_rate": 0.0001314659587816188, + "loss": 0.8985, + "step": 1063 + }, + { + "epoch": 1.1996611126800338, + "grad_norm": 0.039322640746831894, + "learning_rate": 0.0001313531950571355, + "loss": 1.0385, + "step": 1064 + }, + { + "epoch": 1.200790737079921, + "grad_norm": 0.03806902840733528, + "learning_rate": 0.00013124038710130722, + "loss": 1.0221, + "step": 1065 + }, + { + "epoch": 1.201920361479808, + "grad_norm": 0.037562061101198196, + "learning_rate": 0.00013112753507327706, + "loss": 0.9412, + "step": 1066 + }, + { + "epoch": 1.203049985879695, + "grad_norm": 0.03723952919244766, + "learning_rate": 0.00013101463913225048, + "loss": 1.0065, + "step": 1067 + }, + { + "epoch": 1.204179610279582, + "grad_norm": 0.03826143220067024, + "learning_rate": 0.00013090169943749476, + "loss": 0.9315, + "step": 1068 + }, + { + "epoch": 1.2053092346794692, + "grad_norm": 0.03751199692487717, + "learning_rate": 0.00013078871614833894, + "loss": 0.9418, + "step": 1069 + }, + { + "epoch": 1.206438859079356, + "grad_norm": 0.03741021826863289, + "learning_rate": 0.00013067568942417356, + "loss": 1.0147, + "step": 1070 + }, + { + "epoch": 1.207568483479243, + "grad_norm": 0.0373196117579937, + "learning_rate": 0.00013056261942445045, + "loss": 0.9944, + "step": 1071 + }, + { + "epoch": 1.2086981078791301, + "grad_norm": 0.038884326815605164, + "learning_rate": 0.0001304495063086825, + "loss": 1.064, + "step": 1072 + }, + { + "epoch": 1.2098277322790172, + "grad_norm": 0.037447500973939896, + "learning_rate": 0.00013033635023644338, + "loss": 1.0027, + "step": 1073 + }, + { + "epoch": 1.2109573566789043, + "grad_norm": 0.0381641611456871, + "learning_rate": 0.00013022315136736743, + "loss": 1.0136, + "step": 1074 + }, + { + "epoch": 1.2120869810787913, + "grad_norm": 0.03642114996910095, + "learning_rate": 0.00013010990986114926, + "loss": 0.9936, + "step": 1075 + }, + { + "epoch": 1.2132166054786784, + "grad_norm": 0.03828851506114006, + "learning_rate": 0.0001299966258775437, + "loss": 0.9752, + "step": 1076 + }, + { + "epoch": 1.2143462298785654, + "grad_norm": 0.03775739669799805, + "learning_rate": 0.00012988329957636558, + "loss": 1.0207, + "step": 1077 + }, + { + "epoch": 1.2154758542784525, + "grad_norm": 0.036843057721853256, + "learning_rate": 0.00012976993111748922, + "loss": 0.995, + "step": 1078 + }, + { + "epoch": 1.2166054786783396, + "grad_norm": 0.03826352581381798, + "learning_rate": 0.00012965652066084866, + "loss": 0.9746, + "step": 1079 + }, + { + "epoch": 1.2177351030782264, + "grad_norm": 0.03939265385270119, + "learning_rate": 0.00012954306836643703, + "loss": 0.9897, + "step": 1080 + }, + { + "epoch": 1.2188647274781135, + "grad_norm": 0.038330599665641785, + "learning_rate": 0.00012942957439430648, + "loss": 1.0044, + "step": 1081 + }, + { + "epoch": 1.2199943518780005, + "grad_norm": 0.03781147673726082, + "learning_rate": 0.00012931603890456805, + "loss": 1.0505, + "step": 1082 + }, + { + "epoch": 1.2211239762778876, + "grad_norm": 0.03792247548699379, + "learning_rate": 0.00012920246205739124, + "loss": 1.0327, + "step": 1083 + }, + { + "epoch": 1.2222536006777747, + "grad_norm": 0.03832760825753212, + "learning_rate": 0.00012908884401300404, + "loss": 1.0185, + "step": 1084 + }, + { + "epoch": 1.2233832250776617, + "grad_norm": 0.038666147738695145, + "learning_rate": 0.0001289751849316924, + "loss": 0.9639, + "step": 1085 + }, + { + "epoch": 1.2245128494775488, + "grad_norm": 0.037469346076250076, + "learning_rate": 0.00012886148497380024, + "loss": 0.9962, + "step": 1086 + }, + { + "epoch": 1.2256424738774356, + "grad_norm": 0.0382404662668705, + "learning_rate": 0.00012874774429972918, + "loss": 1.0073, + "step": 1087 + }, + { + "epoch": 1.2267720982773227, + "grad_norm": 0.037142958492040634, + "learning_rate": 0.0001286339630699382, + "loss": 0.9954, + "step": 1088 + }, + { + "epoch": 1.2279017226772098, + "grad_norm": 0.03799648955464363, + "learning_rate": 0.0001285201414449436, + "loss": 0.9685, + "step": 1089 + }, + { + "epoch": 1.2290313470770968, + "grad_norm": 0.03858242928981781, + "learning_rate": 0.0001284062795853185, + "loss": 0.9869, + "step": 1090 + }, + { + "epoch": 1.2301609714769839, + "grad_norm": 0.03654414415359497, + "learning_rate": 0.000128292377651693, + "loss": 1.0773, + "step": 1091 + }, + { + "epoch": 1.231290595876871, + "grad_norm": 0.037349898368120193, + "learning_rate": 0.00012817843580475354, + "loss": 1.0794, + "step": 1092 + }, + { + "epoch": 1.232420220276758, + "grad_norm": 0.03809039667248726, + "learning_rate": 0.00012806445420524297, + "loss": 1.0409, + "step": 1093 + }, + { + "epoch": 1.233549844676645, + "grad_norm": 0.03639010712504387, + "learning_rate": 0.00012795043301396016, + "loss": 1.0052, + "step": 1094 + }, + { + "epoch": 1.2346794690765321, + "grad_norm": 0.03701767325401306, + "learning_rate": 0.00012783637239175994, + "loss": 0.9447, + "step": 1095 + }, + { + "epoch": 1.2358090934764192, + "grad_norm": 0.03904819115996361, + "learning_rate": 0.00012772227249955263, + "loss": 0.9801, + "step": 1096 + }, + { + "epoch": 1.236938717876306, + "grad_norm": 0.037295203655958176, + "learning_rate": 0.00012760813349830405, + "loss": 0.9683, + "step": 1097 + }, + { + "epoch": 1.2380683422761931, + "grad_norm": 0.037878576666116714, + "learning_rate": 0.00012749395554903514, + "loss": 1.0504, + "step": 1098 + }, + { + "epoch": 1.2391979666760802, + "grad_norm": 0.03729383647441864, + "learning_rate": 0.00012737973881282185, + "loss": 0.9637, + "step": 1099 + }, + { + "epoch": 1.2403275910759672, + "grad_norm": 0.038625191897153854, + "learning_rate": 0.00012726548345079475, + "loss": 1.0177, + "step": 1100 + }, + { + "epoch": 1.2414572154758543, + "grad_norm": 0.03700273111462593, + "learning_rate": 0.00012715118962413894, + "loss": 1.064, + "step": 1101 + }, + { + "epoch": 1.2425868398757414, + "grad_norm": 0.03757200762629509, + "learning_rate": 0.00012703685749409387, + "loss": 0.9652, + "step": 1102 + }, + { + "epoch": 1.2437164642756284, + "grad_norm": 0.037905555218458176, + "learning_rate": 0.00012692248722195293, + "loss": 1.0696, + "step": 1103 + }, + { + "epoch": 1.2448460886755153, + "grad_norm": 0.03769846633076668, + "learning_rate": 0.00012680807896906329, + "loss": 1.0299, + "step": 1104 + }, + { + "epoch": 1.2459757130754023, + "grad_norm": 0.0391731932759285, + "learning_rate": 0.00012669363289682584, + "loss": 0.9584, + "step": 1105 + }, + { + "epoch": 1.2471053374752894, + "grad_norm": 0.03786834701895714, + "learning_rate": 0.00012657914916669466, + "loss": 0.9974, + "step": 1106 + }, + { + "epoch": 1.2482349618751765, + "grad_norm": 0.037806760519742966, + "learning_rate": 0.00012646462794017708, + "loss": 0.9201, + "step": 1107 + }, + { + "epoch": 1.2493645862750635, + "grad_norm": 0.04048847034573555, + "learning_rate": 0.00012635006937883328, + "loss": 0.9848, + "step": 1108 + }, + { + "epoch": 1.2504942106749506, + "grad_norm": 0.03775718808174133, + "learning_rate": 0.00012623547364427605, + "loss": 0.9275, + "step": 1109 + }, + { + "epoch": 1.2516238350748377, + "grad_norm": 0.03765375167131424, + "learning_rate": 0.0001261208408981708, + "loss": 0.9887, + "step": 1110 + }, + { + "epoch": 1.2527534594747247, + "grad_norm": 0.03869464248418808, + "learning_rate": 0.00012600617130223494, + "loss": 0.9137, + "step": 1111 + }, + { + "epoch": 1.2538830838746118, + "grad_norm": 0.038646209985017776, + "learning_rate": 0.000125891465018238, + "loss": 0.9535, + "step": 1112 + }, + { + "epoch": 1.2550127082744988, + "grad_norm": 0.038947030901908875, + "learning_rate": 0.00012577672220800122, + "loss": 0.988, + "step": 1113 + }, + { + "epoch": 1.2561423326743857, + "grad_norm": 0.039047569036483765, + "learning_rate": 0.00012566194303339739, + "loss": 1.0917, + "step": 1114 + }, + { + "epoch": 1.2572719570742728, + "grad_norm": 0.03860335052013397, + "learning_rate": 0.00012554712765635057, + "loss": 1.0186, + "step": 1115 + }, + { + "epoch": 1.2584015814741598, + "grad_norm": 0.03737378120422363, + "learning_rate": 0.0001254322762388359, + "loss": 0.9183, + "step": 1116 + }, + { + "epoch": 1.2595312058740469, + "grad_norm": 0.038941189646720886, + "learning_rate": 0.00012531738894287946, + "loss": 1.0043, + "step": 1117 + }, + { + "epoch": 1.260660830273934, + "grad_norm": 0.038606077432632446, + "learning_rate": 0.00012520246593055777, + "loss": 0.9232, + "step": 1118 + }, + { + "epoch": 1.261790454673821, + "grad_norm": 0.03935806453227997, + "learning_rate": 0.0001250875073639979, + "loss": 1.0401, + "step": 1119 + }, + { + "epoch": 1.262920079073708, + "grad_norm": 0.03954112157225609, + "learning_rate": 0.0001249725134053769, + "loss": 1.0234, + "step": 1120 + }, + { + "epoch": 1.264049703473595, + "grad_norm": 0.03769481182098389, + "learning_rate": 0.00012485748421692196, + "loss": 1.1324, + "step": 1121 + }, + { + "epoch": 1.265179327873482, + "grad_norm": 0.037308190017938614, + "learning_rate": 0.00012474241996090986, + "loss": 0.983, + "step": 1122 + }, + { + "epoch": 1.266308952273369, + "grad_norm": 0.03750931844115257, + "learning_rate": 0.00012462732079966676, + "loss": 0.9654, + "step": 1123 + }, + { + "epoch": 1.267438576673256, + "grad_norm": 0.03773000091314316, + "learning_rate": 0.0001245121868955683, + "loss": 0.9537, + "step": 1124 + }, + { + "epoch": 1.2685682010731432, + "grad_norm": 0.03808850795030594, + "learning_rate": 0.00012439701841103888, + "loss": 1.0058, + "step": 1125 + }, + { + "epoch": 1.2696978254730302, + "grad_norm": 0.03719565272331238, + "learning_rate": 0.00012428181550855184, + "loss": 1.047, + "step": 1126 + }, + { + "epoch": 1.2708274498729173, + "grad_norm": 0.03920544311404228, + "learning_rate": 0.00012416657835062907, + "loss": 0.9937, + "step": 1127 + }, + { + "epoch": 1.2719570742728044, + "grad_norm": 0.0384603887796402, + "learning_rate": 0.00012405130709984064, + "loss": 1.0042, + "step": 1128 + }, + { + "epoch": 1.2730866986726914, + "grad_norm": 0.039956919848918915, + "learning_rate": 0.00012393600191880493, + "loss": 1.0264, + "step": 1129 + }, + { + "epoch": 1.2742163230725785, + "grad_norm": 0.03926371783018112, + "learning_rate": 0.00012382066297018804, + "loss": 1.0665, + "step": 1130 + }, + { + "epoch": 1.2753459474724653, + "grad_norm": 0.03931189700961113, + "learning_rate": 0.00012370529041670376, + "loss": 1.004, + "step": 1131 + }, + { + "epoch": 1.2764755718723524, + "grad_norm": 0.038920726627111435, + "learning_rate": 0.00012358988442111326, + "loss": 0.9131, + "step": 1132 + }, + { + "epoch": 1.2776051962722395, + "grad_norm": 0.039931900799274445, + "learning_rate": 0.0001234744451462249, + "loss": 0.9556, + "step": 1133 + }, + { + "epoch": 1.2787348206721265, + "grad_norm": 0.03920763358473778, + "learning_rate": 0.000123358972754894, + "loss": 1.0534, + "step": 1134 + }, + { + "epoch": 1.2798644450720136, + "grad_norm": 0.03894329071044922, + "learning_rate": 0.0001232434674100226, + "loss": 0.9777, + "step": 1135 + }, + { + "epoch": 1.2809940694719006, + "grad_norm": 0.039821792393922806, + "learning_rate": 0.00012312792927455924, + "loss": 1.0461, + "step": 1136 + }, + { + "epoch": 1.2821236938717877, + "grad_norm": 0.041238732635974884, + "learning_rate": 0.00012301235851149865, + "loss": 0.9593, + "step": 1137 + }, + { + "epoch": 1.2832533182716745, + "grad_norm": 0.039833102375268936, + "learning_rate": 0.00012289675528388169, + "loss": 1.0378, + "step": 1138 + }, + { + "epoch": 1.2843829426715616, + "grad_norm": 0.03923288360238075, + "learning_rate": 0.00012278111975479492, + "loss": 0.9528, + "step": 1139 + }, + { + "epoch": 1.2855125670714487, + "grad_norm": 0.03867553547024727, + "learning_rate": 0.00012266545208737054, + "loss": 1.0494, + "step": 1140 + }, + { + "epoch": 1.2866421914713357, + "grad_norm": 0.03829865902662277, + "learning_rate": 0.0001225497524447861, + "loss": 0.889, + "step": 1141 + }, + { + "epoch": 1.2877718158712228, + "grad_norm": 0.039233699440956116, + "learning_rate": 0.00012243402099026417, + "loss": 0.8935, + "step": 1142 + }, + { + "epoch": 1.2889014402711099, + "grad_norm": 0.03830961883068085, + "learning_rate": 0.0001223182578870723, + "loss": 0.9697, + "step": 1143 + }, + { + "epoch": 1.290031064670997, + "grad_norm": 0.03787538781762123, + "learning_rate": 0.0001222024632985226, + "loss": 0.9645, + "step": 1144 + }, + { + "epoch": 1.291160689070884, + "grad_norm": 0.03876281529664993, + "learning_rate": 0.00012208663738797165, + "loss": 0.9705, + "step": 1145 + }, + { + "epoch": 1.292290313470771, + "grad_norm": 0.03762456402182579, + "learning_rate": 0.00012197078031882026, + "loss": 1.012, + "step": 1146 + }, + { + "epoch": 1.2934199378706581, + "grad_norm": 0.037920914590358734, + "learning_rate": 0.00012185489225451305, + "loss": 0.8949, + "step": 1147 + }, + { + "epoch": 1.294549562270545, + "grad_norm": 0.03920237347483635, + "learning_rate": 0.00012173897335853856, + "loss": 0.959, + "step": 1148 + }, + { + "epoch": 1.295679186670432, + "grad_norm": 0.03853004053235054, + "learning_rate": 0.00012162302379442865, + "loss": 1.011, + "step": 1149 + }, + { + "epoch": 1.296808811070319, + "grad_norm": 0.039866186678409576, + "learning_rate": 0.00012150704372575854, + "loss": 1.0368, + "step": 1150 + }, + { + "epoch": 1.2979384354702062, + "grad_norm": 0.040773652493953705, + "learning_rate": 0.00012139103331614649, + "loss": 1.0744, + "step": 1151 + }, + { + "epoch": 1.2990680598700932, + "grad_norm": 0.038420483469963074, + "learning_rate": 0.00012127499272925352, + "loss": 0.993, + "step": 1152 + }, + { + "epoch": 1.3001976842699803, + "grad_norm": 0.038056086748838425, + "learning_rate": 0.00012115892212878324, + "loss": 0.9368, + "step": 1153 + }, + { + "epoch": 1.3013273086698673, + "grad_norm": 0.04063399136066437, + "learning_rate": 0.00012104282167848157, + "loss": 0.8962, + "step": 1154 + }, + { + "epoch": 1.3024569330697542, + "grad_norm": 0.038728486746549606, + "learning_rate": 0.00012092669154213665, + "loss": 0.9737, + "step": 1155 + }, + { + "epoch": 1.3035865574696412, + "grad_norm": 0.03924306482076645, + "learning_rate": 0.00012081053188357838, + "loss": 1.0224, + "step": 1156 + }, + { + "epoch": 1.3047161818695283, + "grad_norm": 0.03962641581892967, + "learning_rate": 0.00012069434286667833, + "loss": 0.9779, + "step": 1157 + }, + { + "epoch": 1.3058458062694154, + "grad_norm": 0.03946220874786377, + "learning_rate": 0.00012057812465534953, + "loss": 1.0043, + "step": 1158 + }, + { + "epoch": 1.3069754306693024, + "grad_norm": 0.038344915956258774, + "learning_rate": 0.00012046187741354618, + "loss": 1.0668, + "step": 1159 + }, + { + "epoch": 1.3081050550691895, + "grad_norm": 0.03802265226840973, + "learning_rate": 0.0001203456013052634, + "loss": 0.9928, + "step": 1160 + }, + { + "epoch": 1.3092346794690766, + "grad_norm": 0.03755849227309227, + "learning_rate": 0.00012022929649453707, + "loss": 0.9103, + "step": 1161 + }, + { + "epoch": 1.3103643038689636, + "grad_norm": 0.03990541025996208, + "learning_rate": 0.00012011296314544359, + "loss": 0.9837, + "step": 1162 + }, + { + "epoch": 1.3114939282688507, + "grad_norm": 0.03864932060241699, + "learning_rate": 0.00011999660142209953, + "loss": 0.99, + "step": 1163 + }, + { + "epoch": 1.3126235526687378, + "grad_norm": 0.03857451304793358, + "learning_rate": 0.00011988021148866158, + "loss": 1.0028, + "step": 1164 + }, + { + "epoch": 1.3137531770686246, + "grad_norm": 0.03803737089037895, + "learning_rate": 0.00011976379350932618, + "loss": 0.9621, + "step": 1165 + }, + { + "epoch": 1.3148828014685117, + "grad_norm": 0.037970781326293945, + "learning_rate": 0.00011964734764832933, + "loss": 0.942, + "step": 1166 + }, + { + "epoch": 1.3160124258683987, + "grad_norm": 0.03878822922706604, + "learning_rate": 0.0001195308740699464, + "loss": 0.9238, + "step": 1167 + }, + { + "epoch": 1.3171420502682858, + "grad_norm": 0.03999705985188484, + "learning_rate": 0.00011941437293849181, + "loss": 1.0493, + "step": 1168 + }, + { + "epoch": 1.3182716746681729, + "grad_norm": 0.04126083478331566, + "learning_rate": 0.00011929784441831893, + "loss": 0.9518, + "step": 1169 + }, + { + "epoch": 1.31940129906806, + "grad_norm": 0.03893827274441719, + "learning_rate": 0.00011918128867381965, + "loss": 0.8951, + "step": 1170 + }, + { + "epoch": 1.320530923467947, + "grad_norm": 0.039558008313179016, + "learning_rate": 0.00011906470586942445, + "loss": 0.9446, + "step": 1171 + }, + { + "epoch": 1.3216605478678338, + "grad_norm": 0.03813496232032776, + "learning_rate": 0.00011894809616960174, + "loss": 0.9837, + "step": 1172 + }, + { + "epoch": 1.3227901722677209, + "grad_norm": 0.03912735357880592, + "learning_rate": 0.00011883145973885805, + "loss": 1.0211, + "step": 1173 + }, + { + "epoch": 1.323919796667608, + "grad_norm": 0.0382121279835701, + "learning_rate": 0.00011871479674173762, + "loss": 1.0054, + "step": 1174 + }, + { + "epoch": 1.325049421067495, + "grad_norm": 0.03812730684876442, + "learning_rate": 0.00011859810734282208, + "loss": 0.9412, + "step": 1175 + }, + { + "epoch": 1.326179045467382, + "grad_norm": 0.04326564446091652, + "learning_rate": 0.00011848139170673036, + "loss": 1.0046, + "step": 1176 + }, + { + "epoch": 1.3273086698672691, + "grad_norm": 0.038262803107500076, + "learning_rate": 0.00011836464999811838, + "loss": 0.936, + "step": 1177 + }, + { + "epoch": 1.3284382942671562, + "grad_norm": 0.03915071487426758, + "learning_rate": 0.0001182478823816789, + "loss": 0.9397, + "step": 1178 + }, + { + "epoch": 1.3295679186670433, + "grad_norm": 0.0392167903482914, + "learning_rate": 0.00011813108902214111, + "loss": 1.0413, + "step": 1179 + }, + { + "epoch": 1.3306975430669303, + "grad_norm": 0.04144560545682907, + "learning_rate": 0.00011801427008427063, + "loss": 1.0385, + "step": 1180 + }, + { + "epoch": 1.3306975430669303, + "eval_loss": 0.995391845703125, + "eval_runtime": 559.0238, + "eval_samples_per_second": 17.5, + "eval_steps_per_second": 8.751, + "step": 1180 + }, + { + "epoch": 1.3318271674668174, + "grad_norm": 0.039459336549043655, + "learning_rate": 0.00011789742573286915, + "loss": 0.9662, + "step": 1181 + }, + { + "epoch": 1.3329567918667042, + "grad_norm": 0.03919503092765808, + "learning_rate": 0.00011778055613277415, + "loss": 0.9673, + "step": 1182 + }, + { + "epoch": 1.3340864162665913, + "grad_norm": 0.03915826976299286, + "learning_rate": 0.00011766366144885877, + "loss": 0.9992, + "step": 1183 + }, + { + "epoch": 1.3352160406664784, + "grad_norm": 0.038948994129896164, + "learning_rate": 0.00011754674184603149, + "loss": 0.959, + "step": 1184 + }, + { + "epoch": 1.3363456650663654, + "grad_norm": 0.03835888206958771, + "learning_rate": 0.00011742979748923611, + "loss": 1.0011, + "step": 1185 + }, + { + "epoch": 1.3374752894662525, + "grad_norm": 0.03993493691086769, + "learning_rate": 0.00011731282854345114, + "loss": 1.0694, + "step": 1186 + }, + { + "epoch": 1.3386049138661396, + "grad_norm": 0.03888334706425667, + "learning_rate": 0.00011719583517368985, + "loss": 1.0382, + "step": 1187 + }, + { + "epoch": 1.3397345382660266, + "grad_norm": 0.03841853514313698, + "learning_rate": 0.00011707881754500005, + "loss": 0.9161, + "step": 1188 + }, + { + "epoch": 1.3408641626659135, + "grad_norm": 0.038549747318029404, + "learning_rate": 0.00011696177582246367, + "loss": 0.9674, + "step": 1189 + }, + { + "epoch": 1.3419937870658005, + "grad_norm": 0.04010612517595291, + "learning_rate": 0.00011684471017119667, + "loss": 1.0048, + "step": 1190 + }, + { + "epoch": 1.3431234114656876, + "grad_norm": 0.03912900388240814, + "learning_rate": 0.00011672762075634873, + "loss": 0.9314, + "step": 1191 + }, + { + "epoch": 1.3442530358655747, + "grad_norm": 0.041035935282707214, + "learning_rate": 0.00011661050774310319, + "loss": 0.9267, + "step": 1192 + }, + { + "epoch": 1.3453826602654617, + "grad_norm": 0.03944677114486694, + "learning_rate": 0.00011649337129667648, + "loss": 0.9811, + "step": 1193 + }, + { + "epoch": 1.3465122846653488, + "grad_norm": 0.040055934339761734, + "learning_rate": 0.00011637621158231822, + "loss": 0.9618, + "step": 1194 + }, + { + "epoch": 1.3476419090652358, + "grad_norm": 0.038965675979852676, + "learning_rate": 0.00011625902876531082, + "loss": 0.9501, + "step": 1195 + }, + { + "epoch": 1.348771533465123, + "grad_norm": 0.038453359156847, + "learning_rate": 0.00011614182301096927, + "loss": 0.9526, + "step": 1196 + }, + { + "epoch": 1.34990115786501, + "grad_norm": 0.038837965577840805, + "learning_rate": 0.00011602459448464102, + "loss": 0.9861, + "step": 1197 + }, + { + "epoch": 1.351030782264897, + "grad_norm": 0.03882889077067375, + "learning_rate": 0.00011590734335170545, + "loss": 0.9874, + "step": 1198 + }, + { + "epoch": 1.352160406664784, + "grad_norm": 0.0396343395113945, + "learning_rate": 0.000115790069777574, + "loss": 0.9412, + "step": 1199 + }, + { + "epoch": 1.353290031064671, + "grad_norm": 0.03892532363533974, + "learning_rate": 0.00011567277392768972, + "loss": 0.8851, + "step": 1200 + }, + { + "epoch": 1.354419655464558, + "grad_norm": 0.0408422015607357, + "learning_rate": 0.00011555545596752701, + "loss": 0.9849, + "step": 1201 + }, + { + "epoch": 1.355549279864445, + "grad_norm": 0.040196966379880905, + "learning_rate": 0.0001154381160625916, + "loss": 0.9962, + "step": 1202 + }, + { + "epoch": 1.3566789042643321, + "grad_norm": 0.04056261479854584, + "learning_rate": 0.00011532075437842004, + "loss": 1.0322, + "step": 1203 + }, + { + "epoch": 1.3578085286642192, + "grad_norm": 0.03838193044066429, + "learning_rate": 0.00011520337108057974, + "loss": 1.061, + "step": 1204 + }, + { + "epoch": 1.3589381530641063, + "grad_norm": 0.03917081281542778, + "learning_rate": 0.00011508596633466852, + "loss": 1.0052, + "step": 1205 + }, + { + "epoch": 1.360067777463993, + "grad_norm": 0.040417592972517014, + "learning_rate": 0.00011496854030631443, + "loss": 0.9492, + "step": 1206 + }, + { + "epoch": 1.3611974018638802, + "grad_norm": 0.03877858445048332, + "learning_rate": 0.00011485109316117562, + "loss": 0.9382, + "step": 1207 + }, + { + "epoch": 1.3623270262637672, + "grad_norm": 0.03815083205699921, + "learning_rate": 0.00011473362506493997, + "loss": 0.9688, + "step": 1208 + }, + { + "epoch": 1.3634566506636543, + "grad_norm": 0.038873858749866486, + "learning_rate": 0.00011461613618332497, + "loss": 0.8748, + "step": 1209 + }, + { + "epoch": 1.3645862750635414, + "grad_norm": 0.03795653581619263, + "learning_rate": 0.00011449862668207734, + "loss": 0.9784, + "step": 1210 + }, + { + "epoch": 1.3657158994634284, + "grad_norm": 0.04054610803723335, + "learning_rate": 0.00011438109672697303, + "loss": 0.9856, + "step": 1211 + }, + { + "epoch": 1.3668455238633155, + "grad_norm": 0.039193637669086456, + "learning_rate": 0.00011426354648381674, + "loss": 0.9638, + "step": 1212 + }, + { + "epoch": 1.3679751482632025, + "grad_norm": 0.038261231034994125, + "learning_rate": 0.00011414597611844183, + "loss": 1.0003, + "step": 1213 + }, + { + "epoch": 1.3691047726630896, + "grad_norm": 0.039814114570617676, + "learning_rate": 0.00011402838579671001, + "loss": 0.9288, + "step": 1214 + }, + { + "epoch": 1.3702343970629767, + "grad_norm": 0.04093167930841446, + "learning_rate": 0.00011391077568451116, + "loss": 1.0528, + "step": 1215 + }, + { + "epoch": 1.3713640214628637, + "grad_norm": 0.040623877197504044, + "learning_rate": 0.00011379314594776315, + "loss": 0.9221, + "step": 1216 + }, + { + "epoch": 1.3724936458627506, + "grad_norm": 0.039268527179956436, + "learning_rate": 0.00011367549675241142, + "loss": 0.9739, + "step": 1217 + }, + { + "epoch": 1.3736232702626376, + "grad_norm": 0.038862016052007675, + "learning_rate": 0.00011355782826442895, + "loss": 0.9871, + "step": 1218 + }, + { + "epoch": 1.3747528946625247, + "grad_norm": 0.03837895020842552, + "learning_rate": 0.0001134401406498159, + "loss": 0.9517, + "step": 1219 + }, + { + "epoch": 1.3758825190624118, + "grad_norm": 0.0400700680911541, + "learning_rate": 0.0001133224340745994, + "loss": 1.0339, + "step": 1220 + }, + { + "epoch": 1.3770121434622988, + "grad_norm": 0.040071625262498856, + "learning_rate": 0.00011320470870483335, + "loss": 0.9464, + "step": 1221 + }, + { + "epoch": 1.378141767862186, + "grad_norm": 0.039051372557878494, + "learning_rate": 0.00011308696470659814, + "loss": 0.9558, + "step": 1222 + }, + { + "epoch": 1.3792713922620727, + "grad_norm": 0.04017410799860954, + "learning_rate": 0.0001129692022460005, + "loss": 0.9779, + "step": 1223 + }, + { + "epoch": 1.3804010166619598, + "grad_norm": 0.039144422858953476, + "learning_rate": 0.00011285142148917312, + "loss": 1.0111, + "step": 1224 + }, + { + "epoch": 1.3815306410618469, + "grad_norm": 0.04037921130657196, + "learning_rate": 0.00011273362260227458, + "loss": 0.9653, + "step": 1225 + }, + { + "epoch": 1.382660265461734, + "grad_norm": 0.03725403547286987, + "learning_rate": 0.000112615805751489, + "loss": 1.0813, + "step": 1226 + }, + { + "epoch": 1.383789889861621, + "grad_norm": 0.03872475028038025, + "learning_rate": 0.00011249797110302582, + "loss": 0.9006, + "step": 1227 + }, + { + "epoch": 1.384919514261508, + "grad_norm": 0.03926214575767517, + "learning_rate": 0.00011238011882311963, + "loss": 1.0214, + "step": 1228 + }, + { + "epoch": 1.3860491386613951, + "grad_norm": 0.03871994838118553, + "learning_rate": 0.00011226224907802985, + "loss": 1.0218, + "step": 1229 + }, + { + "epoch": 1.3871787630612822, + "grad_norm": 0.03772478923201561, + "learning_rate": 0.00011214436203404062, + "loss": 0.918, + "step": 1230 + }, + { + "epoch": 1.3883083874611692, + "grad_norm": 0.038930702954530716, + "learning_rate": 0.0001120264578574604, + "loss": 0.9678, + "step": 1231 + }, + { + "epoch": 1.3894380118610563, + "grad_norm": 0.03879974037408829, + "learning_rate": 0.00011190853671462183, + "loss": 0.9241, + "step": 1232 + }, + { + "epoch": 1.3905676362609434, + "grad_norm": 0.04013784974813461, + "learning_rate": 0.00011179059877188154, + "loss": 1.0302, + "step": 1233 + }, + { + "epoch": 1.3916972606608302, + "grad_norm": 0.039637017995119095, + "learning_rate": 0.00011167264419561978, + "loss": 1.0615, + "step": 1234 + }, + { + "epoch": 1.3928268850607173, + "grad_norm": 0.03899321332573891, + "learning_rate": 0.00011155467315224038, + "loss": 0.963, + "step": 1235 + }, + { + "epoch": 1.3939565094606043, + "grad_norm": 0.03954652324318886, + "learning_rate": 0.00011143668580817026, + "loss": 1.081, + "step": 1236 + }, + { + "epoch": 1.3950861338604914, + "grad_norm": 0.03980173170566559, + "learning_rate": 0.00011131868232985945, + "loss": 1.0521, + "step": 1237 + }, + { + "epoch": 1.3962157582603785, + "grad_norm": 0.04101087152957916, + "learning_rate": 0.00011120066288378066, + "loss": 1.0194, + "step": 1238 + }, + { + "epoch": 1.3973453826602655, + "grad_norm": 0.03934963047504425, + "learning_rate": 0.00011108262763642923, + "loss": 1.0247, + "step": 1239 + }, + { + "epoch": 1.3984750070601524, + "grad_norm": 0.04077817499637604, + "learning_rate": 0.00011096457675432266, + "loss": 1.0232, + "step": 1240 + }, + { + "epoch": 1.3996046314600394, + "grad_norm": 0.040299609303474426, + "learning_rate": 0.00011084651040400057, + "loss": 1.0593, + "step": 1241 + }, + { + "epoch": 1.4007342558599265, + "grad_norm": 0.03928808122873306, + "learning_rate": 0.00011072842875202451, + "loss": 1.0006, + "step": 1242 + }, + { + "epoch": 1.4018638802598136, + "grad_norm": 0.03891727328300476, + "learning_rate": 0.00011061033196497738, + "loss": 0.9564, + "step": 1243 + }, + { + "epoch": 1.4029935046597006, + "grad_norm": 0.03931816667318344, + "learning_rate": 0.00011049222020946366, + "loss": 0.9873, + "step": 1244 + }, + { + "epoch": 1.4041231290595877, + "grad_norm": 0.039088763296604156, + "learning_rate": 0.0001103740936521088, + "loss": 1.0766, + "step": 1245 + }, + { + "epoch": 1.4052527534594748, + "grad_norm": 0.03867960348725319, + "learning_rate": 0.00011025595245955917, + "loss": 0.9637, + "step": 1246 + }, + { + "epoch": 1.4063823778593618, + "grad_norm": 0.03779779002070427, + "learning_rate": 0.00011013779679848184, + "loss": 0.927, + "step": 1247 + }, + { + "epoch": 1.4075120022592489, + "grad_norm": 0.04010738059878349, + "learning_rate": 0.00011001962683556421, + "loss": 0.8963, + "step": 1248 + }, + { + "epoch": 1.408641626659136, + "grad_norm": 0.03821851313114166, + "learning_rate": 0.00010990144273751394, + "loss": 0.9803, + "step": 1249 + }, + { + "epoch": 1.409771251059023, + "grad_norm": 0.03944707289338112, + "learning_rate": 0.00010978324467105858, + "loss": 0.9464, + "step": 1250 + }, + { + "epoch": 1.4109008754589099, + "grad_norm": 0.04029305279254913, + "learning_rate": 0.00010966503280294535, + "loss": 1.007, + "step": 1251 + }, + { + "epoch": 1.412030499858797, + "grad_norm": 0.039815753698349, + "learning_rate": 0.00010954680729994102, + "loss": 0.9242, + "step": 1252 + }, + { + "epoch": 1.413160124258684, + "grad_norm": 0.03888542577624321, + "learning_rate": 0.00010942856832883153, + "loss": 1.0182, + "step": 1253 + }, + { + "epoch": 1.414289748658571, + "grad_norm": 0.040413666516542435, + "learning_rate": 0.00010931031605642193, + "loss": 0.9702, + "step": 1254 + }, + { + "epoch": 1.415419373058458, + "grad_norm": 0.04115651547908783, + "learning_rate": 0.00010919205064953582, + "loss": 1.0048, + "step": 1255 + }, + { + "epoch": 1.4165489974583452, + "grad_norm": 0.03888091444969177, + "learning_rate": 0.00010907377227501556, + "loss": 1.0556, + "step": 1256 + }, + { + "epoch": 1.417678621858232, + "grad_norm": 0.03853550925850868, + "learning_rate": 0.0001089554810997217, + "loss": 0.9906, + "step": 1257 + }, + { + "epoch": 1.418808246258119, + "grad_norm": 0.04010673239827156, + "learning_rate": 0.0001088371772905328, + "loss": 0.9924, + "step": 1258 + }, + { + "epoch": 1.4199378706580061, + "grad_norm": 0.04074999690055847, + "learning_rate": 0.00010871886101434536, + "loss": 0.9516, + "step": 1259 + }, + { + "epoch": 1.4210674950578932, + "grad_norm": 0.03997982293367386, + "learning_rate": 0.00010860053243807338, + "loss": 1.0788, + "step": 1260 + }, + { + "epoch": 1.4221971194577803, + "grad_norm": 0.040215082466602325, + "learning_rate": 0.00010848219172864827, + "loss": 0.9652, + "step": 1261 + }, + { + "epoch": 1.4233267438576673, + "grad_norm": 0.04047797992825508, + "learning_rate": 0.00010836383905301844, + "loss": 0.9426, + "step": 1262 + }, + { + "epoch": 1.4244563682575544, + "grad_norm": 0.039171457290649414, + "learning_rate": 0.00010824547457814935, + "loss": 0.9483, + "step": 1263 + }, + { + "epoch": 1.4255859926574415, + "grad_norm": 0.039281874895095825, + "learning_rate": 0.000108127098471023, + "loss": 0.9718, + "step": 1264 + }, + { + "epoch": 1.4267156170573285, + "grad_norm": 0.03949813172221184, + "learning_rate": 0.00010800871089863785, + "loss": 1.0681, + "step": 1265 + }, + { + "epoch": 1.4278452414572156, + "grad_norm": 0.047373417764902115, + "learning_rate": 0.00010789031202800844, + "loss": 1.0289, + "step": 1266 + }, + { + "epoch": 1.4289748658571026, + "grad_norm": 0.04009164124727249, + "learning_rate": 0.00010777190202616535, + "loss": 1.0028, + "step": 1267 + }, + { + "epoch": 1.4301044902569895, + "grad_norm": 0.039989493787288666, + "learning_rate": 0.00010765348106015483, + "loss": 0.9449, + "step": 1268 + }, + { + "epoch": 1.4312341146568766, + "grad_norm": 0.039491135627031326, + "learning_rate": 0.00010753504929703861, + "loss": 1.0107, + "step": 1269 + }, + { + "epoch": 1.4323637390567636, + "grad_norm": 0.04091602563858032, + "learning_rate": 0.00010741660690389365, + "loss": 0.9778, + "step": 1270 + }, + { + "epoch": 1.4334933634566507, + "grad_norm": 0.03951510414481163, + "learning_rate": 0.00010729815404781188, + "loss": 0.9379, + "step": 1271 + }, + { + "epoch": 1.4346229878565377, + "grad_norm": 0.04040463641285896, + "learning_rate": 0.00010717969089589994, + "loss": 0.9273, + "step": 1272 + }, + { + "epoch": 1.4357526122564248, + "grad_norm": 0.038381997495889664, + "learning_rate": 0.00010706121761527919, + "loss": 1.1048, + "step": 1273 + }, + { + "epoch": 1.4368822366563117, + "grad_norm": 0.04138065129518509, + "learning_rate": 0.00010694273437308501, + "loss": 0.9641, + "step": 1274 + }, + { + "epoch": 1.4380118610561987, + "grad_norm": 0.03996184840798378, + "learning_rate": 0.0001068242413364671, + "loss": 0.9644, + "step": 1275 + }, + { + "epoch": 1.4391414854560858, + "grad_norm": 0.040266938507556915, + "learning_rate": 0.0001067057386725888, + "loss": 0.9741, + "step": 1276 + }, + { + "epoch": 1.4402711098559728, + "grad_norm": 0.03877119719982147, + "learning_rate": 0.00010658722654862705, + "loss": 1.0218, + "step": 1277 + }, + { + "epoch": 1.44140073425586, + "grad_norm": 0.041445109993219376, + "learning_rate": 0.00010646870513177221, + "loss": 1.0361, + "step": 1278 + }, + { + "epoch": 1.442530358655747, + "grad_norm": 0.03916264697909355, + "learning_rate": 0.00010635017458922765, + "loss": 0.9704, + "step": 1279 + }, + { + "epoch": 1.443659983055634, + "grad_norm": 0.03879700228571892, + "learning_rate": 0.00010623163508820977, + "loss": 0.9553, + "step": 1280 + }, + { + "epoch": 1.444789607455521, + "grad_norm": 0.0390193834900856, + "learning_rate": 0.0001061130867959474, + "loss": 0.9379, + "step": 1281 + }, + { + "epoch": 1.4459192318554082, + "grad_norm": 0.040195297449827194, + "learning_rate": 0.00010599452987968196, + "loss": 0.9409, + "step": 1282 + }, + { + "epoch": 1.4470488562552952, + "grad_norm": 0.040100473910570145, + "learning_rate": 0.00010587596450666694, + "loss": 1.0221, + "step": 1283 + }, + { + "epoch": 1.4481784806551823, + "grad_norm": 0.03950706124305725, + "learning_rate": 0.00010575739084416778, + "loss": 0.9796, + "step": 1284 + }, + { + "epoch": 1.4493081050550691, + "grad_norm": 0.03975612670183182, + "learning_rate": 0.00010563880905946159, + "loss": 0.9528, + "step": 1285 + }, + { + "epoch": 1.4504377294549562, + "grad_norm": 0.039496973156929016, + "learning_rate": 0.00010552021931983696, + "loss": 0.9168, + "step": 1286 + }, + { + "epoch": 1.4515673538548433, + "grad_norm": 0.039589397609233856, + "learning_rate": 0.00010540162179259373, + "loss": 0.9511, + "step": 1287 + }, + { + "epoch": 1.4526969782547303, + "grad_norm": 0.03889545425772667, + "learning_rate": 0.00010528301664504273, + "loss": 0.9859, + "step": 1288 + }, + { + "epoch": 1.4538266026546174, + "grad_norm": 0.04123200848698616, + "learning_rate": 0.00010516440404450543, + "loss": 1.0383, + "step": 1289 + }, + { + "epoch": 1.4549562270545044, + "grad_norm": 0.03866191580891609, + "learning_rate": 0.00010504578415831395, + "loss": 0.9974, + "step": 1290 + }, + { + "epoch": 1.4560858514543913, + "grad_norm": 0.03936515003442764, + "learning_rate": 0.00010492715715381059, + "loss": 0.9545, + "step": 1291 + }, + { + "epoch": 1.4572154758542784, + "grad_norm": 0.04141482710838318, + "learning_rate": 0.0001048085231983478, + "loss": 1.035, + "step": 1292 + }, + { + "epoch": 1.4583451002541654, + "grad_norm": 0.03921012952923775, + "learning_rate": 0.00010468988245928768, + "loss": 0.8901, + "step": 1293 + }, + { + "epoch": 1.4594747246540525, + "grad_norm": 0.038768839091062546, + "learning_rate": 0.00010457123510400209, + "loss": 0.9591, + "step": 1294 + }, + { + "epoch": 1.4606043490539395, + "grad_norm": 0.04044971987605095, + "learning_rate": 0.00010445258129987206, + "loss": 0.9327, + "step": 1295 + }, + { + "epoch": 1.4617339734538266, + "grad_norm": 0.04001708701252937, + "learning_rate": 0.0001043339212142878, + "loss": 0.9784, + "step": 1296 + }, + { + "epoch": 1.4628635978537137, + "grad_norm": 0.03915560990571976, + "learning_rate": 0.00010421525501464837, + "loss": 0.9788, + "step": 1297 + }, + { + "epoch": 1.4639932222536007, + "grad_norm": 0.04014548286795616, + "learning_rate": 0.00010409658286836143, + "loss": 0.95, + "step": 1298 + }, + { + "epoch": 1.4651228466534878, + "grad_norm": 0.039338886737823486, + "learning_rate": 0.0001039779049428431, + "loss": 1.0238, + "step": 1299 + }, + { + "epoch": 1.4662524710533749, + "grad_norm": 0.0389542393386364, + "learning_rate": 0.00010385922140551752, + "loss": 1.0861, + "step": 1300 + }, + { + "epoch": 1.467382095453262, + "grad_norm": 0.038751065731048584, + "learning_rate": 0.00010374053242381687, + "loss": 1.0137, + "step": 1301 + }, + { + "epoch": 1.4685117198531488, + "grad_norm": 0.041603147983551025, + "learning_rate": 0.00010362183816518101, + "loss": 1.0291, + "step": 1302 + }, + { + "epoch": 1.4696413442530358, + "grad_norm": 0.04023897647857666, + "learning_rate": 0.00010350313879705713, + "loss": 0.9787, + "step": 1303 + }, + { + "epoch": 1.470770968652923, + "grad_norm": 0.041538018733263016, + "learning_rate": 0.00010338443448689976, + "loss": 1.0501, + "step": 1304 + }, + { + "epoch": 1.47190059305281, + "grad_norm": 0.039176035672426224, + "learning_rate": 0.00010326572540217028, + "loss": 1.012, + "step": 1305 + }, + { + "epoch": 1.473030217452697, + "grad_norm": 0.03936734050512314, + "learning_rate": 0.00010314701171033694, + "loss": 1.0412, + "step": 1306 + }, + { + "epoch": 1.474159841852584, + "grad_norm": 0.04016166925430298, + "learning_rate": 0.00010302829357887438, + "loss": 0.9524, + "step": 1307 + }, + { + "epoch": 1.475289466252471, + "grad_norm": 0.04301619902253151, + "learning_rate": 0.00010290957117526354, + "loss": 1.0209, + "step": 1308 + }, + { + "epoch": 1.476419090652358, + "grad_norm": 0.03983006253838539, + "learning_rate": 0.0001027908446669914, + "loss": 0.9807, + "step": 1309 + }, + { + "epoch": 1.477548715052245, + "grad_norm": 0.0387161448597908, + "learning_rate": 0.00010267211422155072, + "loss": 1.0256, + "step": 1310 + }, + { + "epoch": 1.4786783394521321, + "grad_norm": 0.040488190948963165, + "learning_rate": 0.00010255338000643979, + "loss": 0.9816, + "step": 1311 + }, + { + "epoch": 1.4798079638520192, + "grad_norm": 0.04232305288314819, + "learning_rate": 0.00010243464218916226, + "loss": 1.0266, + "step": 1312 + }, + { + "epoch": 1.4809375882519062, + "grad_norm": 0.039844729006290436, + "learning_rate": 0.00010231590093722687, + "loss": 0.9535, + "step": 1313 + }, + { + "epoch": 1.4820672126517933, + "grad_norm": 0.0414462611079216, + "learning_rate": 0.00010219715641814714, + "loss": 1.0776, + "step": 1314 + }, + { + "epoch": 1.4831968370516804, + "grad_norm": 0.04016523063182831, + "learning_rate": 0.00010207840879944123, + "loss": 0.9812, + "step": 1315 + }, + { + "epoch": 1.4843264614515674, + "grad_norm": 0.039963267743587494, + "learning_rate": 0.00010195965824863174, + "loss": 0.92, + "step": 1316 + }, + { + "epoch": 1.4854560858514545, + "grad_norm": 0.040194012224674225, + "learning_rate": 0.00010184090493324527, + "loss": 0.9259, + "step": 1317 + }, + { + "epoch": 1.4865857102513416, + "grad_norm": 0.03941455855965614, + "learning_rate": 0.00010172214902081247, + "loss": 1.0022, + "step": 1318 + }, + { + "epoch": 1.4877153346512284, + "grad_norm": 0.03942098468542099, + "learning_rate": 0.00010160339067886751, + "loss": 0.9878, + "step": 1319 + }, + { + "epoch": 1.4888449590511155, + "grad_norm": 0.04082900658249855, + "learning_rate": 0.0001014846300749481, + "loss": 1.0775, + "step": 1320 + }, + { + "epoch": 1.4899745834510025, + "grad_norm": 0.040975864976644516, + "learning_rate": 0.0001013658673765951, + "loss": 0.884, + "step": 1321 + }, + { + "epoch": 1.4911042078508896, + "grad_norm": 0.037688951939344406, + "learning_rate": 0.00010124710275135236, + "loss": 0.9211, + "step": 1322 + }, + { + "epoch": 1.4922338322507767, + "grad_norm": 0.038889314979314804, + "learning_rate": 0.00010112833636676633, + "loss": 0.9202, + "step": 1323 + }, + { + "epoch": 1.4933634566506637, + "grad_norm": 0.040478792041540146, + "learning_rate": 0.00010100956839038605, + "loss": 0.9496, + "step": 1324 + }, + { + "epoch": 1.4944930810505506, + "grad_norm": 0.040962960571050644, + "learning_rate": 0.00010089079898976284, + "loss": 0.9801, + "step": 1325 + }, + { + "epoch": 1.4956227054504376, + "grad_norm": 0.04065919294953346, + "learning_rate": 0.00010077202833244987, + "loss": 0.9985, + "step": 1326 + }, + { + "epoch": 1.4967523298503247, + "grad_norm": 0.039901912212371826, + "learning_rate": 0.00010065325658600228, + "loss": 0.9965, + "step": 1327 + }, + { + "epoch": 1.4978819542502118, + "grad_norm": 0.040795568376779556, + "learning_rate": 0.00010053448391797657, + "loss": 0.9618, + "step": 1328 + }, + { + "epoch": 1.4990115786500988, + "grad_norm": 0.03914114087820053, + "learning_rate": 0.0001004157104959307, + "loss": 0.9579, + "step": 1329 + }, + { + "epoch": 1.5001412030499859, + "grad_norm": 0.0392233207821846, + "learning_rate": 0.00010029693648742355, + "loss": 1.0439, + "step": 1330 + }, + { + "epoch": 1.501270827449873, + "grad_norm": 0.04035951942205429, + "learning_rate": 0.00010017816206001489, + "loss": 0.9729, + "step": 1331 + }, + { + "epoch": 1.50240045184976, + "grad_norm": 0.04052640497684479, + "learning_rate": 0.0001000593873812651, + "loss": 1.0254, + "step": 1332 + }, + { + "epoch": 1.503530076249647, + "grad_norm": 0.03904925286769867, + "learning_rate": 9.994061261873491e-05, + "loss": 1.017, + "step": 1333 + }, + { + "epoch": 1.5046597006495341, + "grad_norm": 0.04079846292734146, + "learning_rate": 9.98218379399851e-05, + "loss": 0.9723, + "step": 1334 + }, + { + "epoch": 1.5057893250494212, + "grad_norm": 0.04014943167567253, + "learning_rate": 9.970306351257647e-05, + "loss": 1.0047, + "step": 1335 + }, + { + "epoch": 1.5069189494493083, + "grad_norm": 0.040499597787857056, + "learning_rate": 9.958428950406935e-05, + "loss": 0.981, + "step": 1336 + }, + { + "epoch": 1.508048573849195, + "grad_norm": 0.039357781410217285, + "learning_rate": 9.946551608202343e-05, + "loss": 1.0248, + "step": 1337 + }, + { + "epoch": 1.5091781982490822, + "grad_norm": 0.0400700643658638, + "learning_rate": 9.934674341399773e-05, + "loss": 0.8825, + "step": 1338 + }, + { + "epoch": 1.5103078226489692, + "grad_norm": 0.04014676809310913, + "learning_rate": 9.922797166755016e-05, + "loss": 0.9863, + "step": 1339 + }, + { + "epoch": 1.5114374470488563, + "grad_norm": 0.03986269235610962, + "learning_rate": 9.910920101023717e-05, + "loss": 0.9944, + "step": 1340 + }, + { + "epoch": 1.5125670714487431, + "grad_norm": 0.04043012112379074, + "learning_rate": 9.899043160961396e-05, + "loss": 1.0018, + "step": 1341 + }, + { + "epoch": 1.5136966958486302, + "grad_norm": 0.04049236699938774, + "learning_rate": 9.887166363323372e-05, + "loss": 1.0049, + "step": 1342 + }, + { + "epoch": 1.5148263202485173, + "grad_norm": 0.03971569612622261, + "learning_rate": 9.875289724864767e-05, + "loss": 0.9351, + "step": 1343 + }, + { + "epoch": 1.5159559446484043, + "grad_norm": 0.039951734244823456, + "learning_rate": 9.863413262340491e-05, + "loss": 0.8937, + "step": 1344 + }, + { + "epoch": 1.5170855690482914, + "grad_norm": 0.04067753627896309, + "learning_rate": 9.851536992505188e-05, + "loss": 0.9653, + "step": 1345 + }, + { + "epoch": 1.5182151934481785, + "grad_norm": 0.03986869379878044, + "learning_rate": 9.839660932113252e-05, + "loss": 0.9987, + "step": 1346 + }, + { + "epoch": 1.5193448178480655, + "grad_norm": 0.039932139217853546, + "learning_rate": 9.827785097918757e-05, + "loss": 0.9733, + "step": 1347 + }, + { + "epoch": 1.5204744422479526, + "grad_norm": 0.039806436747312546, + "learning_rate": 9.815909506675474e-05, + "loss": 0.9847, + "step": 1348 + }, + { + "epoch": 1.5216040666478396, + "grad_norm": 0.039268314838409424, + "learning_rate": 9.804034175136828e-05, + "loss": 1.0022, + "step": 1349 + }, + { + "epoch": 1.5227336910477267, + "grad_norm": 0.04103934392333031, + "learning_rate": 9.79215912005588e-05, + "loss": 0.9476, + "step": 1350 + }, + { + "epoch": 1.5238633154476138, + "grad_norm": 0.040989138185977936, + "learning_rate": 9.780284358185289e-05, + "loss": 0.9352, + "step": 1351 + }, + { + "epoch": 1.5249929398475008, + "grad_norm": 0.04021565616130829, + "learning_rate": 9.768409906277316e-05, + "loss": 0.9293, + "step": 1352 + }, + { + "epoch": 1.526122564247388, + "grad_norm": 0.040516503155231476, + "learning_rate": 9.756535781083773e-05, + "loss": 0.9935, + "step": 1353 + }, + { + "epoch": 1.5272521886472747, + "grad_norm": 0.03917067497968674, + "learning_rate": 9.744661999356022e-05, + "loss": 0.9802, + "step": 1354 + }, + { + "epoch": 1.5283818130471618, + "grad_norm": 0.04264604300260544, + "learning_rate": 9.732788577844934e-05, + "loss": 0.9984, + "step": 1355 + }, + { + "epoch": 1.5295114374470489, + "grad_norm": 0.04073306545615196, + "learning_rate": 9.720915533300863e-05, + "loss": 0.987, + "step": 1356 + }, + { + "epoch": 1.530641061846936, + "grad_norm": 0.040012288838624954, + "learning_rate": 9.70904288247365e-05, + "loss": 0.9858, + "step": 1357 + }, + { + "epoch": 1.5317706862468228, + "grad_norm": 0.03969816863536835, + "learning_rate": 9.697170642112567e-05, + "loss": 0.8985, + "step": 1358 + }, + { + "epoch": 1.5329003106467098, + "grad_norm": 0.04010999575257301, + "learning_rate": 9.685298828966307e-05, + "loss": 1.04, + "step": 1359 + }, + { + "epoch": 1.534029935046597, + "grad_norm": 0.0400669127702713, + "learning_rate": 9.673427459782974e-05, + "loss": 1.0496, + "step": 1360 + }, + { + "epoch": 1.535159559446484, + "grad_norm": 0.03961464762687683, + "learning_rate": 9.661556551310029e-05, + "loss": 1.0093, + "step": 1361 + }, + { + "epoch": 1.536289183846371, + "grad_norm": 0.04031101614236832, + "learning_rate": 9.64968612029429e-05, + "loss": 0.9953, + "step": 1362 + }, + { + "epoch": 1.537418808246258, + "grad_norm": 0.0386832058429718, + "learning_rate": 9.637816183481903e-05, + "loss": 0.9974, + "step": 1363 + }, + { + "epoch": 1.5385484326461452, + "grad_norm": 0.04059210419654846, + "learning_rate": 9.625946757618311e-05, + "loss": 1.0859, + "step": 1364 + }, + { + "epoch": 1.5396780570460322, + "grad_norm": 0.039874449372291565, + "learning_rate": 9.61407785944825e-05, + "loss": 0.9252, + "step": 1365 + }, + { + "epoch": 1.5408076814459193, + "grad_norm": 0.040245506912469864, + "learning_rate": 9.602209505715695e-05, + "loss": 0.9719, + "step": 1366 + }, + { + "epoch": 1.5419373058458063, + "grad_norm": 0.0418454185128212, + "learning_rate": 9.590341713163858e-05, + "loss": 0.9523, + "step": 1367 + }, + { + "epoch": 1.5430669302456934, + "grad_norm": 0.041850898414850235, + "learning_rate": 9.578474498535165e-05, + "loss": 0.9707, + "step": 1368 + }, + { + "epoch": 1.5441965546455805, + "grad_norm": 0.04082093760371208, + "learning_rate": 9.566607878571225e-05, + "loss": 0.9638, + "step": 1369 + }, + { + "epoch": 1.5453261790454675, + "grad_norm": 0.040829937905073166, + "learning_rate": 9.554741870012797e-05, + "loss": 0.9991, + "step": 1370 + }, + { + "epoch": 1.5464558034453544, + "grad_norm": 0.04339444637298584, + "learning_rate": 9.542876489599795e-05, + "loss": 1.0947, + "step": 1371 + }, + { + "epoch": 1.5475854278452414, + "grad_norm": 0.040259361267089844, + "learning_rate": 9.531011754071232e-05, + "loss": 0.9616, + "step": 1372 + }, + { + "epoch": 1.5487150522451285, + "grad_norm": 0.04023364931344986, + "learning_rate": 9.519147680165224e-05, + "loss": 0.9364, + "step": 1373 + }, + { + "epoch": 1.5498446766450156, + "grad_norm": 0.03999504819512367, + "learning_rate": 9.507284284618945e-05, + "loss": 0.9235, + "step": 1374 + }, + { + "epoch": 1.5509743010449024, + "grad_norm": 0.03931557759642601, + "learning_rate": 9.495421584168609e-05, + "loss": 0.9417, + "step": 1375 + }, + { + "epoch": 1.5521039254447895, + "grad_norm": 0.040559954941272736, + "learning_rate": 9.483559595549461e-05, + "loss": 1.0114, + "step": 1376 + }, + { + "epoch": 1.5532335498446765, + "grad_norm": 0.040581028908491135, + "learning_rate": 9.471698335495731e-05, + "loss": 1.0169, + "step": 1377 + }, + { + "epoch": 1.5543631742445636, + "grad_norm": 0.04158001020550728, + "learning_rate": 9.459837820740626e-05, + "loss": 0.9635, + "step": 1378 + }, + { + "epoch": 1.5554927986444507, + "grad_norm": 0.0392904132604599, + "learning_rate": 9.447978068016307e-05, + "loss": 0.9105, + "step": 1379 + }, + { + "epoch": 1.5566224230443377, + "grad_norm": 0.04135749489068985, + "learning_rate": 9.436119094053846e-05, + "loss": 1.004, + "step": 1380 + }, + { + "epoch": 1.5577520474442248, + "grad_norm": 0.040026694536209106, + "learning_rate": 9.424260915583225e-05, + "loss": 1.0355, + "step": 1381 + }, + { + "epoch": 1.5588816718441119, + "grad_norm": 0.0401214100420475, + "learning_rate": 9.412403549333307e-05, + "loss": 0.9886, + "step": 1382 + }, + { + "epoch": 1.560011296243999, + "grad_norm": 0.039697665721178055, + "learning_rate": 9.400547012031803e-05, + "loss": 0.9499, + "step": 1383 + }, + { + "epoch": 1.561140920643886, + "grad_norm": 0.03938675299286842, + "learning_rate": 9.388691320405262e-05, + "loss": 0.9735, + "step": 1384 + }, + { + "epoch": 1.562270545043773, + "grad_norm": 0.041321761906147, + "learning_rate": 9.376836491179028e-05, + "loss": 1.0457, + "step": 1385 + }, + { + "epoch": 1.5634001694436601, + "grad_norm": 0.04004902020096779, + "learning_rate": 9.364982541077235e-05, + "loss": 0.9871, + "step": 1386 + }, + { + "epoch": 1.5645297938435472, + "grad_norm": 0.04067892208695412, + "learning_rate": 9.353129486822783e-05, + "loss": 0.9761, + "step": 1387 + }, + { + "epoch": 1.565659418243434, + "grad_norm": 0.04096290096640587, + "learning_rate": 9.341277345137301e-05, + "loss": 1.023, + "step": 1388 + }, + { + "epoch": 1.566789042643321, + "grad_norm": 0.04284733161330223, + "learning_rate": 9.329426132741124e-05, + "loss": 1.0182, + "step": 1389 + }, + { + "epoch": 1.5679186670432081, + "grad_norm": 0.04189145565032959, + "learning_rate": 9.317575866353292e-05, + "loss": 1.044, + "step": 1390 + }, + { + "epoch": 1.5690482914430952, + "grad_norm": 0.04177414998412132, + "learning_rate": 9.3057265626915e-05, + "loss": 1.0317, + "step": 1391 + }, + { + "epoch": 1.570177915842982, + "grad_norm": 0.04260309785604477, + "learning_rate": 9.293878238472084e-05, + "loss": 0.9835, + "step": 1392 + }, + { + "epoch": 1.5713075402428691, + "grad_norm": 0.04053134843707085, + "learning_rate": 9.282030910410007e-05, + "loss": 1.0568, + "step": 1393 + }, + { + "epoch": 1.5724371646427562, + "grad_norm": 0.039543889462947845, + "learning_rate": 9.270184595218816e-05, + "loss": 0.9351, + "step": 1394 + }, + { + "epoch": 1.5735667890426432, + "grad_norm": 0.04078560322523117, + "learning_rate": 9.258339309610637e-05, + "loss": 0.9573, + "step": 1395 + }, + { + "epoch": 1.5746964134425303, + "grad_norm": 0.04066834971308708, + "learning_rate": 9.24649507029614e-05, + "loss": 0.9305, + "step": 1396 + }, + { + "epoch": 1.5758260378424174, + "grad_norm": 0.039552509784698486, + "learning_rate": 9.234651893984516e-05, + "loss": 0.8974, + "step": 1397 + }, + { + "epoch": 1.5769556622423044, + "grad_norm": 0.04099886491894722, + "learning_rate": 9.222809797383468e-05, + "loss": 0.9938, + "step": 1398 + }, + { + "epoch": 1.5780852866421915, + "grad_norm": 0.041135065257549286, + "learning_rate": 9.210968797199161e-05, + "loss": 0.9801, + "step": 1399 + }, + { + "epoch": 1.5792149110420786, + "grad_norm": 0.041974954307079315, + "learning_rate": 9.199128910136219e-05, + "loss": 0.9009, + "step": 1400 + }, + { + "epoch": 1.5803445354419656, + "grad_norm": 0.04135981947183609, + "learning_rate": 9.1872901528977e-05, + "loss": 0.9265, + "step": 1401 + }, + { + "epoch": 1.5814741598418527, + "grad_norm": 0.041727129369974136, + "learning_rate": 9.175452542185064e-05, + "loss": 1.0046, + "step": 1402 + }, + { + "epoch": 1.5826037842417398, + "grad_norm": 0.040945183485746384, + "learning_rate": 9.163616094698159e-05, + "loss": 1.0032, + "step": 1403 + }, + { + "epoch": 1.5837334086416268, + "grad_norm": 0.04023808240890503, + "learning_rate": 9.151780827135178e-05, + "loss": 1.1407, + "step": 1404 + }, + { + "epoch": 1.5848630330415137, + "grad_norm": 0.04090253263711929, + "learning_rate": 9.139946756192663e-05, + "loss": 0.9573, + "step": 1405 + }, + { + "epoch": 1.5859926574414007, + "grad_norm": 0.04100187495350838, + "learning_rate": 9.128113898565465e-05, + "loss": 0.9929, + "step": 1406 + }, + { + "epoch": 1.5871222818412878, + "grad_norm": 0.039680980145931244, + "learning_rate": 9.116282270946724e-05, + "loss": 1.127, + "step": 1407 + }, + { + "epoch": 1.5882519062411748, + "grad_norm": 0.0428316704928875, + "learning_rate": 9.104451890027834e-05, + "loss": 0.9507, + "step": 1408 + }, + { + "epoch": 1.5893815306410617, + "grad_norm": 0.03888211399316788, + "learning_rate": 9.092622772498448e-05, + "loss": 0.9702, + "step": 1409 + }, + { + "epoch": 1.5905111550409488, + "grad_norm": 0.04164845868945122, + "learning_rate": 9.080794935046421e-05, + "loss": 0.9594, + "step": 1410 + }, + { + "epoch": 1.5916407794408358, + "grad_norm": 0.04320342466235161, + "learning_rate": 9.068968394357811e-05, + "loss": 1.0014, + "step": 1411 + }, + { + "epoch": 1.5927704038407229, + "grad_norm": 0.039522089064121246, + "learning_rate": 9.057143167116848e-05, + "loss": 0.9515, + "step": 1412 + }, + { + "epoch": 1.59390002824061, + "grad_norm": 0.03963286057114601, + "learning_rate": 9.0453192700059e-05, + "loss": 0.9353, + "step": 1413 + }, + { + "epoch": 1.595029652640497, + "grad_norm": 0.03974980115890503, + "learning_rate": 9.033496719705467e-05, + "loss": 0.9351, + "step": 1414 + }, + { + "epoch": 1.596159277040384, + "grad_norm": 0.03890387713909149, + "learning_rate": 9.021675532894145e-05, + "loss": 1.0577, + "step": 1415 + }, + { + "epoch": 1.5972889014402711, + "grad_norm": 0.04113437607884407, + "learning_rate": 9.009855726248605e-05, + "loss": 1.0971, + "step": 1416 + }, + { + "epoch": 1.5984185258401582, + "grad_norm": 0.04086640477180481, + "learning_rate": 8.99803731644358e-05, + "loss": 0.979, + "step": 1417 + }, + { + "epoch": 1.5995481502400453, + "grad_norm": 0.04131541773676872, + "learning_rate": 8.98622032015182e-05, + "loss": 0.9165, + "step": 1418 + }, + { + "epoch": 1.6006777746399323, + "grad_norm": 0.04054603353142738, + "learning_rate": 8.974404754044085e-05, + "loss": 1.0683, + "step": 1419 + }, + { + "epoch": 1.6018073990398194, + "grad_norm": 0.039694368839263916, + "learning_rate": 8.962590634789123e-05, + "loss": 0.9606, + "step": 1420 + }, + { + "epoch": 1.6029370234397065, + "grad_norm": 0.0408555269241333, + "learning_rate": 8.950777979053635e-05, + "loss": 0.8758, + "step": 1421 + }, + { + "epoch": 1.6040666478395933, + "grad_norm": 0.04044231399893761, + "learning_rate": 8.938966803502264e-05, + "loss": 1.0119, + "step": 1422 + }, + { + "epoch": 1.6051962722394804, + "grad_norm": 0.03972948342561722, + "learning_rate": 8.927157124797552e-05, + "loss": 1.0015, + "step": 1423 + }, + { + "epoch": 1.6063258966393674, + "grad_norm": 0.04121416062116623, + "learning_rate": 8.915348959599941e-05, + "loss": 0.93, + "step": 1424 + }, + { + "epoch": 1.6074555210392545, + "grad_norm": 0.03931145742535591, + "learning_rate": 8.903542324567736e-05, + "loss": 1.0265, + "step": 1425 + }, + { + "epoch": 1.6085851454391413, + "grad_norm": 0.03970059007406235, + "learning_rate": 8.891737236357083e-05, + "loss": 1.034, + "step": 1426 + }, + { + "epoch": 1.6097147698390284, + "grad_norm": 0.039576053619384766, + "learning_rate": 8.879933711621935e-05, + "loss": 0.9432, + "step": 1427 + }, + { + "epoch": 1.6108443942389155, + "grad_norm": 0.04142745956778526, + "learning_rate": 8.868131767014059e-05, + "loss": 0.9162, + "step": 1428 + }, + { + "epoch": 1.6119740186388025, + "grad_norm": 0.03949185460805893, + "learning_rate": 8.856331419182977e-05, + "loss": 1.0063, + "step": 1429 + }, + { + "epoch": 1.6131036430386896, + "grad_norm": 0.04371139034628868, + "learning_rate": 8.844532684775963e-05, + "loss": 1.0026, + "step": 1430 + }, + { + "epoch": 1.6142332674385766, + "grad_norm": 0.039744243025779724, + "learning_rate": 8.832735580438025e-05, + "loss": 0.9801, + "step": 1431 + }, + { + "epoch": 1.6153628918384637, + "grad_norm": 0.041075777262449265, + "learning_rate": 8.820940122811849e-05, + "loss": 1.0475, + "step": 1432 + }, + { + "epoch": 1.6164925162383508, + "grad_norm": 0.039307381957769394, + "learning_rate": 8.809146328537818e-05, + "loss": 0.9662, + "step": 1433 + }, + { + "epoch": 1.6176221406382378, + "grad_norm": 0.04150167852640152, + "learning_rate": 8.797354214253963e-05, + "loss": 0.941, + "step": 1434 + }, + { + "epoch": 1.618751765038125, + "grad_norm": 0.04133110120892525, + "learning_rate": 8.785563796595938e-05, + "loss": 0.9079, + "step": 1435 + }, + { + "epoch": 1.619881389438012, + "grad_norm": 0.04027198255062103, + "learning_rate": 8.773775092197017e-05, + "loss": 0.9626, + "step": 1436 + }, + { + "epoch": 1.621011013837899, + "grad_norm": 0.04010196402668953, + "learning_rate": 8.761988117688041e-05, + "loss": 0.9184, + "step": 1437 + }, + { + "epoch": 1.622140638237786, + "grad_norm": 0.042807456105947495, + "learning_rate": 8.750202889697421e-05, + "loss": 1.0578, + "step": 1438 + }, + { + "epoch": 1.623270262637673, + "grad_norm": 0.03901375085115433, + "learning_rate": 8.738419424851104e-05, + "loss": 0.9621, + "step": 1439 + }, + { + "epoch": 1.62439988703756, + "grad_norm": 0.04136718809604645, + "learning_rate": 8.726637739772542e-05, + "loss": 1.0693, + "step": 1440 + }, + { + "epoch": 1.625529511437447, + "grad_norm": 0.04196292906999588, + "learning_rate": 8.71485785108269e-05, + "loss": 1.0302, + "step": 1441 + }, + { + "epoch": 1.6266591358373341, + "grad_norm": 0.03940456733107567, + "learning_rate": 8.703079775399954e-05, + "loss": 0.9193, + "step": 1442 + }, + { + "epoch": 1.627788760237221, + "grad_norm": 0.04063562676310539, + "learning_rate": 8.691303529340187e-05, + "loss": 1.0028, + "step": 1443 + }, + { + "epoch": 1.628918384637108, + "grad_norm": 0.041786015033721924, + "learning_rate": 8.679529129516667e-05, + "loss": 0.9887, + "step": 1444 + }, + { + "epoch": 1.630048009036995, + "grad_norm": 0.04099738225340843, + "learning_rate": 8.667756592540064e-05, + "loss": 0.9782, + "step": 1445 + }, + { + "epoch": 1.6311776334368822, + "grad_norm": 0.040208104997873306, + "learning_rate": 8.655985935018411e-05, + "loss": 0.8863, + "step": 1446 + }, + { + "epoch": 1.6323072578367692, + "grad_norm": 0.0400582030415535, + "learning_rate": 8.644217173557106e-05, + "loss": 1.0338, + "step": 1447 + }, + { + "epoch": 1.6334368822366563, + "grad_norm": 0.04026160016655922, + "learning_rate": 8.632450324758859e-05, + "loss": 0.924, + "step": 1448 + }, + { + "epoch": 1.6345665066365433, + "grad_norm": 0.04190199449658394, + "learning_rate": 8.620685405223686e-05, + "loss": 0.9588, + "step": 1449 + }, + { + "epoch": 1.6356961310364304, + "grad_norm": 0.040600717067718506, + "learning_rate": 8.608922431548887e-05, + "loss": 0.988, + "step": 1450 + }, + { + "epoch": 1.6368257554363175, + "grad_norm": 0.03968328237533569, + "learning_rate": 8.597161420329e-05, + "loss": 1.0247, + "step": 1451 + }, + { + "epoch": 1.6379553798362045, + "grad_norm": 0.03925072401762009, + "learning_rate": 8.585402388155821e-05, + "loss": 0.9461, + "step": 1452 + }, + { + "epoch": 1.6390850042360916, + "grad_norm": 0.04031050205230713, + "learning_rate": 8.573645351618329e-05, + "loss": 0.9915, + "step": 1453 + }, + { + "epoch": 1.6402146286359787, + "grad_norm": 0.04113445058465004, + "learning_rate": 8.561890327302698e-05, + "loss": 0.9443, + "step": 1454 + }, + { + "epoch": 1.6413442530358657, + "grad_norm": 0.04077060893177986, + "learning_rate": 8.55013733179227e-05, + "loss": 1.0212, + "step": 1455 + }, + { + "epoch": 1.6424738774357526, + "grad_norm": 0.04069150611758232, + "learning_rate": 8.53838638166751e-05, + "loss": 1.0852, + "step": 1456 + }, + { + "epoch": 1.6436035018356396, + "grad_norm": 0.04216321185231209, + "learning_rate": 8.526637493506006e-05, + "loss": 1.0119, + "step": 1457 + }, + { + "epoch": 1.6447331262355267, + "grad_norm": 0.04014064744114876, + "learning_rate": 8.51489068388244e-05, + "loss": 1.0884, + "step": 1458 + }, + { + "epoch": 1.6458627506354138, + "grad_norm": 0.04238475114107132, + "learning_rate": 8.503145969368562e-05, + "loss": 1.0737, + "step": 1459 + }, + { + "epoch": 1.6469923750353006, + "grad_norm": 0.04255363717675209, + "learning_rate": 8.49140336653315e-05, + "loss": 0.9817, + "step": 1460 + }, + { + "epoch": 1.6481219994351877, + "grad_norm": 0.0400969423353672, + "learning_rate": 8.479662891942026e-05, + "loss": 0.9089, + "step": 1461 + }, + { + "epoch": 1.6492516238350747, + "grad_norm": 0.03944752737879753, + "learning_rate": 8.467924562157994e-05, + "loss": 1.0311, + "step": 1462 + }, + { + "epoch": 1.6503812482349618, + "grad_norm": 0.04170246049761772, + "learning_rate": 8.456188393740841e-05, + "loss": 1.0078, + "step": 1463 + }, + { + "epoch": 1.6515108726348489, + "grad_norm": 0.04236338660120964, + "learning_rate": 8.444454403247302e-05, + "loss": 0.9292, + "step": 1464 + }, + { + "epoch": 1.652640497034736, + "grad_norm": 0.04006924107670784, + "learning_rate": 8.43272260723103e-05, + "loss": 0.9691, + "step": 1465 + }, + { + "epoch": 1.653770121434623, + "grad_norm": 0.03962864354252815, + "learning_rate": 8.420993022242602e-05, + "loss": 1.025, + "step": 1466 + }, + { + "epoch": 1.65489974583451, + "grad_norm": 0.04008259251713753, + "learning_rate": 8.409265664829457e-05, + "loss": 0.9579, + "step": 1467 + }, + { + "epoch": 1.6560293702343971, + "grad_norm": 0.04024090990424156, + "learning_rate": 8.3975405515359e-05, + "loss": 1.0279, + "step": 1468 + }, + { + "epoch": 1.6571589946342842, + "grad_norm": 0.042322732508182526, + "learning_rate": 8.385817698903074e-05, + "loss": 0.9587, + "step": 1469 + }, + { + "epoch": 1.6582886190341712, + "grad_norm": 0.04091969132423401, + "learning_rate": 8.374097123468918e-05, + "loss": 1.0587, + "step": 1470 + }, + { + "epoch": 1.6594182434340583, + "grad_norm": 0.04016717150807381, + "learning_rate": 8.362378841768182e-05, + "loss": 0.9552, + "step": 1471 + }, + { + "epoch": 1.6605478678339454, + "grad_norm": 0.04016115888953209, + "learning_rate": 8.350662870332356e-05, + "loss": 1.0046, + "step": 1472 + }, + { + "epoch": 1.6616774922338322, + "grad_norm": 0.04118018224835396, + "learning_rate": 8.338949225689682e-05, + "loss": 0.9951, + "step": 1473 + }, + { + "epoch": 1.6628071166337193, + "grad_norm": 0.041833024471998215, + "learning_rate": 8.327237924365128e-05, + "loss": 0.9437, + "step": 1474 + }, + { + "epoch": 1.6639367410336063, + "grad_norm": 0.03964128717780113, + "learning_rate": 8.315528982880337e-05, + "loss": 0.9405, + "step": 1475 + }, + { + "epoch": 1.6639367410336063, + "eval_loss": 0.9902071356773376, + "eval_runtime": 554.2501, + "eval_samples_per_second": 17.651, + "eval_steps_per_second": 8.826, + "step": 1475 + }, + { + "epoch": 1.6650663654334934, + "grad_norm": 0.04184534028172493, + "learning_rate": 8.303822417753634e-05, + "loss": 0.9693, + "step": 1476 + }, + { + "epoch": 1.6661959898333802, + "grad_norm": 0.040639445185661316, + "learning_rate": 8.292118245499996e-05, + "loss": 1.026, + "step": 1477 + }, + { + "epoch": 1.6673256142332673, + "grad_norm": 0.041504424065351486, + "learning_rate": 8.280416482631018e-05, + "loss": 1.0356, + "step": 1478 + }, + { + "epoch": 1.6684552386331544, + "grad_norm": 0.041965845972299576, + "learning_rate": 8.268717145654887e-05, + "loss": 0.9386, + "step": 1479 + }, + { + "epoch": 1.6695848630330414, + "grad_norm": 0.04126819968223572, + "learning_rate": 8.257020251076393e-05, + "loss": 0.9731, + "step": 1480 + }, + { + "epoch": 1.6707144874329285, + "grad_norm": 0.03976297006011009, + "learning_rate": 8.245325815396849e-05, + "loss": 0.9571, + "step": 1481 + }, + { + "epoch": 1.6718441118328156, + "grad_norm": 0.0409400649368763, + "learning_rate": 8.233633855114127e-05, + "loss": 0.9836, + "step": 1482 + }, + { + "epoch": 1.6729737362327026, + "grad_norm": 0.04177910462021828, + "learning_rate": 8.221944386722591e-05, + "loss": 0.9716, + "step": 1483 + }, + { + "epoch": 1.6741033606325897, + "grad_norm": 0.041480690240859985, + "learning_rate": 8.210257426713086e-05, + "loss": 0.9171, + "step": 1484 + }, + { + "epoch": 1.6752329850324768, + "grad_norm": 0.0406128354370594, + "learning_rate": 8.198572991572939e-05, + "loss": 0.9579, + "step": 1485 + }, + { + "epoch": 1.6763626094323638, + "grad_norm": 0.04042517766356468, + "learning_rate": 8.186891097785891e-05, + "loss": 0.9316, + "step": 1486 + }, + { + "epoch": 1.6774922338322509, + "grad_norm": 0.042964909225702286, + "learning_rate": 8.175211761832113e-05, + "loss": 0.9642, + "step": 1487 + }, + { + "epoch": 1.678621858232138, + "grad_norm": 0.04327230900526047, + "learning_rate": 8.163535000188164e-05, + "loss": 1.0398, + "step": 1488 + }, + { + "epoch": 1.679751482632025, + "grad_norm": 0.04038830101490021, + "learning_rate": 8.151860829326962e-05, + "loss": 0.9255, + "step": 1489 + }, + { + "epoch": 1.6808811070319118, + "grad_norm": 0.04081222414970398, + "learning_rate": 8.140189265717794e-05, + "loss": 0.9471, + "step": 1490 + }, + { + "epoch": 1.682010731431799, + "grad_norm": 0.04002978280186653, + "learning_rate": 8.12852032582624e-05, + "loss": 0.9539, + "step": 1491 + }, + { + "epoch": 1.683140355831686, + "grad_norm": 0.04034026339650154, + "learning_rate": 8.116854026114194e-05, + "loss": 1.0661, + "step": 1492 + }, + { + "epoch": 1.684269980231573, + "grad_norm": 0.0421736054122448, + "learning_rate": 8.105190383039828e-05, + "loss": 0.9198, + "step": 1493 + }, + { + "epoch": 1.6853996046314599, + "grad_norm": 0.04237702116370201, + "learning_rate": 8.09352941305756e-05, + "loss": 0.9406, + "step": 1494 + }, + { + "epoch": 1.686529229031347, + "grad_norm": 0.04037948697805405, + "learning_rate": 8.081871132618036e-05, + "loss": 0.9974, + "step": 1495 + }, + { + "epoch": 1.687658853431234, + "grad_norm": 0.04098232835531235, + "learning_rate": 8.070215558168111e-05, + "loss": 0.9271, + "step": 1496 + }, + { + "epoch": 1.688788477831121, + "grad_norm": 0.04053972661495209, + "learning_rate": 8.058562706150823e-05, + "loss": 0.9823, + "step": 1497 + }, + { + "epoch": 1.6899181022310081, + "grad_norm": 0.041380010545253754, + "learning_rate": 8.046912593005361e-05, + "loss": 1.078, + "step": 1498 + }, + { + "epoch": 1.6910477266308952, + "grad_norm": 0.04158872738480568, + "learning_rate": 8.035265235167071e-05, + "loss": 0.9958, + "step": 1499 + }, + { + "epoch": 1.6921773510307823, + "grad_norm": 0.04147128760814667, + "learning_rate": 8.023620649067384e-05, + "loss": 0.9634, + "step": 1500 + }, + { + "epoch": 1.6933069754306693, + "grad_norm": 0.04079499468207359, + "learning_rate": 8.011978851133843e-05, + "loss": 0.9581, + "step": 1501 + }, + { + "epoch": 1.6944365998305564, + "grad_norm": 0.04090631753206253, + "learning_rate": 8.000339857790052e-05, + "loss": 0.9927, + "step": 1502 + }, + { + "epoch": 1.6955662242304435, + "grad_norm": 0.03968152403831482, + "learning_rate": 7.988703685455642e-05, + "loss": 0.8988, + "step": 1503 + }, + { + "epoch": 1.6966958486303305, + "grad_norm": 0.04115234315395355, + "learning_rate": 7.977070350546295e-05, + "loss": 1.0064, + "step": 1504 + }, + { + "epoch": 1.6978254730302176, + "grad_norm": 0.04148809239268303, + "learning_rate": 7.965439869473664e-05, + "loss": 1.0416, + "step": 1505 + }, + { + "epoch": 1.6989550974301046, + "grad_norm": 0.040575575083494186, + "learning_rate": 7.953812258645384e-05, + "loss": 1.0512, + "step": 1506 + }, + { + "epoch": 1.7000847218299915, + "grad_norm": 0.042429015040397644, + "learning_rate": 7.942187534465048e-05, + "loss": 0.9684, + "step": 1507 + }, + { + "epoch": 1.7012143462298785, + "grad_norm": 0.04083799198269844, + "learning_rate": 7.930565713332172e-05, + "loss": 0.9619, + "step": 1508 + }, + { + "epoch": 1.7023439706297656, + "grad_norm": 0.041108790785074234, + "learning_rate": 7.918946811642165e-05, + "loss": 1.0179, + "step": 1509 + }, + { + "epoch": 1.7034735950296527, + "grad_norm": 0.040565114468336105, + "learning_rate": 7.907330845786337e-05, + "loss": 1.0207, + "step": 1510 + }, + { + "epoch": 1.7046032194295395, + "grad_norm": 0.04313211888074875, + "learning_rate": 7.895717832151842e-05, + "loss": 0.9691, + "step": 1511 + }, + { + "epoch": 1.7057328438294266, + "grad_norm": 0.04209309443831444, + "learning_rate": 7.884107787121678e-05, + "loss": 1.0266, + "step": 1512 + }, + { + "epoch": 1.7068624682293136, + "grad_norm": 0.04235268756747246, + "learning_rate": 7.872500727074652e-05, + "loss": 1.0672, + "step": 1513 + }, + { + "epoch": 1.7079920926292007, + "grad_norm": 0.04129282012581825, + "learning_rate": 7.860896668385353e-05, + "loss": 1.0046, + "step": 1514 + }, + { + "epoch": 1.7091217170290878, + "grad_norm": 0.040298108011484146, + "learning_rate": 7.849295627424148e-05, + "loss": 1.0131, + "step": 1515 + }, + { + "epoch": 1.7102513414289748, + "grad_norm": 0.041110679507255554, + "learning_rate": 7.837697620557141e-05, + "loss": 0.8495, + "step": 1516 + }, + { + "epoch": 1.711380965828862, + "grad_norm": 0.04193076491355896, + "learning_rate": 7.826102664146146e-05, + "loss": 1.0323, + "step": 1517 + }, + { + "epoch": 1.712510590228749, + "grad_norm": 0.041734110563993454, + "learning_rate": 7.814510774548696e-05, + "loss": 0.9111, + "step": 1518 + }, + { + "epoch": 1.713640214628636, + "grad_norm": 0.04231907054781914, + "learning_rate": 7.802921968117976e-05, + "loss": 1.1051, + "step": 1519 + }, + { + "epoch": 1.714769839028523, + "grad_norm": 0.04157987982034683, + "learning_rate": 7.791336261202835e-05, + "loss": 0.9876, + "step": 1520 + }, + { + "epoch": 1.7158994634284102, + "grad_norm": 0.04158562049269676, + "learning_rate": 7.779753670147742e-05, + "loss": 1.0402, + "step": 1521 + }, + { + "epoch": 1.7170290878282972, + "grad_norm": 0.042495615780353546, + "learning_rate": 7.768174211292771e-05, + "loss": 0.9308, + "step": 1522 + }, + { + "epoch": 1.7181587122281843, + "grad_norm": 0.041278909891843796, + "learning_rate": 7.756597900973586e-05, + "loss": 1.0045, + "step": 1523 + }, + { + "epoch": 1.7192883366280711, + "grad_norm": 0.04124930873513222, + "learning_rate": 7.745024755521393e-05, + "loss": 0.9869, + "step": 1524 + }, + { + "epoch": 1.7204179610279582, + "grad_norm": 0.04180390387773514, + "learning_rate": 7.733454791262947e-05, + "loss": 0.9802, + "step": 1525 + }, + { + "epoch": 1.7215475854278453, + "grad_norm": 0.0410812608897686, + "learning_rate": 7.72188802452051e-05, + "loss": 1.0069, + "step": 1526 + }, + { + "epoch": 1.7226772098277323, + "grad_norm": 0.04038868471980095, + "learning_rate": 7.710324471611837e-05, + "loss": 0.92, + "step": 1527 + }, + { + "epoch": 1.7238068342276192, + "grad_norm": 0.04037635400891304, + "learning_rate": 7.698764148850137e-05, + "loss": 0.9254, + "step": 1528 + }, + { + "epoch": 1.7249364586275062, + "grad_norm": 0.041685521602630615, + "learning_rate": 7.68720707254408e-05, + "loss": 0.9573, + "step": 1529 + }, + { + "epoch": 1.7260660830273933, + "grad_norm": 0.040962155908346176, + "learning_rate": 7.67565325899774e-05, + "loss": 1.0302, + "step": 1530 + }, + { + "epoch": 1.7271957074272803, + "grad_norm": 0.04144284501671791, + "learning_rate": 7.6641027245106e-05, + "loss": 0.9257, + "step": 1531 + }, + { + "epoch": 1.7283253318271674, + "grad_norm": 0.04156067594885826, + "learning_rate": 7.652555485377515e-05, + "loss": 1.0532, + "step": 1532 + }, + { + "epoch": 1.7294549562270545, + "grad_norm": 0.041655533015728, + "learning_rate": 7.641011557888677e-05, + "loss": 1.0718, + "step": 1533 + }, + { + "epoch": 1.7305845806269415, + "grad_norm": 0.042078595608472824, + "learning_rate": 7.629470958329628e-05, + "loss": 0.9245, + "step": 1534 + }, + { + "epoch": 1.7317142050268286, + "grad_norm": 0.04095086082816124, + "learning_rate": 7.617933702981198e-05, + "loss": 1.0378, + "step": 1535 + }, + { + "epoch": 1.7328438294267157, + "grad_norm": 0.0403096005320549, + "learning_rate": 7.606399808119506e-05, + "loss": 1.0323, + "step": 1536 + }, + { + "epoch": 1.7339734538266027, + "grad_norm": 0.04328161105513573, + "learning_rate": 7.594869290015938e-05, + "loss": 0.9129, + "step": 1537 + }, + { + "epoch": 1.7351030782264898, + "grad_norm": 0.043508730828762054, + "learning_rate": 7.583342164937097e-05, + "loss": 1.0068, + "step": 1538 + }, + { + "epoch": 1.7362327026263769, + "grad_norm": 0.040654052048921585, + "learning_rate": 7.571818449144817e-05, + "loss": 1.0229, + "step": 1539 + }, + { + "epoch": 1.737362327026264, + "grad_norm": 0.04089539498090744, + "learning_rate": 7.560298158896114e-05, + "loss": 0.9388, + "step": 1540 + }, + { + "epoch": 1.7384919514261508, + "grad_norm": 0.04065493494272232, + "learning_rate": 7.548781310443172e-05, + "loss": 1.0679, + "step": 1541 + }, + { + "epoch": 1.7396215758260378, + "grad_norm": 0.041994400322437286, + "learning_rate": 7.537267920033325e-05, + "loss": 0.8963, + "step": 1542 + }, + { + "epoch": 1.7407512002259249, + "grad_norm": 0.04161880537867546, + "learning_rate": 7.525758003909019e-05, + "loss": 0.893, + "step": 1543 + }, + { + "epoch": 1.741880824625812, + "grad_norm": 0.04373447597026825, + "learning_rate": 7.514251578307805e-05, + "loss": 1.0542, + "step": 1544 + }, + { + "epoch": 1.7430104490256988, + "grad_norm": 0.04112675040960312, + "learning_rate": 7.502748659462311e-05, + "loss": 0.9247, + "step": 1545 + }, + { + "epoch": 1.7441400734255859, + "grad_norm": 0.04262956976890564, + "learning_rate": 7.491249263600217e-05, + "loss": 0.9339, + "step": 1546 + }, + { + "epoch": 1.745269697825473, + "grad_norm": 0.04103526473045349, + "learning_rate": 7.479753406944226e-05, + "loss": 0.9179, + "step": 1547 + }, + { + "epoch": 1.74639932222536, + "grad_norm": 0.040054481476545334, + "learning_rate": 7.468261105712058e-05, + "loss": 0.9594, + "step": 1548 + }, + { + "epoch": 1.747528946625247, + "grad_norm": 0.04210900515317917, + "learning_rate": 7.456772376116408e-05, + "loss": 0.996, + "step": 1549 + }, + { + "epoch": 1.7486585710251341, + "grad_norm": 0.040741559118032455, + "learning_rate": 7.445287234364946e-05, + "loss": 1.0583, + "step": 1550 + }, + { + "epoch": 1.7497881954250212, + "grad_norm": 0.04023589938879013, + "learning_rate": 7.433805696660266e-05, + "loss": 1.0283, + "step": 1551 + }, + { + "epoch": 1.7509178198249082, + "grad_norm": 0.04167277738451958, + "learning_rate": 7.42232777919988e-05, + "loss": 0.9618, + "step": 1552 + }, + { + "epoch": 1.7520474442247953, + "grad_norm": 0.0415283665060997, + "learning_rate": 7.410853498176202e-05, + "loss": 0.9279, + "step": 1553 + }, + { + "epoch": 1.7531770686246824, + "grad_norm": 0.04055981710553169, + "learning_rate": 7.399382869776508e-05, + "loss": 0.9431, + "step": 1554 + }, + { + "epoch": 1.7543066930245694, + "grad_norm": 0.04017677158117294, + "learning_rate": 7.38791591018292e-05, + "loss": 0.9168, + "step": 1555 + }, + { + "epoch": 1.7554363174244565, + "grad_norm": 0.04040129482746124, + "learning_rate": 7.376452635572395e-05, + "loss": 1.0269, + "step": 1556 + }, + { + "epoch": 1.7565659418243436, + "grad_norm": 0.04052029550075531, + "learning_rate": 7.364993062116674e-05, + "loss": 0.9945, + "step": 1557 + }, + { + "epoch": 1.7576955662242304, + "grad_norm": 0.04180474951863289, + "learning_rate": 7.353537205982294e-05, + "loss": 0.9983, + "step": 1558 + }, + { + "epoch": 1.7588251906241175, + "grad_norm": 0.04247574508190155, + "learning_rate": 7.342085083330537e-05, + "loss": 1.051, + "step": 1559 + }, + { + "epoch": 1.7599548150240045, + "grad_norm": 0.04135637730360031, + "learning_rate": 7.330636710317417e-05, + "loss": 0.9702, + "step": 1560 + }, + { + "epoch": 1.7610844394238916, + "grad_norm": 0.04159768298268318, + "learning_rate": 7.319192103093672e-05, + "loss": 1.0098, + "step": 1561 + }, + { + "epoch": 1.7622140638237784, + "grad_norm": 0.04054262861609459, + "learning_rate": 7.307751277804711e-05, + "loss": 0.9221, + "step": 1562 + }, + { + "epoch": 1.7633436882236655, + "grad_norm": 0.042798902839422226, + "learning_rate": 7.296314250590615e-05, + "loss": 0.9744, + "step": 1563 + }, + { + "epoch": 1.7644733126235526, + "grad_norm": 0.04235600307583809, + "learning_rate": 7.284881037586107e-05, + "loss": 0.9744, + "step": 1564 + }, + { + "epoch": 1.7656029370234396, + "grad_norm": 0.040483228862285614, + "learning_rate": 7.273451654920532e-05, + "loss": 0.9765, + "step": 1565 + }, + { + "epoch": 1.7667325614233267, + "grad_norm": 0.04099193215370178, + "learning_rate": 7.26202611871782e-05, + "loss": 1.0124, + "step": 1566 + }, + { + "epoch": 1.7678621858232137, + "grad_norm": 0.04022025689482689, + "learning_rate": 7.250604445096487e-05, + "loss": 0.9559, + "step": 1567 + }, + { + "epoch": 1.7689918102231008, + "grad_norm": 0.040389448404312134, + "learning_rate": 7.239186650169596e-05, + "loss": 0.9435, + "step": 1568 + }, + { + "epoch": 1.7701214346229879, + "grad_norm": 0.04212417080998421, + "learning_rate": 7.227772750044739e-05, + "loss": 0.9733, + "step": 1569 + }, + { + "epoch": 1.771251059022875, + "grad_norm": 0.03963673487305641, + "learning_rate": 7.21636276082401e-05, + "loss": 1.0231, + "step": 1570 + }, + { + "epoch": 1.772380683422762, + "grad_norm": 0.04181777685880661, + "learning_rate": 7.204956698603984e-05, + "loss": 0.9407, + "step": 1571 + }, + { + "epoch": 1.773510307822649, + "grad_norm": 0.04021856188774109, + "learning_rate": 7.193554579475705e-05, + "loss": 0.9671, + "step": 1572 + }, + { + "epoch": 1.7746399322225361, + "grad_norm": 0.044537365436553955, + "learning_rate": 7.182156419524649e-05, + "loss": 1.0034, + "step": 1573 + }, + { + "epoch": 1.7757695566224232, + "grad_norm": 0.04181935638189316, + "learning_rate": 7.170762234830699e-05, + "loss": 0.9827, + "step": 1574 + }, + { + "epoch": 1.77689918102231, + "grad_norm": 0.04236932843923569, + "learning_rate": 7.15937204146815e-05, + "loss": 0.9793, + "step": 1575 + }, + { + "epoch": 1.778028805422197, + "grad_norm": 0.041638486087322235, + "learning_rate": 7.147985855505643e-05, + "loss": 0.9354, + "step": 1576 + }, + { + "epoch": 1.7791584298220842, + "grad_norm": 0.04149574786424637, + "learning_rate": 7.136603693006179e-05, + "loss": 0.9866, + "step": 1577 + }, + { + "epoch": 1.7802880542219712, + "grad_norm": 0.04303409904241562, + "learning_rate": 7.125225570027083e-05, + "loss": 0.9862, + "step": 1578 + }, + { + "epoch": 1.781417678621858, + "grad_norm": 0.04238145053386688, + "learning_rate": 7.113851502619974e-05, + "loss": 0.9455, + "step": 1579 + }, + { + "epoch": 1.7825473030217451, + "grad_norm": 0.04182044416666031, + "learning_rate": 7.102481506830764e-05, + "loss": 0.9714, + "step": 1580 + }, + { + "epoch": 1.7836769274216322, + "grad_norm": 0.04323654994368553, + "learning_rate": 7.0911155986996e-05, + "loss": 1.0505, + "step": 1581 + }, + { + "epoch": 1.7848065518215193, + "grad_norm": 0.04246308654546738, + "learning_rate": 7.079753794260876e-05, + "loss": 0.9668, + "step": 1582 + }, + { + "epoch": 1.7859361762214063, + "grad_norm": 0.042159292846918106, + "learning_rate": 7.068396109543199e-05, + "loss": 1.0844, + "step": 1583 + }, + { + "epoch": 1.7870658006212934, + "grad_norm": 0.04247213155031204, + "learning_rate": 7.057042560569356e-05, + "loss": 1.0371, + "step": 1584 + }, + { + "epoch": 1.7881954250211805, + "grad_norm": 0.04189344495534897, + "learning_rate": 7.0456931633563e-05, + "loss": 1.0026, + "step": 1585 + }, + { + "epoch": 1.7893250494210675, + "grad_norm": 0.040689561516046524, + "learning_rate": 7.034347933915135e-05, + "loss": 0.9506, + "step": 1586 + }, + { + "epoch": 1.7904546738209546, + "grad_norm": 0.04134576767683029, + "learning_rate": 7.023006888251076e-05, + "loss": 0.96, + "step": 1587 + }, + { + "epoch": 1.7915842982208416, + "grad_norm": 0.03989570587873459, + "learning_rate": 7.011670042363445e-05, + "loss": 0.9603, + "step": 1588 + }, + { + "epoch": 1.7927139226207287, + "grad_norm": 0.04224357753992081, + "learning_rate": 7.000337412245632e-05, + "loss": 0.9133, + "step": 1589 + }, + { + "epoch": 1.7938435470206158, + "grad_norm": 0.04070102795958519, + "learning_rate": 6.989009013885077e-05, + "loss": 1.0841, + "step": 1590 + }, + { + "epoch": 1.7949731714205028, + "grad_norm": 0.04164205491542816, + "learning_rate": 6.977684863263261e-05, + "loss": 0.9859, + "step": 1591 + }, + { + "epoch": 1.7961027958203897, + "grad_norm": 0.040442369878292084, + "learning_rate": 6.966364976355664e-05, + "loss": 1.0549, + "step": 1592 + }, + { + "epoch": 1.7972324202202767, + "grad_norm": 0.04000772163271904, + "learning_rate": 6.955049369131749e-05, + "loss": 1.0889, + "step": 1593 + }, + { + "epoch": 1.7983620446201638, + "grad_norm": 0.0411187969148159, + "learning_rate": 6.943738057554957e-05, + "loss": 1.0049, + "step": 1594 + }, + { + "epoch": 1.7994916690200509, + "grad_norm": 0.041261740028858185, + "learning_rate": 6.932431057582647e-05, + "loss": 0.9334, + "step": 1595 + }, + { + "epoch": 1.8006212934199377, + "grad_norm": 0.041718561202287674, + "learning_rate": 6.921128385166109e-05, + "loss": 1.0152, + "step": 1596 + }, + { + "epoch": 1.8017509178198248, + "grad_norm": 0.042475439608097076, + "learning_rate": 6.909830056250527e-05, + "loss": 0.923, + "step": 1597 + }, + { + "epoch": 1.8028805422197118, + "grad_norm": 0.040802422910928726, + "learning_rate": 6.898536086774952e-05, + "loss": 0.9905, + "step": 1598 + }, + { + "epoch": 1.804010166619599, + "grad_norm": 0.04284251853823662, + "learning_rate": 6.887246492672296e-05, + "loss": 0.9836, + "step": 1599 + }, + { + "epoch": 1.805139791019486, + "grad_norm": 0.04458894580602646, + "learning_rate": 6.875961289869283e-05, + "loss": 0.9887, + "step": 1600 + }, + { + "epoch": 1.806269415419373, + "grad_norm": 0.04009930416941643, + "learning_rate": 6.864680494286451e-05, + "loss": 0.9395, + "step": 1601 + }, + { + "epoch": 1.80739903981926, + "grad_norm": 0.0409211665391922, + "learning_rate": 6.853404121838121e-05, + "loss": 0.954, + "step": 1602 + }, + { + "epoch": 1.8085286642191472, + "grad_norm": 0.04242323338985443, + "learning_rate": 6.842132188432376e-05, + "loss": 0.9966, + "step": 1603 + }, + { + "epoch": 1.8096582886190342, + "grad_norm": 0.04248444736003876, + "learning_rate": 6.830864709971027e-05, + "loss": 0.9439, + "step": 1604 + }, + { + "epoch": 1.8107879130189213, + "grad_norm": 0.041173093020915985, + "learning_rate": 6.819601702349609e-05, + "loss": 0.8787, + "step": 1605 + }, + { + "epoch": 1.8119175374188083, + "grad_norm": 0.042021602392196655, + "learning_rate": 6.808343181457346e-05, + "loss": 1.0753, + "step": 1606 + }, + { + "epoch": 1.8130471618186954, + "grad_norm": 0.04070396348834038, + "learning_rate": 6.797089163177137e-05, + "loss": 0.9971, + "step": 1607 + }, + { + "epoch": 1.8141767862185825, + "grad_norm": 0.04318329691886902, + "learning_rate": 6.785839663385526e-05, + "loss": 1.0268, + "step": 1608 + }, + { + "epoch": 1.8153064106184693, + "grad_norm": 0.04180993512272835, + "learning_rate": 6.774594697952672e-05, + "loss": 0.9524, + "step": 1609 + }, + { + "epoch": 1.8164360350183564, + "grad_norm": 0.042706914246082306, + "learning_rate": 6.763354282742363e-05, + "loss": 1.0231, + "step": 1610 + }, + { + "epoch": 1.8175656594182434, + "grad_norm": 0.03994883596897125, + "learning_rate": 6.752118433611939e-05, + "loss": 0.9749, + "step": 1611 + }, + { + "epoch": 1.8186952838181305, + "grad_norm": 0.042355459183454514, + "learning_rate": 6.740887166412315e-05, + "loss": 1.0237, + "step": 1612 + }, + { + "epoch": 1.8198249082180173, + "grad_norm": 0.04314768686890602, + "learning_rate": 6.729660496987944e-05, + "loss": 1.0333, + "step": 1613 + }, + { + "epoch": 1.8209545326179044, + "grad_norm": 0.043471548706293106, + "learning_rate": 6.718438441176781e-05, + "loss": 1.0369, + "step": 1614 + }, + { + "epoch": 1.8220841570177915, + "grad_norm": 0.040395159274339676, + "learning_rate": 6.707221014810279e-05, + "loss": 0.9598, + "step": 1615 + }, + { + "epoch": 1.8232137814176785, + "grad_norm": 0.04183076322078705, + "learning_rate": 6.696008233713362e-05, + "loss": 1.0045, + "step": 1616 + }, + { + "epoch": 1.8243434058175656, + "grad_norm": 0.04115899279713631, + "learning_rate": 6.684800113704397e-05, + "loss": 0.9235, + "step": 1617 + }, + { + "epoch": 1.8254730302174527, + "grad_norm": 0.04163803160190582, + "learning_rate": 6.673596670595181e-05, + "loss": 1.0222, + "step": 1618 + }, + { + "epoch": 1.8266026546173397, + "grad_norm": 0.042601440101861954, + "learning_rate": 6.662397920190904e-05, + "loss": 1.0039, + "step": 1619 + }, + { + "epoch": 1.8277322790172268, + "grad_norm": 0.04220454394817352, + "learning_rate": 6.651203878290139e-05, + "loss": 0.9655, + "step": 1620 + }, + { + "epoch": 1.8288619034171139, + "grad_norm": 0.04085693880915642, + "learning_rate": 6.640014560684824e-05, + "loss": 1.0043, + "step": 1621 + }, + { + "epoch": 1.829991527817001, + "grad_norm": 0.04273395612835884, + "learning_rate": 6.628829983160225e-05, + "loss": 0.9698, + "step": 1622 + }, + { + "epoch": 1.831121152216888, + "grad_norm": 0.04279292747378349, + "learning_rate": 6.617650161494915e-05, + "loss": 0.8682, + "step": 1623 + }, + { + "epoch": 1.832250776616775, + "grad_norm": 0.04293496534228325, + "learning_rate": 6.606475111460776e-05, + "loss": 1.0443, + "step": 1624 + }, + { + "epoch": 1.833380401016662, + "grad_norm": 0.042040567845106125, + "learning_rate": 6.59530484882294e-05, + "loss": 0.9817, + "step": 1625 + }, + { + "epoch": 1.834510025416549, + "grad_norm": 0.04191702604293823, + "learning_rate": 6.584139389339796e-05, + "loss": 1.0594, + "step": 1626 + }, + { + "epoch": 1.835639649816436, + "grad_norm": 0.0418466292321682, + "learning_rate": 6.572978748762954e-05, + "loss": 0.9879, + "step": 1627 + }, + { + "epoch": 1.836769274216323, + "grad_norm": 0.0425787977874279, + "learning_rate": 6.56182294283722e-05, + "loss": 0.9546, + "step": 1628 + }, + { + "epoch": 1.8378988986162101, + "grad_norm": 0.041348233819007874, + "learning_rate": 6.550671987300594e-05, + "loss": 0.964, + "step": 1629 + }, + { + "epoch": 1.839028523016097, + "grad_norm": 0.04128558188676834, + "learning_rate": 6.539525897884219e-05, + "loss": 0.9743, + "step": 1630 + }, + { + "epoch": 1.840158147415984, + "grad_norm": 0.042008284479379654, + "learning_rate": 6.528384690312375e-05, + "loss": 0.9439, + "step": 1631 + }, + { + "epoch": 1.841287771815871, + "grad_norm": 0.042078789323568344, + "learning_rate": 6.517248380302469e-05, + "loss": 0.9519, + "step": 1632 + }, + { + "epoch": 1.8424173962157582, + "grad_norm": 0.04227921739220619, + "learning_rate": 6.506116983564978e-05, + "loss": 0.992, + "step": 1633 + }, + { + "epoch": 1.8435470206156452, + "grad_norm": 0.04202863574028015, + "learning_rate": 6.49499051580346e-05, + "loss": 0.9948, + "step": 1634 + }, + { + "epoch": 1.8446766450155323, + "grad_norm": 0.04132537916302681, + "learning_rate": 6.48386899271452e-05, + "loss": 1.0068, + "step": 1635 + }, + { + "epoch": 1.8458062694154194, + "grad_norm": 0.04138810560107231, + "learning_rate": 6.472752429987782e-05, + "loss": 0.9869, + "step": 1636 + }, + { + "epoch": 1.8469358938153064, + "grad_norm": 0.04405840113759041, + "learning_rate": 6.461640843305878e-05, + "loss": 0.9432, + "step": 1637 + }, + { + "epoch": 1.8480655182151935, + "grad_norm": 0.04127390310168266, + "learning_rate": 6.450534248344417e-05, + "loss": 0.9554, + "step": 1638 + }, + { + "epoch": 1.8491951426150806, + "grad_norm": 0.04151296615600586, + "learning_rate": 6.439432660771962e-05, + "loss": 0.9998, + "step": 1639 + }, + { + "epoch": 1.8503247670149676, + "grad_norm": 0.04165178909897804, + "learning_rate": 6.428336096250019e-05, + "loss": 0.9282, + "step": 1640 + }, + { + "epoch": 1.8514543914148547, + "grad_norm": 0.04152395576238632, + "learning_rate": 6.417244570433005e-05, + "loss": 0.9607, + "step": 1641 + }, + { + "epoch": 1.8525840158147417, + "grad_norm": 0.041522059589624405, + "learning_rate": 6.406158098968221e-05, + "loss": 1.0155, + "step": 1642 + }, + { + "epoch": 1.8537136402146286, + "grad_norm": 0.041412562131881714, + "learning_rate": 6.395076697495854e-05, + "loss": 0.9229, + "step": 1643 + }, + { + "epoch": 1.8548432646145157, + "grad_norm": 0.042269017547369, + "learning_rate": 6.384000381648926e-05, + "loss": 0.9884, + "step": 1644 + }, + { + "epoch": 1.8559728890144027, + "grad_norm": 0.04206148535013199, + "learning_rate": 6.372929167053286e-05, + "loss": 0.9337, + "step": 1645 + }, + { + "epoch": 1.8571025134142898, + "grad_norm": 0.04192545637488365, + "learning_rate": 6.361863069327591e-05, + "loss": 1.0505, + "step": 1646 + }, + { + "epoch": 1.8582321378141766, + "grad_norm": 0.04114922136068344, + "learning_rate": 6.350802104083271e-05, + "loss": 0.9875, + "step": 1647 + }, + { + "epoch": 1.8593617622140637, + "grad_norm": 0.04190947860479355, + "learning_rate": 6.339746286924531e-05, + "loss": 0.9939, + "step": 1648 + }, + { + "epoch": 1.8604913866139507, + "grad_norm": 0.043601252138614655, + "learning_rate": 6.328695633448296e-05, + "loss": 1.061, + "step": 1649 + }, + { + "epoch": 1.8616210110138378, + "grad_norm": 0.042812932282686234, + "learning_rate": 6.317650159244212e-05, + "loss": 0.9783, + "step": 1650 + }, + { + "epoch": 1.8627506354137249, + "grad_norm": 0.04309513419866562, + "learning_rate": 6.306609879894627e-05, + "loss": 0.9543, + "step": 1651 + }, + { + "epoch": 1.863880259813612, + "grad_norm": 0.04289943352341652, + "learning_rate": 6.295574810974552e-05, + "loss": 0.9788, + "step": 1652 + }, + { + "epoch": 1.865009884213499, + "grad_norm": 0.04318351671099663, + "learning_rate": 6.284544968051643e-05, + "loss": 0.9669, + "step": 1653 + }, + { + "epoch": 1.866139508613386, + "grad_norm": 0.04084068536758423, + "learning_rate": 6.273520366686195e-05, + "loss": 0.905, + "step": 1654 + }, + { + "epoch": 1.8672691330132731, + "grad_norm": 0.04186054319143295, + "learning_rate": 6.2625010224311e-05, + "loss": 0.9222, + "step": 1655 + }, + { + "epoch": 1.8683987574131602, + "grad_norm": 0.04201950505375862, + "learning_rate": 6.251486950831838e-05, + "loss": 0.9293, + "step": 1656 + }, + { + "epoch": 1.8695283818130473, + "grad_norm": 0.043176308274269104, + "learning_rate": 6.240478167426451e-05, + "loss": 1.0731, + "step": 1657 + }, + { + "epoch": 1.8706580062129343, + "grad_norm": 0.042063016444444656, + "learning_rate": 6.229474687745513e-05, + "loss": 1.0344, + "step": 1658 + }, + { + "epoch": 1.8717876306128214, + "grad_norm": 0.041596777737140656, + "learning_rate": 6.218476527312127e-05, + "loss": 0.9725, + "step": 1659 + }, + { + "epoch": 1.8729172550127082, + "grad_norm": 0.04136804863810539, + "learning_rate": 6.207483701641888e-05, + "loss": 0.9089, + "step": 1660 + }, + { + "epoch": 1.8740468794125953, + "grad_norm": 0.04120111092925072, + "learning_rate": 6.196496226242852e-05, + "loss": 1.0282, + "step": 1661 + }, + { + "epoch": 1.8751765038124824, + "grad_norm": 0.04079049080610275, + "learning_rate": 6.185514116615553e-05, + "loss": 0.9121, + "step": 1662 + }, + { + "epoch": 1.8763061282123694, + "grad_norm": 0.041485901921987534, + "learning_rate": 6.174537388252932e-05, + "loss": 1.017, + "step": 1663 + }, + { + "epoch": 1.8774357526122563, + "grad_norm": 0.04155506566166878, + "learning_rate": 6.163566056640349e-05, + "loss": 1.0028, + "step": 1664 + }, + { + "epoch": 1.8785653770121433, + "grad_norm": 0.041246477514505386, + "learning_rate": 6.15260013725555e-05, + "loss": 1.0494, + "step": 1665 + }, + { + "epoch": 1.8796950014120304, + "grad_norm": 0.04152505099773407, + "learning_rate": 6.141639645568646e-05, + "loss": 0.9382, + "step": 1666 + }, + { + "epoch": 1.8808246258119174, + "grad_norm": 0.03967185318470001, + "learning_rate": 6.130684597042088e-05, + "loss": 0.9703, + "step": 1667 + }, + { + "epoch": 1.8819542502118045, + "grad_norm": 0.04061749577522278, + "learning_rate": 6.119735007130649e-05, + "loss": 0.9481, + "step": 1668 + }, + { + "epoch": 1.8830838746116916, + "grad_norm": 0.04188203811645508, + "learning_rate": 6.1087908912814e-05, + "loss": 0.9199, + "step": 1669 + }, + { + "epoch": 1.8842134990115786, + "grad_norm": 0.041013531386852264, + "learning_rate": 6.097852264933697e-05, + "loss": 0.8728, + "step": 1670 + }, + { + "epoch": 1.8853431234114657, + "grad_norm": 0.04213571920990944, + "learning_rate": 6.086919143519143e-05, + "loss": 0.9471, + "step": 1671 + }, + { + "epoch": 1.8864727478113528, + "grad_norm": 0.04296332970261574, + "learning_rate": 6.075991542461574e-05, + "loss": 0.9888, + "step": 1672 + }, + { + "epoch": 1.8876023722112398, + "grad_norm": 0.040672652423381805, + "learning_rate": 6.065069477177048e-05, + "loss": 0.9785, + "step": 1673 + }, + { + "epoch": 1.888731996611127, + "grad_norm": 0.041439056396484375, + "learning_rate": 6.054152963073809e-05, + "loss": 0.9904, + "step": 1674 + }, + { + "epoch": 1.889861621011014, + "grad_norm": 0.04170332849025726, + "learning_rate": 6.043242015552258e-05, + "loss": 0.9502, + "step": 1675 + }, + { + "epoch": 1.890991245410901, + "grad_norm": 0.043845389038324356, + "learning_rate": 6.032336650004966e-05, + "loss": 0.9832, + "step": 1676 + }, + { + "epoch": 1.8921208698107879, + "grad_norm": 0.042602553963661194, + "learning_rate": 6.021436881816608e-05, + "loss": 1.0506, + "step": 1677 + }, + { + "epoch": 1.893250494210675, + "grad_norm": 0.04084152728319168, + "learning_rate": 6.010542726363976e-05, + "loss": 0.9826, + "step": 1678 + }, + { + "epoch": 1.894380118610562, + "grad_norm": 0.04469555988907814, + "learning_rate": 5.9996541990159383e-05, + "loss": 1.0022, + "step": 1679 + }, + { + "epoch": 1.895509743010449, + "grad_norm": 0.04405542463064194, + "learning_rate": 5.988771315133418e-05, + "loss": 0.9563, + "step": 1680 + }, + { + "epoch": 1.896639367410336, + "grad_norm": 0.041179101914167404, + "learning_rate": 5.9778940900693935e-05, + "loss": 0.971, + "step": 1681 + }, + { + "epoch": 1.897768991810223, + "grad_norm": 0.04140183702111244, + "learning_rate": 5.967022539168843e-05, + "loss": 0.8642, + "step": 1682 + }, + { + "epoch": 1.89889861621011, + "grad_norm": 0.04198272526264191, + "learning_rate": 5.9561566777687427e-05, + "loss": 0.9716, + "step": 1683 + }, + { + "epoch": 1.900028240609997, + "grad_norm": 0.042881209403276443, + "learning_rate": 5.945296521198054e-05, + "loss": 0.9638, + "step": 1684 + }, + { + "epoch": 1.9011578650098842, + "grad_norm": 0.042486660182476044, + "learning_rate": 5.934442084777676e-05, + "loss": 0.9335, + "step": 1685 + }, + { + "epoch": 1.9022874894097712, + "grad_norm": 0.04221144691109657, + "learning_rate": 5.9235933838204516e-05, + "loss": 0.9281, + "step": 1686 + }, + { + "epoch": 1.9034171138096583, + "grad_norm": 0.04329012706875801, + "learning_rate": 5.9127504336311176e-05, + "loss": 1.0478, + "step": 1687 + }, + { + "epoch": 1.9045467382095453, + "grad_norm": 0.04217381402850151, + "learning_rate": 5.9019132495063056e-05, + "loss": 1.0038, + "step": 1688 + }, + { + "epoch": 1.9056763626094324, + "grad_norm": 0.042386867105960846, + "learning_rate": 5.8910818467345185e-05, + "loss": 1.0279, + "step": 1689 + }, + { + "epoch": 1.9068059870093195, + "grad_norm": 0.04081982001662254, + "learning_rate": 5.880256240596096e-05, + "loss": 1.0205, + "step": 1690 + }, + { + "epoch": 1.9079356114092065, + "grad_norm": 0.043334148824214935, + "learning_rate": 5.869436446363195e-05, + "loss": 0.9938, + "step": 1691 + }, + { + "epoch": 1.9090652358090936, + "grad_norm": 0.04122162610292435, + "learning_rate": 5.858622479299787e-05, + "loss": 0.9001, + "step": 1692 + }, + { + "epoch": 1.9101948602089807, + "grad_norm": 0.041972994804382324, + "learning_rate": 5.847814354661616e-05, + "loss": 0.9105, + "step": 1693 + }, + { + "epoch": 1.9113244846088675, + "grad_norm": 0.04239688068628311, + "learning_rate": 5.8370120876961745e-05, + "loss": 0.9657, + "step": 1694 + }, + { + "epoch": 1.9124541090087546, + "grad_norm": 0.043491121381521225, + "learning_rate": 5.8262156936427096e-05, + "loss": 1.0711, + "step": 1695 + }, + { + "epoch": 1.9135837334086416, + "grad_norm": 0.04262327775359154, + "learning_rate": 5.8154251877321706e-05, + "loss": 0.8719, + "step": 1696 + }, + { + "epoch": 1.9147133578085287, + "grad_norm": 0.04023859649896622, + "learning_rate": 5.804640585187207e-05, + "loss": 1.0929, + "step": 1697 + }, + { + "epoch": 1.9158429822084158, + "grad_norm": 0.04104539379477501, + "learning_rate": 5.793861901222131e-05, + "loss": 0.8602, + "step": 1698 + }, + { + "epoch": 1.9169726066083026, + "grad_norm": 0.04198300093412399, + "learning_rate": 5.783089151042914e-05, + "loss": 0.9534, + "step": 1699 + }, + { + "epoch": 1.9181022310081897, + "grad_norm": 0.04305735230445862, + "learning_rate": 5.772322349847154e-05, + "loss": 0.948, + "step": 1700 + }, + { + "epoch": 1.9192318554080767, + "grad_norm": 0.04169195145368576, + "learning_rate": 5.761561512824053e-05, + "loss": 0.9201, + "step": 1701 + }, + { + "epoch": 1.9203614798079638, + "grad_norm": 0.041344236582517624, + "learning_rate": 5.750806655154399e-05, + "loss": 0.996, + "step": 1702 + }, + { + "epoch": 1.9214911042078509, + "grad_norm": 0.04161575064063072, + "learning_rate": 5.740057792010562e-05, + "loss": 0.98, + "step": 1703 + }, + { + "epoch": 1.922620728607738, + "grad_norm": 0.04355592653155327, + "learning_rate": 5.72931493855642e-05, + "loss": 0.9855, + "step": 1704 + }, + { + "epoch": 1.923750353007625, + "grad_norm": 0.043292030692100525, + "learning_rate": 5.71857810994741e-05, + "loss": 1.0197, + "step": 1705 + }, + { + "epoch": 1.924879977407512, + "grad_norm": 0.04138379916548729, + "learning_rate": 5.70784732133045e-05, + "loss": 0.9655, + "step": 1706 + }, + { + "epoch": 1.926009601807399, + "grad_norm": 0.04593029245734215, + "learning_rate": 5.6971225878439285e-05, + "loss": 1.0195, + "step": 1707 + }, + { + "epoch": 1.9271392262072862, + "grad_norm": 0.042191557586193085, + "learning_rate": 5.686403924617718e-05, + "loss": 1.0543, + "step": 1708 + }, + { + "epoch": 1.9282688506071732, + "grad_norm": 0.041936252266168594, + "learning_rate": 5.6756913467731066e-05, + "loss": 0.9987, + "step": 1709 + }, + { + "epoch": 1.9293984750070603, + "grad_norm": 0.042339473962783813, + "learning_rate": 5.6649848694228026e-05, + "loss": 0.8652, + "step": 1710 + }, + { + "epoch": 1.9305280994069471, + "grad_norm": 0.04215146601200104, + "learning_rate": 5.6542845076709126e-05, + "loss": 0.9681, + "step": 1711 + }, + { + "epoch": 1.9316577238068342, + "grad_norm": 0.042300038039684296, + "learning_rate": 5.643590276612909e-05, + "loss": 0.9267, + "step": 1712 + }, + { + "epoch": 1.9327873482067213, + "grad_norm": 0.04266273230314255, + "learning_rate": 5.6329021913356216e-05, + "loss": 1.0148, + "step": 1713 + }, + { + "epoch": 1.9339169726066083, + "grad_norm": 0.04163605719804764, + "learning_rate": 5.6222202669172045e-05, + "loss": 1.1034, + "step": 1714 + }, + { + "epoch": 1.9350465970064954, + "grad_norm": 0.044020287692546844, + "learning_rate": 5.611544518427121e-05, + "loss": 1.0142, + "step": 1715 + }, + { + "epoch": 1.9361762214063822, + "grad_norm": 0.041142720729112625, + "learning_rate": 5.6008749609261304e-05, + "loss": 1.0388, + "step": 1716 + }, + { + "epoch": 1.9373058458062693, + "grad_norm": 0.04347783327102661, + "learning_rate": 5.59021160946625e-05, + "loss": 1.0435, + "step": 1717 + }, + { + "epoch": 1.9384354702061564, + "grad_norm": 0.04174618050456047, + "learning_rate": 5.579554479090735e-05, + "loss": 0.9933, + "step": 1718 + }, + { + "epoch": 1.9395650946060434, + "grad_norm": 0.042470984160900116, + "learning_rate": 5.568903584834082e-05, + "loss": 1.0287, + "step": 1719 + }, + { + "epoch": 1.9406947190059305, + "grad_norm": 0.04115034639835358, + "learning_rate": 5.558258941721982e-05, + "loss": 0.9838, + "step": 1720 + }, + { + "epoch": 1.9418243434058176, + "grad_norm": 0.04163762181997299, + "learning_rate": 5.547620564771293e-05, + "loss": 0.9575, + "step": 1721 + }, + { + "epoch": 1.9429539678057046, + "grad_norm": 0.04199086129665375, + "learning_rate": 5.5369884689900586e-05, + "loss": 1.0274, + "step": 1722 + }, + { + "epoch": 1.9440835922055917, + "grad_norm": 0.04397021606564522, + "learning_rate": 5.526362669377444e-05, + "loss": 1.0434, + "step": 1723 + }, + { + "epoch": 1.9452132166054787, + "grad_norm": 0.0407724566757679, + "learning_rate": 5.515743180923737e-05, + "loss": 1.1053, + "step": 1724 + }, + { + "epoch": 1.9463428410053658, + "grad_norm": 0.040555331856012344, + "learning_rate": 5.5051300186103214e-05, + "loss": 0.9554, + "step": 1725 + }, + { + "epoch": 1.9474724654052529, + "grad_norm": 0.043255507946014404, + "learning_rate": 5.494523197409653e-05, + "loss": 0.9669, + "step": 1726 + }, + { + "epoch": 1.94860208980514, + "grad_norm": 0.04308564215898514, + "learning_rate": 5.483922732285258e-05, + "loss": 0.9083, + "step": 1727 + }, + { + "epoch": 1.9497317142050268, + "grad_norm": 0.042100612074136734, + "learning_rate": 5.4733286381916715e-05, + "loss": 0.9747, + "step": 1728 + }, + { + "epoch": 1.9508613386049138, + "grad_norm": 0.042345307767391205, + "learning_rate": 5.4627409300744526e-05, + "loss": 0.9809, + "step": 1729 + }, + { + "epoch": 1.951990963004801, + "grad_norm": 0.041533615440130234, + "learning_rate": 5.452159622870158e-05, + "loss": 0.9315, + "step": 1730 + }, + { + "epoch": 1.953120587404688, + "grad_norm": 0.041634928435087204, + "learning_rate": 5.4415847315063083e-05, + "loss": 0.9254, + "step": 1731 + }, + { + "epoch": 1.954250211804575, + "grad_norm": 0.04313870519399643, + "learning_rate": 5.431016270901362e-05, + "loss": 0.926, + "step": 1732 + }, + { + "epoch": 1.9553798362044619, + "grad_norm": 0.04185828939080238, + "learning_rate": 5.4204542559647266e-05, + "loss": 0.8805, + "step": 1733 + }, + { + "epoch": 1.956509460604349, + "grad_norm": 0.04114462062716484, + "learning_rate": 5.409898701596703e-05, + "loss": 1.0163, + "step": 1734 + }, + { + "epoch": 1.957639085004236, + "grad_norm": 0.04308726638555527, + "learning_rate": 5.399349622688479e-05, + "loss": 1.0412, + "step": 1735 + }, + { + "epoch": 1.958768709404123, + "grad_norm": 0.04099753499031067, + "learning_rate": 5.388807034122111e-05, + "loss": 0.8463, + "step": 1736 + }, + { + "epoch": 1.9598983338040101, + "grad_norm": 0.04242704063653946, + "learning_rate": 5.378270950770494e-05, + "loss": 0.9864, + "step": 1737 + }, + { + "epoch": 1.9610279582038972, + "grad_norm": 0.04200004041194916, + "learning_rate": 5.367741387497351e-05, + "loss": 0.9477, + "step": 1738 + }, + { + "epoch": 1.9621575826037843, + "grad_norm": 0.04250415042042732, + "learning_rate": 5.3572183591572054e-05, + "loss": 0.9758, + "step": 1739 + }, + { + "epoch": 1.9632872070036713, + "grad_norm": 0.04299885779619217, + "learning_rate": 5.346701880595354e-05, + "loss": 1.0595, + "step": 1740 + }, + { + "epoch": 1.9644168314035584, + "grad_norm": 0.04244587942957878, + "learning_rate": 5.336191966647874e-05, + "loss": 1.0024, + "step": 1741 + }, + { + "epoch": 1.9655464558034454, + "grad_norm": 0.04144721105694771, + "learning_rate": 5.325688632141555e-05, + "loss": 0.986, + "step": 1742 + }, + { + "epoch": 1.9666760802033325, + "grad_norm": 0.045342057943344116, + "learning_rate": 5.315191891893919e-05, + "loss": 1.0682, + "step": 1743 + }, + { + "epoch": 1.9678057046032196, + "grad_norm": 0.043371427804231644, + "learning_rate": 5.3047017607131955e-05, + "loss": 0.9772, + "step": 1744 + }, + { + "epoch": 1.9689353290031064, + "grad_norm": 0.04567921906709671, + "learning_rate": 5.29421825339826e-05, + "loss": 0.9572, + "step": 1745 + }, + { + "epoch": 1.9700649534029935, + "grad_norm": 0.040609635412693024, + "learning_rate": 5.2837413847386776e-05, + "loss": 0.9193, + "step": 1746 + }, + { + "epoch": 1.9711945778028805, + "grad_norm": 0.04302290827035904, + "learning_rate": 5.2732711695146266e-05, + "loss": 0.9864, + "step": 1747 + }, + { + "epoch": 1.9723242022027676, + "grad_norm": 0.04100389406085014, + "learning_rate": 5.2628076224969036e-05, + "loss": 0.9455, + "step": 1748 + }, + { + "epoch": 1.9734538266026547, + "grad_norm": 0.04347492754459381, + "learning_rate": 5.2523507584469e-05, + "loss": 0.8589, + "step": 1749 + }, + { + "epoch": 1.9745834510025415, + "grad_norm": 0.041986506432294846, + "learning_rate": 5.24190059211658e-05, + "loss": 0.9477, + "step": 1750 + }, + { + "epoch": 1.9757130754024286, + "grad_norm": 0.041839152574539185, + "learning_rate": 5.231457138248457e-05, + "loss": 1.0711, + "step": 1751 + }, + { + "epoch": 1.9768426998023156, + "grad_norm": 0.04245884716510773, + "learning_rate": 5.221020411575572e-05, + "loss": 1.0331, + "step": 1752 + }, + { + "epoch": 1.9779723242022027, + "grad_norm": 0.04223821312189102, + "learning_rate": 5.210590426821479e-05, + "loss": 1.0202, + "step": 1753 + }, + { + "epoch": 1.9791019486020898, + "grad_norm": 0.0426202192902565, + "learning_rate": 5.200167198700228e-05, + "loss": 0.9441, + "step": 1754 + }, + { + "epoch": 1.9802315730019768, + "grad_norm": 0.04572174325585365, + "learning_rate": 5.189750741916326e-05, + "loss": 0.9966, + "step": 1755 + }, + { + "epoch": 1.981361197401864, + "grad_norm": 0.04126838594675064, + "learning_rate": 5.179341071164725e-05, + "loss": 1.0346, + "step": 1756 + }, + { + "epoch": 1.982490821801751, + "grad_norm": 0.041993822902441025, + "learning_rate": 5.16893820113082e-05, + "loss": 0.8974, + "step": 1757 + }, + { + "epoch": 1.983620446201638, + "grad_norm": 0.042118530720472336, + "learning_rate": 5.1585421464903994e-05, + "loss": 1.0124, + "step": 1758 + }, + { + "epoch": 1.984750070601525, + "grad_norm": 0.04224333539605141, + "learning_rate": 5.1481529219096304e-05, + "loss": 1.0564, + "step": 1759 + }, + { + "epoch": 1.9858796950014121, + "grad_norm": 0.04151271656155586, + "learning_rate": 5.137770542045063e-05, + "loss": 0.9412, + "step": 1760 + }, + { + "epoch": 1.9870093194012992, + "grad_norm": 0.04101775959134102, + "learning_rate": 5.12739502154358e-05, + "loss": 0.9164, + "step": 1761 + }, + { + "epoch": 1.988138943801186, + "grad_norm": 0.04164358228445053, + "learning_rate": 5.117026375042387e-05, + "loss": 0.9745, + "step": 1762 + }, + { + "epoch": 1.9892685682010731, + "grad_norm": 0.04084084928035736, + "learning_rate": 5.106664617168997e-05, + "loss": 0.9126, + "step": 1763 + }, + { + "epoch": 1.9903981926009602, + "grad_norm": 0.042791228741407394, + "learning_rate": 5.096309762541196e-05, + "loss": 0.9558, + "step": 1764 + }, + { + "epoch": 1.9915278170008472, + "grad_norm": 0.042839985340833664, + "learning_rate": 5.085961825767049e-05, + "loss": 0.9901, + "step": 1765 + }, + { + "epoch": 1.9926574414007343, + "grad_norm": 0.043768156319856644, + "learning_rate": 5.075620821444839e-05, + "loss": 0.9564, + "step": 1766 + }, + { + "epoch": 1.9937870658006211, + "grad_norm": 0.042571209371089935, + "learning_rate": 5.06528676416308e-05, + "loss": 0.9687, + "step": 1767 + }, + { + "epoch": 1.9949166902005082, + "grad_norm": 0.04314889758825302, + "learning_rate": 5.054959668500494e-05, + "loss": 1.1016, + "step": 1768 + }, + { + "epoch": 1.9960463146003953, + "grad_norm": 0.04145493358373642, + "learning_rate": 5.0446395490259734e-05, + "loss": 0.8943, + "step": 1769 + }, + { + "epoch": 1.9971759390002823, + "grad_norm": 0.04249756410717964, + "learning_rate": 5.0343264202985575e-05, + "loss": 0.9249, + "step": 1770 + }, + { + "epoch": 1.9971759390002823, + "eval_loss": 0.98674476146698, + "eval_runtime": 550.5161, + "eval_samples_per_second": 17.771, + "eval_steps_per_second": 8.886, + "step": 1770 + }, + { + "epoch": 1.9983055634001694, + "grad_norm": 0.04155099019408226, + "learning_rate": 5.024020296867447e-05, + "loss": 0.9206, + "step": 1771 + }, + { + "epoch": 1.9994351878000565, + "grad_norm": 0.04307400807738304, + "learning_rate": 5.013721193271943e-05, + "loss": 0.9068, + "step": 1772 + }, + { + "epoch": 2.0005648121999435, + "grad_norm": 0.04108371585607529, + "learning_rate": 5.003429124041448e-05, + "loss": 1.0198, + "step": 1773 + }, + { + "epoch": 2.0016944365998306, + "grad_norm": 0.04241110011935234, + "learning_rate": 4.993144103695444e-05, + "loss": 1.0401, + "step": 1774 + }, + { + "epoch": 2.0005649717514125, + "grad_norm": 0.04497067630290985, + "learning_rate": 4.9828661467434644e-05, + "loss": 0.8991, + "step": 1775 + }, + { + "epoch": 2.0016949152542374, + "grad_norm": 0.04190199822187424, + "learning_rate": 4.9725952676850794e-05, + "loss": 1.0002, + "step": 1776 + }, + { + "epoch": 2.0028248587570623, + "grad_norm": 0.04270040616393089, + "learning_rate": 4.9623314810098755e-05, + "loss": 0.9177, + "step": 1777 + }, + { + "epoch": 2.003954802259887, + "grad_norm": 0.042360976338386536, + "learning_rate": 4.952074801197426e-05, + "loss": 0.9497, + "step": 1778 + }, + { + "epoch": 2.0050847457627117, + "grad_norm": 0.04216707870364189, + "learning_rate": 4.9418252427172996e-05, + "loss": 0.9471, + "step": 1779 + }, + { + "epoch": 2.0062146892655366, + "grad_norm": 0.04215894639492035, + "learning_rate": 4.93158282002899e-05, + "loss": 0.956, + "step": 1780 + }, + { + "epoch": 2.0073446327683615, + "grad_norm": 0.041472285985946655, + "learning_rate": 4.921347547581939e-05, + "loss": 1.0184, + "step": 1781 + }, + { + "epoch": 2.0084745762711864, + "grad_norm": 0.041694026440382004, + "learning_rate": 4.911119439815508e-05, + "loss": 0.9715, + "step": 1782 + }, + { + "epoch": 2.0096045197740113, + "grad_norm": 0.041800353676080704, + "learning_rate": 4.900898511158938e-05, + "loss": 0.9482, + "step": 1783 + }, + { + "epoch": 2.0107344632768362, + "grad_norm": 0.04290417581796646, + "learning_rate": 4.890684776031347e-05, + "loss": 0.9799, + "step": 1784 + }, + { + "epoch": 2.011864406779661, + "grad_norm": 0.04424067586660385, + "learning_rate": 4.8804782488417054e-05, + "loss": 0.9554, + "step": 1785 + }, + { + "epoch": 2.012994350282486, + "grad_norm": 0.04423059523105621, + "learning_rate": 4.870278943988815e-05, + "loss": 0.9836, + "step": 1786 + }, + { + "epoch": 2.0141242937853105, + "grad_norm": 0.041760075837373734, + "learning_rate": 4.860086875861288e-05, + "loss": 0.9582, + "step": 1787 + }, + { + "epoch": 2.0152542372881355, + "grad_norm": 0.04206801578402519, + "learning_rate": 4.8499020588375274e-05, + "loss": 1.0098, + "step": 1788 + }, + { + "epoch": 2.0163841807909604, + "grad_norm": 0.041129548102617264, + "learning_rate": 4.8397245072857066e-05, + "loss": 0.999, + "step": 1789 + }, + { + "epoch": 2.0175141242937853, + "grad_norm": 0.042199425399303436, + "learning_rate": 4.82955423556375e-05, + "loss": 1.0089, + "step": 1790 + }, + { + "epoch": 2.01864406779661, + "grad_norm": 0.04368184879422188, + "learning_rate": 4.8193912580193126e-05, + "loss": 0.9612, + "step": 1791 + }, + { + "epoch": 2.019774011299435, + "grad_norm": 0.04354587942361832, + "learning_rate": 4.8092355889897535e-05, + "loss": 0.9033, + "step": 1792 + }, + { + "epoch": 2.02090395480226, + "grad_norm": 0.04332384094595909, + "learning_rate": 4.799087242802137e-05, + "loss": 0.965, + "step": 1793 + }, + { + "epoch": 2.022033898305085, + "grad_norm": 0.04101352021098137, + "learning_rate": 4.788946233773171e-05, + "loss": 0.9246, + "step": 1794 + }, + { + "epoch": 2.0231638418079094, + "grad_norm": 0.0444595068693161, + "learning_rate": 4.778812576209241e-05, + "loss": 0.9641, + "step": 1795 + }, + { + "epoch": 2.0242937853107343, + "grad_norm": 0.04191228747367859, + "learning_rate": 4.768686284406341e-05, + "loss": 0.9286, + "step": 1796 + }, + { + "epoch": 2.0254237288135593, + "grad_norm": 0.042017534375190735, + "learning_rate": 4.758567372650081e-05, + "loss": 1.0086, + "step": 1797 + }, + { + "epoch": 2.026553672316384, + "grad_norm": 0.0429287888109684, + "learning_rate": 4.748455855215661e-05, + "loss": 0.9385, + "step": 1798 + }, + { + "epoch": 2.027683615819209, + "grad_norm": 0.04178991913795471, + "learning_rate": 4.738351746367847e-05, + "loss": 0.8944, + "step": 1799 + }, + { + "epoch": 2.028813559322034, + "grad_norm": 0.04435624182224274, + "learning_rate": 4.728255060360955e-05, + "loss": 0.9517, + "step": 1800 + }, + { + "epoch": 2.029943502824859, + "grad_norm": 0.04260590299963951, + "learning_rate": 4.718165811438827e-05, + "loss": 0.868, + "step": 1801 + }, + { + "epoch": 2.031073446327684, + "grad_norm": 0.04216260462999344, + "learning_rate": 4.708084013834813e-05, + "loss": 1.0083, + "step": 1802 + }, + { + "epoch": 2.0322033898305083, + "grad_norm": 0.04152260348200798, + "learning_rate": 4.69800968177176e-05, + "loss": 1.0892, + "step": 1803 + }, + { + "epoch": 2.033333333333333, + "grad_norm": 0.0426262728869915, + "learning_rate": 4.687942829461969e-05, + "loss": 0.939, + "step": 1804 + }, + { + "epoch": 2.034463276836158, + "grad_norm": 0.042451802641153336, + "learning_rate": 4.677883471107193e-05, + "loss": 0.9649, + "step": 1805 + }, + { + "epoch": 2.035593220338983, + "grad_norm": 0.04238414391875267, + "learning_rate": 4.667831620898624e-05, + "loss": 1.0071, + "step": 1806 + }, + { + "epoch": 2.036723163841808, + "grad_norm": 0.04201498627662659, + "learning_rate": 4.657787293016854e-05, + "loss": 1.0189, + "step": 1807 + }, + { + "epoch": 2.037853107344633, + "grad_norm": 0.042049601674079895, + "learning_rate": 4.64775050163185e-05, + "loss": 0.9891, + "step": 1808 + }, + { + "epoch": 2.038983050847458, + "grad_norm": 0.04302692413330078, + "learning_rate": 4.6377212609029727e-05, + "loss": 0.928, + "step": 1809 + }, + { + "epoch": 2.0401129943502827, + "grad_norm": 0.04353903979063034, + "learning_rate": 4.6276995849789115e-05, + "loss": 0.9651, + "step": 1810 + }, + { + "epoch": 2.041242937853107, + "grad_norm": 0.04473048448562622, + "learning_rate": 4.617685487997693e-05, + "loss": 1.036, + "step": 1811 + }, + { + "epoch": 2.042372881355932, + "grad_norm": 0.043298009783029556, + "learning_rate": 4.607678984086644e-05, + "loss": 0.9935, + "step": 1812 + }, + { + "epoch": 2.043502824858757, + "grad_norm": 0.04168165847659111, + "learning_rate": 4.597680087362388e-05, + "loss": 0.9986, + "step": 1813 + }, + { + "epoch": 2.044632768361582, + "grad_norm": 0.04360107332468033, + "learning_rate": 4.5876888119308116e-05, + "loss": 1.041, + "step": 1814 + }, + { + "epoch": 2.045762711864407, + "grad_norm": 0.0433119460940361, + "learning_rate": 4.57770517188705e-05, + "loss": 0.9559, + "step": 1815 + }, + { + "epoch": 2.0468926553672318, + "grad_norm": 0.04289395362138748, + "learning_rate": 4.567729181315466e-05, + "loss": 0.9049, + "step": 1816 + }, + { + "epoch": 2.0480225988700567, + "grad_norm": 0.04195884242653847, + "learning_rate": 4.5577608542896414e-05, + "loss": 0.9174, + "step": 1817 + }, + { + "epoch": 2.049152542372881, + "grad_norm": 0.045005012303590775, + "learning_rate": 4.547800204872328e-05, + "loss": 1.0006, + "step": 1818 + }, + { + "epoch": 2.050282485875706, + "grad_norm": 0.042177747935056686, + "learning_rate": 4.537847247115455e-05, + "loss": 0.8667, + "step": 1819 + }, + { + "epoch": 2.051412429378531, + "grad_norm": 0.04145647957921028, + "learning_rate": 4.527901995060113e-05, + "loss": 0.9482, + "step": 1820 + }, + { + "epoch": 2.052542372881356, + "grad_norm": 0.04456596449017525, + "learning_rate": 4.517964462736504e-05, + "loss": 1.0383, + "step": 1821 + }, + { + "epoch": 2.053672316384181, + "grad_norm": 0.04184206575155258, + "learning_rate": 4.5080346641639474e-05, + "loss": 0.8904, + "step": 1822 + }, + { + "epoch": 2.0548022598870057, + "grad_norm": 0.04329001158475876, + "learning_rate": 4.49811261335085e-05, + "loss": 1.0087, + "step": 1823 + }, + { + "epoch": 2.0559322033898306, + "grad_norm": 0.043258413672447205, + "learning_rate": 4.488198324294691e-05, + "loss": 0.9363, + "step": 1824 + }, + { + "epoch": 2.0570621468926555, + "grad_norm": 0.04341473430395126, + "learning_rate": 4.478291810981998e-05, + "loss": 0.9583, + "step": 1825 + }, + { + "epoch": 2.05819209039548, + "grad_norm": 0.042815063148736954, + "learning_rate": 4.46839308738833e-05, + "loss": 0.9449, + "step": 1826 + }, + { + "epoch": 2.059322033898305, + "grad_norm": 0.04257015511393547, + "learning_rate": 4.4585021674782534e-05, + "loss": 0.961, + "step": 1827 + }, + { + "epoch": 2.06045197740113, + "grad_norm": 0.043428368866443634, + "learning_rate": 4.4486190652053315e-05, + "loss": 0.9658, + "step": 1828 + }, + { + "epoch": 2.0615819209039548, + "grad_norm": 0.042757805436849594, + "learning_rate": 4.438743794512096e-05, + "loss": 1.0266, + "step": 1829 + }, + { + "epoch": 2.0627118644067797, + "grad_norm": 0.0427665188908577, + "learning_rate": 4.428876369330023e-05, + "loss": 1.0276, + "step": 1830 + }, + { + "epoch": 2.0638418079096046, + "grad_norm": 0.04279656335711479, + "learning_rate": 4.41901680357954e-05, + "loss": 0.8914, + "step": 1831 + }, + { + "epoch": 2.0649717514124295, + "grad_norm": 0.04068029671907425, + "learning_rate": 4.4091651111699606e-05, + "loss": 0.9335, + "step": 1832 + }, + { + "epoch": 2.0661016949152544, + "grad_norm": 0.043668482452631, + "learning_rate": 4.3993213059995154e-05, + "loss": 1.0198, + "step": 1833 + }, + { + "epoch": 2.067231638418079, + "grad_norm": 0.04225129261612892, + "learning_rate": 4.389485401955294e-05, + "loss": 1.0887, + "step": 1834 + }, + { + "epoch": 2.068361581920904, + "grad_norm": 0.042218949645757675, + "learning_rate": 4.379657412913243e-05, + "loss": 0.9673, + "step": 1835 + }, + { + "epoch": 2.0694915254237287, + "grad_norm": 0.044487837702035904, + "learning_rate": 4.369837352738143e-05, + "loss": 1.0627, + "step": 1836 + }, + { + "epoch": 2.0706214689265536, + "grad_norm": 0.043943922966718674, + "learning_rate": 4.360025235283589e-05, + "loss": 1.028, + "step": 1837 + }, + { + "epoch": 2.0717514124293785, + "grad_norm": 0.04223185032606125, + "learning_rate": 4.3502210743919716e-05, + "loss": 0.982, + "step": 1838 + }, + { + "epoch": 2.0728813559322035, + "grad_norm": 0.04249482974410057, + "learning_rate": 4.340424883894456e-05, + "loss": 0.9398, + "step": 1839 + }, + { + "epoch": 2.0740112994350284, + "grad_norm": 0.04417464882135391, + "learning_rate": 4.330636677610962e-05, + "loss": 0.9966, + "step": 1840 + }, + { + "epoch": 2.0751412429378533, + "grad_norm": 0.04311511293053627, + "learning_rate": 4.320856469350148e-05, + "loss": 0.942, + "step": 1841 + }, + { + "epoch": 2.0762711864406778, + "grad_norm": 0.04165413975715637, + "learning_rate": 4.3110842729093905e-05, + "loss": 0.9925, + "step": 1842 + }, + { + "epoch": 2.0774011299435027, + "grad_norm": 0.04295306280255318, + "learning_rate": 4.301320102074753e-05, + "loss": 1.0347, + "step": 1843 + }, + { + "epoch": 2.0785310734463276, + "grad_norm": 0.042987748980522156, + "learning_rate": 4.291563970620995e-05, + "loss": 1.0159, + "step": 1844 + }, + { + "epoch": 2.0796610169491525, + "grad_norm": 0.04220904782414436, + "learning_rate": 4.281815892311525e-05, + "loss": 0.8988, + "step": 1845 + }, + { + "epoch": 2.0807909604519774, + "grad_norm": 0.04398202523589134, + "learning_rate": 4.2720758808983764e-05, + "loss": 0.8488, + "step": 1846 + }, + { + "epoch": 2.0819209039548023, + "grad_norm": 0.042306363582611084, + "learning_rate": 4.262343950122231e-05, + "loss": 0.9823, + "step": 1847 + }, + { + "epoch": 2.0830508474576273, + "grad_norm": 0.04340597242116928, + "learning_rate": 4.2526201137123476e-05, + "loss": 0.9914, + "step": 1848 + }, + { + "epoch": 2.084180790960452, + "grad_norm": 0.04187840595841408, + "learning_rate": 4.242904385386578e-05, + "loss": 0.9613, + "step": 1849 + }, + { + "epoch": 2.0853107344632766, + "grad_norm": 0.04024931788444519, + "learning_rate": 4.2331967788513295e-05, + "loss": 0.8752, + "step": 1850 + }, + { + "epoch": 2.0864406779661016, + "grad_norm": 0.04166257008910179, + "learning_rate": 4.223497307801551e-05, + "loss": 0.9149, + "step": 1851 + }, + { + "epoch": 2.0875706214689265, + "grad_norm": 0.04291348159313202, + "learning_rate": 4.213805985920729e-05, + "loss": 0.9848, + "step": 1852 + }, + { + "epoch": 2.0887005649717514, + "grad_norm": 0.043250638991594315, + "learning_rate": 4.2041228268808294e-05, + "loss": 0.9481, + "step": 1853 + }, + { + "epoch": 2.0898305084745763, + "grad_norm": 0.04172235727310181, + "learning_rate": 4.1944478443423155e-05, + "loss": 0.9443, + "step": 1854 + }, + { + "epoch": 2.090960451977401, + "grad_norm": 0.04241979867219925, + "learning_rate": 4.1847810519541255e-05, + "loss": 0.9697, + "step": 1855 + }, + { + "epoch": 2.092090395480226, + "grad_norm": 0.042877666652202606, + "learning_rate": 4.175122463353621e-05, + "loss": 0.9557, + "step": 1856 + }, + { + "epoch": 2.093220338983051, + "grad_norm": 0.04124724119901657, + "learning_rate": 4.1654720921666044e-05, + "loss": 0.9078, + "step": 1857 + }, + { + "epoch": 2.0943502824858755, + "grad_norm": 0.04259972274303436, + "learning_rate": 4.1558299520072885e-05, + "loss": 0.9561, + "step": 1858 + }, + { + "epoch": 2.0954802259887004, + "grad_norm": 0.04378335177898407, + "learning_rate": 4.14619605647827e-05, + "loss": 0.9466, + "step": 1859 + }, + { + "epoch": 2.0966101694915253, + "grad_norm": 0.04312341660261154, + "learning_rate": 4.136570419170501e-05, + "loss": 0.9631, + "step": 1860 + }, + { + "epoch": 2.0977401129943503, + "grad_norm": 0.042929697781801224, + "learning_rate": 4.126953053663307e-05, + "loss": 0.8175, + "step": 1861 + }, + { + "epoch": 2.098870056497175, + "grad_norm": 0.0430251844227314, + "learning_rate": 4.117343973524329e-05, + "loss": 0.9864, + "step": 1862 + }, + { + "epoch": 2.1, + "grad_norm": 0.04333322122693062, + "learning_rate": 4.1077431923095244e-05, + "loss": 0.9386, + "step": 1863 + }, + { + "epoch": 2.101129943502825, + "grad_norm": 0.04226114600896835, + "learning_rate": 4.09815072356314e-05, + "loss": 1.0237, + "step": 1864 + }, + { + "epoch": 2.10225988700565, + "grad_norm": 0.0435493066906929, + "learning_rate": 4.088566580817694e-05, + "loss": 0.9987, + "step": 1865 + }, + { + "epoch": 2.1033898305084744, + "grad_norm": 0.04193365201354027, + "learning_rate": 4.078990777593975e-05, + "loss": 1.0508, + "step": 1866 + }, + { + "epoch": 2.1045197740112993, + "grad_norm": 0.04185868427157402, + "learning_rate": 4.0694233274009816e-05, + "loss": 0.9143, + "step": 1867 + }, + { + "epoch": 2.105649717514124, + "grad_norm": 0.042803600430488586, + "learning_rate": 4.0598642437359414e-05, + "loss": 0.9395, + "step": 1868 + }, + { + "epoch": 2.106779661016949, + "grad_norm": 0.042586468160152435, + "learning_rate": 4.05031354008429e-05, + "loss": 1.0041, + "step": 1869 + }, + { + "epoch": 2.107909604519774, + "grad_norm": 0.04308640584349632, + "learning_rate": 4.040771229919612e-05, + "loss": 0.95, + "step": 1870 + }, + { + "epoch": 2.109039548022599, + "grad_norm": 0.04416211321949959, + "learning_rate": 4.0312373267036816e-05, + "loss": 0.8787, + "step": 1871 + }, + { + "epoch": 2.110169491525424, + "grad_norm": 0.04484262317419052, + "learning_rate": 4.021711843886395e-05, + "loss": 0.9828, + "step": 1872 + }, + { + "epoch": 2.111299435028249, + "grad_norm": 0.04606883227825165, + "learning_rate": 4.012194794905775e-05, + "loss": 1.0205, + "step": 1873 + }, + { + "epoch": 2.1124293785310733, + "grad_norm": 0.043447453528642654, + "learning_rate": 4.0026861931879446e-05, + "loss": 0.9687, + "step": 1874 + }, + { + "epoch": 2.113559322033898, + "grad_norm": 0.042595911771059036, + "learning_rate": 3.99318605214711e-05, + "loss": 0.9269, + "step": 1875 + }, + { + "epoch": 2.114689265536723, + "grad_norm": 0.04446543753147125, + "learning_rate": 3.983694385185543e-05, + "loss": 0.937, + "step": 1876 + }, + { + "epoch": 2.115819209039548, + "grad_norm": 0.04304026812314987, + "learning_rate": 3.974211205693559e-05, + "loss": 0.8803, + "step": 1877 + }, + { + "epoch": 2.116949152542373, + "grad_norm": 0.041843317449092865, + "learning_rate": 3.964736527049502e-05, + "loss": 0.9684, + "step": 1878 + }, + { + "epoch": 2.118079096045198, + "grad_norm": 0.04251433536410332, + "learning_rate": 3.955270362619717e-05, + "loss": 0.9508, + "step": 1879 + }, + { + "epoch": 2.1192090395480228, + "grad_norm": 0.04271414875984192, + "learning_rate": 3.945812725758554e-05, + "loss": 0.9956, + "step": 1880 + }, + { + "epoch": 2.1203389830508477, + "grad_norm": 0.04318327456712723, + "learning_rate": 3.936363629808309e-05, + "loss": 0.981, + "step": 1881 + }, + { + "epoch": 2.121468926553672, + "grad_norm": 0.04361184686422348, + "learning_rate": 3.926923088099248e-05, + "loss": 0.9366, + "step": 1882 + }, + { + "epoch": 2.122598870056497, + "grad_norm": 0.043581850826740265, + "learning_rate": 3.917491113949566e-05, + "loss": 0.907, + "step": 1883 + }, + { + "epoch": 2.123728813559322, + "grad_norm": 0.04366597160696983, + "learning_rate": 3.908067720665356e-05, + "loss": 0.9216, + "step": 1884 + }, + { + "epoch": 2.124858757062147, + "grad_norm": 0.043010421097278595, + "learning_rate": 3.8986529215406275e-05, + "loss": 0.9093, + "step": 1885 + }, + { + "epoch": 2.125988700564972, + "grad_norm": 0.044093113392591476, + "learning_rate": 3.889246729857253e-05, + "loss": 0.967, + "step": 1886 + }, + { + "epoch": 2.1271186440677967, + "grad_norm": 0.043733999133110046, + "learning_rate": 3.879849158884966e-05, + "loss": 0.9415, + "step": 1887 + }, + { + "epoch": 2.1282485875706216, + "grad_norm": 0.04293812811374664, + "learning_rate": 3.870460221881336e-05, + "loss": 1.055, + "step": 1888 + }, + { + "epoch": 2.1293785310734465, + "grad_norm": 0.04446000978350639, + "learning_rate": 3.861079932091749e-05, + "loss": 0.9815, + "step": 1889 + }, + { + "epoch": 2.130508474576271, + "grad_norm": 0.04384174942970276, + "learning_rate": 3.851708302749409e-05, + "loss": 1.0459, + "step": 1890 + }, + { + "epoch": 2.131638418079096, + "grad_norm": 0.04370885714888573, + "learning_rate": 3.8423453470752805e-05, + "loss": 0.9833, + "step": 1891 + }, + { + "epoch": 2.132768361581921, + "grad_norm": 0.046146828681230545, + "learning_rate": 3.8329910782780966e-05, + "loss": 0.9669, + "step": 1892 + }, + { + "epoch": 2.1338983050847458, + "grad_norm": 0.04334993287920952, + "learning_rate": 3.82364550955435e-05, + "loss": 0.9578, + "step": 1893 + }, + { + "epoch": 2.1350282485875707, + "grad_norm": 0.04254217445850372, + "learning_rate": 3.81430865408825e-05, + "loss": 0.9224, + "step": 1894 + }, + { + "epoch": 2.1361581920903956, + "grad_norm": 0.04299743473529816, + "learning_rate": 3.8049805250517004e-05, + "loss": 0.9902, + "step": 1895 + }, + { + "epoch": 2.1372881355932205, + "grad_norm": 0.04331592097878456, + "learning_rate": 3.795661135604319e-05, + "loss": 0.8947, + "step": 1896 + }, + { + "epoch": 2.138418079096045, + "grad_norm": 0.04248592630028725, + "learning_rate": 3.786350498893384e-05, + "loss": 0.9523, + "step": 1897 + }, + { + "epoch": 2.13954802259887, + "grad_norm": 0.04246117174625397, + "learning_rate": 3.777048628053812e-05, + "loss": 0.9214, + "step": 1898 + }, + { + "epoch": 2.140677966101695, + "grad_norm": 0.043325964361429214, + "learning_rate": 3.7677555362081753e-05, + "loss": 0.9706, + "step": 1899 + }, + { + "epoch": 2.1418079096045197, + "grad_norm": 0.043188970535993576, + "learning_rate": 3.7584712364666494e-05, + "loss": 0.9794, + "step": 1900 + }, + { + "epoch": 2.1429378531073446, + "grad_norm": 0.045290421694517136, + "learning_rate": 3.7491957419270076e-05, + "loss": 0.9523, + "step": 1901 + }, + { + "epoch": 2.1440677966101696, + "grad_norm": 0.042237911373376846, + "learning_rate": 3.7399290656746025e-05, + "loss": 0.9501, + "step": 1902 + }, + { + "epoch": 2.1451977401129945, + "grad_norm": 0.045046236366033554, + "learning_rate": 3.7306712207823416e-05, + "loss": 1.0151, + "step": 1903 + }, + { + "epoch": 2.1463276836158194, + "grad_norm": 0.04322218894958496, + "learning_rate": 3.721422220310689e-05, + "loss": 0.9431, + "step": 1904 + }, + { + "epoch": 2.1474576271186443, + "grad_norm": 0.04495770111680031, + "learning_rate": 3.71218207730761e-05, + "loss": 0.9045, + "step": 1905 + }, + { + "epoch": 2.1485875706214688, + "grad_norm": 0.04479131102561951, + "learning_rate": 3.702950804808585e-05, + "loss": 1.0059, + "step": 1906 + }, + { + "epoch": 2.1497175141242937, + "grad_norm": 0.043446823954582214, + "learning_rate": 3.693728415836586e-05, + "loss": 0.9376, + "step": 1907 + }, + { + "epoch": 2.1508474576271186, + "grad_norm": 0.04486696794629097, + "learning_rate": 3.684514923402049e-05, + "loss": 0.9882, + "step": 1908 + }, + { + "epoch": 2.1519774011299435, + "grad_norm": 0.04472246393561363, + "learning_rate": 3.6753103405028455e-05, + "loss": 1.0832, + "step": 1909 + }, + { + "epoch": 2.1531073446327684, + "grad_norm": 0.04336708039045334, + "learning_rate": 3.666114680124298e-05, + "loss": 1.0151, + "step": 1910 + }, + { + "epoch": 2.1542372881355933, + "grad_norm": 0.04265212640166283, + "learning_rate": 3.6569279552391335e-05, + "loss": 0.9912, + "step": 1911 + }, + { + "epoch": 2.1553672316384183, + "grad_norm": 0.04515647888183594, + "learning_rate": 3.64775017880747e-05, + "loss": 0.9827, + "step": 1912 + }, + { + "epoch": 2.1564971751412427, + "grad_norm": 0.043546319007873535, + "learning_rate": 3.638581363776805e-05, + "loss": 0.9497, + "step": 1913 + }, + { + "epoch": 2.1576271186440676, + "grad_norm": 0.04316361993551254, + "learning_rate": 3.629421523081993e-05, + "loss": 0.9203, + "step": 1914 + }, + { + "epoch": 2.1587570621468926, + "grad_norm": 0.04629300907254219, + "learning_rate": 3.620270669645228e-05, + "loss": 0.9532, + "step": 1915 + }, + { + "epoch": 2.1598870056497175, + "grad_norm": 0.04403712972998619, + "learning_rate": 3.611128816376027e-05, + "loss": 0.9163, + "step": 1916 + }, + { + "epoch": 2.1610169491525424, + "grad_norm": 0.04466381296515465, + "learning_rate": 3.601995976171204e-05, + "loss": 0.9866, + "step": 1917 + }, + { + "epoch": 2.1621468926553673, + "grad_norm": 0.043207425624132156, + "learning_rate": 3.592872161914872e-05, + "loss": 0.984, + "step": 1918 + }, + { + "epoch": 2.163276836158192, + "grad_norm": 0.043047528713941574, + "learning_rate": 3.583757386478389e-05, + "loss": 0.9795, + "step": 1919 + }, + { + "epoch": 2.164406779661017, + "grad_norm": 0.04385092481970787, + "learning_rate": 3.574651662720382e-05, + "loss": 1.0078, + "step": 1920 + }, + { + "epoch": 2.1655367231638416, + "grad_norm": 0.04322254657745361, + "learning_rate": 3.565555003486697e-05, + "loss": 1.0193, + "step": 1921 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.0432460755109787, + "learning_rate": 3.556467421610397e-05, + "loss": 1.0581, + "step": 1922 + }, + { + "epoch": 2.1677966101694914, + "grad_norm": 0.0425468273460865, + "learning_rate": 3.547388929911735e-05, + "loss": 0.9893, + "step": 1923 + }, + { + "epoch": 2.1689265536723163, + "grad_norm": 0.04245666787028313, + "learning_rate": 3.538319541198144e-05, + "loss": 0.949, + "step": 1924 + }, + { + "epoch": 2.1700564971751413, + "grad_norm": 0.04199038818478584, + "learning_rate": 3.5292592682642134e-05, + "loss": 0.9138, + "step": 1925 + }, + { + "epoch": 2.171186440677966, + "grad_norm": 0.04395987465977669, + "learning_rate": 3.5202081238916716e-05, + "loss": 0.9457, + "step": 1926 + }, + { + "epoch": 2.172316384180791, + "grad_norm": 0.044708240777254105, + "learning_rate": 3.511166120849373e-05, + "loss": 1.0507, + "step": 1927 + }, + { + "epoch": 2.173446327683616, + "grad_norm": 0.04187402501702309, + "learning_rate": 3.502133271893269e-05, + "loss": 0.9955, + "step": 1928 + }, + { + "epoch": 2.1745762711864405, + "grad_norm": 0.04379774630069733, + "learning_rate": 3.493109589766403e-05, + "loss": 0.9605, + "step": 1929 + }, + { + "epoch": 2.1757062146892654, + "grad_norm": 0.04420413821935654, + "learning_rate": 3.484095087198881e-05, + "loss": 0.909, + "step": 1930 + }, + { + "epoch": 2.1768361581920903, + "grad_norm": 0.045495763421058655, + "learning_rate": 3.475089776907868e-05, + "loss": 0.9914, + "step": 1931 + }, + { + "epoch": 2.1779661016949152, + "grad_norm": 0.04469761997461319, + "learning_rate": 3.466093671597557e-05, + "loss": 0.9338, + "step": 1932 + }, + { + "epoch": 2.17909604519774, + "grad_norm": 0.0445980429649353, + "learning_rate": 3.457106783959141e-05, + "loss": 0.9165, + "step": 1933 + }, + { + "epoch": 2.180225988700565, + "grad_norm": 0.04591384530067444, + "learning_rate": 3.448129126670834e-05, + "loss": 0.9446, + "step": 1934 + }, + { + "epoch": 2.18135593220339, + "grad_norm": 0.042553581297397614, + "learning_rate": 3.4391607123978095e-05, + "loss": 0.9666, + "step": 1935 + }, + { + "epoch": 2.182485875706215, + "grad_norm": 0.042708031833171844, + "learning_rate": 3.43020155379221e-05, + "loss": 1.0542, + "step": 1936 + }, + { + "epoch": 2.1836158192090394, + "grad_norm": 0.04311813414096832, + "learning_rate": 3.42125166349312e-05, + "loss": 1.0275, + "step": 1937 + }, + { + "epoch": 2.1847457627118643, + "grad_norm": 0.04380349814891815, + "learning_rate": 3.4123110541265443e-05, + "loss": 0.9017, + "step": 1938 + }, + { + "epoch": 2.185875706214689, + "grad_norm": 0.04285889118909836, + "learning_rate": 3.403379738305399e-05, + "loss": 0.966, + "step": 1939 + }, + { + "epoch": 2.187005649717514, + "grad_norm": 0.04314585402607918, + "learning_rate": 3.394457728629489e-05, + "loss": 0.9751, + "step": 1940 + }, + { + "epoch": 2.188135593220339, + "grad_norm": 0.04352666810154915, + "learning_rate": 3.385545037685485e-05, + "loss": 0.8714, + "step": 1941 + }, + { + "epoch": 2.189265536723164, + "grad_norm": 0.04633624106645584, + "learning_rate": 3.3766416780469256e-05, + "loss": 1.0029, + "step": 1942 + }, + { + "epoch": 2.190395480225989, + "grad_norm": 0.043088801205158234, + "learning_rate": 3.3677476622741664e-05, + "loss": 1.0206, + "step": 1943 + }, + { + "epoch": 2.1915254237288138, + "grad_norm": 0.044619057327508926, + "learning_rate": 3.358863002914392e-05, + "loss": 1.0072, + "step": 1944 + }, + { + "epoch": 2.1926553672316382, + "grad_norm": 0.044183578342199326, + "learning_rate": 3.349987712501591e-05, + "loss": 0.9604, + "step": 1945 + }, + { + "epoch": 2.193785310734463, + "grad_norm": 0.04243571683764458, + "learning_rate": 3.341121803556529e-05, + "loss": 0.9624, + "step": 1946 + }, + { + "epoch": 2.194915254237288, + "grad_norm": 0.04398589953780174, + "learning_rate": 3.3322652885867315e-05, + "loss": 1.0083, + "step": 1947 + }, + { + "epoch": 2.196045197740113, + "grad_norm": 0.04456605017185211, + "learning_rate": 3.323418180086483e-05, + "loss": 0.9609, + "step": 1948 + }, + { + "epoch": 2.197175141242938, + "grad_norm": 0.0444977842271328, + "learning_rate": 3.3145804905367904e-05, + "loss": 0.9846, + "step": 1949 + }, + { + "epoch": 2.198305084745763, + "grad_norm": 0.04405605047941208, + "learning_rate": 3.305752232405377e-05, + "loss": 0.9386, + "step": 1950 + }, + { + "epoch": 2.1994350282485877, + "grad_norm": 0.0432204008102417, + "learning_rate": 3.296933418146656e-05, + "loss": 0.959, + "step": 1951 + }, + { + "epoch": 2.2005649717514126, + "grad_norm": 0.04267331585288048, + "learning_rate": 3.2881240602017224e-05, + "loss": 0.9566, + "step": 1952 + }, + { + "epoch": 2.201694915254237, + "grad_norm": 0.0441194623708725, + "learning_rate": 3.279324170998328e-05, + "loss": 0.9098, + "step": 1953 + }, + { + "epoch": 2.202824858757062, + "grad_norm": 0.043350979685783386, + "learning_rate": 3.270533762950868e-05, + "loss": 0.9884, + "step": 1954 + }, + { + "epoch": 2.203954802259887, + "grad_norm": 0.04236207902431488, + "learning_rate": 3.2617528484603576e-05, + "loss": 0.9565, + "step": 1955 + }, + { + "epoch": 2.205084745762712, + "grad_norm": 0.043681688606739044, + "learning_rate": 3.252981439914432e-05, + "loss": 0.9966, + "step": 1956 + }, + { + "epoch": 2.2062146892655368, + "grad_norm": 0.044432997703552246, + "learning_rate": 3.244219549687298e-05, + "loss": 0.9039, + "step": 1957 + }, + { + "epoch": 2.2073446327683617, + "grad_norm": 0.0422402061522007, + "learning_rate": 3.235467190139744e-05, + "loss": 1.036, + "step": 1958 + }, + { + "epoch": 2.2084745762711866, + "grad_norm": 0.044290442019701004, + "learning_rate": 3.226724373619118e-05, + "loss": 1.0285, + "step": 1959 + }, + { + "epoch": 2.209604519774011, + "grad_norm": 0.04244611784815788, + "learning_rate": 3.2179911124592966e-05, + "loss": 0.954, + "step": 1960 + }, + { + "epoch": 2.210734463276836, + "grad_norm": 0.04315735027194023, + "learning_rate": 3.2092674189806796e-05, + "loss": 0.9831, + "step": 1961 + }, + { + "epoch": 2.211864406779661, + "grad_norm": 0.04285259544849396, + "learning_rate": 3.200553305490168e-05, + "loss": 1.0086, + "step": 1962 + }, + { + "epoch": 2.212994350282486, + "grad_norm": 0.043484896421432495, + "learning_rate": 3.19184878428115e-05, + "loss": 0.9422, + "step": 1963 + }, + { + "epoch": 2.2141242937853107, + "grad_norm": 0.0440831333398819, + "learning_rate": 3.183153867633478e-05, + "loss": 0.9518, + "step": 1964 + }, + { + "epoch": 2.2152542372881356, + "grad_norm": 0.044822998344898224, + "learning_rate": 3.174468567813461e-05, + "loss": 0.9831, + "step": 1965 + }, + { + "epoch": 2.2163841807909606, + "grad_norm": 0.04435247182846069, + "learning_rate": 3.165792897073834e-05, + "loss": 0.9201, + "step": 1966 + }, + { + "epoch": 2.2175141242937855, + "grad_norm": 0.04330061003565788, + "learning_rate": 3.157126867653753e-05, + "loss": 0.9692, + "step": 1967 + }, + { + "epoch": 2.2186440677966104, + "grad_norm": 0.043659813702106476, + "learning_rate": 3.1484704917787654e-05, + "loss": 0.9948, + "step": 1968 + }, + { + "epoch": 2.219774011299435, + "grad_norm": 0.044164128601551056, + "learning_rate": 3.1398237816608135e-05, + "loss": 0.9184, + "step": 1969 + }, + { + "epoch": 2.2209039548022598, + "grad_norm": 0.04370899870991707, + "learning_rate": 3.131186749498195e-05, + "loss": 0.9378, + "step": 1970 + }, + { + "epoch": 2.2220338983050847, + "grad_norm": 0.04361080005764961, + "learning_rate": 3.122559407475545e-05, + "loss": 0.9835, + "step": 1971 + }, + { + "epoch": 2.2231638418079096, + "grad_norm": 0.04388871416449547, + "learning_rate": 3.113941767763847e-05, + "loss": 0.9748, + "step": 1972 + }, + { + "epoch": 2.2242937853107345, + "grad_norm": 0.04267182946205139, + "learning_rate": 3.105333842520386e-05, + "loss": 0.9749, + "step": 1973 + }, + { + "epoch": 2.2254237288135594, + "grad_norm": 0.043973181396722794, + "learning_rate": 3.096735643888744e-05, + "loss": 1.0114, + "step": 1974 + }, + { + "epoch": 2.2265536723163843, + "grad_norm": 0.04484979808330536, + "learning_rate": 3.088147183998782e-05, + "loss": 0.96, + "step": 1975 + }, + { + "epoch": 2.227683615819209, + "grad_norm": 0.04428961127996445, + "learning_rate": 3.079568474966622e-05, + "loss": 0.9723, + "step": 1976 + }, + { + "epoch": 2.2288135593220337, + "grad_norm": 0.0446840338408947, + "learning_rate": 3.070999528894629e-05, + "loss": 0.9633, + "step": 1977 + }, + { + "epoch": 2.2299435028248586, + "grad_norm": 0.04326969385147095, + "learning_rate": 3.0624403578713976e-05, + "loss": 0.934, + "step": 1978 + }, + { + "epoch": 2.2310734463276836, + "grad_norm": 0.04401664808392525, + "learning_rate": 3.053890973971726e-05, + "loss": 1.055, + "step": 1979 + }, + { + "epoch": 2.2322033898305085, + "grad_norm": 0.04194195196032524, + "learning_rate": 3.0453513892566197e-05, + "loss": 1.0163, + "step": 1980 + }, + { + "epoch": 2.2333333333333334, + "grad_norm": 0.04289555549621582, + "learning_rate": 3.0368216157732397e-05, + "loss": 1.0094, + "step": 1981 + }, + { + "epoch": 2.2344632768361583, + "grad_norm": 0.04160468652844429, + "learning_rate": 3.028301665554919e-05, + "loss": 0.9903, + "step": 1982 + }, + { + "epoch": 2.2355932203389832, + "grad_norm": 0.04424525424838066, + "learning_rate": 3.0197915506211337e-05, + "loss": 0.8995, + "step": 1983 + }, + { + "epoch": 2.236723163841808, + "grad_norm": 0.04448120296001434, + "learning_rate": 3.011291282977482e-05, + "loss": 0.9845, + "step": 1984 + }, + { + "epoch": 2.2378531073446326, + "grad_norm": 0.04262761026620865, + "learning_rate": 3.0028008746156588e-05, + "loss": 1.0196, + "step": 1985 + }, + { + "epoch": 2.2389830508474575, + "grad_norm": 0.042251285165548325, + "learning_rate": 2.994320337513471e-05, + "loss": 1.0227, + "step": 1986 + }, + { + "epoch": 2.2401129943502824, + "grad_norm": 0.04287414252758026, + "learning_rate": 2.9858496836347848e-05, + "loss": 0.956, + "step": 1987 + }, + { + "epoch": 2.2412429378531074, + "grad_norm": 0.04530943185091019, + "learning_rate": 2.9773889249295294e-05, + "loss": 1.0611, + "step": 1988 + }, + { + "epoch": 2.2423728813559323, + "grad_norm": 0.044923651963472366, + "learning_rate": 2.9689380733336714e-05, + "loss": 0.996, + "step": 1989 + }, + { + "epoch": 2.243502824858757, + "grad_norm": 0.044317737221717834, + "learning_rate": 2.9604971407692027e-05, + "loss": 1.0236, + "step": 1990 + }, + { + "epoch": 2.244632768361582, + "grad_norm": 0.04512808844447136, + "learning_rate": 2.9520661391441216e-05, + "loss": 1.011, + "step": 1991 + }, + { + "epoch": 2.2457627118644066, + "grad_norm": 0.04485659301280975, + "learning_rate": 2.9436450803524183e-05, + "loss": 0.8739, + "step": 1992 + }, + { + "epoch": 2.2468926553672315, + "grad_norm": 0.04554317891597748, + "learning_rate": 2.93523397627405e-05, + "loss": 0.9842, + "step": 1993 + }, + { + "epoch": 2.2480225988700564, + "grad_norm": 0.044335950165987015, + "learning_rate": 2.9268328387749442e-05, + "loss": 0.9293, + "step": 1994 + }, + { + "epoch": 2.2491525423728813, + "grad_norm": 0.0435885488986969, + "learning_rate": 2.918441679706949e-05, + "loss": 0.9581, + "step": 1995 + }, + { + "epoch": 2.2502824858757062, + "grad_norm": 0.0456608422100544, + "learning_rate": 2.910060510907847e-05, + "loss": 0.9955, + "step": 1996 + }, + { + "epoch": 2.251412429378531, + "grad_norm": 0.04341673105955124, + "learning_rate": 2.9016893442013306e-05, + "loss": 1.0348, + "step": 1997 + }, + { + "epoch": 2.252542372881356, + "grad_norm": 0.04355182126164436, + "learning_rate": 2.8933281913969734e-05, + "loss": 0.9885, + "step": 1998 + }, + { + "epoch": 2.253672316384181, + "grad_norm": 0.04326220974326134, + "learning_rate": 2.884977064290224e-05, + "loss": 0.895, + "step": 1999 + }, + { + "epoch": 2.254802259887006, + "grad_norm": 0.044238653033971786, + "learning_rate": 2.8766359746623894e-05, + "loss": 0.8844, + "step": 2000 + }, + { + "epoch": 2.2559322033898304, + "grad_norm": 0.043896935880184174, + "learning_rate": 2.8683049342806157e-05, + "loss": 0.9641, + "step": 2001 + }, + { + "epoch": 2.2570621468926553, + "grad_norm": 0.04326968267560005, + "learning_rate": 2.8599839548978713e-05, + "loss": 0.9907, + "step": 2002 + }, + { + "epoch": 2.25819209039548, + "grad_norm": 0.04462304711341858, + "learning_rate": 2.851673048252931e-05, + "loss": 1.0387, + "step": 2003 + }, + { + "epoch": 2.259322033898305, + "grad_norm": 0.04370822757482529, + "learning_rate": 2.843372226070361e-05, + "loss": 1.0079, + "step": 2004 + }, + { + "epoch": 2.26045197740113, + "grad_norm": 0.04261447489261627, + "learning_rate": 2.835081500060498e-05, + "loss": 0.9493, + "step": 2005 + }, + { + "epoch": 2.261581920903955, + "grad_norm": 0.044322796165943146, + "learning_rate": 2.826800881919437e-05, + "loss": 0.9265, + "step": 2006 + }, + { + "epoch": 2.26271186440678, + "grad_norm": 0.043250150978565216, + "learning_rate": 2.8185303833290177e-05, + "loss": 0.9663, + "step": 2007 + }, + { + "epoch": 2.2638418079096043, + "grad_norm": 0.043140899389982224, + "learning_rate": 2.8102700159568008e-05, + "loss": 0.9805, + "step": 2008 + }, + { + "epoch": 2.2649717514124292, + "grad_norm": 0.044603534042835236, + "learning_rate": 2.802019791456044e-05, + "loss": 1.0215, + "step": 2009 + }, + { + "epoch": 2.266101694915254, + "grad_norm": 0.04410766065120697, + "learning_rate": 2.7937797214657147e-05, + "loss": 1.0059, + "step": 2010 + }, + { + "epoch": 2.267231638418079, + "grad_norm": 0.043483491986989975, + "learning_rate": 2.7855498176104434e-05, + "loss": 0.9487, + "step": 2011 + }, + { + "epoch": 2.268361581920904, + "grad_norm": 0.042907100170850754, + "learning_rate": 2.7773300915005207e-05, + "loss": 0.84, + "step": 2012 + }, + { + "epoch": 2.269491525423729, + "grad_norm": 0.044326696544885635, + "learning_rate": 2.7691205547318776e-05, + "loss": 1.0064, + "step": 2013 + }, + { + "epoch": 2.270621468926554, + "grad_norm": 0.04372712969779968, + "learning_rate": 2.7609212188860757e-05, + "loss": 0.9424, + "step": 2014 + }, + { + "epoch": 2.2717514124293787, + "grad_norm": 0.04335528612136841, + "learning_rate": 2.7527320955302794e-05, + "loss": 0.9816, + "step": 2015 + }, + { + "epoch": 2.272881355932203, + "grad_norm": 0.04416688159108162, + "learning_rate": 2.74455319621725e-05, + "loss": 0.9326, + "step": 2016 + }, + { + "epoch": 2.274011299435028, + "grad_norm": 0.04589290916919708, + "learning_rate": 2.736384532485321e-05, + "loss": 0.9499, + "step": 2017 + }, + { + "epoch": 2.275141242937853, + "grad_norm": 0.0437130331993103, + "learning_rate": 2.7282261158583976e-05, + "loss": 0.9826, + "step": 2018 + }, + { + "epoch": 2.276271186440678, + "grad_norm": 0.04344571754336357, + "learning_rate": 2.7200779578459123e-05, + "loss": 0.9085, + "step": 2019 + }, + { + "epoch": 2.277401129943503, + "grad_norm": 0.04497040435671806, + "learning_rate": 2.7119400699428332e-05, + "loss": 0.937, + "step": 2020 + }, + { + "epoch": 2.2785310734463278, + "grad_norm": 0.04277808219194412, + "learning_rate": 2.703812463629646e-05, + "loss": 0.9305, + "step": 2021 + }, + { + "epoch": 2.2796610169491527, + "grad_norm": 0.04330938309431076, + "learning_rate": 2.6956951503723272e-05, + "loss": 0.9323, + "step": 2022 + }, + { + "epoch": 2.280790960451977, + "grad_norm": 0.04527467489242554, + "learning_rate": 2.6875881416223204e-05, + "loss": 1.0554, + "step": 2023 + }, + { + "epoch": 2.281920903954802, + "grad_norm": 0.04678976908326149, + "learning_rate": 2.6794914488165533e-05, + "loss": 0.9549, + "step": 2024 + }, + { + "epoch": 2.283050847457627, + "grad_norm": 0.04412659630179405, + "learning_rate": 2.671405083377386e-05, + "loss": 1.0194, + "step": 2025 + }, + { + "epoch": 2.284180790960452, + "grad_norm": 0.04408615082502365, + "learning_rate": 2.6633290567126157e-05, + "loss": 0.9893, + "step": 2026 + }, + { + "epoch": 2.285310734463277, + "grad_norm": 0.04321965202689171, + "learning_rate": 2.6552633802154493e-05, + "loss": 0.9805, + "step": 2027 + }, + { + "epoch": 2.2864406779661017, + "grad_norm": 0.04430380091071129, + "learning_rate": 2.6472080652644926e-05, + "loss": 0.9462, + "step": 2028 + }, + { + "epoch": 2.2875706214689266, + "grad_norm": 0.04322722926735878, + "learning_rate": 2.639163123223747e-05, + "loss": 0.9606, + "step": 2029 + }, + { + "epoch": 2.2887005649717516, + "grad_norm": 0.043180808424949646, + "learning_rate": 2.6311285654425575e-05, + "loss": 0.9322, + "step": 2030 + }, + { + "epoch": 2.2898305084745765, + "grad_norm": 0.043794550001621246, + "learning_rate": 2.623104403255634e-05, + "loss": 0.9762, + "step": 2031 + }, + { + "epoch": 2.290960451977401, + "grad_norm": 0.044038672000169754, + "learning_rate": 2.6150906479830274e-05, + "loss": 0.9987, + "step": 2032 + }, + { + "epoch": 2.292090395480226, + "grad_norm": 0.0434204563498497, + "learning_rate": 2.6070873109300885e-05, + "loss": 0.9565, + "step": 2033 + }, + { + "epoch": 2.2932203389830508, + "grad_norm": 0.04463200643658638, + "learning_rate": 2.599094403387481e-05, + "loss": 0.9487, + "step": 2034 + }, + { + "epoch": 2.2943502824858757, + "grad_norm": 0.04487144574522972, + "learning_rate": 2.5911119366311597e-05, + "loss": 0.9826, + "step": 2035 + }, + { + "epoch": 2.2954802259887006, + "grad_norm": 0.044315457344055176, + "learning_rate": 2.5831399219223428e-05, + "loss": 1.0061, + "step": 2036 + }, + { + "epoch": 2.2966101694915255, + "grad_norm": 0.045690327882766724, + "learning_rate": 2.575178370507506e-05, + "loss": 0.9714, + "step": 2037 + }, + { + "epoch": 2.2977401129943504, + "grad_norm": 0.042950958013534546, + "learning_rate": 2.5672272936183627e-05, + "loss": 0.9542, + "step": 2038 + }, + { + "epoch": 2.298870056497175, + "grad_norm": 0.04238516837358475, + "learning_rate": 2.55928670247185e-05, + "loss": 0.9595, + "step": 2039 + }, + { + "epoch": 2.3, + "grad_norm": 0.04378442093729973, + "learning_rate": 2.5513566082701135e-05, + "loss": 0.9148, + "step": 2040 + }, + { + "epoch": 2.3011299435028247, + "grad_norm": 0.043305352330207825, + "learning_rate": 2.543437022200489e-05, + "loss": 0.9211, + "step": 2041 + }, + { + "epoch": 2.3022598870056497, + "grad_norm": 0.04488563537597656, + "learning_rate": 2.535527955435486e-05, + "loss": 0.9947, + "step": 2042 + }, + { + "epoch": 2.3033898305084746, + "grad_norm": 0.04428262263536453, + "learning_rate": 2.527629419132783e-05, + "loss": 1.0371, + "step": 2043 + }, + { + "epoch": 2.3045197740112995, + "grad_norm": 0.04419199004769325, + "learning_rate": 2.5197414244351904e-05, + "loss": 0.927, + "step": 2044 + }, + { + "epoch": 2.3056497175141244, + "grad_norm": 0.04307791590690613, + "learning_rate": 2.51186398247065e-05, + "loss": 1.0249, + "step": 2045 + }, + { + "epoch": 2.3067796610169493, + "grad_norm": 0.04518159106373787, + "learning_rate": 2.5039971043522292e-05, + "loss": 0.9505, + "step": 2046 + }, + { + "epoch": 2.3079096045197742, + "grad_norm": 0.04357128217816353, + "learning_rate": 2.4961408011780707e-05, + "loss": 1.0103, + "step": 2047 + }, + { + "epoch": 2.3090395480225987, + "grad_norm": 0.0436529703438282, + "learning_rate": 2.488295084031419e-05, + "loss": 1.0638, + "step": 2048 + }, + { + "epoch": 2.3101694915254236, + "grad_norm": 0.04202603921294212, + "learning_rate": 2.4804599639805714e-05, + "loss": 0.9998, + "step": 2049 + }, + { + "epoch": 2.3112994350282485, + "grad_norm": 0.04418287053704262, + "learning_rate": 2.472635452078883e-05, + "loss": 0.9245, + "step": 2050 + }, + { + "epoch": 2.3124293785310734, + "grad_norm": 0.04419273883104324, + "learning_rate": 2.464821559364737e-05, + "loss": 0.9414, + "step": 2051 + }, + { + "epoch": 2.3135593220338984, + "grad_norm": 0.043836288154125214, + "learning_rate": 2.4570182968615418e-05, + "loss": 0.9468, + "step": 2052 + }, + { + "epoch": 2.3146892655367233, + "grad_norm": 0.04379745200276375, + "learning_rate": 2.4492256755777042e-05, + "loss": 0.9917, + "step": 2053 + }, + { + "epoch": 2.315819209039548, + "grad_norm": 0.04468725621700287, + "learning_rate": 2.441443706506623e-05, + "loss": 0.9677, + "step": 2054 + }, + { + "epoch": 2.3169491525423727, + "grad_norm": 0.04374300688505173, + "learning_rate": 2.433672400626663e-05, + "loss": 1.0079, + "step": 2055 + }, + { + "epoch": 2.3180790960451976, + "grad_norm": 0.045238886028528214, + "learning_rate": 2.4259117689011578e-05, + "loss": 0.9881, + "step": 2056 + }, + { + "epoch": 2.3192090395480225, + "grad_norm": 0.043223727494478226, + "learning_rate": 2.418161822278374e-05, + "loss": 1.0842, + "step": 2057 + }, + { + "epoch": 2.3203389830508474, + "grad_norm": 0.04424876347184181, + "learning_rate": 2.4104225716914985e-05, + "loss": 0.9624, + "step": 2058 + }, + { + "epoch": 2.3214689265536723, + "grad_norm": 0.04261065274477005, + "learning_rate": 2.402694028058644e-05, + "loss": 0.9402, + "step": 2059 + }, + { + "epoch": 2.3225988700564972, + "grad_norm": 0.042224884033203125, + "learning_rate": 2.3949762022828092e-05, + "loss": 0.9993, + "step": 2060 + }, + { + "epoch": 2.323728813559322, + "grad_norm": 0.0436599999666214, + "learning_rate": 2.387269105251866e-05, + "loss": 1.0189, + "step": 2061 + }, + { + "epoch": 2.324858757062147, + "grad_norm": 0.04297950863838196, + "learning_rate": 2.3795727478385676e-05, + "loss": 1.1014, + "step": 2062 + }, + { + "epoch": 2.325988700564972, + "grad_norm": 0.04437912628054619, + "learning_rate": 2.371887140900504e-05, + "loss": 0.943, + "step": 2063 + }, + { + "epoch": 2.3271186440677964, + "grad_norm": 0.043281424790620804, + "learning_rate": 2.3642122952801015e-05, + "loss": 0.9319, + "step": 2064 + }, + { + "epoch": 2.3282485875706214, + "grad_norm": 0.04431607574224472, + "learning_rate": 2.3565482218046075e-05, + "loss": 0.9951, + "step": 2065 + }, + { + "epoch": 2.3282485875706214, + "eval_loss": 0.9855674505233765, + "eval_runtime": 554.8838, + "eval_samples_per_second": 17.631, + "eval_steps_per_second": 8.816, + "step": 2065 + }, + { + "epoch": 2.3293785310734463, + "grad_norm": 0.044959913939237595, + "learning_rate": 2.3488949312860664e-05, + "loss": 0.8822, + "step": 2066 + }, + { + "epoch": 2.330508474576271, + "grad_norm": 0.044671010226011276, + "learning_rate": 2.341252434521325e-05, + "loss": 0.9366, + "step": 2067 + }, + { + "epoch": 2.331638418079096, + "grad_norm": 0.044197868555784225, + "learning_rate": 2.333620742291983e-05, + "loss": 0.9288, + "step": 2068 + }, + { + "epoch": 2.332768361581921, + "grad_norm": 0.045790866017341614, + "learning_rate": 2.325999865364409e-05, + "loss": 0.953, + "step": 2069 + }, + { + "epoch": 2.333898305084746, + "grad_norm": 0.043006639927625656, + "learning_rate": 2.3183898144897177e-05, + "loss": 0.9384, + "step": 2070 + }, + { + "epoch": 2.3350282485875704, + "grad_norm": 0.04369287192821503, + "learning_rate": 2.310790600403745e-05, + "loss": 0.9972, + "step": 2071 + }, + { + "epoch": 2.3361581920903953, + "grad_norm": 0.044209450483322144, + "learning_rate": 2.303202233827033e-05, + "loss": 0.9869, + "step": 2072 + }, + { + "epoch": 2.3372881355932202, + "grad_norm": 0.04336726665496826, + "learning_rate": 2.2956247254648344e-05, + "loss": 0.9897, + "step": 2073 + }, + { + "epoch": 2.338418079096045, + "grad_norm": 0.04320811852812767, + "learning_rate": 2.2880580860070778e-05, + "loss": 1.0209, + "step": 2074 + }, + { + "epoch": 2.33954802259887, + "grad_norm": 0.04383328929543495, + "learning_rate": 2.2805023261283497e-05, + "loss": 0.9673, + "step": 2075 + }, + { + "epoch": 2.340677966101695, + "grad_norm": 0.04465322569012642, + "learning_rate": 2.272957456487903e-05, + "loss": 1.0069, + "step": 2076 + }, + { + "epoch": 2.34180790960452, + "grad_norm": 0.04262759909033775, + "learning_rate": 2.2654234877296198e-05, + "loss": 0.8755, + "step": 2077 + }, + { + "epoch": 2.342937853107345, + "grad_norm": 0.0429883673787117, + "learning_rate": 2.257900430482003e-05, + "loss": 0.9556, + "step": 2078 + }, + { + "epoch": 2.3440677966101697, + "grad_norm": 0.0438074916601181, + "learning_rate": 2.2503882953581622e-05, + "loss": 0.9628, + "step": 2079 + }, + { + "epoch": 2.345197740112994, + "grad_norm": 0.0434563122689724, + "learning_rate": 2.242887092955801e-05, + "loss": 0.8773, + "step": 2080 + }, + { + "epoch": 2.346327683615819, + "grad_norm": 0.0433909073472023, + "learning_rate": 2.235396833857204e-05, + "loss": 1.0392, + "step": 2081 + }, + { + "epoch": 2.347457627118644, + "grad_norm": 0.04353194683790207, + "learning_rate": 2.2279175286292064e-05, + "loss": 0.9412, + "step": 2082 + }, + { + "epoch": 2.348587570621469, + "grad_norm": 0.04275263100862503, + "learning_rate": 2.2204491878231948e-05, + "loss": 0.9089, + "step": 2083 + }, + { + "epoch": 2.349717514124294, + "grad_norm": 0.0452563501894474, + "learning_rate": 2.2129918219750944e-05, + "loss": 1.0463, + "step": 2084 + }, + { + "epoch": 2.3508474576271188, + "grad_norm": 0.04346321523189545, + "learning_rate": 2.2055454416053422e-05, + "loss": 0.9901, + "step": 2085 + }, + { + "epoch": 2.3519774011299437, + "grad_norm": 0.04487701877951622, + "learning_rate": 2.1981100572188752e-05, + "loss": 0.981, + "step": 2086 + }, + { + "epoch": 2.353107344632768, + "grad_norm": 0.04522349685430527, + "learning_rate": 2.190685679305121e-05, + "loss": 0.9812, + "step": 2087 + }, + { + "epoch": 2.354237288135593, + "grad_norm": 0.04697408527135849, + "learning_rate": 2.1832723183379812e-05, + "loss": 0.9467, + "step": 2088 + }, + { + "epoch": 2.355367231638418, + "grad_norm": 0.042166102677583694, + "learning_rate": 2.1758699847758112e-05, + "loss": 1.0908, + "step": 2089 + }, + { + "epoch": 2.356497175141243, + "grad_norm": 0.04440850764513016, + "learning_rate": 2.168478689061413e-05, + "loss": 0.9584, + "step": 2090 + }, + { + "epoch": 2.357627118644068, + "grad_norm": 0.044397566467523575, + "learning_rate": 2.161098441622014e-05, + "loss": 1.0119, + "step": 2091 + }, + { + "epoch": 2.3587570621468927, + "grad_norm": 0.0443037785589695, + "learning_rate": 2.1537292528692598e-05, + "loss": 0.9441, + "step": 2092 + }, + { + "epoch": 2.3598870056497177, + "grad_norm": 0.043571796268224716, + "learning_rate": 2.146371133199191e-05, + "loss": 0.9358, + "step": 2093 + }, + { + "epoch": 2.3610169491525426, + "grad_norm": 0.045840222388505936, + "learning_rate": 2.13902409299223e-05, + "loss": 0.93, + "step": 2094 + }, + { + "epoch": 2.3621468926553675, + "grad_norm": 0.043795760720968246, + "learning_rate": 2.131688142613183e-05, + "loss": 1.0515, + "step": 2095 + }, + { + "epoch": 2.363276836158192, + "grad_norm": 0.04440205171704292, + "learning_rate": 2.124363292411189e-05, + "loss": 0.9891, + "step": 2096 + }, + { + "epoch": 2.364406779661017, + "grad_norm": 0.04491677135229111, + "learning_rate": 2.1170495527197487e-05, + "loss": 0.9825, + "step": 2097 + }, + { + "epoch": 2.365536723163842, + "grad_norm": 0.0445803739130497, + "learning_rate": 2.1097469338566754e-05, + "loss": 0.9716, + "step": 2098 + }, + { + "epoch": 2.3666666666666667, + "grad_norm": 0.04519984871149063, + "learning_rate": 2.1024554461240986e-05, + "loss": 0.9865, + "step": 2099 + }, + { + "epoch": 2.3677966101694916, + "grad_norm": 0.04491204023361206, + "learning_rate": 2.095175099808444e-05, + "loss": 0.9552, + "step": 2100 + }, + { + "epoch": 2.3689265536723165, + "grad_norm": 0.04308044910430908, + "learning_rate": 2.0879059051804206e-05, + "loss": 1.0304, + "step": 2101 + }, + { + "epoch": 2.370056497175141, + "grad_norm": 0.04669945314526558, + "learning_rate": 2.080647872495002e-05, + "loss": 1.0882, + "step": 2102 + }, + { + "epoch": 2.371186440677966, + "grad_norm": 0.04366466403007507, + "learning_rate": 2.0734010119914192e-05, + "loss": 1.0511, + "step": 2103 + }, + { + "epoch": 2.372316384180791, + "grad_norm": 0.04505183920264244, + "learning_rate": 2.0661653338931364e-05, + "loss": 0.9223, + "step": 2104 + }, + { + "epoch": 2.3734463276836157, + "grad_norm": 0.043640442192554474, + "learning_rate": 2.058940848407854e-05, + "loss": 0.9174, + "step": 2105 + }, + { + "epoch": 2.3745762711864407, + "grad_norm": 0.044932059943675995, + "learning_rate": 2.0517275657274683e-05, + "loss": 0.9584, + "step": 2106 + }, + { + "epoch": 2.3757062146892656, + "grad_norm": 0.04342171177268028, + "learning_rate": 2.0445254960280757e-05, + "loss": 0.9948, + "step": 2107 + }, + { + "epoch": 2.3768361581920905, + "grad_norm": 0.042078595608472824, + "learning_rate": 2.0373346494699618e-05, + "loss": 1.0606, + "step": 2108 + }, + { + "epoch": 2.3779661016949154, + "grad_norm": 0.04417315125465393, + "learning_rate": 2.030155036197574e-05, + "loss": 0.9545, + "step": 2109 + }, + { + "epoch": 2.3790960451977403, + "grad_norm": 0.04296877607703209, + "learning_rate": 2.0229866663395026e-05, + "loss": 0.9642, + "step": 2110 + }, + { + "epoch": 2.380225988700565, + "grad_norm": 0.04403414949774742, + "learning_rate": 2.0158295500084945e-05, + "loss": 0.9322, + "step": 2111 + }, + { + "epoch": 2.3813559322033897, + "grad_norm": 0.044930048286914825, + "learning_rate": 2.008683697301408e-05, + "loss": 0.9121, + "step": 2112 + }, + { + "epoch": 2.3824858757062146, + "grad_norm": 0.04404081776738167, + "learning_rate": 2.001549118299214e-05, + "loss": 0.9026, + "step": 2113 + }, + { + "epoch": 2.3836158192090395, + "grad_norm": 0.044441040605306625, + "learning_rate": 1.9944258230669833e-05, + "loss": 0.9232, + "step": 2114 + }, + { + "epoch": 2.3847457627118644, + "grad_norm": 0.04437883198261261, + "learning_rate": 1.987313821653861e-05, + "loss": 1.0162, + "step": 2115 + }, + { + "epoch": 2.3858757062146894, + "grad_norm": 0.04486614465713501, + "learning_rate": 1.9802131240930664e-05, + "loss": 0.9181, + "step": 2116 + }, + { + "epoch": 2.3870056497175143, + "grad_norm": 0.04392695426940918, + "learning_rate": 1.9731237404018676e-05, + "loss": 0.8908, + "step": 2117 + }, + { + "epoch": 2.3881355932203387, + "grad_norm": 0.047182969748973846, + "learning_rate": 1.966045680581572e-05, + "loss": 0.9931, + "step": 2118 + }, + { + "epoch": 2.3892655367231637, + "grad_norm": 0.04336957260966301, + "learning_rate": 1.9589789546175176e-05, + "loss": 0.9822, + "step": 2119 + }, + { + "epoch": 2.3903954802259886, + "grad_norm": 0.04483437165617943, + "learning_rate": 1.951923572479044e-05, + "loss": 1.024, + "step": 2120 + }, + { + "epoch": 2.3915254237288135, + "grad_norm": 0.045611586421728134, + "learning_rate": 1.944879544119491e-05, + "loss": 0.9401, + "step": 2121 + }, + { + "epoch": 2.3926553672316384, + "grad_norm": 0.04340572655200958, + "learning_rate": 1.9378468794761873e-05, + "loss": 1.0403, + "step": 2122 + }, + { + "epoch": 2.3937853107344633, + "grad_norm": 0.046446606516838074, + "learning_rate": 1.9308255884704253e-05, + "loss": 0.9327, + "step": 2123 + }, + { + "epoch": 2.3949152542372882, + "grad_norm": 0.0440828800201416, + "learning_rate": 1.9238156810074448e-05, + "loss": 0.9262, + "step": 2124 + }, + { + "epoch": 2.396045197740113, + "grad_norm": 0.04405950754880905, + "learning_rate": 1.9168171669764413e-05, + "loss": 0.9528, + "step": 2125 + }, + { + "epoch": 2.397175141242938, + "grad_norm": 0.044946879148483276, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.9574, + "step": 2126 + }, + { + "epoch": 2.3983050847457625, + "grad_norm": 0.04468640685081482, + "learning_rate": 1.9028543586867276e-05, + "loss": 0.9745, + "step": 2127 + }, + { + "epoch": 2.3994350282485875, + "grad_norm": 0.04496829956769943, + "learning_rate": 1.895890084125973e-05, + "loss": 0.99, + "step": 2128 + }, + { + "epoch": 2.4005649717514124, + "grad_norm": 0.04439869895577431, + "learning_rate": 1.888937242393072e-05, + "loss": 0.9164, + "step": 2129 + }, + { + "epoch": 2.4016949152542373, + "grad_norm": 0.04531821236014366, + "learning_rate": 1.881995843296708e-05, + "loss": 1.0018, + "step": 2130 + }, + { + "epoch": 2.402824858757062, + "grad_norm": 0.043409690260887146, + "learning_rate": 1.87506589662942e-05, + "loss": 1.0898, + "step": 2131 + }, + { + "epoch": 2.403954802259887, + "grad_norm": 0.04595141485333443, + "learning_rate": 1.8681474121675913e-05, + "loss": 0.9994, + "step": 2132 + }, + { + "epoch": 2.405084745762712, + "grad_norm": 0.04307853803038597, + "learning_rate": 1.8612403996714413e-05, + "loss": 1.0001, + "step": 2133 + }, + { + "epoch": 2.4062146892655365, + "grad_norm": 0.045866724103689194, + "learning_rate": 1.8543448688849897e-05, + "loss": 0.9808, + "step": 2134 + }, + { + "epoch": 2.4073446327683614, + "grad_norm": 0.04307013005018234, + "learning_rate": 1.847460829536075e-05, + "loss": 0.8761, + "step": 2135 + }, + { + "epoch": 2.4084745762711863, + "grad_norm": 0.04371463879942894, + "learning_rate": 1.840588291336317e-05, + "loss": 0.9833, + "step": 2136 + }, + { + "epoch": 2.4096045197740112, + "grad_norm": 0.04334999993443489, + "learning_rate": 1.8337272639811075e-05, + "loss": 0.936, + "step": 2137 + }, + { + "epoch": 2.410734463276836, + "grad_norm": 0.04375695809721947, + "learning_rate": 1.8268777571496044e-05, + "loss": 0.9321, + "step": 2138 + }, + { + "epoch": 2.411864406779661, + "grad_norm": 0.044900234788656235, + "learning_rate": 1.8200397805047108e-05, + "loss": 0.8979, + "step": 2139 + }, + { + "epoch": 2.412994350282486, + "grad_norm": 0.04461583495140076, + "learning_rate": 1.8132133436930642e-05, + "loss": 1.0154, + "step": 2140 + }, + { + "epoch": 2.414124293785311, + "grad_norm": 0.043774548918008804, + "learning_rate": 1.806398456345023e-05, + "loss": 0.9681, + "step": 2141 + }, + { + "epoch": 2.415254237288136, + "grad_norm": 0.044340088963508606, + "learning_rate": 1.7995951280746503e-05, + "loss": 0.9608, + "step": 2142 + }, + { + "epoch": 2.4163841807909603, + "grad_norm": 0.043895475566387177, + "learning_rate": 1.792803368479704e-05, + "loss": 0.9723, + "step": 2143 + }, + { + "epoch": 2.417514124293785, + "grad_norm": 0.04527180269360542, + "learning_rate": 1.786023187141621e-05, + "loss": 0.9095, + "step": 2144 + }, + { + "epoch": 2.41864406779661, + "grad_norm": 0.04495590180158615, + "learning_rate": 1.7792545936255013e-05, + "loss": 0.9953, + "step": 2145 + }, + { + "epoch": 2.419774011299435, + "grad_norm": 0.04520578309893608, + "learning_rate": 1.7724975974801073e-05, + "loss": 0.9458, + "step": 2146 + }, + { + "epoch": 2.42090395480226, + "grad_norm": 0.04260654002428055, + "learning_rate": 1.765752208237831e-05, + "loss": 0.9465, + "step": 2147 + }, + { + "epoch": 2.422033898305085, + "grad_norm": 0.04469961300492287, + "learning_rate": 1.7590184354146867e-05, + "loss": 0.9411, + "step": 2148 + }, + { + "epoch": 2.42316384180791, + "grad_norm": 0.0442170612514019, + "learning_rate": 1.7522962885103145e-05, + "loss": 0.8851, + "step": 2149 + }, + { + "epoch": 2.4242937853107343, + "grad_norm": 0.04404238983988762, + "learning_rate": 1.745585777007943e-05, + "loss": 0.9593, + "step": 2150 + }, + { + "epoch": 2.425423728813559, + "grad_norm": 0.04603072255849838, + "learning_rate": 1.7388869103743878e-05, + "loss": 1.013, + "step": 2151 + }, + { + "epoch": 2.426553672316384, + "grad_norm": 0.04436330869793892, + "learning_rate": 1.7321996980600387e-05, + "loss": 0.9281, + "step": 2152 + }, + { + "epoch": 2.427683615819209, + "grad_norm": 0.043367523699998856, + "learning_rate": 1.7255241494988405e-05, + "loss": 1.0137, + "step": 2153 + }, + { + "epoch": 2.428813559322034, + "grad_norm": 0.046013541519641876, + "learning_rate": 1.7188602741082938e-05, + "loss": 0.964, + "step": 2154 + }, + { + "epoch": 2.429943502824859, + "grad_norm": 0.04216546192765236, + "learning_rate": 1.7122080812894147e-05, + "loss": 0.8503, + "step": 2155 + }, + { + "epoch": 2.4310734463276837, + "grad_norm": 0.04499104246497154, + "learning_rate": 1.705567580426749e-05, + "loss": 0.9901, + "step": 2156 + }, + { + "epoch": 2.4322033898305087, + "grad_norm": 0.04435711354017258, + "learning_rate": 1.698938780888354e-05, + "loss": 0.9676, + "step": 2157 + }, + { + "epoch": 2.4333333333333336, + "grad_norm": 0.04303577542304993, + "learning_rate": 1.6923216920257612e-05, + "loss": 0.9955, + "step": 2158 + }, + { + "epoch": 2.434463276836158, + "grad_norm": 0.04677002876996994, + "learning_rate": 1.6857163231739948e-05, + "loss": 0.9851, + "step": 2159 + }, + { + "epoch": 2.435593220338983, + "grad_norm": 0.045381397008895874, + "learning_rate": 1.679122683651546e-05, + "loss": 1.0507, + "step": 2160 + }, + { + "epoch": 2.436723163841808, + "grad_norm": 0.044448669999837875, + "learning_rate": 1.6725407827603546e-05, + "loss": 1.0459, + "step": 2161 + }, + { + "epoch": 2.437853107344633, + "grad_norm": 0.0439726784825325, + "learning_rate": 1.6659706297857945e-05, + "loss": 0.9604, + "step": 2162 + }, + { + "epoch": 2.4389830508474577, + "grad_norm": 0.04220819100737572, + "learning_rate": 1.6594122339966778e-05, + "loss": 0.9844, + "step": 2163 + }, + { + "epoch": 2.4401129943502826, + "grad_norm": 0.044570211321115494, + "learning_rate": 1.6528656046452228e-05, + "loss": 0.9174, + "step": 2164 + }, + { + "epoch": 2.4412429378531075, + "grad_norm": 0.04373630881309509, + "learning_rate": 1.6463307509670524e-05, + "loss": 0.9263, + "step": 2165 + }, + { + "epoch": 2.442372881355932, + "grad_norm": 0.043320853263139725, + "learning_rate": 1.6398076821811724e-05, + "loss": 0.9403, + "step": 2166 + }, + { + "epoch": 2.443502824858757, + "grad_norm": 0.04534811154007912, + "learning_rate": 1.6332964074899636e-05, + "loss": 1.05, + "step": 2167 + }, + { + "epoch": 2.444632768361582, + "grad_norm": 0.04526575654745102, + "learning_rate": 1.626796936079179e-05, + "loss": 0.9709, + "step": 2168 + }, + { + "epoch": 2.4457627118644067, + "grad_norm": 0.04434860870242119, + "learning_rate": 1.620309277117904e-05, + "loss": 0.9629, + "step": 2169 + }, + { + "epoch": 2.4468926553672317, + "grad_norm": 0.04492155835032463, + "learning_rate": 1.6138334397585675e-05, + "loss": 0.9906, + "step": 2170 + }, + { + "epoch": 2.4480225988700566, + "grad_norm": 0.04469876363873482, + "learning_rate": 1.6073694331369272e-05, + "loss": 1.0313, + "step": 2171 + }, + { + "epoch": 2.4491525423728815, + "grad_norm": 0.045252226293087006, + "learning_rate": 1.600917266372035e-05, + "loss": 0.908, + "step": 2172 + }, + { + "epoch": 2.4502824858757064, + "grad_norm": 0.044812172651290894, + "learning_rate": 1.5944769485662568e-05, + "loss": 0.9541, + "step": 2173 + }, + { + "epoch": 2.4514124293785313, + "grad_norm": 0.04414849728345871, + "learning_rate": 1.5880484888052328e-05, + "loss": 0.9989, + "step": 2174 + }, + { + "epoch": 2.452542372881356, + "grad_norm": 0.04734036698937416, + "learning_rate": 1.5816318961578757e-05, + "loss": 0.9075, + "step": 2175 + }, + { + "epoch": 2.4536723163841807, + "grad_norm": 0.044426627457141876, + "learning_rate": 1.5752271796763584e-05, + "loss": 0.9566, + "step": 2176 + }, + { + "epoch": 2.4548022598870056, + "grad_norm": 0.04688658565282822, + "learning_rate": 1.5688343483961e-05, + "loss": 1.0372, + "step": 2177 + }, + { + "epoch": 2.4559322033898305, + "grad_norm": 0.04371168091893196, + "learning_rate": 1.5624534113357493e-05, + "loss": 0.9158, + "step": 2178 + }, + { + "epoch": 2.4570621468926555, + "grad_norm": 0.045110173523426056, + "learning_rate": 1.55608437749718e-05, + "loss": 0.9342, + "step": 2179 + }, + { + "epoch": 2.4581920903954804, + "grad_norm": 0.04536379873752594, + "learning_rate": 1.5497272558654697e-05, + "loss": 0.9582, + "step": 2180 + }, + { + "epoch": 2.459322033898305, + "grad_norm": 0.043405305594205856, + "learning_rate": 1.5433820554088895e-05, + "loss": 0.9627, + "step": 2181 + }, + { + "epoch": 2.4604519774011298, + "grad_norm": 0.0462508387863636, + "learning_rate": 1.537048785078905e-05, + "loss": 0.9083, + "step": 2182 + }, + { + "epoch": 2.4615819209039547, + "grad_norm": 0.04306486248970032, + "learning_rate": 1.5307274538101292e-05, + "loss": 0.9715, + "step": 2183 + }, + { + "epoch": 2.4627118644067796, + "grad_norm": 0.04309242591261864, + "learning_rate": 1.5244180705203547e-05, + "loss": 0.946, + "step": 2184 + }, + { + "epoch": 2.4638418079096045, + "grad_norm": 0.04370222985744476, + "learning_rate": 1.5181206441105078e-05, + "loss": 0.9582, + "step": 2185 + }, + { + "epoch": 2.4649717514124294, + "grad_norm": 0.04389223828911781, + "learning_rate": 1.5118351834646405e-05, + "loss": 0.916, + "step": 2186 + }, + { + "epoch": 2.4661016949152543, + "grad_norm": 0.04450360685586929, + "learning_rate": 1.5055616974499375e-05, + "loss": 0.9326, + "step": 2187 + }, + { + "epoch": 2.4672316384180792, + "grad_norm": 0.04638772830367088, + "learning_rate": 1.499300194916684e-05, + "loss": 1.0533, + "step": 2188 + }, + { + "epoch": 2.468361581920904, + "grad_norm": 0.04407142847776413, + "learning_rate": 1.4930506846982585e-05, + "loss": 0.9369, + "step": 2189 + }, + { + "epoch": 2.4694915254237286, + "grad_norm": 0.04272819682955742, + "learning_rate": 1.4868131756111225e-05, + "loss": 1.03, + "step": 2190 + }, + { + "epoch": 2.4706214689265535, + "grad_norm": 0.044162772595882416, + "learning_rate": 1.4805876764548077e-05, + "loss": 0.9255, + "step": 2191 + }, + { + "epoch": 2.4717514124293785, + "grad_norm": 0.04563014954328537, + "learning_rate": 1.4743741960119018e-05, + "loss": 0.9713, + "step": 2192 + }, + { + "epoch": 2.4728813559322034, + "grad_norm": 0.04250924661755562, + "learning_rate": 1.4681727430480375e-05, + "loss": 1.0386, + "step": 2193 + }, + { + "epoch": 2.4740112994350283, + "grad_norm": 0.044044073671102524, + "learning_rate": 1.4619833263118788e-05, + "loss": 0.9318, + "step": 2194 + }, + { + "epoch": 2.475141242937853, + "grad_norm": 0.04529934003949165, + "learning_rate": 1.4558059545351143e-05, + "loss": 1.1511, + "step": 2195 + }, + { + "epoch": 2.476271186440678, + "grad_norm": 0.04386807605624199, + "learning_rate": 1.4496406364324367e-05, + "loss": 0.9201, + "step": 2196 + }, + { + "epoch": 2.4774011299435026, + "grad_norm": 0.04460158571600914, + "learning_rate": 1.4434873807015281e-05, + "loss": 1.0432, + "step": 2197 + }, + { + "epoch": 2.4785310734463275, + "grad_norm": 0.04447031766176224, + "learning_rate": 1.4373461960230671e-05, + "loss": 0.872, + "step": 2198 + }, + { + "epoch": 2.4796610169491524, + "grad_norm": 0.04573468118906021, + "learning_rate": 1.4312170910606937e-05, + "loss": 0.9578, + "step": 2199 + }, + { + "epoch": 2.4807909604519773, + "grad_norm": 0.04439990594983101, + "learning_rate": 1.4251000744610033e-05, + "loss": 0.9185, + "step": 2200 + }, + { + "epoch": 2.4819209039548022, + "grad_norm": 0.045020122081041336, + "learning_rate": 1.4189951548535496e-05, + "loss": 0.9954, + "step": 2201 + }, + { + "epoch": 2.483050847457627, + "grad_norm": 0.04263998568058014, + "learning_rate": 1.4129023408508101e-05, + "loss": 0.9367, + "step": 2202 + }, + { + "epoch": 2.484180790960452, + "grad_norm": 0.04334124177694321, + "learning_rate": 1.4068216410481904e-05, + "loss": 0.9856, + "step": 2203 + }, + { + "epoch": 2.485310734463277, + "grad_norm": 0.04389631003141403, + "learning_rate": 1.4007530640240029e-05, + "loss": 0.8721, + "step": 2204 + }, + { + "epoch": 2.486440677966102, + "grad_norm": 0.046165741980075836, + "learning_rate": 1.394696618339456e-05, + "loss": 1.0324, + "step": 2205 + }, + { + "epoch": 2.4875706214689264, + "grad_norm": 0.04491398110985756, + "learning_rate": 1.388652312538653e-05, + "loss": 0.9486, + "step": 2206 + }, + { + "epoch": 2.4887005649717513, + "grad_norm": 0.043603088706731796, + "learning_rate": 1.3826201551485596e-05, + "loss": 0.9196, + "step": 2207 + }, + { + "epoch": 2.489830508474576, + "grad_norm": 0.04453226923942566, + "learning_rate": 1.3766001546790075e-05, + "loss": 0.9218, + "step": 2208 + }, + { + "epoch": 2.490960451977401, + "grad_norm": 0.04467688500881195, + "learning_rate": 1.3705923196226832e-05, + "loss": 0.8989, + "step": 2209 + }, + { + "epoch": 2.492090395480226, + "grad_norm": 0.04468687251210213, + "learning_rate": 1.364596658455105e-05, + "loss": 0.894, + "step": 2210 + }, + { + "epoch": 2.493220338983051, + "grad_norm": 0.043691717088222504, + "learning_rate": 1.3586131796346147e-05, + "loss": 0.9544, + "step": 2211 + }, + { + "epoch": 2.494350282485876, + "grad_norm": 0.04409053549170494, + "learning_rate": 1.3526418916023753e-05, + "loss": 0.9463, + "step": 2212 + }, + { + "epoch": 2.4954802259887003, + "grad_norm": 0.0438973568379879, + "learning_rate": 1.3466828027823475e-05, + "loss": 1.0117, + "step": 2213 + }, + { + "epoch": 2.4966101694915253, + "grad_norm": 0.04333541914820671, + "learning_rate": 1.3407359215812832e-05, + "loss": 1.0121, + "step": 2214 + }, + { + "epoch": 2.49774011299435, + "grad_norm": 0.04311882331967354, + "learning_rate": 1.3348012563887102e-05, + "loss": 0.9785, + "step": 2215 + }, + { + "epoch": 2.498870056497175, + "grad_norm": 0.044175051152706146, + "learning_rate": 1.328878815576926e-05, + "loss": 1.0375, + "step": 2216 + }, + { + "epoch": 2.5, + "grad_norm": 0.04385006055235863, + "learning_rate": 1.3229686075009795e-05, + "loss": 0.9311, + "step": 2217 + }, + { + "epoch": 2.501129943502825, + "grad_norm": 0.044461339712142944, + "learning_rate": 1.3170706404986644e-05, + "loss": 0.9383, + "step": 2218 + }, + { + "epoch": 2.50225988700565, + "grad_norm": 0.042694512754678726, + "learning_rate": 1.3111849228905026e-05, + "loss": 0.9092, + "step": 2219 + }, + { + "epoch": 2.5033898305084747, + "grad_norm": 0.04385200887918472, + "learning_rate": 1.3053114629797437e-05, + "loss": 0.9969, + "step": 2220 + }, + { + "epoch": 2.5045197740112997, + "grad_norm": 0.04622391611337662, + "learning_rate": 1.2994502690523303e-05, + "loss": 0.9167, + "step": 2221 + }, + { + "epoch": 2.505649717514124, + "grad_norm": 0.043433066457509995, + "learning_rate": 1.293601349376915e-05, + "loss": 1.0352, + "step": 2222 + }, + { + "epoch": 2.506779661016949, + "grad_norm": 0.04414517059922218, + "learning_rate": 1.2877647122048265e-05, + "loss": 1.0007, + "step": 2223 + }, + { + "epoch": 2.507909604519774, + "grad_norm": 0.04432892054319382, + "learning_rate": 1.281940365770068e-05, + "loss": 0.9849, + "step": 2224 + }, + { + "epoch": 2.509039548022599, + "grad_norm": 0.04392955079674721, + "learning_rate": 1.2761283182893047e-05, + "loss": 0.9475, + "step": 2225 + }, + { + "epoch": 2.510169491525424, + "grad_norm": 0.045272983610630035, + "learning_rate": 1.2703285779618491e-05, + "loss": 0.9326, + "step": 2226 + }, + { + "epoch": 2.5112994350282487, + "grad_norm": 0.04357200860977173, + "learning_rate": 1.2645411529696527e-05, + "loss": 1.004, + "step": 2227 + }, + { + "epoch": 2.512429378531073, + "grad_norm": 0.04428897425532341, + "learning_rate": 1.2587660514772936e-05, + "loss": 1.0016, + "step": 2228 + }, + { + "epoch": 2.513559322033898, + "grad_norm": 0.042956896126270294, + "learning_rate": 1.2530032816319637e-05, + "loss": 0.9741, + "step": 2229 + }, + { + "epoch": 2.514689265536723, + "grad_norm": 0.04535180702805519, + "learning_rate": 1.2472528515634584e-05, + "loss": 0.9554, + "step": 2230 + }, + { + "epoch": 2.515819209039548, + "grad_norm": 0.044575612992048264, + "learning_rate": 1.2415147693841644e-05, + "loss": 1.0013, + "step": 2231 + }, + { + "epoch": 2.516949152542373, + "grad_norm": 0.04531654715538025, + "learning_rate": 1.2357890431890473e-05, + "loss": 0.9653, + "step": 2232 + }, + { + "epoch": 2.5180790960451978, + "grad_norm": 0.044301070272922516, + "learning_rate": 1.230075681055648e-05, + "loss": 0.9754, + "step": 2233 + }, + { + "epoch": 2.5192090395480227, + "grad_norm": 0.04337549954652786, + "learning_rate": 1.22437469104406e-05, + "loss": 0.9641, + "step": 2234 + }, + { + "epoch": 2.5203389830508476, + "grad_norm": 0.044852569699287415, + "learning_rate": 1.218686081196917e-05, + "loss": 0.9933, + "step": 2235 + }, + { + "epoch": 2.5214689265536725, + "grad_norm": 0.0434139259159565, + "learning_rate": 1.213009859539399e-05, + "loss": 0.8976, + "step": 2236 + }, + { + "epoch": 2.5225988700564974, + "grad_norm": 0.043631184846162796, + "learning_rate": 1.207346034079203e-05, + "loss": 0.9398, + "step": 2237 + }, + { + "epoch": 2.523728813559322, + "grad_norm": 0.0428677536547184, + "learning_rate": 1.2016946128065387e-05, + "loss": 0.9076, + "step": 2238 + }, + { + "epoch": 2.524858757062147, + "grad_norm": 0.04406916722655296, + "learning_rate": 1.1960556036941174e-05, + "loss": 0.9515, + "step": 2239 + }, + { + "epoch": 2.5259887005649717, + "grad_norm": 0.04295162111520767, + "learning_rate": 1.1904290146971397e-05, + "loss": 0.9814, + "step": 2240 + }, + { + "epoch": 2.5271186440677966, + "grad_norm": 0.043710824102163315, + "learning_rate": 1.1848148537532843e-05, + "loss": 0.9396, + "step": 2241 + }, + { + "epoch": 2.5282485875706215, + "grad_norm": 0.044418223202228546, + "learning_rate": 1.179213128782698e-05, + "loss": 0.9687, + "step": 2242 + }, + { + "epoch": 2.5293785310734465, + "grad_norm": 0.043215930461883545, + "learning_rate": 1.1736238476879802e-05, + "loss": 0.9498, + "step": 2243 + }, + { + "epoch": 2.530508474576271, + "grad_norm": 0.04229172319173813, + "learning_rate": 1.1680470183541847e-05, + "loss": 0.9091, + "step": 2244 + }, + { + "epoch": 2.531638418079096, + "grad_norm": 0.04341178387403488, + "learning_rate": 1.1624826486487871e-05, + "loss": 1.0278, + "step": 2245 + }, + { + "epoch": 2.5327683615819208, + "grad_norm": 0.046501412987709045, + "learning_rate": 1.1569307464216895e-05, + "loss": 1.0282, + "step": 2246 + }, + { + "epoch": 2.5338983050847457, + "grad_norm": 0.04544202238321304, + "learning_rate": 1.1513913195052107e-05, + "loss": 0.9748, + "step": 2247 + }, + { + "epoch": 2.5350282485875706, + "grad_norm": 0.045019011944532394, + "learning_rate": 1.1458643757140674e-05, + "loss": 0.9622, + "step": 2248 + }, + { + "epoch": 2.5361581920903955, + "grad_norm": 0.04544651135802269, + "learning_rate": 1.1403499228453596e-05, + "loss": 0.9654, + "step": 2249 + }, + { + "epoch": 2.5372881355932204, + "grad_norm": 0.04545415937900543, + "learning_rate": 1.1348479686785751e-05, + "loss": 0.8964, + "step": 2250 + }, + { + "epoch": 2.5384180790960453, + "grad_norm": 0.0433647446334362, + "learning_rate": 1.1293585209755631e-05, + "loss": 0.8951, + "step": 2251 + }, + { + "epoch": 2.5395480225988702, + "grad_norm": 0.045199718326330185, + "learning_rate": 1.123881587480533e-05, + "loss": 0.9357, + "step": 2252 + }, + { + "epoch": 2.540677966101695, + "grad_norm": 0.04812634736299515, + "learning_rate": 1.1184171759200368e-05, + "loss": 0.8831, + "step": 2253 + }, + { + "epoch": 2.5418079096045196, + "grad_norm": 0.04364898055791855, + "learning_rate": 1.1129652940029623e-05, + "loss": 0.9079, + "step": 2254 + }, + { + "epoch": 2.5429378531073445, + "grad_norm": 0.044137731194496155, + "learning_rate": 1.1075259494205225e-05, + "loss": 1.0158, + "step": 2255 + }, + { + "epoch": 2.5440677966101695, + "grad_norm": 0.044673506170511246, + "learning_rate": 1.102099149846243e-05, + "loss": 0.9106, + "step": 2256 + }, + { + "epoch": 2.5451977401129944, + "grad_norm": 0.04392934590578079, + "learning_rate": 1.0966849029359472e-05, + "loss": 0.9462, + "step": 2257 + }, + { + "epoch": 2.5463276836158193, + "grad_norm": 0.044603992253541946, + "learning_rate": 1.0912832163277609e-05, + "loss": 1.0413, + "step": 2258 + }, + { + "epoch": 2.547457627118644, + "grad_norm": 0.043722860515117645, + "learning_rate": 1.0858940976420772e-05, + "loss": 1.0259, + "step": 2259 + }, + { + "epoch": 2.5485875706214687, + "grad_norm": 0.04358895495533943, + "learning_rate": 1.0805175544815648e-05, + "loss": 0.907, + "step": 2260 + }, + { + "epoch": 2.5497175141242936, + "grad_norm": 0.0456516295671463, + "learning_rate": 1.0751535944311564e-05, + "loss": 0.9535, + "step": 2261 + }, + { + "epoch": 2.5508474576271185, + "grad_norm": 0.046546079218387604, + "learning_rate": 1.0698022250580264e-05, + "loss": 1.042, + "step": 2262 + }, + { + "epoch": 2.5519774011299434, + "grad_norm": 0.043882519006729126, + "learning_rate": 1.0644634539115906e-05, + "loss": 0.883, + "step": 2263 + }, + { + "epoch": 2.5531073446327683, + "grad_norm": 0.04427448660135269, + "learning_rate": 1.0591372885234885e-05, + "loss": 0.9493, + "step": 2264 + }, + { + "epoch": 2.5542372881355933, + "grad_norm": 0.04398886859416962, + "learning_rate": 1.0538237364075787e-05, + "loss": 1.0124, + "step": 2265 + }, + { + "epoch": 2.555367231638418, + "grad_norm": 0.04535575583577156, + "learning_rate": 1.0485228050599272e-05, + "loss": 0.9521, + "step": 2266 + }, + { + "epoch": 2.556497175141243, + "grad_norm": 0.04494888707995415, + "learning_rate": 1.0432345019587908e-05, + "loss": 0.9673, + "step": 2267 + }, + { + "epoch": 2.557627118644068, + "grad_norm": 0.04512500762939453, + "learning_rate": 1.0379588345646152e-05, + "loss": 0.9744, + "step": 2268 + }, + { + "epoch": 2.558757062146893, + "grad_norm": 0.045277807861566544, + "learning_rate": 1.032695810320018e-05, + "loss": 0.9401, + "step": 2269 + }, + { + "epoch": 2.5598870056497174, + "grad_norm": 0.044249020516872406, + "learning_rate": 1.0274454366497787e-05, + "loss": 1.0551, + "step": 2270 + }, + { + "epoch": 2.5610169491525423, + "grad_norm": 0.04531550407409668, + "learning_rate": 1.0222077209608383e-05, + "loss": 0.9485, + "step": 2271 + }, + { + "epoch": 2.562146892655367, + "grad_norm": 0.044957417994737625, + "learning_rate": 1.0169826706422735e-05, + "loss": 1.0354, + "step": 2272 + }, + { + "epoch": 2.563276836158192, + "grad_norm": 0.04343345761299133, + "learning_rate": 1.0117702930652906e-05, + "loss": 0.9992, + "step": 2273 + }, + { + "epoch": 2.564406779661017, + "grad_norm": 0.0441376157104969, + "learning_rate": 1.006570595583226e-05, + "loss": 0.8785, + "step": 2274 + }, + { + "epoch": 2.565536723163842, + "grad_norm": 0.04558190703392029, + "learning_rate": 1.0013835855315235e-05, + "loss": 1.0488, + "step": 2275 + }, + { + "epoch": 2.5666666666666664, + "grad_norm": 0.04628458246588707, + "learning_rate": 9.962092702277282e-06, + "loss": 0.9738, + "step": 2276 + }, + { + "epoch": 2.5677966101694913, + "grad_norm": 0.046162836253643036, + "learning_rate": 9.910476569714766e-06, + "loss": 1.0152, + "step": 2277 + }, + { + "epoch": 2.5689265536723163, + "grad_norm": 0.0438779816031456, + "learning_rate": 9.858987530444852e-06, + "loss": 0.9672, + "step": 2278 + }, + { + "epoch": 2.570056497175141, + "grad_norm": 0.0449378527700901, + "learning_rate": 9.807625657105424e-06, + "loss": 0.8919, + "step": 2279 + }, + { + "epoch": 2.571186440677966, + "grad_norm": 0.04468364268541336, + "learning_rate": 9.756391022154954e-06, + "loss": 0.9767, + "step": 2280 + }, + { + "epoch": 2.572316384180791, + "grad_norm": 0.05266987159848213, + "learning_rate": 9.705283697872402e-06, + "loss": 1.0026, + "step": 2281 + }, + { + "epoch": 2.573446327683616, + "grad_norm": 0.044424451887607574, + "learning_rate": 9.654303756357186e-06, + "loss": 0.9913, + "step": 2282 + }, + { + "epoch": 2.574576271186441, + "grad_norm": 0.045492034405469894, + "learning_rate": 9.60345126952893e-06, + "loss": 0.9934, + "step": 2283 + }, + { + "epoch": 2.5757062146892657, + "grad_norm": 0.04521951451897621, + "learning_rate": 9.552726309127492e-06, + "loss": 0.9527, + "step": 2284 + }, + { + "epoch": 2.5768361581920907, + "grad_norm": 0.04271269962191582, + "learning_rate": 9.502128946712862e-06, + "loss": 0.9101, + "step": 2285 + }, + { + "epoch": 2.577966101694915, + "grad_norm": 0.043572571128606796, + "learning_rate": 9.45165925366498e-06, + "loss": 0.9369, + "step": 2286 + }, + { + "epoch": 2.57909604519774, + "grad_norm": 0.04333474859595299, + "learning_rate": 9.401317301183655e-06, + "loss": 1.0792, + "step": 2287 + }, + { + "epoch": 2.580225988700565, + "grad_norm": 0.044597212225198746, + "learning_rate": 9.351103160288533e-06, + "loss": 0.9374, + "step": 2288 + }, + { + "epoch": 2.58135593220339, + "grad_norm": 0.04439055919647217, + "learning_rate": 9.301016901818948e-06, + "loss": 0.869, + "step": 2289 + }, + { + "epoch": 2.582485875706215, + "grad_norm": 0.04525582492351532, + "learning_rate": 9.251058596433793e-06, + "loss": 1.0324, + "step": 2290 + }, + { + "epoch": 2.5836158192090397, + "grad_norm": 0.04282711073756218, + "learning_rate": 9.201228314611477e-06, + "loss": 0.9317, + "step": 2291 + }, + { + "epoch": 2.584745762711864, + "grad_norm": 0.044124454259872437, + "learning_rate": 9.151526126649779e-06, + "loss": 1.0026, + "step": 2292 + }, + { + "epoch": 2.585875706214689, + "grad_norm": 0.043038588017225266, + "learning_rate": 9.10195210266579e-06, + "loss": 0.9802, + "step": 2293 + }, + { + "epoch": 2.587005649717514, + "grad_norm": 0.04550214856863022, + "learning_rate": 9.052506312595798e-06, + "loss": 1.0226, + "step": 2294 + }, + { + "epoch": 2.588135593220339, + "grad_norm": 0.0450584851205349, + "learning_rate": 9.003188826195142e-06, + "loss": 1.0131, + "step": 2295 + }, + { + "epoch": 2.589265536723164, + "grad_norm": 0.0439411997795105, + "learning_rate": 8.953999713038252e-06, + "loss": 1.002, + "step": 2296 + }, + { + "epoch": 2.5903954802259888, + "grad_norm": 0.04387841746211052, + "learning_rate": 8.904939042518345e-06, + "loss": 0.993, + "step": 2297 + }, + { + "epoch": 2.5915254237288137, + "grad_norm": 0.046317506581544876, + "learning_rate": 8.856006883847479e-06, + "loss": 0.9878, + "step": 2298 + }, + { + "epoch": 2.5926553672316386, + "grad_norm": 0.04398587346076965, + "learning_rate": 8.807203306056466e-06, + "loss": 0.9364, + "step": 2299 + }, + { + "epoch": 2.5937853107344635, + "grad_norm": 0.044241469353437424, + "learning_rate": 8.758528377994667e-06, + "loss": 0.9447, + "step": 2300 + }, + { + "epoch": 2.594915254237288, + "grad_norm": 0.045419175177812576, + "learning_rate": 8.709982168329955e-06, + "loss": 0.9441, + "step": 2301 + }, + { + "epoch": 2.596045197740113, + "grad_norm": 0.04503266513347626, + "learning_rate": 8.661564745548634e-06, + "loss": 0.9268, + "step": 2302 + }, + { + "epoch": 2.597175141242938, + "grad_norm": 0.04366137087345123, + "learning_rate": 8.613276177955309e-06, + "loss": 0.9875, + "step": 2303 + }, + { + "epoch": 2.5983050847457627, + "grad_norm": 0.045485030859708786, + "learning_rate": 8.565116533672801e-06, + "loss": 0.9896, + "step": 2304 + }, + { + "epoch": 2.5994350282485876, + "grad_norm": 0.04749319702386856, + "learning_rate": 8.517085880642062e-06, + "loss": 0.9509, + "step": 2305 + }, + { + "epoch": 2.6005649717514125, + "grad_norm": 0.044025685638189316, + "learning_rate": 8.469184286622078e-06, + "loss": 0.9259, + "step": 2306 + }, + { + "epoch": 2.601694915254237, + "grad_norm": 0.04714653640985489, + "learning_rate": 8.42141181918975e-06, + "loss": 0.986, + "step": 2307 + }, + { + "epoch": 2.602824858757062, + "grad_norm": 0.04400424286723137, + "learning_rate": 8.373768545739813e-06, + "loss": 0.9512, + "step": 2308 + }, + { + "epoch": 2.603954802259887, + "grad_norm": 0.0437365286052227, + "learning_rate": 8.326254533484745e-06, + "loss": 1.0059, + "step": 2309 + }, + { + "epoch": 2.6050847457627118, + "grad_norm": 0.045271988958120346, + "learning_rate": 8.278869849454718e-06, + "loss": 1.0108, + "step": 2310 + }, + { + "epoch": 2.6062146892655367, + "grad_norm": 0.04391276836395264, + "learning_rate": 8.231614560497337e-06, + "loss": 0.9316, + "step": 2311 + }, + { + "epoch": 2.6073446327683616, + "grad_norm": 0.04322647675871849, + "learning_rate": 8.184488733277796e-06, + "loss": 0.954, + "step": 2312 + }, + { + "epoch": 2.6084745762711865, + "grad_norm": 0.04381094500422478, + "learning_rate": 8.13749243427857e-06, + "loss": 0.996, + "step": 2313 + }, + { + "epoch": 2.6096045197740114, + "grad_norm": 0.04461012780666351, + "learning_rate": 8.090625729799429e-06, + "loss": 0.9191, + "step": 2314 + }, + { + "epoch": 2.6107344632768363, + "grad_norm": 0.04649262875318527, + "learning_rate": 8.043888685957313e-06, + "loss": 0.9661, + "step": 2315 + }, + { + "epoch": 2.6118644067796613, + "grad_norm": 0.043844398111104965, + "learning_rate": 7.99728136868626e-06, + "loss": 1.0126, + "step": 2316 + }, + { + "epoch": 2.6129943502824857, + "grad_norm": 0.04373704642057419, + "learning_rate": 7.950803843737264e-06, + "loss": 1.0559, + "step": 2317 + }, + { + "epoch": 2.6141242937853106, + "grad_norm": 0.04947759956121445, + "learning_rate": 7.904456176678232e-06, + "loss": 0.8689, + "step": 2318 + }, + { + "epoch": 2.6152542372881356, + "grad_norm": 0.04527619481086731, + "learning_rate": 7.858238432893883e-06, + "loss": 0.9564, + "step": 2319 + }, + { + "epoch": 2.6163841807909605, + "grad_norm": 0.04394813999533653, + "learning_rate": 7.812150677585673e-06, + "loss": 0.8771, + "step": 2320 + }, + { + "epoch": 2.6175141242937854, + "grad_norm": 0.04304569587111473, + "learning_rate": 7.766192975771591e-06, + "loss": 0.9332, + "step": 2321 + }, + { + "epoch": 2.6186440677966103, + "grad_norm": 0.04365582764148712, + "learning_rate": 7.720365392286222e-06, + "loss": 1.0163, + "step": 2322 + }, + { + "epoch": 2.6197740112994348, + "grad_norm": 0.045340195298194885, + "learning_rate": 7.674667991780604e-06, + "loss": 0.9353, + "step": 2323 + }, + { + "epoch": 2.6209039548022597, + "grad_norm": 0.045364297926425934, + "learning_rate": 7.6291008387220916e-06, + "loss": 0.9099, + "step": 2324 + }, + { + "epoch": 2.6220338983050846, + "grad_norm": 0.04610748961567879, + "learning_rate": 7.583663997394241e-06, + "loss": 0.9208, + "step": 2325 + }, + { + "epoch": 2.6231638418079095, + "grad_norm": 0.0443345345556736, + "learning_rate": 7.538357531896856e-06, + "loss": 0.9391, + "step": 2326 + }, + { + "epoch": 2.6242937853107344, + "grad_norm": 0.04437210038304329, + "learning_rate": 7.4931815061457855e-06, + "loss": 1.0321, + "step": 2327 + }, + { + "epoch": 2.6254237288135593, + "grad_norm": 0.04455529898405075, + "learning_rate": 7.44813598387285e-06, + "loss": 0.9334, + "step": 2328 + }, + { + "epoch": 2.6265536723163843, + "grad_norm": 0.04353608191013336, + "learning_rate": 7.403221028625762e-06, + "loss": 0.9266, + "step": 2329 + }, + { + "epoch": 2.627683615819209, + "grad_norm": 0.04302187263965607, + "learning_rate": 7.358436703768035e-06, + "loss": 0.9863, + "step": 2330 + }, + { + "epoch": 2.628813559322034, + "grad_norm": 0.04428388550877571, + "learning_rate": 7.313783072478953e-06, + "loss": 1.0419, + "step": 2331 + }, + { + "epoch": 2.629943502824859, + "grad_norm": 0.04457690566778183, + "learning_rate": 7.269260197753325e-06, + "loss": 0.9549, + "step": 2332 + }, + { + "epoch": 2.6310734463276835, + "grad_norm": 0.04523680359125137, + "learning_rate": 7.224868142401542e-06, + "loss": 1.0456, + "step": 2333 + }, + { + "epoch": 2.6322033898305084, + "grad_norm": 0.04431130737066269, + "learning_rate": 7.180606969049519e-06, + "loss": 0.8987, + "step": 2334 + }, + { + "epoch": 2.6333333333333333, + "grad_norm": 0.045462481677532196, + "learning_rate": 7.136476740138387e-06, + "loss": 0.9716, + "step": 2335 + }, + { + "epoch": 2.634463276836158, + "grad_norm": 0.044277604669332504, + "learning_rate": 7.092477517924634e-06, + "loss": 0.9404, + "step": 2336 + }, + { + "epoch": 2.635593220338983, + "grad_norm": 0.045289479196071625, + "learning_rate": 7.048609364479941e-06, + "loss": 0.9956, + "step": 2337 + }, + { + "epoch": 2.636723163841808, + "grad_norm": 0.04256889224052429, + "learning_rate": 7.00487234169106e-06, + "loss": 0.9515, + "step": 2338 + }, + { + "epoch": 2.6378531073446325, + "grad_norm": 0.04596678167581558, + "learning_rate": 6.961266511259734e-06, + "loss": 0.8839, + "step": 2339 + }, + { + "epoch": 2.6389830508474574, + "grad_norm": 0.04532042145729065, + "learning_rate": 6.917791934702655e-06, + "loss": 1.0426, + "step": 2340 + }, + { + "epoch": 2.6401129943502823, + "grad_norm": 0.04318855702877045, + "learning_rate": 6.87444867335133e-06, + "loss": 1.0885, + "step": 2341 + }, + { + "epoch": 2.6412429378531073, + "grad_norm": 0.0447513610124588, + "learning_rate": 6.831236788352035e-06, + "loss": 0.9046, + "step": 2342 + }, + { + "epoch": 2.642372881355932, + "grad_norm": 0.04536350071430206, + "learning_rate": 6.788156340665697e-06, + "loss": 0.9783, + "step": 2343 + }, + { + "epoch": 2.643502824858757, + "grad_norm": 0.04435262084007263, + "learning_rate": 6.745207391067787e-06, + "loss": 0.9362, + "step": 2344 + }, + { + "epoch": 2.644632768361582, + "grad_norm": 0.04777863621711731, + "learning_rate": 6.702390000148351e-06, + "loss": 0.9676, + "step": 2345 + }, + { + "epoch": 2.645762711864407, + "grad_norm": 0.04534154385328293, + "learning_rate": 6.6597042283117365e-06, + "loss": 0.9768, + "step": 2346 + }, + { + "epoch": 2.646892655367232, + "grad_norm": 0.046897709369659424, + "learning_rate": 6.617150135776662e-06, + "loss": 0.9542, + "step": 2347 + }, + { + "epoch": 2.6480225988700568, + "grad_norm": 0.04535379633307457, + "learning_rate": 6.574727782576129e-06, + "loss": 0.9005, + "step": 2348 + }, + { + "epoch": 2.6491525423728812, + "grad_norm": 0.04338109865784645, + "learning_rate": 6.532437228557153e-06, + "loss": 0.9573, + "step": 2349 + }, + { + "epoch": 2.650282485875706, + "grad_norm": 0.043659161776304245, + "learning_rate": 6.490278533380956e-06, + "loss": 1.0269, + "step": 2350 + }, + { + "epoch": 2.651412429378531, + "grad_norm": 0.04302853345870972, + "learning_rate": 6.4482517565226715e-06, + "loss": 0.8879, + "step": 2351 + }, + { + "epoch": 2.652542372881356, + "grad_norm": 0.04258056730031967, + "learning_rate": 6.406356957271331e-06, + "loss": 0.8815, + "step": 2352 + }, + { + "epoch": 2.653672316384181, + "grad_norm": 0.04558559134602547, + "learning_rate": 6.364594194729789e-06, + "loss": 0.9809, + "step": 2353 + }, + { + "epoch": 2.654802259887006, + "grad_norm": 0.04610477015376091, + "learning_rate": 6.3229635278146295e-06, + "loss": 1.0469, + "step": 2354 + }, + { + "epoch": 2.6559322033898303, + "grad_norm": 0.044572293758392334, + "learning_rate": 6.281465015256094e-06, + "loss": 1.0038, + "step": 2355 + }, + { + "epoch": 2.657062146892655, + "grad_norm": 0.043487418442964554, + "learning_rate": 6.240098715597975e-06, + "loss": 0.9886, + "step": 2356 + }, + { + "epoch": 2.65819209039548, + "grad_norm": 0.044853370636701584, + "learning_rate": 6.198864687197536e-06, + "loss": 1.0096, + "step": 2357 + }, + { + "epoch": 2.659322033898305, + "grad_norm": 0.044973455369472504, + "learning_rate": 6.157762988225457e-06, + "loss": 0.9626, + "step": 2358 + }, + { + "epoch": 2.66045197740113, + "grad_norm": 0.04432562738656998, + "learning_rate": 6.116793676665755e-06, + "loss": 1.0168, + "step": 2359 + }, + { + "epoch": 2.661581920903955, + "grad_norm": 0.046326398849487305, + "learning_rate": 6.0759568103156195e-06, + "loss": 0.9713, + "step": 2360 + }, + { + "epoch": 2.661581920903955, + "eval_loss": 0.9848312735557556, + "eval_runtime": 556.2884, + "eval_samples_per_second": 17.586, + "eval_steps_per_second": 8.794, + "step": 2360 + }, + { + "epoch": 2.6627118644067798, + "grad_norm": 0.04329252615571022, + "learning_rate": 6.0352524467854555e-06, + "loss": 0.9532, + "step": 2361 + }, + { + "epoch": 2.6638418079096047, + "grad_norm": 0.04437597095966339, + "learning_rate": 5.994680643498729e-06, + "loss": 0.8682, + "step": 2362 + }, + { + "epoch": 2.6649717514124296, + "grad_norm": 0.04312526434659958, + "learning_rate": 5.954241457691834e-06, + "loss": 1.0423, + "step": 2363 + }, + { + "epoch": 2.6661016949152545, + "grad_norm": 0.04437880218029022, + "learning_rate": 5.913934946414179e-06, + "loss": 0.9964, + "step": 2364 + }, + { + "epoch": 2.667231638418079, + "grad_norm": 0.04453466460108757, + "learning_rate": 5.873761166527936e-06, + "loss": 0.9702, + "step": 2365 + }, + { + "epoch": 2.668361581920904, + "grad_norm": 0.04452778771519661, + "learning_rate": 5.833720174708024e-06, + "loss": 0.9637, + "step": 2366 + }, + { + "epoch": 2.669491525423729, + "grad_norm": 0.04603665694594383, + "learning_rate": 5.793812027442069e-06, + "loss": 1.0437, + "step": 2367 + }, + { + "epoch": 2.6706214689265537, + "grad_norm": 0.04555994272232056, + "learning_rate": 5.754036781030247e-06, + "loss": 0.9752, + "step": 2368 + }, + { + "epoch": 2.6717514124293786, + "grad_norm": 0.04573677107691765, + "learning_rate": 5.714394491585295e-06, + "loss": 0.94, + "step": 2369 + }, + { + "epoch": 2.6728813559322036, + "grad_norm": 0.04386160522699356, + "learning_rate": 5.674885215032322e-06, + "loss": 0.9949, + "step": 2370 + }, + { + "epoch": 2.674011299435028, + "grad_norm": 0.04481654241681099, + "learning_rate": 5.635509007108819e-06, + "loss": 1.0066, + "step": 2371 + }, + { + "epoch": 2.675141242937853, + "grad_norm": 0.04394556209445, + "learning_rate": 5.596265923364585e-06, + "loss": 0.9715, + "step": 2372 + }, + { + "epoch": 2.676271186440678, + "grad_norm": 0.0456613190472126, + "learning_rate": 5.557156019161558e-06, + "loss": 0.9375, + "step": 2373 + }, + { + "epoch": 2.6774011299435028, + "grad_norm": 0.04373668506741524, + "learning_rate": 5.518179349673802e-06, + "loss": 0.9866, + "step": 2374 + }, + { + "epoch": 2.6785310734463277, + "grad_norm": 0.04538121819496155, + "learning_rate": 5.479335969887467e-06, + "loss": 0.9699, + "step": 2375 + }, + { + "epoch": 2.6796610169491526, + "grad_norm": 0.04715288430452347, + "learning_rate": 5.44062593460063e-06, + "loss": 0.9896, + "step": 2376 + }, + { + "epoch": 2.6807909604519775, + "grad_norm": 0.044993508607149124, + "learning_rate": 5.4020492984232175e-06, + "loss": 0.9059, + "step": 2377 + }, + { + "epoch": 2.6819209039548024, + "grad_norm": 0.04353228956460953, + "learning_rate": 5.363606115777054e-06, + "loss": 0.9714, + "step": 2378 + }, + { + "epoch": 2.6830508474576273, + "grad_norm": 0.04626986011862755, + "learning_rate": 5.325296440895622e-06, + "loss": 1.0475, + "step": 2379 + }, + { + "epoch": 2.684180790960452, + "grad_norm": 0.04400772973895073, + "learning_rate": 5.287120327824091e-06, + "loss": 0.9536, + "step": 2380 + }, + { + "epoch": 2.6853107344632767, + "grad_norm": 0.04464118182659149, + "learning_rate": 5.249077830419191e-06, + "loss": 0.9327, + "step": 2381 + }, + { + "epoch": 2.6864406779661016, + "grad_norm": 0.044131822884082794, + "learning_rate": 5.211169002349148e-06, + "loss": 0.8624, + "step": 2382 + }, + { + "epoch": 2.6875706214689266, + "grad_norm": 0.04314563795924187, + "learning_rate": 5.1733938970936705e-06, + "loss": 1.057, + "step": 2383 + }, + { + "epoch": 2.6887005649717515, + "grad_norm": 0.0449894517660141, + "learning_rate": 5.135752567943753e-06, + "loss": 1.0021, + "step": 2384 + }, + { + "epoch": 2.6898305084745764, + "grad_norm": 0.04238582029938698, + "learning_rate": 5.098245068001661e-06, + "loss": 0.9477, + "step": 2385 + }, + { + "epoch": 2.690960451977401, + "grad_norm": 0.04517286270856857, + "learning_rate": 5.060871450180949e-06, + "loss": 0.9401, + "step": 2386 + }, + { + "epoch": 2.6920903954802258, + "grad_norm": 0.0464588962495327, + "learning_rate": 5.023631767206205e-06, + "loss": 0.9651, + "step": 2387 + }, + { + "epoch": 2.6932203389830507, + "grad_norm": 0.04352294281125069, + "learning_rate": 4.986526071613118e-06, + "loss": 0.9318, + "step": 2388 + }, + { + "epoch": 2.6943502824858756, + "grad_norm": 0.04457921162247658, + "learning_rate": 4.949554415748325e-06, + "loss": 0.9933, + "step": 2389 + }, + { + "epoch": 2.6954802259887005, + "grad_norm": 0.04590234532952309, + "learning_rate": 4.9127168517693946e-06, + "loss": 0.9513, + "step": 2390 + }, + { + "epoch": 2.6966101694915254, + "grad_norm": 0.04296498000621796, + "learning_rate": 4.876013431644721e-06, + "loss": 0.9084, + "step": 2391 + }, + { + "epoch": 2.6977401129943503, + "grad_norm": 0.044914163649082184, + "learning_rate": 4.839444207153432e-06, + "loss": 0.9736, + "step": 2392 + }, + { + "epoch": 2.6988700564971753, + "grad_norm": 0.04357539862394333, + "learning_rate": 4.803009229885369e-06, + "loss": 1.0393, + "step": 2393 + }, + { + "epoch": 2.7, + "grad_norm": 0.04416859522461891, + "learning_rate": 4.766708551240972e-06, + "loss": 1.0616, + "step": 2394 + }, + { + "epoch": 2.701129943502825, + "grad_norm": 0.046145226806402206, + "learning_rate": 4.730542222431223e-06, + "loss": 0.9901, + "step": 2395 + }, + { + "epoch": 2.7022598870056496, + "grad_norm": 0.04451952874660492, + "learning_rate": 4.6945102944775385e-06, + "loss": 0.9736, + "step": 2396 + }, + { + "epoch": 2.7033898305084745, + "grad_norm": 0.04450220614671707, + "learning_rate": 4.658612818211827e-06, + "loss": 0.8922, + "step": 2397 + }, + { + "epoch": 2.7045197740112994, + "grad_norm": 0.043102215975522995, + "learning_rate": 4.6228498442761785e-06, + "loss": 1.0498, + "step": 2398 + }, + { + "epoch": 2.7056497175141243, + "grad_norm": 0.043673399835824966, + "learning_rate": 4.587221423123056e-06, + "loss": 1.0152, + "step": 2399 + }, + { + "epoch": 2.7067796610169492, + "grad_norm": 0.04494861513376236, + "learning_rate": 4.551727605015032e-06, + "loss": 0.9538, + "step": 2400 + }, + { + "epoch": 2.707909604519774, + "grad_norm": 0.04306402429938316, + "learning_rate": 4.516368440024832e-06, + "loss": 0.9531, + "step": 2401 + }, + { + "epoch": 2.7090395480225986, + "grad_norm": 0.04380949214100838, + "learning_rate": 4.481143978035196e-06, + "loss": 0.9118, + "step": 2402 + }, + { + "epoch": 2.7101694915254235, + "grad_norm": 0.04429548606276512, + "learning_rate": 4.44605426873882e-06, + "loss": 0.9699, + "step": 2403 + }, + { + "epoch": 2.7112994350282484, + "grad_norm": 0.04391038790345192, + "learning_rate": 4.411099361638338e-06, + "loss": 0.9136, + "step": 2404 + }, + { + "epoch": 2.7124293785310734, + "grad_norm": 0.04415955767035484, + "learning_rate": 4.376279306046183e-06, + "loss": 0.9088, + "step": 2405 + }, + { + "epoch": 2.7135593220338983, + "grad_norm": 0.04572036862373352, + "learning_rate": 4.341594151084538e-06, + "loss": 0.9603, + "step": 2406 + }, + { + "epoch": 2.714689265536723, + "grad_norm": 0.044412463903427124, + "learning_rate": 4.307043945685318e-06, + "loss": 0.8958, + "step": 2407 + }, + { + "epoch": 2.715819209039548, + "grad_norm": 0.044557929039001465, + "learning_rate": 4.272628738590012e-06, + "loss": 0.9025, + "step": 2408 + }, + { + "epoch": 2.716949152542373, + "grad_norm": 0.044220615178346634, + "learning_rate": 4.238348578349683e-06, + "loss": 0.9579, + "step": 2409 + }, + { + "epoch": 2.718079096045198, + "grad_norm": 0.04436076059937477, + "learning_rate": 4.2042035133248895e-06, + "loss": 0.9139, + "step": 2410 + }, + { + "epoch": 2.719209039548023, + "grad_norm": 0.04490151256322861, + "learning_rate": 4.170193591685601e-06, + "loss": 0.9488, + "step": 2411 + }, + { + "epoch": 2.7203389830508473, + "grad_norm": 0.0449664331972599, + "learning_rate": 4.136318861411081e-06, + "loss": 0.9821, + "step": 2412 + }, + { + "epoch": 2.7214689265536722, + "grad_norm": 0.04529043659567833, + "learning_rate": 4.10257937028996e-06, + "loss": 0.9616, + "step": 2413 + }, + { + "epoch": 2.722598870056497, + "grad_norm": 0.04390520602464676, + "learning_rate": 4.068975165920008e-06, + "loss": 0.9357, + "step": 2414 + }, + { + "epoch": 2.723728813559322, + "grad_norm": 0.04567769914865494, + "learning_rate": 4.035506295708191e-06, + "loss": 0.9597, + "step": 2415 + }, + { + "epoch": 2.724858757062147, + "grad_norm": 0.044814545661211014, + "learning_rate": 4.002172806870519e-06, + "loss": 0.9892, + "step": 2416 + }, + { + "epoch": 2.725988700564972, + "grad_norm": 0.044856686145067215, + "learning_rate": 3.96897474643203e-06, + "loss": 1.0405, + "step": 2417 + }, + { + "epoch": 2.7271186440677964, + "grad_norm": 0.04615604504942894, + "learning_rate": 3.935912161226696e-06, + "loss": 1.0903, + "step": 2418 + }, + { + "epoch": 2.7282485875706213, + "grad_norm": 0.043409042060375214, + "learning_rate": 3.9029850978973715e-06, + "loss": 0.9417, + "step": 2419 + }, + { + "epoch": 2.729378531073446, + "grad_norm": 0.04470163583755493, + "learning_rate": 3.870193602895733e-06, + "loss": 0.945, + "step": 2420 + }, + { + "epoch": 2.730508474576271, + "grad_norm": 0.04446692019701004, + "learning_rate": 3.8375377224822204e-06, + "loss": 1.0123, + "step": 2421 + }, + { + "epoch": 2.731638418079096, + "grad_norm": 0.04441095516085625, + "learning_rate": 3.805017502725905e-06, + "loss": 1.002, + "step": 2422 + }, + { + "epoch": 2.732768361581921, + "grad_norm": 0.04440312832593918, + "learning_rate": 3.7726329895044986e-06, + "loss": 0.9104, + "step": 2423 + }, + { + "epoch": 2.733898305084746, + "grad_norm": 0.04501642286777496, + "learning_rate": 3.740384228504312e-06, + "loss": 0.9695, + "step": 2424 + }, + { + "epoch": 2.7350282485875708, + "grad_norm": 0.04526914656162262, + "learning_rate": 3.7082712652200867e-06, + "loss": 0.9805, + "step": 2425 + }, + { + "epoch": 2.7361581920903957, + "grad_norm": 0.04381269961595535, + "learning_rate": 3.6762941449549727e-06, + "loss": 0.9831, + "step": 2426 + }, + { + "epoch": 2.7372881355932206, + "grad_norm": 0.04696391150355339, + "learning_rate": 3.6444529128205618e-06, + "loss": 0.9726, + "step": 2427 + }, + { + "epoch": 2.738418079096045, + "grad_norm": 0.04461664706468582, + "learning_rate": 3.6127476137366777e-06, + "loss": 1.0091, + "step": 2428 + }, + { + "epoch": 2.73954802259887, + "grad_norm": 0.04383271187543869, + "learning_rate": 3.5811782924313864e-06, + "loss": 1.0026, + "step": 2429 + }, + { + "epoch": 2.740677966101695, + "grad_norm": 0.04371367394924164, + "learning_rate": 3.54974499344094e-06, + "loss": 0.9416, + "step": 2430 + }, + { + "epoch": 2.74180790960452, + "grad_norm": 0.044015347957611084, + "learning_rate": 3.518447761109689e-06, + "loss": 0.9705, + "step": 2431 + }, + { + "epoch": 2.7429378531073447, + "grad_norm": 0.04504389315843582, + "learning_rate": 3.487286639590026e-06, + "loss": 0.9769, + "step": 2432 + }, + { + "epoch": 2.7440677966101696, + "grad_norm": 0.04418431222438812, + "learning_rate": 3.4562616728423426e-06, + "loss": 0.8872, + "step": 2433 + }, + { + "epoch": 2.745197740112994, + "grad_norm": 0.042971495538949966, + "learning_rate": 3.425372904634905e-06, + "loss": 0.9208, + "step": 2434 + }, + { + "epoch": 2.746327683615819, + "grad_norm": 0.04686740040779114, + "learning_rate": 3.3946203785439113e-06, + "loss": 1.0176, + "step": 2435 + }, + { + "epoch": 2.747457627118644, + "grad_norm": 0.04372727870941162, + "learning_rate": 3.3640041379532805e-06, + "loss": 0.8971, + "step": 2436 + }, + { + "epoch": 2.748587570621469, + "grad_norm": 0.04424417391419411, + "learning_rate": 3.3335242260547293e-06, + "loss": 1.0166, + "step": 2437 + }, + { + "epoch": 2.7497175141242938, + "grad_norm": 0.04451939836144447, + "learning_rate": 3.3031806858476065e-06, + "loss": 1.0402, + "step": 2438 + }, + { + "epoch": 2.7508474576271187, + "grad_norm": 0.04643287882208824, + "learning_rate": 3.2729735601389143e-06, + "loss": 0.8732, + "step": 2439 + }, + { + "epoch": 2.7519774011299436, + "grad_norm": 0.043419696390628815, + "learning_rate": 3.2429028915431534e-06, + "loss": 1.0057, + "step": 2440 + }, + { + "epoch": 2.7531073446327685, + "grad_norm": 0.04414771869778633, + "learning_rate": 3.21296872248239e-06, + "loss": 1.0273, + "step": 2441 + }, + { + "epoch": 2.7542372881355934, + "grad_norm": 0.04677413031458855, + "learning_rate": 3.1831710951860547e-06, + "loss": 0.8554, + "step": 2442 + }, + { + "epoch": 2.7553672316384183, + "grad_norm": 0.044694963842630386, + "learning_rate": 3.153510051690989e-06, + "loss": 1.0362, + "step": 2443 + }, + { + "epoch": 2.756497175141243, + "grad_norm": 0.043344851583242416, + "learning_rate": 3.123985633841364e-06, + "loss": 0.9214, + "step": 2444 + }, + { + "epoch": 2.7576271186440677, + "grad_norm": 0.04490336403250694, + "learning_rate": 3.094597883288575e-06, + "loss": 0.9133, + "step": 2445 + }, + { + "epoch": 2.7587570621468926, + "grad_norm": 0.045138705521821976, + "learning_rate": 3.0653468414912123e-06, + "loss": 0.9534, + "step": 2446 + }, + { + "epoch": 2.7598870056497176, + "grad_norm": 0.04454415664076805, + "learning_rate": 3.0362325497150348e-06, + "loss": 1.0111, + "step": 2447 + }, + { + "epoch": 2.7610169491525425, + "grad_norm": 0.04441482201218605, + "learning_rate": 3.0072550490328753e-06, + "loss": 0.9419, + "step": 2448 + }, + { + "epoch": 2.7621468926553674, + "grad_norm": 0.0444534532725811, + "learning_rate": 2.978414380324579e-06, + "loss": 0.8905, + "step": 2449 + }, + { + "epoch": 2.763276836158192, + "grad_norm": 0.045210570096969604, + "learning_rate": 2.9497105842769435e-06, + "loss": 0.9332, + "step": 2450 + }, + { + "epoch": 2.7644067796610168, + "grad_norm": 0.04406363144516945, + "learning_rate": 2.9211437013836995e-06, + "loss": 0.8547, + "step": 2451 + }, + { + "epoch": 2.7655367231638417, + "grad_norm": 0.043664876371622086, + "learning_rate": 2.8927137719454103e-06, + "loss": 0.8963, + "step": 2452 + }, + { + "epoch": 2.7666666666666666, + "grad_norm": 0.04488592594861984, + "learning_rate": 2.864420836069459e-06, + "loss": 0.9805, + "step": 2453 + }, + { + "epoch": 2.7677966101694915, + "grad_norm": 0.04462852701544762, + "learning_rate": 2.836264933669919e-06, + "loss": 1.0324, + "step": 2454 + }, + { + "epoch": 2.7689265536723164, + "grad_norm": 0.04454018175601959, + "learning_rate": 2.808246104467582e-06, + "loss": 0.9059, + "step": 2455 + }, + { + "epoch": 2.7700564971751414, + "grad_norm": 0.04549877718091011, + "learning_rate": 2.7803643879898865e-06, + "loss": 0.8686, + "step": 2456 + }, + { + "epoch": 2.7711864406779663, + "grad_norm": 0.04532479867339134, + "learning_rate": 2.7526198235707678e-06, + "loss": 1.0447, + "step": 2457 + }, + { + "epoch": 2.772316384180791, + "grad_norm": 0.04685727506875992, + "learning_rate": 2.7250124503507168e-06, + "loss": 0.9719, + "step": 2458 + }, + { + "epoch": 2.7734463276836157, + "grad_norm": 0.04455072060227394, + "learning_rate": 2.697542307276724e-06, + "loss": 1.0226, + "step": 2459 + }, + { + "epoch": 2.7745762711864406, + "grad_norm": 0.04388142749667168, + "learning_rate": 2.6702094331020887e-06, + "loss": 1.0189, + "step": 2460 + }, + { + "epoch": 2.7757062146892655, + "grad_norm": 0.04537327587604523, + "learning_rate": 2.6430138663865346e-06, + "loss": 0.9486, + "step": 2461 + }, + { + "epoch": 2.7768361581920904, + "grad_norm": 0.04427479952573776, + "learning_rate": 2.6159556454960487e-06, + "loss": 0.9353, + "step": 2462 + }, + { + "epoch": 2.7779661016949153, + "grad_norm": 0.046056341379880905, + "learning_rate": 2.589034808602897e-06, + "loss": 0.9175, + "step": 2463 + }, + { + "epoch": 2.7790960451977402, + "grad_norm": 0.043814852833747864, + "learning_rate": 2.5622513936854442e-06, + "loss": 0.9208, + "step": 2464 + }, + { + "epoch": 2.7802259887005647, + "grad_norm": 0.04527292400598526, + "learning_rate": 2.5356054385282766e-06, + "loss": 0.975, + "step": 2465 + }, + { + "epoch": 2.7813559322033896, + "grad_norm": 0.04537815973162651, + "learning_rate": 2.5090969807220366e-06, + "loss": 0.8724, + "step": 2466 + }, + { + "epoch": 2.7824858757062145, + "grad_norm": 0.04719525948166847, + "learning_rate": 2.482726057663365e-06, + "loss": 0.8909, + "step": 2467 + }, + { + "epoch": 2.7836158192090394, + "grad_norm": 0.04599473252892494, + "learning_rate": 2.4564927065548914e-06, + "loss": 0.9732, + "step": 2468 + }, + { + "epoch": 2.7847457627118644, + "grad_norm": 0.04588894918560982, + "learning_rate": 2.430396964405168e-06, + "loss": 1.0697, + "step": 2469 + }, + { + "epoch": 2.7858757062146893, + "grad_norm": 0.045891888439655304, + "learning_rate": 2.404438868028658e-06, + "loss": 0.9133, + "step": 2470 + }, + { + "epoch": 2.787005649717514, + "grad_norm": 0.04246148839592934, + "learning_rate": 2.3786184540455448e-06, + "loss": 0.9665, + "step": 2471 + }, + { + "epoch": 2.788135593220339, + "grad_norm": 0.042736880481243134, + "learning_rate": 2.3529357588818577e-06, + "loss": 0.8699, + "step": 2472 + }, + { + "epoch": 2.789265536723164, + "grad_norm": 0.04353400319814682, + "learning_rate": 2.327390818769337e-06, + "loss": 0.984, + "step": 2473 + }, + { + "epoch": 2.790395480225989, + "grad_norm": 0.04404514282941818, + "learning_rate": 2.301983669745322e-06, + "loss": 0.8818, + "step": 2474 + }, + { + "epoch": 2.7915254237288134, + "grad_norm": 0.044174037873744965, + "learning_rate": 2.276714347652831e-06, + "loss": 1.0032, + "step": 2475 + }, + { + "epoch": 2.7926553672316383, + "grad_norm": 0.044073160737752914, + "learning_rate": 2.2515828881404134e-06, + "loss": 0.9107, + "step": 2476 + }, + { + "epoch": 2.7937853107344632, + "grad_norm": 0.0445500873029232, + "learning_rate": 2.226589326662143e-06, + "loss": 0.9533, + "step": 2477 + }, + { + "epoch": 2.794915254237288, + "grad_norm": 0.044462572783231735, + "learning_rate": 2.2017336984775484e-06, + "loss": 0.9722, + "step": 2478 + }, + { + "epoch": 2.796045197740113, + "grad_norm": 0.04642799124121666, + "learning_rate": 2.177016038651558e-06, + "loss": 1.0028, + "step": 2479 + }, + { + "epoch": 2.797175141242938, + "grad_norm": 0.04265080764889717, + "learning_rate": 2.152436382054479e-06, + "loss": 0.9624, + "step": 2480 + }, + { + "epoch": 2.7983050847457624, + "grad_norm": 0.04329387843608856, + "learning_rate": 2.127994763361918e-06, + "loss": 0.934, + "step": 2481 + }, + { + "epoch": 2.7994350282485874, + "grad_norm": 0.04579446464776993, + "learning_rate": 2.1036912170547595e-06, + "loss": 0.8815, + "step": 2482 + }, + { + "epoch": 2.8005649717514123, + "grad_norm": 0.04465530812740326, + "learning_rate": 2.079525777419078e-06, + "loss": 0.9744, + "step": 2483 + }, + { + "epoch": 2.801694915254237, + "grad_norm": 0.04475056752562523, + "learning_rate": 2.0554984785461806e-06, + "loss": 0.9676, + "step": 2484 + }, + { + "epoch": 2.802824858757062, + "grad_norm": 0.045168641954660416, + "learning_rate": 2.0316093543323757e-06, + "loss": 0.9999, + "step": 2485 + }, + { + "epoch": 2.803954802259887, + "grad_norm": 0.04566594585776329, + "learning_rate": 2.007858438479171e-06, + "loss": 1.0069, + "step": 2486 + }, + { + "epoch": 2.805084745762712, + "grad_norm": 0.04546283185482025, + "learning_rate": 1.98424576449302e-06, + "loss": 0.9726, + "step": 2487 + }, + { + "epoch": 2.806214689265537, + "grad_norm": 0.043535858392715454, + "learning_rate": 1.9607713656853544e-06, + "loss": 0.9872, + "step": 2488 + }, + { + "epoch": 2.8073446327683618, + "grad_norm": 0.04416786879301071, + "learning_rate": 1.937435275172572e-06, + "loss": 0.9428, + "step": 2489 + }, + { + "epoch": 2.8084745762711867, + "grad_norm": 0.04551788419485092, + "learning_rate": 1.914237525875917e-06, + "loss": 0.9504, + "step": 2490 + }, + { + "epoch": 2.809604519774011, + "grad_norm": 0.046153049916028976, + "learning_rate": 1.8911781505215e-06, + "loss": 0.9752, + "step": 2491 + }, + { + "epoch": 2.810734463276836, + "grad_norm": 0.04438695311546326, + "learning_rate": 1.8682571816401983e-06, + "loss": 0.9908, + "step": 2492 + }, + { + "epoch": 2.811864406779661, + "grad_norm": 0.04513789713382721, + "learning_rate": 1.8454746515676357e-06, + "loss": 0.9173, + "step": 2493 + }, + { + "epoch": 2.812994350282486, + "grad_norm": 0.04486611858010292, + "learning_rate": 1.822830592444147e-06, + "loss": 0.986, + "step": 2494 + }, + { + "epoch": 2.814124293785311, + "grad_norm": 0.04494575038552284, + "learning_rate": 1.8003250362147005e-06, + "loss": 0.9892, + "step": 2495 + }, + { + "epoch": 2.8152542372881357, + "grad_norm": 0.04305053502321243, + "learning_rate": 1.7779580146288999e-06, + "loss": 0.8947, + "step": 2496 + }, + { + "epoch": 2.81638418079096, + "grad_norm": 0.041880443692207336, + "learning_rate": 1.755729559240893e-06, + "loss": 0.919, + "step": 2497 + }, + { + "epoch": 2.817514124293785, + "grad_norm": 0.0441063791513443, + "learning_rate": 1.7336397014093508e-06, + "loss": 0.9521, + "step": 2498 + }, + { + "epoch": 2.81864406779661, + "grad_norm": 0.043986279517412186, + "learning_rate": 1.7116884722974013e-06, + "loss": 1.0188, + "step": 2499 + }, + { + "epoch": 2.819774011299435, + "grad_norm": 0.04386138543486595, + "learning_rate": 1.6898759028726285e-06, + "loss": 0.9757, + "step": 2500 + }, + { + "epoch": 2.82090395480226, + "grad_norm": 0.04467868059873581, + "learning_rate": 1.6682020239070173e-06, + "loss": 1.0435, + "step": 2501 + }, + { + "epoch": 2.8220338983050848, + "grad_norm": 0.04562905803322792, + "learning_rate": 1.6466668659768202e-06, + "loss": 1.0047, + "step": 2502 + }, + { + "epoch": 2.8231638418079097, + "grad_norm": 0.04512863978743553, + "learning_rate": 1.62527045946268e-06, + "loss": 1.0126, + "step": 2503 + }, + { + "epoch": 2.8242937853107346, + "grad_norm": 0.045594554394483566, + "learning_rate": 1.6040128345494398e-06, + "loss": 0.9089, + "step": 2504 + }, + { + "epoch": 2.8254237288135595, + "grad_norm": 0.044841185212135315, + "learning_rate": 1.5828940212261889e-06, + "loss": 0.9067, + "step": 2505 + }, + { + "epoch": 2.8265536723163844, + "grad_norm": 0.043986570090055466, + "learning_rate": 1.561914049286173e-06, + "loss": 0.9603, + "step": 2506 + }, + { + "epoch": 2.827683615819209, + "grad_norm": 0.044366851449012756, + "learning_rate": 1.5410729483267606e-06, + "loss": 1.0132, + "step": 2507 + }, + { + "epoch": 2.828813559322034, + "grad_norm": 0.04770446941256523, + "learning_rate": 1.5203707477494333e-06, + "loss": 0.9823, + "step": 2508 + }, + { + "epoch": 2.8299435028248587, + "grad_norm": 0.04479978233575821, + "learning_rate": 1.4998074767596848e-06, + "loss": 0.9858, + "step": 2509 + }, + { + "epoch": 2.8310734463276837, + "grad_norm": 0.044626832008361816, + "learning_rate": 1.479383164367043e-06, + "loss": 0.9863, + "step": 2510 + }, + { + "epoch": 2.8322033898305086, + "grad_norm": 0.04503661394119263, + "learning_rate": 1.459097839385004e-06, + "loss": 0.9606, + "step": 2511 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.043242525309324265, + "learning_rate": 1.4389515304309763e-06, + "loss": 0.9008, + "step": 2512 + }, + { + "epoch": 2.834463276836158, + "grad_norm": 0.04532679542899132, + "learning_rate": 1.4189442659262365e-06, + "loss": 0.9146, + "step": 2513 + }, + { + "epoch": 2.835593220338983, + "grad_norm": 0.04340112954378128, + "learning_rate": 1.3990760740959285e-06, + "loss": 1.0163, + "step": 2514 + }, + { + "epoch": 2.836723163841808, + "grad_norm": 0.045547567307949066, + "learning_rate": 1.3793469829689987e-06, + "loss": 0.9454, + "step": 2515 + }, + { + "epoch": 2.8378531073446327, + "grad_norm": 0.04489200562238693, + "learning_rate": 1.3597570203781496e-06, + "loss": 0.9677, + "step": 2516 + }, + { + "epoch": 2.8389830508474576, + "grad_norm": 0.0449584536254406, + "learning_rate": 1.3403062139598076e-06, + "loss": 0.8906, + "step": 2517 + }, + { + "epoch": 2.8401129943502825, + "grad_norm": 0.04351430386304855, + "learning_rate": 1.32099459115409e-06, + "loss": 0.9033, + "step": 2518 + }, + { + "epoch": 2.8412429378531074, + "grad_norm": 0.04647156223654747, + "learning_rate": 1.3018221792047703e-06, + "loss": 1.0343, + "step": 2519 + }, + { + "epoch": 2.8423728813559324, + "grad_norm": 0.04549736902117729, + "learning_rate": 1.2827890051592128e-06, + "loss": 0.9813, + "step": 2520 + }, + { + "epoch": 2.8435028248587573, + "grad_norm": 0.04418494924902916, + "learning_rate": 1.2638950958683504e-06, + "loss": 0.9703, + "step": 2521 + }, + { + "epoch": 2.844632768361582, + "grad_norm": 0.04414811357855797, + "learning_rate": 1.2451404779866948e-06, + "loss": 1.0038, + "step": 2522 + }, + { + "epoch": 2.8457627118644067, + "grad_norm": 0.04323131591081619, + "learning_rate": 1.226525177972182e-06, + "loss": 1.0242, + "step": 2523 + }, + { + "epoch": 2.8468926553672316, + "grad_norm": 0.04595525190234184, + "learning_rate": 1.2080492220862605e-06, + "loss": 0.9779, + "step": 2524 + }, + { + "epoch": 2.8480225988700565, + "grad_norm": 0.04375181347131729, + "learning_rate": 1.1897126363937804e-06, + "loss": 0.9557, + "step": 2525 + }, + { + "epoch": 2.8491525423728814, + "grad_norm": 0.04520711675286293, + "learning_rate": 1.171515446762972e-06, + "loss": 0.9467, + "step": 2526 + }, + { + "epoch": 2.8502824858757063, + "grad_norm": 0.04446513205766678, + "learning_rate": 1.1534576788654327e-06, + "loss": 0.8768, + "step": 2527 + }, + { + "epoch": 2.8514124293785312, + "grad_norm": 0.044064104557037354, + "learning_rate": 1.135539358176041e-06, + "loss": 0.9144, + "step": 2528 + }, + { + "epoch": 2.8525423728813557, + "grad_norm": 0.04525630176067352, + "learning_rate": 1.117760509972987e-06, + "loss": 0.9452, + "step": 2529 + }, + { + "epoch": 2.8536723163841806, + "grad_norm": 0.042912956327199936, + "learning_rate": 1.1001211593376526e-06, + "loss": 1.0189, + "step": 2530 + }, + { + "epoch": 2.8548022598870055, + "grad_norm": 0.04408475011587143, + "learning_rate": 1.0826213311546873e-06, + "loss": 0.9009, + "step": 2531 + }, + { + "epoch": 2.8559322033898304, + "grad_norm": 0.04350871592760086, + "learning_rate": 1.0652610501118542e-06, + "loss": 1.0334, + "step": 2532 + }, + { + "epoch": 2.8570621468926554, + "grad_norm": 0.04493604972958565, + "learning_rate": 1.0480403407000738e-06, + "loss": 0.945, + "step": 2533 + }, + { + "epoch": 2.8581920903954803, + "grad_norm": 0.04436785727739334, + "learning_rate": 1.0309592272133684e-06, + "loss": 1.0529, + "step": 2534 + }, + { + "epoch": 2.859322033898305, + "grad_norm": 0.04460444301366806, + "learning_rate": 1.0140177337488288e-06, + "loss": 0.9559, + "step": 2535 + }, + { + "epoch": 2.86045197740113, + "grad_norm": 0.04335474595427513, + "learning_rate": 9.972158842065816e-07, + "loss": 0.9376, + "step": 2536 + }, + { + "epoch": 2.861581920903955, + "grad_norm": 0.04430084675550461, + "learning_rate": 9.805537022897104e-07, + "loss": 0.9212, + "step": 2537 + }, + { + "epoch": 2.86271186440678, + "grad_norm": 0.0432843454182148, + "learning_rate": 9.640312115043237e-07, + "loss": 0.9804, + "step": 2538 + }, + { + "epoch": 2.8638418079096044, + "grad_norm": 0.04469630494713783, + "learning_rate": 9.476484351594317e-07, + "loss": 0.9615, + "step": 2539 + }, + { + "epoch": 2.8649717514124293, + "grad_norm": 0.046468090265989304, + "learning_rate": 9.314053963669245e-07, + "loss": 0.9386, + "step": 2540 + }, + { + "epoch": 2.8661016949152542, + "grad_norm": 0.044023171067237854, + "learning_rate": 9.153021180415944e-07, + "loss": 0.9795, + "step": 2541 + }, + { + "epoch": 2.867231638418079, + "grad_norm": 0.045041874051094055, + "learning_rate": 8.993386229010581e-07, + "loss": 0.9741, + "step": 2542 + }, + { + "epoch": 2.868361581920904, + "grad_norm": 0.04462430626153946, + "learning_rate": 8.835149334657122e-07, + "loss": 0.9663, + "step": 2543 + }, + { + "epoch": 2.8694915254237285, + "grad_norm": 0.04438344016671181, + "learning_rate": 8.678310720587335e-07, + "loss": 0.9934, + "step": 2544 + }, + { + "epoch": 2.8706214689265535, + "grad_norm": 0.0462101474404335, + "learning_rate": 8.522870608060563e-07, + "loss": 1.1079, + "step": 2545 + }, + { + "epoch": 2.8717514124293784, + "grad_norm": 0.04502633213996887, + "learning_rate": 8.368829216363172e-07, + "loss": 0.9364, + "step": 2546 + }, + { + "epoch": 2.8728813559322033, + "grad_norm": 0.04489148035645485, + "learning_rate": 8.216186762807998e-07, + "loss": 1.0012, + "step": 2547 + }, + { + "epoch": 2.874011299435028, + "grad_norm": 0.04346054419875145, + "learning_rate": 8.064943462734454e-07, + "loss": 0.9731, + "step": 2548 + }, + { + "epoch": 2.875141242937853, + "grad_norm": 0.04357984662055969, + "learning_rate": 7.91509952950853e-07, + "loss": 1.0232, + "step": 2549 + }, + { + "epoch": 2.876271186440678, + "grad_norm": 0.04210647568106651, + "learning_rate": 7.766655174521465e-07, + "loss": 1.039, + "step": 2550 + }, + { + "epoch": 2.877401129943503, + "grad_norm": 0.044638440012931824, + "learning_rate": 7.619610607190186e-07, + "loss": 0.8978, + "step": 2551 + }, + { + "epoch": 2.878531073446328, + "grad_norm": 0.04344021528959274, + "learning_rate": 7.473966034957313e-07, + "loss": 0.9152, + "step": 2552 + }, + { + "epoch": 2.8796610169491528, + "grad_norm": 0.04684427008032799, + "learning_rate": 7.329721663289935e-07, + "loss": 0.9515, + "step": 2553 + }, + { + "epoch": 2.8807909604519772, + "grad_norm": 0.04457107558846474, + "learning_rate": 7.186877695679939e-07, + "loss": 0.9952, + "step": 2554 + }, + { + "epoch": 2.881920903954802, + "grad_norm": 0.043536800891160965, + "learning_rate": 7.045434333643797e-07, + "loss": 0.9389, + "step": 2555 + }, + { + "epoch": 2.883050847457627, + "grad_norm": 0.04445602744817734, + "learning_rate": 6.905391776721893e-07, + "loss": 0.9033, + "step": 2556 + }, + { + "epoch": 2.884180790960452, + "grad_norm": 0.042980775237083435, + "learning_rate": 6.766750222478524e-07, + "loss": 0.849, + "step": 2557 + }, + { + "epoch": 2.885310734463277, + "grad_norm": 0.045031286776065826, + "learning_rate": 6.629509866501349e-07, + "loss": 0.9813, + "step": 2558 + }, + { + "epoch": 2.886440677966102, + "grad_norm": 0.0436282679438591, + "learning_rate": 6.493670902401494e-07, + "loss": 0.9479, + "step": 2559 + }, + { + "epoch": 2.8875706214689263, + "grad_norm": 0.045009516179561615, + "learning_rate": 6.359233521813224e-07, + "loss": 0.8999, + "step": 2560 + }, + { + "epoch": 2.888700564971751, + "grad_norm": 0.04531530290842056, + "learning_rate": 6.22619791439305e-07, + "loss": 1.0032, + "step": 2561 + }, + { + "epoch": 2.889830508474576, + "grad_norm": 0.04437597468495369, + "learning_rate": 6.094564267820291e-07, + "loss": 1.0318, + "step": 2562 + }, + { + "epoch": 2.890960451977401, + "grad_norm": 0.04388117045164108, + "learning_rate": 5.964332767796399e-07, + "loss": 0.9455, + "step": 2563 + }, + { + "epoch": 2.892090395480226, + "grad_norm": 0.04577458277344704, + "learning_rate": 5.835503598044745e-07, + "loss": 1.0662, + "step": 2564 + }, + { + "epoch": 2.893220338983051, + "grad_norm": 0.04454333335161209, + "learning_rate": 5.70807694031028e-07, + "loss": 1.0782, + "step": 2565 + }, + { + "epoch": 2.894350282485876, + "grad_norm": 0.045315299183130264, + "learning_rate": 5.582052974359653e-07, + "loss": 0.9799, + "step": 2566 + }, + { + "epoch": 2.8954802259887007, + "grad_norm": 0.044649627059698105, + "learning_rate": 5.457431877980312e-07, + "loss": 1.0305, + "step": 2567 + }, + { + "epoch": 2.8966101694915256, + "grad_norm": 0.044768646359443665, + "learning_rate": 5.334213826980738e-07, + "loss": 1.0419, + "step": 2568 + }, + { + "epoch": 2.8977401129943505, + "grad_norm": 0.04542625695466995, + "learning_rate": 5.212398995190215e-07, + "loss": 0.9791, + "step": 2569 + }, + { + "epoch": 2.898870056497175, + "grad_norm": 0.04401678591966629, + "learning_rate": 5.091987554458388e-07, + "loss": 0.9531, + "step": 2570 + }, + { + "epoch": 2.9, + "grad_norm": 0.04501966014504433, + "learning_rate": 4.972979674654821e-07, + "loss": 0.9026, + "step": 2571 + }, + { + "epoch": 2.901129943502825, + "grad_norm": 0.04327463358640671, + "learning_rate": 4.855375523669326e-07, + "loss": 0.957, + "step": 2572 + }, + { + "epoch": 2.9022598870056497, + "grad_norm": 0.043885570019483566, + "learning_rate": 4.7391752674113e-07, + "loss": 0.9422, + "step": 2573 + }, + { + "epoch": 2.9033898305084747, + "grad_norm": 0.043965715914964676, + "learning_rate": 4.6243790698097255e-07, + "loss": 0.9477, + "step": 2574 + }, + { + "epoch": 2.9045197740112996, + "grad_norm": 0.04610178992152214, + "learning_rate": 4.510987092812502e-07, + "loss": 1.0274, + "step": 2575 + }, + { + "epoch": 2.905649717514124, + "grad_norm": 0.04622510448098183, + "learning_rate": 4.398999496386891e-07, + "loss": 0.9179, + "step": 2576 + }, + { + "epoch": 2.906779661016949, + "grad_norm": 0.04459752142429352, + "learning_rate": 4.288416438518628e-07, + "loss": 1.0092, + "step": 2577 + }, + { + "epoch": 2.907909604519774, + "grad_norm": 0.043733566999435425, + "learning_rate": 4.179238075212144e-07, + "loss": 0.9825, + "step": 2578 + }, + { + "epoch": 2.909039548022599, + "grad_norm": 0.04439612850546837, + "learning_rate": 4.071464560490346e-07, + "loss": 1.0026, + "step": 2579 + }, + { + "epoch": 2.9101694915254237, + "grad_norm": 0.044625185430049896, + "learning_rate": 3.965096046394057e-07, + "loss": 0.9278, + "step": 2580 + }, + { + "epoch": 2.9112994350282486, + "grad_norm": 0.04455350339412689, + "learning_rate": 3.860132682982021e-07, + "loss": 1.0179, + "step": 2581 + }, + { + "epoch": 2.9124293785310735, + "grad_norm": 0.043388571590185165, + "learning_rate": 3.7565746183307884e-07, + "loss": 0.9418, + "step": 2582 + }, + { + "epoch": 2.9135593220338984, + "grad_norm": 0.04536639526486397, + "learning_rate": 3.654421998534163e-07, + "loss": 1.0083, + "step": 2583 + }, + { + "epoch": 2.9146892655367234, + "grad_norm": 0.04366553574800491, + "learning_rate": 3.553674967703646e-07, + "loss": 1.0002, + "step": 2584 + }, + { + "epoch": 2.9158192090395483, + "grad_norm": 0.04244455695152283, + "learning_rate": 3.4543336679673245e-07, + "loss": 1.0485, + "step": 2585 + }, + { + "epoch": 2.9169491525423727, + "grad_norm": 0.04259354993700981, + "learning_rate": 3.3563982394704266e-07, + "loss": 0.9429, + "step": 2586 + }, + { + "epoch": 2.9180790960451977, + "grad_norm": 0.04567457735538483, + "learning_rate": 3.259868820374878e-07, + "loss": 1.0307, + "step": 2587 + }, + { + "epoch": 2.9192090395480226, + "grad_norm": 0.044698674231767654, + "learning_rate": 3.1647455468590825e-07, + "loss": 0.9137, + "step": 2588 + }, + { + "epoch": 2.9203389830508475, + "grad_norm": 0.04482744634151459, + "learning_rate": 3.071028553117472e-07, + "loss": 0.9424, + "step": 2589 + }, + { + "epoch": 2.9214689265536724, + "grad_norm": 0.043734245002269745, + "learning_rate": 2.978717971360956e-07, + "loss": 0.8577, + "step": 2590 + }, + { + "epoch": 2.9225988700564973, + "grad_norm": 0.04413614049553871, + "learning_rate": 2.88781393181603e-07, + "loss": 0.9462, + "step": 2591 + }, + { + "epoch": 2.923728813559322, + "grad_norm": 0.044499874114990234, + "learning_rate": 2.7983165627251116e-07, + "loss": 1.0218, + "step": 2592 + }, + { + "epoch": 2.9248587570621467, + "grad_norm": 0.042965903878211975, + "learning_rate": 2.7102259903460935e-07, + "loss": 0.9094, + "step": 2593 + }, + { + "epoch": 2.9259887005649716, + "grad_norm": 0.04347725212574005, + "learning_rate": 2.623542338952345e-07, + "loss": 1.0104, + "step": 2594 + }, + { + "epoch": 2.9271186440677965, + "grad_norm": 0.044730983674526215, + "learning_rate": 2.5382657308322676e-07, + "loss": 0.943, + "step": 2595 + }, + { + "epoch": 2.9282485875706215, + "grad_norm": 0.04504537582397461, + "learning_rate": 2.4543962862894063e-07, + "loss": 0.9281, + "step": 2596 + }, + { + "epoch": 2.9293785310734464, + "grad_norm": 0.04293621703982353, + "learning_rate": 2.3719341236420057e-07, + "loss": 0.9524, + "step": 2597 + }, + { + "epoch": 2.9305084745762713, + "grad_norm": 0.04390338063240051, + "learning_rate": 2.2908793592232303e-07, + "loss": 0.9315, + "step": 2598 + }, + { + "epoch": 2.931638418079096, + "grad_norm": 0.04320131614804268, + "learning_rate": 2.211232107380612e-07, + "loss": 0.9744, + "step": 2599 + }, + { + "epoch": 2.932768361581921, + "grad_norm": 0.04456019774079323, + "learning_rate": 2.1329924804760482e-07, + "loss": 1.0404, + "step": 2600 + }, + { + "epoch": 2.933898305084746, + "grad_norm": 0.04545271396636963, + "learning_rate": 2.0561605888855805e-07, + "loss": 0.9803, + "step": 2601 + }, + { + "epoch": 2.9350282485875705, + "grad_norm": 0.04348572716116905, + "learning_rate": 1.980736540999506e-07, + "loss": 0.9718, + "step": 2602 + }, + { + "epoch": 2.9361581920903954, + "grad_norm": 0.04441867396235466, + "learning_rate": 1.906720443221821e-07, + "loss": 0.988, + "step": 2603 + }, + { + "epoch": 2.9372881355932203, + "grad_norm": 0.04416580870747566, + "learning_rate": 1.8341123999703337e-07, + "loss": 1.0091, + "step": 2604 + }, + { + "epoch": 2.9384180790960452, + "grad_norm": 0.04366505518555641, + "learning_rate": 1.7629125136764402e-07, + "loss": 0.9855, + "step": 2605 + }, + { + "epoch": 2.93954802259887, + "grad_norm": 0.04411231353878975, + "learning_rate": 1.6931208847849045e-07, + "loss": 0.9712, + "step": 2606 + }, + { + "epoch": 2.940677966101695, + "grad_norm": 0.04340221732854843, + "learning_rate": 1.6247376117539682e-07, + "loss": 1.0107, + "step": 2607 + }, + { + "epoch": 2.9418079096045195, + "grad_norm": 0.044296473264694214, + "learning_rate": 1.5577627910549063e-07, + "loss": 1.0654, + "step": 2608 + }, + { + "epoch": 2.9429378531073445, + "grad_norm": 0.04423755407333374, + "learning_rate": 1.4921965171720287e-07, + "loss": 0.872, + "step": 2609 + }, + { + "epoch": 2.9440677966101694, + "grad_norm": 0.044309426099061966, + "learning_rate": 1.4280388826026782e-07, + "loss": 0.8893, + "step": 2610 + }, + { + "epoch": 2.9451977401129943, + "grad_norm": 0.04572597146034241, + "learning_rate": 1.3652899778568985e-07, + "loss": 1.0088, + "step": 2611 + }, + { + "epoch": 2.946327683615819, + "grad_norm": 0.04595278576016426, + "learning_rate": 1.3039498914573235e-07, + "loss": 0.9908, + "step": 2612 + }, + { + "epoch": 2.947457627118644, + "grad_norm": 0.04614327847957611, + "learning_rate": 1.2440187099390654e-07, + "loss": 0.9492, + "step": 2613 + }, + { + "epoch": 2.948587570621469, + "grad_norm": 0.04578055068850517, + "learning_rate": 1.1854965178497158e-07, + "loss": 0.996, + "step": 2614 + }, + { + "epoch": 2.949717514124294, + "grad_norm": 0.04251642897725105, + "learning_rate": 1.128383397749233e-07, + "loss": 1.0487, + "step": 2615 + }, + { + "epoch": 2.950847457627119, + "grad_norm": 0.0450914241373539, + "learning_rate": 1.072679430209611e-07, + "loss": 0.8909, + "step": 2616 + }, + { + "epoch": 2.951977401129944, + "grad_norm": 0.04336519166827202, + "learning_rate": 1.0183846938148778e-07, + "loss": 1.069, + "step": 2617 + }, + { + "epoch": 2.9531073446327682, + "grad_norm": 0.045954346656799316, + "learning_rate": 9.654992651609851e-08, + "loss": 0.9546, + "step": 2618 + }, + { + "epoch": 2.954237288135593, + "grad_norm": 0.04354023560881615, + "learning_rate": 9.140232188558085e-08, + "loss": 0.9622, + "step": 2619 + }, + { + "epoch": 2.955367231638418, + "grad_norm": 0.044812485575675964, + "learning_rate": 8.639566275189248e-08, + "loss": 0.905, + "step": 2620 + }, + { + "epoch": 2.956497175141243, + "grad_norm": 0.044489793479442596, + "learning_rate": 8.152995617815018e-08, + "loss": 1.0048, + "step": 2621 + }, + { + "epoch": 2.957627118644068, + "grad_norm": 0.0438714399933815, + "learning_rate": 7.680520902860755e-08, + "loss": 0.9843, + "step": 2622 + }, + { + "epoch": 2.9587570621468924, + "grad_norm": 0.04459919407963753, + "learning_rate": 7.222142796868835e-08, + "loss": 0.9293, + "step": 2623 + }, + { + "epoch": 2.9598870056497173, + "grad_norm": 0.0430004857480526, + "learning_rate": 6.777861946493102e-08, + "loss": 0.9283, + "step": 2624 + }, + { + "epoch": 2.961016949152542, + "grad_norm": 0.04521413892507553, + "learning_rate": 6.347678978501082e-08, + "loss": 0.9789, + "step": 2625 + }, + { + "epoch": 2.962146892655367, + "grad_norm": 0.04452219605445862, + "learning_rate": 5.931594499770654e-08, + "loss": 0.9629, + "step": 2626 + }, + { + "epoch": 2.963276836158192, + "grad_norm": 0.04484087601304054, + "learning_rate": 5.529609097290056e-08, + "loss": 0.9989, + "step": 2627 + }, + { + "epoch": 2.964406779661017, + "grad_norm": 0.04512220248579979, + "learning_rate": 5.1417233381578775e-08, + "loss": 0.9498, + "step": 2628 + }, + { + "epoch": 2.965536723163842, + "grad_norm": 0.04488738253712654, + "learning_rate": 4.767937769583064e-08, + "loss": 0.9256, + "step": 2629 + }, + { + "epoch": 2.966666666666667, + "grad_norm": 0.04371628910303116, + "learning_rate": 4.408252918880473e-08, + "loss": 1.0065, + "step": 2630 + }, + { + "epoch": 2.9677966101694917, + "grad_norm": 0.04563480243086815, + "learning_rate": 4.062669293474208e-08, + "loss": 0.9899, + "step": 2631 + }, + { + "epoch": 2.9689265536723166, + "grad_norm": 0.04385736212134361, + "learning_rate": 3.731187380893175e-08, + "loss": 0.9837, + "step": 2632 + }, + { + "epoch": 2.970056497175141, + "grad_norm": 0.04431382194161415, + "learning_rate": 3.413807648775524e-08, + "loss": 0.9099, + "step": 2633 + }, + { + "epoch": 2.971186440677966, + "grad_norm": 0.045138463377952576, + "learning_rate": 3.110530544860879e-08, + "loss": 0.9418, + "step": 2634 + }, + { + "epoch": 2.972316384180791, + "grad_norm": 0.04519420862197876, + "learning_rate": 2.8213564969969963e-08, + "loss": 0.9792, + "step": 2635 + }, + { + "epoch": 2.973446327683616, + "grad_norm": 0.04272979870438576, + "learning_rate": 2.5462859131353266e-08, + "loss": 0.9315, + "step": 2636 + }, + { + "epoch": 2.9745762711864407, + "grad_norm": 0.043719951063394547, + "learning_rate": 2.2853191813276832e-08, + "loss": 0.8662, + "step": 2637 + }, + { + "epoch": 2.9757062146892657, + "grad_norm": 0.0462685190141201, + "learning_rate": 2.0384566697329023e-08, + "loss": 0.9797, + "step": 2638 + }, + { + "epoch": 2.97683615819209, + "grad_norm": 0.04535675793886185, + "learning_rate": 1.8056987266112933e-08, + "loss": 1.0174, + "step": 2639 + }, + { + "epoch": 2.977966101694915, + "grad_norm": 0.042910125106573105, + "learning_rate": 1.5870456803246392e-08, + "loss": 0.927, + "step": 2640 + }, + { + "epoch": 2.97909604519774, + "grad_norm": 0.044517967849969864, + "learning_rate": 1.3824978393361943e-08, + "loss": 0.9859, + "step": 2641 + }, + { + "epoch": 2.980225988700565, + "grad_norm": 0.044496312737464905, + "learning_rate": 1.1920554922106864e-08, + "loss": 1.0117, + "step": 2642 + }, + { + "epoch": 2.98135593220339, + "grad_norm": 0.044253502041101456, + "learning_rate": 1.0157189076132056e-08, + "loss": 0.957, + "step": 2643 + }, + { + "epoch": 2.9824858757062147, + "grad_norm": 0.04357994347810745, + "learning_rate": 8.53488334310315e-09, + "loss": 0.8748, + "step": 2644 + }, + { + "epoch": 2.9836158192090396, + "grad_norm": 0.043963946402072906, + "learning_rate": 7.053640011678297e-09, + "loss": 0.9425, + "step": 2645 + }, + { + "epoch": 2.9847457627118645, + "grad_norm": 0.04307138919830322, + "learning_rate": 5.713461171508172e-09, + "loss": 1.022, + "step": 2646 + }, + { + "epoch": 2.9858757062146895, + "grad_norm": 0.04411528259515762, + "learning_rate": 4.514348713247074e-09, + "loss": 0.9617, + "step": 2647 + }, + { + "epoch": 2.9870056497175144, + "grad_norm": 0.044936712831258774, + "learning_rate": 3.4563043285418264e-09, + "loss": 0.9227, + "step": 2648 + }, + { + "epoch": 2.988135593220339, + "grad_norm": 0.0439041331410408, + "learning_rate": 2.5393295100095695e-09, + "loss": 1.0105, + "step": 2649 + }, + { + "epoch": 2.9892655367231638, + "grad_norm": 0.04542997479438782, + "learning_rate": 1.7634255512710695e-09, + "loss": 0.983, + "step": 2650 + }, + { + "epoch": 2.9903954802259887, + "grad_norm": 0.044694844633340836, + "learning_rate": 1.1285935469285136e-09, + "loss": 0.8849, + "step": 2651 + }, + { + "epoch": 2.9915254237288136, + "grad_norm": 0.043964337557554245, + "learning_rate": 6.348343925766109e-10, + "loss": 1.0035, + "step": 2652 + }, + { + "epoch": 2.9926553672316385, + "grad_norm": 0.04477889835834503, + "learning_rate": 2.821487847692872e-10, + "loss": 1.0285, + "step": 2653 + }, + { + "epoch": 2.9937853107344634, + "grad_norm": 0.044497277587652206, + "learning_rate": 7.053722107519533e-11, + "loss": 0.9643, + "step": 2654 + }, + { + "epoch": 2.994915254237288, + "grad_norm": 0.04465208947658539, + "learning_rate": 0.0, + "loss": 0.9576, + "step": 2655 + }, + { + "epoch": 2.994915254237288, + "eval_loss": 0.984717071056366, + "eval_runtime": 562.7269, + "eval_samples_per_second": 17.385, + "eval_steps_per_second": 8.693, + "step": 2655 + } + ], + "logging_steps": 1, + "max_steps": 2655, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 885, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.202210731046994e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}