diff --git "a/outputs/qlora-out/checkpoint-885/trainer_state.json" "b/outputs/qlora-out/checkpoint-885/trainer_state.json" new file mode 100644--- /dev/null +++ "b/outputs/qlora-out/checkpoint-885/trainer_state.json" @@ -0,0 +1,6260 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997175939000282, + "eval_steps": 295, + "global_step": 885, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011296243998870376, + "grad_norm": 0.03433802351355553, + "learning_rate": 2e-05, + "loss": 1.1396, + "step": 1 + }, + { + "epoch": 0.0011296243998870376, + "eval_loss": 1.1312694549560547, + "eval_runtime": 554.2279, + "eval_samples_per_second": 17.652, + "eval_steps_per_second": 8.827, + "step": 1 + }, + { + "epoch": 0.0022592487997740753, + "grad_norm": 0.03576268255710602, + "learning_rate": 4e-05, + "loss": 1.1188, + "step": 2 + }, + { + "epoch": 0.003388873199661113, + "grad_norm": 0.03207174688577652, + "learning_rate": 6e-05, + "loss": 1.2194, + "step": 3 + }, + { + "epoch": 0.0045184975995481505, + "grad_norm": 0.03231927007436752, + "learning_rate": 8e-05, + "loss": 1.1735, + "step": 4 + }, + { + "epoch": 0.005648121999435188, + "grad_norm": 0.03306754678487778, + "learning_rate": 0.0001, + "loss": 1.1689, + "step": 5 + }, + { + "epoch": 0.006777746399322226, + "grad_norm": 0.035009413957595825, + "learning_rate": 0.00012, + "loss": 1.1808, + "step": 6 + }, + { + "epoch": 0.007907370799209263, + "grad_norm": 0.035900842398405075, + "learning_rate": 0.00014, + "loss": 1.0441, + "step": 7 + }, + { + "epoch": 0.009036995199096301, + "grad_norm": 0.028419604524970055, + "learning_rate": 0.00016, + "loss": 1.0858, + "step": 8 + }, + { + "epoch": 0.010166619598983339, + "grad_norm": 0.024826928973197937, + "learning_rate": 0.00018, + "loss": 1.1937, + "step": 9 + }, + { + "epoch": 0.011296243998870376, + "grad_norm": 0.026519587263464928, + "learning_rate": 0.0002, + "loss": 1.0726, + "step": 10 + }, + { + "epoch": 0.012425868398757414, + "grad_norm": 0.024713166058063507, + "learning_rate": 0.00019999992946277893, + "loss": 1.1191, + "step": 11 + }, + { + "epoch": 0.013555492798644452, + "grad_norm": 0.02494538575410843, + "learning_rate": 0.00019999971785121523, + "loss": 1.1751, + "step": 12 + }, + { + "epoch": 0.014685117198531489, + "grad_norm": 0.023202786222100258, + "learning_rate": 0.00019999936516560744, + "loss": 1.1801, + "step": 13 + }, + { + "epoch": 0.015814741598418527, + "grad_norm": 0.021448194980621338, + "learning_rate": 0.00019999887140645308, + "loss": 1.1676, + "step": 14 + }, + { + "epoch": 0.016944365998305563, + "grad_norm": 0.021449347957968712, + "learning_rate": 0.00019999823657444873, + "loss": 1.1224, + "step": 15 + }, + { + "epoch": 0.018073990398192602, + "grad_norm": 0.021438075229525566, + "learning_rate": 0.00019999746067049, + "loss": 1.0713, + "step": 16 + }, + { + "epoch": 0.019203614798079638, + "grad_norm": 0.020727725699543953, + "learning_rate": 0.00019999654369567147, + "loss": 1.1317, + "step": 17 + }, + { + "epoch": 0.020333239197966677, + "grad_norm": 0.023839153349399567, + "learning_rate": 0.00019999548565128678, + "loss": 0.994, + "step": 18 + }, + { + "epoch": 0.021462863597853713, + "grad_norm": 0.020854901522397995, + "learning_rate": 0.0001999942865388285, + "loss": 1.0086, + "step": 19 + }, + { + "epoch": 0.022592487997740753, + "grad_norm": 0.019373737275600433, + "learning_rate": 0.00019999294635998833, + "loss": 1.0284, + "step": 20 + }, + { + "epoch": 0.02372211239762779, + "grad_norm": 0.020383458584547043, + "learning_rate": 0.00019999146511665692, + "loss": 1.0926, + "step": 21 + }, + { + "epoch": 0.024851736797514828, + "grad_norm": 0.018995020538568497, + "learning_rate": 0.0001999898428109239, + "loss": 1.0333, + "step": 22 + }, + { + "epoch": 0.025981361197401864, + "grad_norm": 0.018192732706665993, + "learning_rate": 0.00019998807944507791, + "loss": 1.1471, + "step": 23 + }, + { + "epoch": 0.027110985597288903, + "grad_norm": 0.017383620142936707, + "learning_rate": 0.00019998617502160664, + "loss": 1.0417, + "step": 24 + }, + { + "epoch": 0.02824060999717594, + "grad_norm": 0.0169441569596529, + "learning_rate": 0.00019998412954319675, + "loss": 1.0375, + "step": 25 + }, + { + "epoch": 0.029370234397062978, + "grad_norm": 0.017483294010162354, + "learning_rate": 0.0001999819430127339, + "loss": 1.1357, + "step": 26 + }, + { + "epoch": 0.030499858796950014, + "grad_norm": 0.016769247129559517, + "learning_rate": 0.00019997961543330269, + "loss": 1.0641, + "step": 27 + }, + { + "epoch": 0.031629483196837054, + "grad_norm": 0.017466655001044273, + "learning_rate": 0.00019997714680818673, + "loss": 1.0919, + "step": 28 + }, + { + "epoch": 0.03275910759672409, + "grad_norm": 0.01684914343059063, + "learning_rate": 0.00019997453714086866, + "loss": 1.1458, + "step": 29 + }, + { + "epoch": 0.033888731996611125, + "grad_norm": 0.017144447192549706, + "learning_rate": 0.00019997178643503004, + "loss": 1.0499, + "step": 30 + }, + { + "epoch": 0.03501835639649816, + "grad_norm": 0.01914755254983902, + "learning_rate": 0.0001999688946945514, + "loss": 1.03, + "step": 31 + }, + { + "epoch": 0.036147980796385204, + "grad_norm": 0.017046675086021423, + "learning_rate": 0.00019996586192351225, + "loss": 1.0747, + "step": 32 + }, + { + "epoch": 0.03727760519627224, + "grad_norm": 0.01652969978749752, + "learning_rate": 0.00019996268812619107, + "loss": 1.1694, + "step": 33 + }, + { + "epoch": 0.038407229596159276, + "grad_norm": 0.016702843829989433, + "learning_rate": 0.00019995937330706526, + "loss": 1.0339, + "step": 34 + }, + { + "epoch": 0.03953685399604631, + "grad_norm": 0.016058262437582016, + "learning_rate": 0.00019995591747081122, + "loss": 1.1002, + "step": 35 + }, + { + "epoch": 0.040666478395933355, + "grad_norm": 0.01609026826918125, + "learning_rate": 0.0001999523206223042, + "loss": 1.0934, + "step": 36 + }, + { + "epoch": 0.04179610279582039, + "grad_norm": 0.017004741355776787, + "learning_rate": 0.00019994858276661844, + "loss": 1.0748, + "step": 37 + }, + { + "epoch": 0.042925727195707426, + "grad_norm": 0.017046496272087097, + "learning_rate": 0.00019994470390902712, + "loss": 1.0851, + "step": 38 + }, + { + "epoch": 0.04405535159559446, + "grad_norm": 0.016196010634303093, + "learning_rate": 0.0001999406840550023, + "loss": 0.9796, + "step": 39 + }, + { + "epoch": 0.045184975995481505, + "grad_norm": 0.01703856885433197, + "learning_rate": 0.000199936523210215, + "loss": 1.0511, + "step": 40 + }, + { + "epoch": 0.04631460039536854, + "grad_norm": 0.017249640077352524, + "learning_rate": 0.00019993222138053507, + "loss": 0.9521, + "step": 41 + }, + { + "epoch": 0.04744422479525558, + "grad_norm": 0.01665697991847992, + "learning_rate": 0.0001999277785720313, + "loss": 1.0732, + "step": 42 + }, + { + "epoch": 0.04857384919514261, + "grad_norm": 0.016691800206899643, + "learning_rate": 0.0001999231947909714, + "loss": 1.1504, + "step": 43 + }, + { + "epoch": 0.049703473595029656, + "grad_norm": 0.016521582379937172, + "learning_rate": 0.00019991847004382186, + "loss": 1.1022, + "step": 44 + }, + { + "epoch": 0.05083309799491669, + "grad_norm": 0.017150631174445152, + "learning_rate": 0.00019991360433724813, + "loss": 1.0843, + "step": 45 + }, + { + "epoch": 0.05196272239480373, + "grad_norm": 0.016666896641254425, + "learning_rate": 0.00019990859767811444, + "loss": 1.0328, + "step": 46 + }, + { + "epoch": 0.05309234679469076, + "grad_norm": 0.016831668093800545, + "learning_rate": 0.0001999034500734839, + "loss": 1.0285, + "step": 47 + }, + { + "epoch": 0.054221971194577806, + "grad_norm": 0.01656479388475418, + "learning_rate": 0.00019989816153061853, + "loss": 0.9825, + "step": 48 + }, + { + "epoch": 0.05535159559446484, + "grad_norm": 0.016812896355986595, + "learning_rate": 0.00019989273205697904, + "loss": 1.0912, + "step": 49 + }, + { + "epoch": 0.05648121999435188, + "grad_norm": 0.01740356534719467, + "learning_rate": 0.0001998871616602251, + "loss": 1.0601, + "step": 50 + }, + { + "epoch": 0.057610844394238914, + "grad_norm": 0.017337938770651817, + "learning_rate": 0.00019988145034821502, + "loss": 1.1482, + "step": 51 + }, + { + "epoch": 0.058740468794125957, + "grad_norm": 0.017546923831105232, + "learning_rate": 0.0001998755981290061, + "loss": 1.0472, + "step": 52 + }, + { + "epoch": 0.05987009319401299, + "grad_norm": 0.017129387706518173, + "learning_rate": 0.00019986960501085428, + "loss": 1.1083, + "step": 53 + }, + { + "epoch": 0.06099971759390003, + "grad_norm": 0.01688031479716301, + "learning_rate": 0.00019986347100221433, + "loss": 1.0589, + "step": 54 + }, + { + "epoch": 0.062129341993787064, + "grad_norm": 0.017208745703101158, + "learning_rate": 0.00019985719611173973, + "loss": 1.1292, + "step": 55 + }, + { + "epoch": 0.06325896639367411, + "grad_norm": 0.01749996654689312, + "learning_rate": 0.0001998507803482828, + "loss": 1.1252, + "step": 56 + }, + { + "epoch": 0.06438859079356114, + "grad_norm": 0.017861831933259964, + "learning_rate": 0.00019984422372089453, + "loss": 1.104, + "step": 57 + }, + { + "epoch": 0.06551821519344818, + "grad_norm": 0.01797177456319332, + "learning_rate": 0.00019983752623882462, + "loss": 1.0569, + "step": 58 + }, + { + "epoch": 0.06664783959333521, + "grad_norm": 0.01702985167503357, + "learning_rate": 0.00019983068791152152, + "loss": 1.0238, + "step": 59 + }, + { + "epoch": 0.06777746399322225, + "grad_norm": 0.01766786351799965, + "learning_rate": 0.00019982370874863236, + "loss": 1.0509, + "step": 60 + }, + { + "epoch": 0.06890708839310929, + "grad_norm": 0.017592614516615868, + "learning_rate": 0.00019981658876000298, + "loss": 1.0613, + "step": 61 + }, + { + "epoch": 0.07003671279299632, + "grad_norm": 0.018118126317858696, + "learning_rate": 0.00019980932795567782, + "loss": 1.1727, + "step": 62 + }, + { + "epoch": 0.07116633719288337, + "grad_norm": 0.017846032977104187, + "learning_rate": 0.00019980192634590007, + "loss": 1.0042, + "step": 63 + }, + { + "epoch": 0.07229596159277041, + "grad_norm": 0.018457984551787376, + "learning_rate": 0.00019979438394111145, + "loss": 1.0648, + "step": 64 + }, + { + "epoch": 0.07342558599265744, + "grad_norm": 0.019480090588331223, + "learning_rate": 0.0001997867007519524, + "loss": 1.0307, + "step": 65 + }, + { + "epoch": 0.07455521039254448, + "grad_norm": 0.018102938309311867, + "learning_rate": 0.00019977887678926195, + "loss": 1.1129, + "step": 66 + }, + { + "epoch": 0.07568483479243152, + "grad_norm": 0.017858000472187996, + "learning_rate": 0.00019977091206407768, + "loss": 1.1574, + "step": 67 + }, + { + "epoch": 0.07681445919231855, + "grad_norm": 0.019296329468488693, + "learning_rate": 0.0001997628065876358, + "loss": 1.046, + "step": 68 + }, + { + "epoch": 0.07794408359220559, + "grad_norm": 0.017915885895490646, + "learning_rate": 0.0001997545603713711, + "loss": 1.0357, + "step": 69 + }, + { + "epoch": 0.07907370799209262, + "grad_norm": 0.01840789057314396, + "learning_rate": 0.00019974617342691678, + "loss": 1.0383, + "step": 70 + }, + { + "epoch": 0.08020333239197967, + "grad_norm": 0.019900545477867126, + "learning_rate": 0.00019973764576610478, + "loss": 0.9994, + "step": 71 + }, + { + "epoch": 0.08133295679186671, + "grad_norm": 0.0189906544983387, + "learning_rate": 0.0001997289774009654, + "loss": 0.9418, + "step": 72 + }, + { + "epoch": 0.08246258119175374, + "grad_norm": 0.01873377151787281, + "learning_rate": 0.00019972016834372749, + "loss": 0.9937, + "step": 73 + }, + { + "epoch": 0.08359220559164078, + "grad_norm": 0.019596470519900322, + "learning_rate": 0.0001997112186068184, + "loss": 1.0887, + "step": 74 + }, + { + "epoch": 0.08472182999152782, + "grad_norm": 0.020303891971707344, + "learning_rate": 0.00019970212820286394, + "loss": 1.0142, + "step": 75 + }, + { + "epoch": 0.08585145439141485, + "grad_norm": 0.019804317504167557, + "learning_rate": 0.00019969289714468825, + "loss": 1.0394, + "step": 76 + }, + { + "epoch": 0.08698107879130189, + "grad_norm": 0.019536610692739487, + "learning_rate": 0.0001996835254453141, + "loss": 0.9576, + "step": 77 + }, + { + "epoch": 0.08811070319118892, + "grad_norm": 0.019902685657143593, + "learning_rate": 0.0001996740131179625, + "loss": 0.9835, + "step": 78 + }, + { + "epoch": 0.08924032759107597, + "grad_norm": 0.01986609399318695, + "learning_rate": 0.00019966436017605297, + "loss": 1.0597, + "step": 79 + }, + { + "epoch": 0.09036995199096301, + "grad_norm": 0.019435487687587738, + "learning_rate": 0.00019965456663320329, + "loss": 1.0863, + "step": 80 + }, + { + "epoch": 0.09149957639085005, + "grad_norm": 0.019000260159373283, + "learning_rate": 0.00019964463250322966, + "loss": 1.0935, + "step": 81 + }, + { + "epoch": 0.09262920079073708, + "grad_norm": 0.018888210877776146, + "learning_rate": 0.0001996345578001466, + "loss": 1.0399, + "step": 82 + }, + { + "epoch": 0.09375882519062412, + "grad_norm": 0.019765490666031837, + "learning_rate": 0.00019962434253816694, + "loss": 1.0265, + "step": 83 + }, + { + "epoch": 0.09488844959051115, + "grad_norm": 0.01926722563803196, + "learning_rate": 0.00019961398673170181, + "loss": 1.0307, + "step": 84 + }, + { + "epoch": 0.09601807399039819, + "grad_norm": 0.019572502002120018, + "learning_rate": 0.00019960349039536062, + "loss": 1.0217, + "step": 85 + }, + { + "epoch": 0.09714769839028523, + "grad_norm": 0.024138784036040306, + "learning_rate": 0.000199592853543951, + "loss": 1.1376, + "step": 86 + }, + { + "epoch": 0.09827732279017226, + "grad_norm": 0.03155818581581116, + "learning_rate": 0.0001995820761924788, + "loss": 1.1029, + "step": 87 + }, + { + "epoch": 0.09940694719005931, + "grad_norm": 0.019589390605688095, + "learning_rate": 0.00019957115835614816, + "loss": 1.0353, + "step": 88 + }, + { + "epoch": 0.10053657158994635, + "grad_norm": 0.020419439300894737, + "learning_rate": 0.00019956010005036133, + "loss": 1.0228, + "step": 89 + }, + { + "epoch": 0.10166619598983338, + "grad_norm": 0.02149847149848938, + "learning_rate": 0.00019954890129071876, + "loss": 1.1214, + "step": 90 + }, + { + "epoch": 0.10279582038972042, + "grad_norm": 0.01982825994491577, + "learning_rate": 0.00019953756209301903, + "loss": 1.0302, + "step": 91 + }, + { + "epoch": 0.10392544478960745, + "grad_norm": 0.01985330507159233, + "learning_rate": 0.00019952608247325885, + "loss": 1.0674, + "step": 92 + }, + { + "epoch": 0.10505506918949449, + "grad_norm": 0.020196454599499702, + "learning_rate": 0.00019951446244763309, + "loss": 1.0113, + "step": 93 + }, + { + "epoch": 0.10618469358938153, + "grad_norm": 0.020652327686548233, + "learning_rate": 0.00019950270203253454, + "loss": 1.0635, + "step": 94 + }, + { + "epoch": 0.10731431798926856, + "grad_norm": 0.020714478567242622, + "learning_rate": 0.00019949080124455416, + "loss": 1.0226, + "step": 95 + }, + { + "epoch": 0.10844394238915561, + "grad_norm": 0.021647842600941658, + "learning_rate": 0.000199478760100481, + "loss": 1.0575, + "step": 96 + }, + { + "epoch": 0.10957356678904265, + "grad_norm": 0.02076675370335579, + "learning_rate": 0.00019946657861730194, + "loss": 1.1146, + "step": 97 + }, + { + "epoch": 0.11070319118892968, + "grad_norm": 0.02103651687502861, + "learning_rate": 0.000199454256812202, + "loss": 0.9953, + "step": 98 + }, + { + "epoch": 0.11183281558881672, + "grad_norm": 0.02276523970067501, + "learning_rate": 0.00019944179470256405, + "loss": 1.021, + "step": 99 + }, + { + "epoch": 0.11296243998870376, + "grad_norm": 0.020957166329026222, + "learning_rate": 0.00019942919230596896, + "loss": 0.9838, + "step": 100 + }, + { + "epoch": 0.11409206438859079, + "grad_norm": 0.022394055500626564, + "learning_rate": 0.00019941644964019552, + "loss": 1.0169, + "step": 101 + }, + { + "epoch": 0.11522168878847783, + "grad_norm": 0.02139163948595524, + "learning_rate": 0.00019940356672322037, + "loss": 1.0788, + "step": 102 + }, + { + "epoch": 0.11635131318836486, + "grad_norm": 0.021381577476859093, + "learning_rate": 0.00019939054357321799, + "loss": 1.0669, + "step": 103 + }, + { + "epoch": 0.11748093758825191, + "grad_norm": 0.02302641049027443, + "learning_rate": 0.00019937738020856072, + "loss": 1.0122, + "step": 104 + }, + { + "epoch": 0.11861056198813895, + "grad_norm": 0.021372724324464798, + "learning_rate": 0.00019936407664781868, + "loss": 1.0974, + "step": 105 + }, + { + "epoch": 0.11974018638802598, + "grad_norm": 0.021260784938931465, + "learning_rate": 0.00019935063290975986, + "loss": 0.9996, + "step": 106 + }, + { + "epoch": 0.12086981078791302, + "grad_norm": 0.021557705476880074, + "learning_rate": 0.0001993370490133499, + "loss": 1.0215, + "step": 107 + }, + { + "epoch": 0.12199943518780006, + "grad_norm": 0.023252975195646286, + "learning_rate": 0.00019932332497775215, + "loss": 1.0908, + "step": 108 + }, + { + "epoch": 0.12312905958768709, + "grad_norm": 0.02185026742517948, + "learning_rate": 0.00019930946082232783, + "loss": 1.0751, + "step": 109 + }, + { + "epoch": 0.12425868398757413, + "grad_norm": 0.022223595529794693, + "learning_rate": 0.00019929545656663562, + "loss": 0.9737, + "step": 110 + }, + { + "epoch": 0.12538830838746118, + "grad_norm": 0.021415019407868385, + "learning_rate": 0.000199281312230432, + "loss": 1.0864, + "step": 111 + }, + { + "epoch": 0.12651793278734821, + "grad_norm": 0.02144046686589718, + "learning_rate": 0.000199267027833671, + "loss": 0.9984, + "step": 112 + }, + { + "epoch": 0.12764755718723525, + "grad_norm": 0.0225879717618227, + "learning_rate": 0.00019925260339650428, + "loss": 1.0685, + "step": 113 + }, + { + "epoch": 0.12877718158712229, + "grad_norm": 0.022809404879808426, + "learning_rate": 0.000199238038939281, + "loss": 1.0733, + "step": 114 + }, + { + "epoch": 0.12990680598700932, + "grad_norm": 0.023381488397717476, + "learning_rate": 0.00019922333448254786, + "loss": 1.0107, + "step": 115 + }, + { + "epoch": 0.13103643038689636, + "grad_norm": 0.022633766755461693, + "learning_rate": 0.00019920849004704914, + "loss": 0.9885, + "step": 116 + }, + { + "epoch": 0.1321660547867834, + "grad_norm": 0.02235741913318634, + "learning_rate": 0.00019919350565372656, + "loss": 1.0714, + "step": 117 + }, + { + "epoch": 0.13329567918667043, + "grad_norm": 0.02206304483115673, + "learning_rate": 0.00019917838132371923, + "loss": 1.0749, + "step": 118 + }, + { + "epoch": 0.13442530358655747, + "grad_norm": 0.022310512140393257, + "learning_rate": 0.0001991631170783637, + "loss": 1.0437, + "step": 119 + }, + { + "epoch": 0.1355549279864445, + "grad_norm": 0.021498341113328934, + "learning_rate": 0.00019914771293919395, + "loss": 1.0317, + "step": 120 + }, + { + "epoch": 0.13668455238633154, + "grad_norm": 0.021773051470518112, + "learning_rate": 0.0001991321689279413, + "loss": 1.0489, + "step": 121 + }, + { + "epoch": 0.13781417678621857, + "grad_norm": 0.021639568731188774, + "learning_rate": 0.0001991164850665343, + "loss": 0.9893, + "step": 122 + }, + { + "epoch": 0.1389438011861056, + "grad_norm": 0.022304847836494446, + "learning_rate": 0.00019910066137709896, + "loss": 1.0542, + "step": 123 + }, + { + "epoch": 0.14007342558599264, + "grad_norm": 0.022173380479216576, + "learning_rate": 0.0001990846978819584, + "loss": 1.0776, + "step": 124 + }, + { + "epoch": 0.1412030499858797, + "grad_norm": 0.023623231798410416, + "learning_rate": 0.00019906859460363307, + "loss": 1.1279, + "step": 125 + }, + { + "epoch": 0.14233267438576674, + "grad_norm": 0.022697214037179947, + "learning_rate": 0.0001990523515648406, + "loss": 0.9973, + "step": 126 + }, + { + "epoch": 0.14346229878565378, + "grad_norm": 0.02267601527273655, + "learning_rate": 0.00019903596878849568, + "loss": 1.1131, + "step": 127 + }, + { + "epoch": 0.14459192318554082, + "grad_norm": 0.02244328148663044, + "learning_rate": 0.0001990194462977103, + "loss": 1.0157, + "step": 128 + }, + { + "epoch": 0.14572154758542785, + "grad_norm": 0.02371121197938919, + "learning_rate": 0.00019900278411579344, + "loss": 0.9888, + "step": 129 + }, + { + "epoch": 0.1468511719853149, + "grad_norm": 0.0227819811552763, + "learning_rate": 0.00019898598226625119, + "loss": 1.0003, + "step": 130 + }, + { + "epoch": 0.14798079638520192, + "grad_norm": 0.02316221408545971, + "learning_rate": 0.00019896904077278663, + "loss": 1.0181, + "step": 131 + }, + { + "epoch": 0.14911042078508896, + "grad_norm": 0.022808320820331573, + "learning_rate": 0.00019895195965929994, + "loss": 1.0546, + "step": 132 + }, + { + "epoch": 0.150240045184976, + "grad_norm": 0.022865859791636467, + "learning_rate": 0.00019893473894988815, + "loss": 1.1513, + "step": 133 + }, + { + "epoch": 0.15136966958486303, + "grad_norm": 0.024973077699542046, + "learning_rate": 0.0001989173786688453, + "loss": 1.0109, + "step": 134 + }, + { + "epoch": 0.15249929398475007, + "grad_norm": 0.02237241342663765, + "learning_rate": 0.00019889987884066237, + "loss": 1.0991, + "step": 135 + }, + { + "epoch": 0.1536289183846371, + "grad_norm": 0.023280750960111618, + "learning_rate": 0.000198882239490027, + "loss": 1.0501, + "step": 136 + }, + { + "epoch": 0.15475854278452414, + "grad_norm": 0.022803954780101776, + "learning_rate": 0.00019886446064182396, + "loss": 1.0033, + "step": 137 + }, + { + "epoch": 0.15588816718441117, + "grad_norm": 0.02270156517624855, + "learning_rate": 0.0001988465423211346, + "loss": 1.0715, + "step": 138 + }, + { + "epoch": 0.1570177915842982, + "grad_norm": 0.023484455421566963, + "learning_rate": 0.00019882848455323704, + "loss": 1.1598, + "step": 139 + }, + { + "epoch": 0.15814741598418525, + "grad_norm": 0.02300065942108631, + "learning_rate": 0.00019881028736360622, + "loss": 1.0813, + "step": 140 + }, + { + "epoch": 0.15927704038407228, + "grad_norm": 0.02320142462849617, + "learning_rate": 0.00019879195077791376, + "loss": 1.0169, + "step": 141 + }, + { + "epoch": 0.16040666478395935, + "grad_norm": 0.02317328006029129, + "learning_rate": 0.00019877347482202785, + "loss": 1.0301, + "step": 142 + }, + { + "epoch": 0.16153628918384638, + "grad_norm": 0.02378895878791809, + "learning_rate": 0.0001987548595220133, + "loss": 0.977, + "step": 143 + }, + { + "epoch": 0.16266591358373342, + "grad_norm": 0.023976027965545654, + "learning_rate": 0.00019873610490413166, + "loss": 1.0859, + "step": 144 + }, + { + "epoch": 0.16379553798362045, + "grad_norm": 0.022843923419713974, + "learning_rate": 0.0001987172109948408, + "loss": 1.0103, + "step": 145 + }, + { + "epoch": 0.1649251623835075, + "grad_norm": 0.023723525926470757, + "learning_rate": 0.00019869817782079525, + "loss": 1.0704, + "step": 146 + }, + { + "epoch": 0.16605478678339453, + "grad_norm": 0.02391059510409832, + "learning_rate": 0.00019867900540884592, + "loss": 1.058, + "step": 147 + }, + { + "epoch": 0.16718441118328156, + "grad_norm": 0.023995989933609962, + "learning_rate": 0.0001986596937860402, + "loss": 1.0034, + "step": 148 + }, + { + "epoch": 0.1683140355831686, + "grad_norm": 0.024091636762022972, + "learning_rate": 0.00019864024297962186, + "loss": 1.1214, + "step": 149 + }, + { + "epoch": 0.16944365998305563, + "grad_norm": 0.024035949259996414, + "learning_rate": 0.000198620653017031, + "loss": 1.0219, + "step": 150 + }, + { + "epoch": 0.17057328438294267, + "grad_norm": 0.02359904535114765, + "learning_rate": 0.00019860092392590408, + "loss": 0.9627, + "step": 151 + }, + { + "epoch": 0.1717029087828297, + "grad_norm": 0.023622050881385803, + "learning_rate": 0.00019858105573407377, + "loss": 1.0582, + "step": 152 + }, + { + "epoch": 0.17283253318271674, + "grad_norm": 0.02392633818089962, + "learning_rate": 0.00019856104846956906, + "loss": 1.0089, + "step": 153 + }, + { + "epoch": 0.17396215758260378, + "grad_norm": 0.024305053055286407, + "learning_rate": 0.00019854090216061502, + "loss": 1.0222, + "step": 154 + }, + { + "epoch": 0.1750917819824908, + "grad_norm": 0.024212822318077087, + "learning_rate": 0.00019852061683563296, + "loss": 1.0429, + "step": 155 + }, + { + "epoch": 0.17622140638237785, + "grad_norm": 0.024261048063635826, + "learning_rate": 0.00019850019252324032, + "loss": 1.0506, + "step": 156 + }, + { + "epoch": 0.17735103078226488, + "grad_norm": 0.022689295932650566, + "learning_rate": 0.0001984796292522506, + "loss": 0.9999, + "step": 157 + }, + { + "epoch": 0.17848065518215195, + "grad_norm": 0.023288823664188385, + "learning_rate": 0.00019845892705167324, + "loss": 1.0242, + "step": 158 + }, + { + "epoch": 0.17961027958203898, + "grad_norm": 0.030173135921359062, + "learning_rate": 0.00019843808595071383, + "loss": 1.0641, + "step": 159 + }, + { + "epoch": 0.18073990398192602, + "grad_norm": 0.024562738835811615, + "learning_rate": 0.00019841710597877382, + "loss": 0.9781, + "step": 160 + }, + { + "epoch": 0.18186952838181306, + "grad_norm": 0.024897055700421333, + "learning_rate": 0.00019839598716545057, + "loss": 1.1015, + "step": 161 + }, + { + "epoch": 0.1829991527817001, + "grad_norm": 0.023950692266225815, + "learning_rate": 0.00019837472954053732, + "loss": 1.125, + "step": 162 + }, + { + "epoch": 0.18412877718158713, + "grad_norm": 0.025099674239754677, + "learning_rate": 0.00019835333313402318, + "loss": 1.0359, + "step": 163 + }, + { + "epoch": 0.18525840158147416, + "grad_norm": 0.025351393967866898, + "learning_rate": 0.000198331797976093, + "loss": 1.0281, + "step": 164 + }, + { + "epoch": 0.1863880259813612, + "grad_norm": 0.024693114683032036, + "learning_rate": 0.00019831012409712737, + "loss": 1.1521, + "step": 165 + }, + { + "epoch": 0.18751765038124824, + "grad_norm": 0.024255136027932167, + "learning_rate": 0.0001982883115277026, + "loss": 1.0842, + "step": 166 + }, + { + "epoch": 0.18864727478113527, + "grad_norm": 0.02501499280333519, + "learning_rate": 0.00019826636029859066, + "loss": 0.9975, + "step": 167 + }, + { + "epoch": 0.1897768991810223, + "grad_norm": 0.025276506319642067, + "learning_rate": 0.00019824427044075912, + "loss": 1.0119, + "step": 168 + }, + { + "epoch": 0.19090652358090934, + "grad_norm": 0.024857770651578903, + "learning_rate": 0.0001982220419853711, + "loss": 0.9733, + "step": 169 + }, + { + "epoch": 0.19203614798079638, + "grad_norm": 0.02459135465323925, + "learning_rate": 0.0001981996749637853, + "loss": 1.0791, + "step": 170 + }, + { + "epoch": 0.19316577238068341, + "grad_norm": 0.026056725531816483, + "learning_rate": 0.00019817716940755586, + "loss": 1.0849, + "step": 171 + }, + { + "epoch": 0.19429539678057045, + "grad_norm": 0.024211106821894646, + "learning_rate": 0.0001981545253484324, + "loss": 1.0253, + "step": 172 + }, + { + "epoch": 0.1954250211804575, + "grad_norm": 0.024450423195958138, + "learning_rate": 0.00019813174281835982, + "loss": 1.1101, + "step": 173 + }, + { + "epoch": 0.19655464558034452, + "grad_norm": 0.02433086559176445, + "learning_rate": 0.0001981088218494785, + "loss": 0.9887, + "step": 174 + }, + { + "epoch": 0.1976842699802316, + "grad_norm": 0.02424442023038864, + "learning_rate": 0.0001980857624741241, + "loss": 1.074, + "step": 175 + }, + { + "epoch": 0.19881389438011862, + "grad_norm": 0.02318243682384491, + "learning_rate": 0.00019806256472482744, + "loss": 1.1045, + "step": 176 + }, + { + "epoch": 0.19994351878000566, + "grad_norm": 0.02407553791999817, + "learning_rate": 0.00019803922863431467, + "loss": 1.0062, + "step": 177 + }, + { + "epoch": 0.2010731431798927, + "grad_norm": 0.02463892102241516, + "learning_rate": 0.000198015754235507, + "loss": 1.0689, + "step": 178 + }, + { + "epoch": 0.20220276757977973, + "grad_norm": 0.023701028898358345, + "learning_rate": 0.00019799214156152083, + "loss": 1.0672, + "step": 179 + }, + { + "epoch": 0.20333239197966677, + "grad_norm": 0.02471453696489334, + "learning_rate": 0.00019796839064566761, + "loss": 1.033, + "step": 180 + }, + { + "epoch": 0.2044620163795538, + "grad_norm": 0.02426736056804657, + "learning_rate": 0.00019794450152145382, + "loss": 1.0831, + "step": 181 + }, + { + "epoch": 0.20559164077944084, + "grad_norm": 0.0243529062718153, + "learning_rate": 0.0001979204742225809, + "loss": 1.0815, + "step": 182 + }, + { + "epoch": 0.20672126517932787, + "grad_norm": 0.0243973471224308, + "learning_rate": 0.00019789630878294526, + "loss": 1.0541, + "step": 183 + }, + { + "epoch": 0.2078508895792149, + "grad_norm": 0.02461186796426773, + "learning_rate": 0.0001978720052366381, + "loss": 1.1203, + "step": 184 + }, + { + "epoch": 0.20898051397910195, + "grad_norm": 0.02479882724583149, + "learning_rate": 0.00019784756361794555, + "loss": 1.078, + "step": 185 + }, + { + "epoch": 0.21011013837898898, + "grad_norm": 0.02605288103222847, + "learning_rate": 0.00019782298396134844, + "loss": 1.01, + "step": 186 + }, + { + "epoch": 0.21123976277887602, + "grad_norm": 0.025911834090948105, + "learning_rate": 0.00019779826630152245, + "loss": 1.1173, + "step": 187 + }, + { + "epoch": 0.21236938717876305, + "grad_norm": 0.024420902132987976, + "learning_rate": 0.00019777341067333786, + "loss": 1.0023, + "step": 188 + }, + { + "epoch": 0.2134990115786501, + "grad_norm": 0.024010393768548965, + "learning_rate": 0.0001977484171118596, + "loss": 1.1382, + "step": 189 + }, + { + "epoch": 0.21462863597853712, + "grad_norm": 0.024915101006627083, + "learning_rate": 0.00019772328565234717, + "loss": 1.0734, + "step": 190 + }, + { + "epoch": 0.21575826037842416, + "grad_norm": 0.025032367557287216, + "learning_rate": 0.0001976980163302547, + "loss": 0.9585, + "step": 191 + }, + { + "epoch": 0.21688788477831122, + "grad_norm": 0.024727528914809227, + "learning_rate": 0.0001976726091812307, + "loss": 1.0731, + "step": 192 + }, + { + "epoch": 0.21801750917819826, + "grad_norm": 0.024914614856243134, + "learning_rate": 0.00019764706424111816, + "loss": 0.9522, + "step": 193 + }, + { + "epoch": 0.2191471335780853, + "grad_norm": 0.024750174954533577, + "learning_rate": 0.00019762138154595446, + "loss": 0.9646, + "step": 194 + }, + { + "epoch": 0.22027675797797233, + "grad_norm": 0.02512511797249317, + "learning_rate": 0.00019759556113197135, + "loss": 1.0643, + "step": 195 + }, + { + "epoch": 0.22140638237785937, + "grad_norm": 0.026546582579612732, + "learning_rate": 0.00019756960303559483, + "loss": 1.1158, + "step": 196 + }, + { + "epoch": 0.2225360067777464, + "grad_norm": 0.02506748028099537, + "learning_rate": 0.0001975435072934451, + "loss": 1.0261, + "step": 197 + }, + { + "epoch": 0.22366563117763344, + "grad_norm": 0.024585796520113945, + "learning_rate": 0.00019751727394233667, + "loss": 1.017, + "step": 198 + }, + { + "epoch": 0.22479525557752048, + "grad_norm": 0.02528531290590763, + "learning_rate": 0.00019749090301927796, + "loss": 1.042, + "step": 199 + }, + { + "epoch": 0.2259248799774075, + "grad_norm": 0.025023646652698517, + "learning_rate": 0.00019746439456147172, + "loss": 0.9618, + "step": 200 + }, + { + "epoch": 0.22705450437729455, + "grad_norm": 0.025859549641609192, + "learning_rate": 0.00019743774860631457, + "loss": 0.9982, + "step": 201 + }, + { + "epoch": 0.22818412877718158, + "grad_norm": 0.026021264493465424, + "learning_rate": 0.00019741096519139713, + "loss": 1.0131, + "step": 202 + }, + { + "epoch": 0.22931375317706862, + "grad_norm": 0.025675011798739433, + "learning_rate": 0.00019738404435450395, + "loss": 1.0186, + "step": 203 + }, + { + "epoch": 0.23044337757695565, + "grad_norm": 0.025758078321814537, + "learning_rate": 0.00019735698613361347, + "loss": 1.0869, + "step": 204 + }, + { + "epoch": 0.2315730019768427, + "grad_norm": 0.02666814811527729, + "learning_rate": 0.00019732979056689794, + "loss": 1.0894, + "step": 205 + }, + { + "epoch": 0.23270262637672973, + "grad_norm": 0.024690723046660423, + "learning_rate": 0.0001973024576927233, + "loss": 1.0898, + "step": 206 + }, + { + "epoch": 0.23383225077661676, + "grad_norm": 0.025678694248199463, + "learning_rate": 0.00019727498754964928, + "loss": 1.091, + "step": 207 + }, + { + "epoch": 0.23496187517650383, + "grad_norm": 0.025275958701968193, + "learning_rate": 0.00019724738017642924, + "loss": 1.089, + "step": 208 + }, + { + "epoch": 0.23609149957639086, + "grad_norm": 0.02560093067586422, + "learning_rate": 0.00019721963561201012, + "loss": 0.9755, + "step": 209 + }, + { + "epoch": 0.2372211239762779, + "grad_norm": 0.026244761422276497, + "learning_rate": 0.00019719175389553242, + "loss": 1.0696, + "step": 210 + }, + { + "epoch": 0.23835074837616493, + "grad_norm": 0.025443457067012787, + "learning_rate": 0.0001971637350663301, + "loss": 1.0032, + "step": 211 + }, + { + "epoch": 0.23948037277605197, + "grad_norm": 0.027356769889593124, + "learning_rate": 0.00019713557916393058, + "loss": 1.0393, + "step": 212 + }, + { + "epoch": 0.240609997175939, + "grad_norm": 0.025765880942344666, + "learning_rate": 0.0001971072862280546, + "loss": 1.015, + "step": 213 + }, + { + "epoch": 0.24173962157582604, + "grad_norm": 0.025718411430716515, + "learning_rate": 0.00019707885629861632, + "loss": 1.0343, + "step": 214 + }, + { + "epoch": 0.24286924597571308, + "grad_norm": 0.026691369712352753, + "learning_rate": 0.00019705028941572307, + "loss": 1.0896, + "step": 215 + }, + { + "epoch": 0.2439988703756001, + "grad_norm": 0.025440771132707596, + "learning_rate": 0.00019702158561967544, + "loss": 0.9986, + "step": 216 + }, + { + "epoch": 0.24512849477548715, + "grad_norm": 0.02483600750565529, + "learning_rate": 0.00019699274495096712, + "loss": 1.0287, + "step": 217 + }, + { + "epoch": 0.24625811917537418, + "grad_norm": 0.027423838153481483, + "learning_rate": 0.00019696376745028497, + "loss": 1.0626, + "step": 218 + }, + { + "epoch": 0.24738774357526122, + "grad_norm": 0.026005201041698456, + "learning_rate": 0.0001969346531585088, + "loss": 1.0203, + "step": 219 + }, + { + "epoch": 0.24851736797514826, + "grad_norm": 0.026350049301981926, + "learning_rate": 0.00019690540211671144, + "loss": 1.0482, + "step": 220 + }, + { + "epoch": 0.2496469923750353, + "grad_norm": 0.026930196210741997, + "learning_rate": 0.00019687601436615864, + "loss": 1.0258, + "step": 221 + }, + { + "epoch": 0.25077661677492236, + "grad_norm": 0.025890439748764038, + "learning_rate": 0.00019684648994830903, + "loss": 1.0886, + "step": 222 + }, + { + "epoch": 0.25190624117480936, + "grad_norm": 0.025864360854029655, + "learning_rate": 0.00019681682890481398, + "loss": 0.976, + "step": 223 + }, + { + "epoch": 0.25303586557469643, + "grad_norm": 0.025524241849780083, + "learning_rate": 0.00019678703127751763, + "loss": 1.0251, + "step": 224 + }, + { + "epoch": 0.25416548997458344, + "grad_norm": 0.02650127001106739, + "learning_rate": 0.00019675709710845687, + "loss": 1.0435, + "step": 225 + }, + { + "epoch": 0.2552951143744705, + "grad_norm": 0.025557860732078552, + "learning_rate": 0.00019672702643986113, + "loss": 1.0555, + "step": 226 + }, + { + "epoch": 0.2564247387743575, + "grad_norm": 0.027075499296188354, + "learning_rate": 0.0001966968193141524, + "loss": 0.9965, + "step": 227 + }, + { + "epoch": 0.25755436317424457, + "grad_norm": 0.025682270526885986, + "learning_rate": 0.00019666647577394527, + "loss": 1.0151, + "step": 228 + }, + { + "epoch": 0.2586839875741316, + "grad_norm": 0.026663288474082947, + "learning_rate": 0.00019663599586204673, + "loss": 1.0354, + "step": 229 + }, + { + "epoch": 0.25981361197401864, + "grad_norm": 0.026434747502207756, + "learning_rate": 0.0001966053796214561, + "loss": 1.0551, + "step": 230 + }, + { + "epoch": 0.26094323637390565, + "grad_norm": 0.025536926463246346, + "learning_rate": 0.0001965746270953651, + "loss": 0.9731, + "step": 231 + }, + { + "epoch": 0.2620728607737927, + "grad_norm": 0.07522192597389221, + "learning_rate": 0.0001965437383271577, + "loss": 0.9796, + "step": 232 + }, + { + "epoch": 0.2632024851736798, + "grad_norm": 0.027285447344183922, + "learning_rate": 0.00019651271336040997, + "loss": 1.011, + "step": 233 + }, + { + "epoch": 0.2643321095735668, + "grad_norm": 0.026399778202176094, + "learning_rate": 0.0001964815522388903, + "loss": 1.0199, + "step": 234 + }, + { + "epoch": 0.26546173397345385, + "grad_norm": 0.026532689109444618, + "learning_rate": 0.00019645025500655906, + "loss": 0.9918, + "step": 235 + }, + { + "epoch": 0.26659135837334086, + "grad_norm": 0.025576921179890633, + "learning_rate": 0.00019641882170756862, + "loss": 1.0198, + "step": 236 + }, + { + "epoch": 0.2677209827732279, + "grad_norm": 0.026158379390835762, + "learning_rate": 0.00019638725238626335, + "loss": 1.0204, + "step": 237 + }, + { + "epoch": 0.26885060717311493, + "grad_norm": 0.025530420243740082, + "learning_rate": 0.00019635554708717946, + "loss": 1.0885, + "step": 238 + }, + { + "epoch": 0.269980231573002, + "grad_norm": 0.02707337960600853, + "learning_rate": 0.00019632370585504502, + "loss": 1.0649, + "step": 239 + }, + { + "epoch": 0.271109855972889, + "grad_norm": 0.027028286829590797, + "learning_rate": 0.00019629172873477995, + "loss": 1.0544, + "step": 240 + }, + { + "epoch": 0.27223948037277607, + "grad_norm": 0.02564058266580105, + "learning_rate": 0.0001962596157714957, + "loss": 1.0481, + "step": 241 + }, + { + "epoch": 0.2733691047726631, + "grad_norm": 0.026479296386241913, + "learning_rate": 0.0001962273670104955, + "loss": 1.0413, + "step": 242 + }, + { + "epoch": 0.27449872917255014, + "grad_norm": 0.0330955870449543, + "learning_rate": 0.00019619498249727412, + "loss": 1.0292, + "step": 243 + }, + { + "epoch": 0.27562835357243715, + "grad_norm": 0.02611500211060047, + "learning_rate": 0.0001961624622775178, + "loss": 1.009, + "step": 244 + }, + { + "epoch": 0.2767579779723242, + "grad_norm": 0.026876097545027733, + "learning_rate": 0.00019612980639710428, + "loss": 0.9854, + "step": 245 + }, + { + "epoch": 0.2778876023722112, + "grad_norm": 0.02685077115893364, + "learning_rate": 0.00019609701490210264, + "loss": 1.0282, + "step": 246 + }, + { + "epoch": 0.2790172267720983, + "grad_norm": 0.026131028309464455, + "learning_rate": 0.00019606408783877334, + "loss": 1.0673, + "step": 247 + }, + { + "epoch": 0.2801468511719853, + "grad_norm": 0.02628222666680813, + "learning_rate": 0.00019603102525356798, + "loss": 1.0659, + "step": 248 + }, + { + "epoch": 0.28127647557187235, + "grad_norm": 0.027401477098464966, + "learning_rate": 0.00019599782719312948, + "loss": 0.9942, + "step": 249 + }, + { + "epoch": 0.2824060999717594, + "grad_norm": 0.02594529278576374, + "learning_rate": 0.00019596449370429183, + "loss": 1.0091, + "step": 250 + }, + { + "epoch": 0.2835357243716464, + "grad_norm": 0.028301890939474106, + "learning_rate": 0.00019593102483408, + "loss": 1.0083, + "step": 251 + }, + { + "epoch": 0.2846653487715335, + "grad_norm": 0.02808901108801365, + "learning_rate": 0.00019589742062971007, + "loss": 1.071, + "step": 252 + }, + { + "epoch": 0.2857949731714205, + "grad_norm": 0.02654552273452282, + "learning_rate": 0.00019586368113858892, + "loss": 1.0865, + "step": 253 + }, + { + "epoch": 0.28692459757130756, + "grad_norm": 0.02610975131392479, + "learning_rate": 0.00019582980640831443, + "loss": 1.1093, + "step": 254 + }, + { + "epoch": 0.28805422197119457, + "grad_norm": 0.027240293100476265, + "learning_rate": 0.0001957957964866751, + "loss": 1.0822, + "step": 255 + }, + { + "epoch": 0.28918384637108163, + "grad_norm": 0.027821950614452362, + "learning_rate": 0.00019576165142165032, + "loss": 1.0371, + "step": 256 + }, + { + "epoch": 0.29031347077096864, + "grad_norm": 0.02755453623831272, + "learning_rate": 0.00019572737126141002, + "loss": 1.0752, + "step": 257 + }, + { + "epoch": 0.2914430951708557, + "grad_norm": 0.02676587551832199, + "learning_rate": 0.0001956929560543147, + "loss": 1.0599, + "step": 258 + }, + { + "epoch": 0.2925727195707427, + "grad_norm": 0.02904544584453106, + "learning_rate": 0.00019565840584891549, + "loss": 1.0568, + "step": 259 + }, + { + "epoch": 0.2937023439706298, + "grad_norm": 0.027289781719446182, + "learning_rate": 0.00019562372069395384, + "loss": 1.0671, + "step": 260 + }, + { + "epoch": 0.2948319683705168, + "grad_norm": 0.025955747812986374, + "learning_rate": 0.00019558890063836167, + "loss": 0.9118, + "step": 261 + }, + { + "epoch": 0.29596159277040385, + "grad_norm": 0.028641648590564728, + "learning_rate": 0.00019555394573126118, + "loss": 1.0498, + "step": 262 + }, + { + "epoch": 0.29709121717029086, + "grad_norm": 0.028356773778796196, + "learning_rate": 0.0001955188560219648, + "loss": 1.0238, + "step": 263 + }, + { + "epoch": 0.2982208415701779, + "grad_norm": 0.02746075950562954, + "learning_rate": 0.00019548363155997517, + "loss": 1.0741, + "step": 264 + }, + { + "epoch": 0.2993504659700649, + "grad_norm": 0.02712567336857319, + "learning_rate": 0.000195448272394985, + "loss": 1.0861, + "step": 265 + }, + { + "epoch": 0.300480090369952, + "grad_norm": 0.026709580793976784, + "learning_rate": 0.00019541277857687694, + "loss": 1.0024, + "step": 266 + }, + { + "epoch": 0.30160971476983905, + "grad_norm": 0.027716003358364105, + "learning_rate": 0.00019537715015572382, + "loss": 1.0406, + "step": 267 + }, + { + "epoch": 0.30273933916972606, + "grad_norm": 0.02704858034849167, + "learning_rate": 0.00019534138718178818, + "loss": 1.0088, + "step": 268 + }, + { + "epoch": 0.3038689635696131, + "grad_norm": 0.026793915778398514, + "learning_rate": 0.00019530548970552247, + "loss": 1.0556, + "step": 269 + }, + { + "epoch": 0.30499858796950013, + "grad_norm": 0.028323287144303322, + "learning_rate": 0.00019526945777756879, + "loss": 1.057, + "step": 270 + }, + { + "epoch": 0.3061282123693872, + "grad_norm": 0.0279136560857296, + "learning_rate": 0.00019523329144875904, + "loss": 1.0654, + "step": 271 + }, + { + "epoch": 0.3072578367692742, + "grad_norm": 0.02878638356924057, + "learning_rate": 0.00019519699077011465, + "loss": 1.0357, + "step": 272 + }, + { + "epoch": 0.30838746116916127, + "grad_norm": 0.026021145284175873, + "learning_rate": 0.00019516055579284658, + "loss": 1.092, + "step": 273 + }, + { + "epoch": 0.3095170855690483, + "grad_norm": 0.0282638818025589, + "learning_rate": 0.00019512398656835528, + "loss": 1.0242, + "step": 274 + }, + { + "epoch": 0.31064670996893534, + "grad_norm": 0.0277785062789917, + "learning_rate": 0.00019508728314823062, + "loss": 1.0922, + "step": 275 + }, + { + "epoch": 0.31177633436882235, + "grad_norm": 0.027666205540299416, + "learning_rate": 0.00019505044558425168, + "loss": 1.0434, + "step": 276 + }, + { + "epoch": 0.3129059587687094, + "grad_norm": 0.02734490856528282, + "learning_rate": 0.0001950134739283869, + "loss": 1.0726, + "step": 277 + }, + { + "epoch": 0.3140355831685964, + "grad_norm": 0.026907166466116905, + "learning_rate": 0.0001949763682327938, + "loss": 1.0807, + "step": 278 + }, + { + "epoch": 0.3151652075684835, + "grad_norm": 0.02773541398346424, + "learning_rate": 0.00019493912854981905, + "loss": 1.0941, + "step": 279 + }, + { + "epoch": 0.3162948319683705, + "grad_norm": 0.027467425912618637, + "learning_rate": 0.00019490175493199833, + "loss": 1.031, + "step": 280 + }, + { + "epoch": 0.31742445636825756, + "grad_norm": 0.02712651528418064, + "learning_rate": 0.00019486424743205626, + "loss": 1.0015, + "step": 281 + }, + { + "epoch": 0.31855408076814457, + "grad_norm": 0.026572776958346367, + "learning_rate": 0.00019482660610290636, + "loss": 0.9459, + "step": 282 + }, + { + "epoch": 0.31968370516803163, + "grad_norm": 0.02701294608414173, + "learning_rate": 0.00019478883099765086, + "loss": 1.0652, + "step": 283 + }, + { + "epoch": 0.3208133295679187, + "grad_norm": 0.02713761292397976, + "learning_rate": 0.0001947509221695808, + "loss": 1.0455, + "step": 284 + }, + { + "epoch": 0.3219429539678057, + "grad_norm": 0.028251413255929947, + "learning_rate": 0.00019471287967217594, + "loss": 0.9885, + "step": 285 + }, + { + "epoch": 0.32307257836769276, + "grad_norm": 0.028362903743982315, + "learning_rate": 0.00019467470355910438, + "loss": 1.0896, + "step": 286 + }, + { + "epoch": 0.3242022027675798, + "grad_norm": 0.027835773304104805, + "learning_rate": 0.00019463639388422297, + "loss": 0.9381, + "step": 287 + }, + { + "epoch": 0.32533182716746684, + "grad_norm": 0.026659086346626282, + "learning_rate": 0.0001945979507015768, + "loss": 0.9987, + "step": 288 + }, + { + "epoch": 0.32646145156735384, + "grad_norm": 0.028285473585128784, + "learning_rate": 0.0001945593740653994, + "loss": 1.0055, + "step": 289 + }, + { + "epoch": 0.3275910759672409, + "grad_norm": 0.027459239587187767, + "learning_rate": 0.00019452066403011253, + "loss": 1.0468, + "step": 290 + }, + { + "epoch": 0.3287207003671279, + "grad_norm": 0.028836321085691452, + "learning_rate": 0.00019448182065032621, + "loss": 1.0855, + "step": 291 + }, + { + "epoch": 0.329850324767015, + "grad_norm": 0.029597043991088867, + "learning_rate": 0.00019444284398083847, + "loss": 1.1135, + "step": 292 + }, + { + "epoch": 0.330979949166902, + "grad_norm": 0.029845820739865303, + "learning_rate": 0.00019440373407663542, + "loss": 1.0117, + "step": 293 + }, + { + "epoch": 0.33210957356678905, + "grad_norm": 0.027042267844080925, + "learning_rate": 0.00019436449099289119, + "loss": 1.0173, + "step": 294 + }, + { + "epoch": 0.33323919796667606, + "grad_norm": 0.027646934613585472, + "learning_rate": 0.00019432511478496768, + "loss": 1.0777, + "step": 295 + }, + { + "epoch": 0.33323919796667606, + "eval_loss": 1.0277949571609497, + "eval_runtime": 565.1236, + "eval_samples_per_second": 17.311, + "eval_steps_per_second": 8.657, + "step": 295 + }, + { + "epoch": 0.3343688223665631, + "grad_norm": 0.026499278843402863, + "learning_rate": 0.00019428560550841472, + "loss": 0.9618, + "step": 296 + }, + { + "epoch": 0.33549844676645013, + "grad_norm": 0.027500445023179054, + "learning_rate": 0.00019424596321896976, + "loss": 0.9794, + "step": 297 + }, + { + "epoch": 0.3366280711663372, + "grad_norm": 0.027349818497896194, + "learning_rate": 0.00019420618797255795, + "loss": 1.1008, + "step": 298 + }, + { + "epoch": 0.3377576955662242, + "grad_norm": 0.027657683938741684, + "learning_rate": 0.000194166279825292, + "loss": 1.0801, + "step": 299 + }, + { + "epoch": 0.33888731996611127, + "grad_norm": 0.027384718880057335, + "learning_rate": 0.00019412623883347207, + "loss": 1.038, + "step": 300 + }, + { + "epoch": 0.34001694436599833, + "grad_norm": 0.026920663192868233, + "learning_rate": 0.00019408606505358583, + "loss": 0.9868, + "step": 301 + }, + { + "epoch": 0.34114656876588534, + "grad_norm": 0.028844624757766724, + "learning_rate": 0.00019404575854230818, + "loss": 1.0293, + "step": 302 + }, + { + "epoch": 0.3422761931657724, + "grad_norm": 0.02755833975970745, + "learning_rate": 0.00019400531935650128, + "loss": 1.0087, + "step": 303 + }, + { + "epoch": 0.3434058175656594, + "grad_norm": 0.027301400899887085, + "learning_rate": 0.00019396474755321456, + "loss": 1.0318, + "step": 304 + }, + { + "epoch": 0.3445354419655465, + "grad_norm": 0.02760390006005764, + "learning_rate": 0.0001939240431896844, + "loss": 0.9421, + "step": 305 + }, + { + "epoch": 0.3456650663654335, + "grad_norm": 0.027442464604973793, + "learning_rate": 0.00019388320632333429, + "loss": 1.0801, + "step": 306 + }, + { + "epoch": 0.34679469076532055, + "grad_norm": 0.027593247592449188, + "learning_rate": 0.00019384223701177455, + "loss": 1.0607, + "step": 307 + }, + { + "epoch": 0.34792431516520755, + "grad_norm": 0.028117630630731583, + "learning_rate": 0.00019380113531280245, + "loss": 1.054, + "step": 308 + }, + { + "epoch": 0.3490539395650946, + "grad_norm": 0.029217706993222237, + "learning_rate": 0.00019375990128440204, + "loss": 1.0997, + "step": 309 + }, + { + "epoch": 0.3501835639649816, + "grad_norm": 0.027274932712316513, + "learning_rate": 0.0001937185349847439, + "loss": 1.0051, + "step": 310 + }, + { + "epoch": 0.3513131883648687, + "grad_norm": 0.03279178589582443, + "learning_rate": 0.0001936770364721854, + "loss": 1.0293, + "step": 311 + }, + { + "epoch": 0.3524428127647557, + "grad_norm": 0.026957320049405098, + "learning_rate": 0.00019363540580527025, + "loss": 1.0358, + "step": 312 + }, + { + "epoch": 0.35357243716464276, + "grad_norm": 0.029469158500432968, + "learning_rate": 0.0001935936430427287, + "loss": 1.1446, + "step": 313 + }, + { + "epoch": 0.35470206156452977, + "grad_norm": 0.03025597333908081, + "learning_rate": 0.00019355174824347735, + "loss": 1.0722, + "step": 314 + }, + { + "epoch": 0.35583168596441683, + "grad_norm": 0.02727232687175274, + "learning_rate": 0.00019350972146661905, + "loss": 1.0592, + "step": 315 + }, + { + "epoch": 0.3569613103643039, + "grad_norm": 0.028911981731653214, + "learning_rate": 0.00019346756277144285, + "loss": 1.1644, + "step": 316 + }, + { + "epoch": 0.3580909347641909, + "grad_norm": 0.02783570997416973, + "learning_rate": 0.0001934252722174239, + "loss": 0.9406, + "step": 317 + }, + { + "epoch": 0.35922055916407797, + "grad_norm": 0.02677338756620884, + "learning_rate": 0.00019338284986422335, + "loss": 0.9287, + "step": 318 + }, + { + "epoch": 0.360350183563965, + "grad_norm": 0.027951853349804878, + "learning_rate": 0.00019334029577168827, + "loss": 0.9541, + "step": 319 + }, + { + "epoch": 0.36147980796385204, + "grad_norm": 0.028323214501142502, + "learning_rate": 0.00019329760999985167, + "loss": 1.1566, + "step": 320 + }, + { + "epoch": 0.36260943236373905, + "grad_norm": 0.027881423011422157, + "learning_rate": 0.00019325479260893223, + "loss": 1.0662, + "step": 321 + }, + { + "epoch": 0.3637390567636261, + "grad_norm": 0.02717737667262554, + "learning_rate": 0.00019321184365933433, + "loss": 1.0317, + "step": 322 + }, + { + "epoch": 0.3648686811635131, + "grad_norm": 0.028628146275877953, + "learning_rate": 0.00019316876321164798, + "loss": 1.0503, + "step": 323 + }, + { + "epoch": 0.3659983055634002, + "grad_norm": 0.02851051092147827, + "learning_rate": 0.0001931255513266487, + "loss": 1.0565, + "step": 324 + }, + { + "epoch": 0.3671279299632872, + "grad_norm": 0.02863175794482231, + "learning_rate": 0.00019308220806529738, + "loss": 1.0243, + "step": 325 + }, + { + "epoch": 0.36825755436317426, + "grad_norm": 0.03015504591166973, + "learning_rate": 0.0001930387334887403, + "loss": 1.0208, + "step": 326 + }, + { + "epoch": 0.36938717876306126, + "grad_norm": 0.02771030366420746, + "learning_rate": 0.00019299512765830895, + "loss": 1.0094, + "step": 327 + }, + { + "epoch": 0.3705168031629483, + "grad_norm": 0.027864158153533936, + "learning_rate": 0.00019295139063552007, + "loss": 0.9863, + "step": 328 + }, + { + "epoch": 0.37164642756283534, + "grad_norm": 0.028755534440279007, + "learning_rate": 0.00019290752248207537, + "loss": 1.0542, + "step": 329 + }, + { + "epoch": 0.3727760519627224, + "grad_norm": 0.029860056936740875, + "learning_rate": 0.00019286352325986164, + "loss": 1.0006, + "step": 330 + }, + { + "epoch": 0.3739056763626094, + "grad_norm": 0.027963971719145775, + "learning_rate": 0.0001928193930309505, + "loss": 0.9609, + "step": 331 + }, + { + "epoch": 0.37503530076249647, + "grad_norm": 0.02750619500875473, + "learning_rate": 0.00019277513185759844, + "loss": 1.0076, + "step": 332 + }, + { + "epoch": 0.37616492516238353, + "grad_norm": 0.02815542183816433, + "learning_rate": 0.0001927307398022467, + "loss": 1.04, + "step": 333 + }, + { + "epoch": 0.37729454956227054, + "grad_norm": 0.028742128983139992, + "learning_rate": 0.00019268621692752108, + "loss": 0.9947, + "step": 334 + }, + { + "epoch": 0.3784241739621576, + "grad_norm": 0.027735736221075058, + "learning_rate": 0.00019264156329623197, + "loss": 1.0265, + "step": 335 + }, + { + "epoch": 0.3795537983620446, + "grad_norm": 0.02745204232633114, + "learning_rate": 0.00019259677897137426, + "loss": 1.0308, + "step": 336 + }, + { + "epoch": 0.3806834227619317, + "grad_norm": 0.028459064662456512, + "learning_rate": 0.00019255186401612718, + "loss": 1.0069, + "step": 337 + }, + { + "epoch": 0.3818130471618187, + "grad_norm": 0.028107335790991783, + "learning_rate": 0.00019250681849385424, + "loss": 1.0812, + "step": 338 + }, + { + "epoch": 0.38294267156170575, + "grad_norm": 0.029490889981389046, + "learning_rate": 0.00019246164246810316, + "loss": 1.0247, + "step": 339 + }, + { + "epoch": 0.38407229596159276, + "grad_norm": 0.027926163747906685, + "learning_rate": 0.00019241633600260578, + "loss": 0.9761, + "step": 340 + }, + { + "epoch": 0.3852019203614798, + "grad_norm": 0.02847837097942829, + "learning_rate": 0.00019237089916127793, + "loss": 1.0841, + "step": 341 + }, + { + "epoch": 0.38633154476136683, + "grad_norm": 0.027178598567843437, + "learning_rate": 0.00019232533200821942, + "loss": 1.1123, + "step": 342 + }, + { + "epoch": 0.3874611691612539, + "grad_norm": 0.027773573994636536, + "learning_rate": 0.00019227963460771377, + "loss": 0.9871, + "step": 343 + }, + { + "epoch": 0.3885907935611409, + "grad_norm": 0.027409275993704796, + "learning_rate": 0.00019223380702422844, + "loss": 1.0916, + "step": 344 + }, + { + "epoch": 0.38972041796102797, + "grad_norm": 0.028152553364634514, + "learning_rate": 0.00019218784932241434, + "loss": 1.0301, + "step": 345 + }, + { + "epoch": 0.390850042360915, + "grad_norm": 0.028817711398005486, + "learning_rate": 0.00019214176156710612, + "loss": 1.0203, + "step": 346 + }, + { + "epoch": 0.39197966676080204, + "grad_norm": 0.02772883139550686, + "learning_rate": 0.0001920955438233218, + "loss": 0.9991, + "step": 347 + }, + { + "epoch": 0.39310929116068904, + "grad_norm": 0.028133943676948547, + "learning_rate": 0.00019204919615626275, + "loss": 0.9834, + "step": 348 + }, + { + "epoch": 0.3942389155605761, + "grad_norm": 0.02936532348394394, + "learning_rate": 0.00019200271863131375, + "loss": 1.0227, + "step": 349 + }, + { + "epoch": 0.3953685399604632, + "grad_norm": 0.028890248388051987, + "learning_rate": 0.0001919561113140427, + "loss": 0.9551, + "step": 350 + }, + { + "epoch": 0.3964981643603502, + "grad_norm": 0.02820666879415512, + "learning_rate": 0.0001919093742702006, + "loss": 1.0343, + "step": 351 + }, + { + "epoch": 0.39762778876023724, + "grad_norm": 0.029474567621946335, + "learning_rate": 0.00019186250756572144, + "loss": 0.9853, + "step": 352 + }, + { + "epoch": 0.39875741316012425, + "grad_norm": 0.02914329618215561, + "learning_rate": 0.0001918155112667222, + "loss": 0.9542, + "step": 353 + }, + { + "epoch": 0.3998870375600113, + "grad_norm": 0.028036657720804214, + "learning_rate": 0.00019176838543950267, + "loss": 0.945, + "step": 354 + }, + { + "epoch": 0.4010166619598983, + "grad_norm": 0.027309326454997063, + "learning_rate": 0.00019172113015054532, + "loss": 0.977, + "step": 355 + }, + { + "epoch": 0.4021462863597854, + "grad_norm": 0.027427159249782562, + "learning_rate": 0.00019167374546651526, + "loss": 1.0505, + "step": 356 + }, + { + "epoch": 0.4032759107596724, + "grad_norm": 0.03023376129567623, + "learning_rate": 0.0001916262314542602, + "loss": 1.1378, + "step": 357 + }, + { + "epoch": 0.40440553515955946, + "grad_norm": 0.027807191014289856, + "learning_rate": 0.00019157858818081026, + "loss": 1.0516, + "step": 358 + }, + { + "epoch": 0.40553515955944647, + "grad_norm": 0.028308499604463577, + "learning_rate": 0.00019153081571337795, + "loss": 1.0673, + "step": 359 + }, + { + "epoch": 0.40666478395933353, + "grad_norm": 0.028541473671793938, + "learning_rate": 0.00019148291411935796, + "loss": 1.0567, + "step": 360 + }, + { + "epoch": 0.40779440835922054, + "grad_norm": 0.027455326169729233, + "learning_rate": 0.00019143488346632723, + "loss": 1.0078, + "step": 361 + }, + { + "epoch": 0.4089240327591076, + "grad_norm": 0.02952658385038376, + "learning_rate": 0.00019138672382204471, + "loss": 1.0686, + "step": 362 + }, + { + "epoch": 0.4100536571589946, + "grad_norm": 0.028435127809643745, + "learning_rate": 0.0001913384352544514, + "loss": 0.9846, + "step": 363 + }, + { + "epoch": 0.4111832815588817, + "grad_norm": 0.028838949277997017, + "learning_rate": 0.00019129001783167005, + "loss": 1.0602, + "step": 364 + }, + { + "epoch": 0.4123129059587687, + "grad_norm": 0.029650872573256493, + "learning_rate": 0.00019124147162200535, + "loss": 0.9967, + "step": 365 + }, + { + "epoch": 0.41344253035865575, + "grad_norm": 0.028792966157197952, + "learning_rate": 0.00019119279669394353, + "loss": 1.0562, + "step": 366 + }, + { + "epoch": 0.4145721547585428, + "grad_norm": 0.029962720349431038, + "learning_rate": 0.00019114399311615253, + "loss": 1.0016, + "step": 367 + }, + { + "epoch": 0.4157017791584298, + "grad_norm": 0.029513955116271973, + "learning_rate": 0.00019109506095748167, + "loss": 1.007, + "step": 368 + }, + { + "epoch": 0.4168314035583169, + "grad_norm": 0.028869032859802246, + "learning_rate": 0.00019104600028696175, + "loss": 1.033, + "step": 369 + }, + { + "epoch": 0.4179610279582039, + "grad_norm": 0.02818440832197666, + "learning_rate": 0.00019099681117380486, + "loss": 0.9947, + "step": 370 + }, + { + "epoch": 0.41909065235809095, + "grad_norm": 0.030735397711396217, + "learning_rate": 0.00019094749368740423, + "loss": 1.031, + "step": 371 + }, + { + "epoch": 0.42022027675797796, + "grad_norm": 0.029516831040382385, + "learning_rate": 0.00019089804789733424, + "loss": 1.1093, + "step": 372 + }, + { + "epoch": 0.421349901157865, + "grad_norm": 0.028589509427547455, + "learning_rate": 0.00019084847387335025, + "loss": 1.0524, + "step": 373 + }, + { + "epoch": 0.42247952555775203, + "grad_norm": 0.029599323868751526, + "learning_rate": 0.00019079877168538855, + "loss": 1.0867, + "step": 374 + }, + { + "epoch": 0.4236091499576391, + "grad_norm": 0.029633615165948868, + "learning_rate": 0.00019074894140356624, + "loss": 1.0187, + "step": 375 + }, + { + "epoch": 0.4247387743575261, + "grad_norm": 0.029569542035460472, + "learning_rate": 0.00019069898309818106, + "loss": 1.0172, + "step": 376 + }, + { + "epoch": 0.42586839875741317, + "grad_norm": 0.02864873595535755, + "learning_rate": 0.00019064889683971149, + "loss": 1.0408, + "step": 377 + }, + { + "epoch": 0.4269980231573002, + "grad_norm": 0.02849559485912323, + "learning_rate": 0.0001905986826988164, + "loss": 1.0513, + "step": 378 + }, + { + "epoch": 0.42812764755718724, + "grad_norm": 0.028202759101986885, + "learning_rate": 0.00019054834074633506, + "loss": 1.0536, + "step": 379 + }, + { + "epoch": 0.42925727195707425, + "grad_norm": 0.02983192540705204, + "learning_rate": 0.00019049787105328715, + "loss": 1.0294, + "step": 380 + }, + { + "epoch": 0.4303868963569613, + "grad_norm": 0.028043275699019432, + "learning_rate": 0.0001904472736908725, + "loss": 0.9645, + "step": 381 + }, + { + "epoch": 0.4315165207568483, + "grad_norm": 0.02895670384168625, + "learning_rate": 0.0001903965487304711, + "loss": 1.154, + "step": 382 + }, + { + "epoch": 0.4326461451567354, + "grad_norm": 0.02832162007689476, + "learning_rate": 0.0001903456962436428, + "loss": 1.0332, + "step": 383 + }, + { + "epoch": 0.43377576955662245, + "grad_norm": 0.029863545671105385, + "learning_rate": 0.00019029471630212762, + "loss": 1.0002, + "step": 384 + }, + { + "epoch": 0.43490539395650946, + "grad_norm": 0.02890811115503311, + "learning_rate": 0.00019024360897784508, + "loss": 1.0644, + "step": 385 + }, + { + "epoch": 0.4360350183563965, + "grad_norm": 0.03050493635237217, + "learning_rate": 0.0001901923743428946, + "loss": 1.0324, + "step": 386 + }, + { + "epoch": 0.43716464275628353, + "grad_norm": 0.029246153309941292, + "learning_rate": 0.00019014101246955515, + "loss": 1.0591, + "step": 387 + }, + { + "epoch": 0.4382942671561706, + "grad_norm": 0.02876698225736618, + "learning_rate": 0.00019008952343028526, + "loss": 0.9519, + "step": 388 + }, + { + "epoch": 0.4394238915560576, + "grad_norm": 0.029059743508696556, + "learning_rate": 0.00019003790729772273, + "loss": 1.0165, + "step": 389 + }, + { + "epoch": 0.44055351595594466, + "grad_norm": 0.02885555475950241, + "learning_rate": 0.00018998616414468478, + "loss": 1.004, + "step": 390 + }, + { + "epoch": 0.44168314035583167, + "grad_norm": 0.02809917740523815, + "learning_rate": 0.00018993429404416773, + "loss": 0.9685, + "step": 391 + }, + { + "epoch": 0.44281276475571874, + "grad_norm": 0.028004605323076248, + "learning_rate": 0.0001898822970693471, + "loss": 0.9923, + "step": 392 + }, + { + "epoch": 0.44394238915560574, + "grad_norm": 0.029958872124552727, + "learning_rate": 0.00018983017329357729, + "loss": 1.0468, + "step": 393 + }, + { + "epoch": 0.4450720135554928, + "grad_norm": 0.03032870590686798, + "learning_rate": 0.00018977792279039162, + "loss": 0.9573, + "step": 394 + }, + { + "epoch": 0.4462016379553798, + "grad_norm": 0.029365211725234985, + "learning_rate": 0.0001897255456335022, + "loss": 0.9673, + "step": 395 + }, + { + "epoch": 0.4473312623552669, + "grad_norm": 0.03092394582927227, + "learning_rate": 0.00018967304189679984, + "loss": 1.1468, + "step": 396 + }, + { + "epoch": 0.4484608867551539, + "grad_norm": 0.029345886781811714, + "learning_rate": 0.00018962041165435388, + "loss": 1.1213, + "step": 397 + }, + { + "epoch": 0.44959051115504095, + "grad_norm": 0.029504388570785522, + "learning_rate": 0.0001895676549804121, + "loss": 1.0483, + "step": 398 + }, + { + "epoch": 0.450720135554928, + "grad_norm": 0.029384993016719818, + "learning_rate": 0.00018951477194940075, + "loss": 0.9973, + "step": 399 + }, + { + "epoch": 0.451849759954815, + "grad_norm": 0.02798447571694851, + "learning_rate": 0.0001894617626359242, + "loss": 1.0041, + "step": 400 + }, + { + "epoch": 0.4529793843547021, + "grad_norm": 0.028576720505952835, + "learning_rate": 0.00018940862711476513, + "loss": 1.0699, + "step": 401 + }, + { + "epoch": 0.4541090087545891, + "grad_norm": 0.029531830921769142, + "learning_rate": 0.0001893553654608841, + "loss": 1.0396, + "step": 402 + }, + { + "epoch": 0.45523863315447616, + "grad_norm": 0.02875913865864277, + "learning_rate": 0.00018930197774941974, + "loss": 1.0302, + "step": 403 + }, + { + "epoch": 0.45636825755436317, + "grad_norm": 0.02790944278240204, + "learning_rate": 0.00018924846405568845, + "loss": 1.1243, + "step": 404 + }, + { + "epoch": 0.45749788195425023, + "grad_norm": 0.02811037190258503, + "learning_rate": 0.00018919482445518436, + "loss": 1.0377, + "step": 405 + }, + { + "epoch": 0.45862750635413724, + "grad_norm": 0.029786163941025734, + "learning_rate": 0.00018914105902357925, + "loss": 0.9825, + "step": 406 + }, + { + "epoch": 0.4597571307540243, + "grad_norm": 0.028242526575922966, + "learning_rate": 0.0001890871678367224, + "loss": 1.0738, + "step": 407 + }, + { + "epoch": 0.4608867551539113, + "grad_norm": 0.028527051210403442, + "learning_rate": 0.00018903315097064055, + "loss": 1.0024, + "step": 408 + }, + { + "epoch": 0.4620163795537984, + "grad_norm": 0.02773975394666195, + "learning_rate": 0.0001889790085015376, + "loss": 1.0042, + "step": 409 + }, + { + "epoch": 0.4631460039536854, + "grad_norm": 0.028500793501734734, + "learning_rate": 0.0001889247405057948, + "loss": 1.0938, + "step": 410 + }, + { + "epoch": 0.46427562835357244, + "grad_norm": 0.028347400948405266, + "learning_rate": 0.0001888703470599704, + "loss": 0.9892, + "step": 411 + }, + { + "epoch": 0.46540525275345945, + "grad_norm": 0.030584534630179405, + "learning_rate": 0.00018881582824079965, + "loss": 0.9977, + "step": 412 + }, + { + "epoch": 0.4665348771533465, + "grad_norm": 0.030196473002433777, + "learning_rate": 0.0001887611841251947, + "loss": 1.0442, + "step": 413 + }, + { + "epoch": 0.4676645015532335, + "grad_norm": 0.02942134439945221, + "learning_rate": 0.00018870641479024438, + "loss": 1.0096, + "step": 414 + }, + { + "epoch": 0.4687941259531206, + "grad_norm": 0.0283603947609663, + "learning_rate": 0.00018865152031321427, + "loss": 1.1341, + "step": 415 + }, + { + "epoch": 0.46992375035300765, + "grad_norm": 0.02936590276658535, + "learning_rate": 0.0001885965007715464, + "loss": 1.0823, + "step": 416 + }, + { + "epoch": 0.47105337475289466, + "grad_norm": 0.029375478625297546, + "learning_rate": 0.00018854135624285935, + "loss": 1.1148, + "step": 417 + }, + { + "epoch": 0.4721829991527817, + "grad_norm": 0.02892325632274151, + "learning_rate": 0.00018848608680494788, + "loss": 1.0905, + "step": 418 + }, + { + "epoch": 0.47331262355266873, + "grad_norm": 0.028916003182530403, + "learning_rate": 0.00018843069253578312, + "loss": 1.0133, + "step": 419 + }, + { + "epoch": 0.4744422479525558, + "grad_norm": 0.03031068667769432, + "learning_rate": 0.00018837517351351214, + "loss": 0.9835, + "step": 420 + }, + { + "epoch": 0.4755718723524428, + "grad_norm": 0.02931569144129753, + "learning_rate": 0.00018831952981645817, + "loss": 0.9664, + "step": 421 + }, + { + "epoch": 0.47670149675232987, + "grad_norm": 0.029150547459721565, + "learning_rate": 0.0001882637615231202, + "loss": 0.9604, + "step": 422 + }, + { + "epoch": 0.4778311211522169, + "grad_norm": 0.03003125637769699, + "learning_rate": 0.00018820786871217305, + "loss": 1.0735, + "step": 423 + }, + { + "epoch": 0.47896074555210394, + "grad_norm": 0.030021261423826218, + "learning_rate": 0.00018815185146246716, + "loss": 1.0005, + "step": 424 + }, + { + "epoch": 0.48009036995199095, + "grad_norm": 0.029816657304763794, + "learning_rate": 0.00018809570985302862, + "loss": 0.9366, + "step": 425 + }, + { + "epoch": 0.481219994351878, + "grad_norm": 0.02971251681447029, + "learning_rate": 0.00018803944396305884, + "loss": 1.0121, + "step": 426 + }, + { + "epoch": 0.482349618751765, + "grad_norm": 0.03110647387802601, + "learning_rate": 0.00018798305387193463, + "loss": 1.0021, + "step": 427 + }, + { + "epoch": 0.4834792431516521, + "grad_norm": 0.030216267332434654, + "learning_rate": 0.000187926539659208, + "loss": 0.9594, + "step": 428 + }, + { + "epoch": 0.4846088675515391, + "grad_norm": 0.030311699956655502, + "learning_rate": 0.000187869901404606, + "loss": 1.0478, + "step": 429 + }, + { + "epoch": 0.48573849195142615, + "grad_norm": 0.028579862788319588, + "learning_rate": 0.00018781313918803086, + "loss": 0.9539, + "step": 430 + }, + { + "epoch": 0.48686811635131316, + "grad_norm": 0.03003637120127678, + "learning_rate": 0.00018775625308955942, + "loss": 1.0172, + "step": 431 + }, + { + "epoch": 0.4879977407512002, + "grad_norm": 0.03043578751385212, + "learning_rate": 0.0001876992431894435, + "loss": 0.9997, + "step": 432 + }, + { + "epoch": 0.4891273651510873, + "grad_norm": 0.03140099346637726, + "learning_rate": 0.0001876421095681095, + "loss": 1.0307, + "step": 433 + }, + { + "epoch": 0.4902569895509743, + "grad_norm": 0.03060254082083702, + "learning_rate": 0.00018758485230615837, + "loss": 0.9873, + "step": 434 + }, + { + "epoch": 0.49138661395086136, + "grad_norm": 0.030223416164517403, + "learning_rate": 0.00018752747148436543, + "loss": 1.0629, + "step": 435 + }, + { + "epoch": 0.49251623835074837, + "grad_norm": 0.030368085950613022, + "learning_rate": 0.00018746996718368037, + "loss": 0.9692, + "step": 436 + }, + { + "epoch": 0.49364586275063543, + "grad_norm": 0.03002486564218998, + "learning_rate": 0.00018741233948522707, + "loss": 1.0334, + "step": 437 + }, + { + "epoch": 0.49477548715052244, + "grad_norm": 0.029050812125205994, + "learning_rate": 0.0001873545884703035, + "loss": 0.9861, + "step": 438 + }, + { + "epoch": 0.4959051115504095, + "grad_norm": 0.030488910153508186, + "learning_rate": 0.0001872967142203815, + "loss": 1.1141, + "step": 439 + }, + { + "epoch": 0.4970347359502965, + "grad_norm": 0.029405072331428528, + "learning_rate": 0.00018723871681710697, + "loss": 1.0318, + "step": 440 + }, + { + "epoch": 0.4981643603501836, + "grad_norm": 0.030446210876107216, + "learning_rate": 0.0001871805963422993, + "loss": 0.9895, + "step": 441 + }, + { + "epoch": 0.4992939847500706, + "grad_norm": 0.029718847945332527, + "learning_rate": 0.00018712235287795176, + "loss": 1.1104, + "step": 442 + }, + { + "epoch": 0.5004236091499576, + "grad_norm": 0.03045968897640705, + "learning_rate": 0.00018706398650623088, + "loss": 0.9305, + "step": 443 + }, + { + "epoch": 0.5015532335498447, + "grad_norm": 0.030085409060120583, + "learning_rate": 0.0001870054973094767, + "loss": 1.0243, + "step": 444 + }, + { + "epoch": 0.5026828579497317, + "grad_norm": 0.030122725293040276, + "learning_rate": 0.0001869468853702026, + "loss": 1.0977, + "step": 445 + }, + { + "epoch": 0.5038124823496187, + "grad_norm": 0.03070569783449173, + "learning_rate": 0.00018688815077109498, + "loss": 1.0352, + "step": 446 + }, + { + "epoch": 0.5049421067495058, + "grad_norm": 0.029172202572226524, + "learning_rate": 0.00018682929359501338, + "loss": 1.0018, + "step": 447 + }, + { + "epoch": 0.5060717311493929, + "grad_norm": 0.02992609702050686, + "learning_rate": 0.00018677031392499023, + "loss": 1.0543, + "step": 448 + }, + { + "epoch": 0.5072013555492799, + "grad_norm": 0.03060738928616047, + "learning_rate": 0.00018671121184423076, + "loss": 0.9548, + "step": 449 + }, + { + "epoch": 0.5083309799491669, + "grad_norm": 0.03061763569712639, + "learning_rate": 0.0001866519874361129, + "loss": 1.0017, + "step": 450 + }, + { + "epoch": 0.5094606043490539, + "grad_norm": 0.031224450096488, + "learning_rate": 0.00018659264078418718, + "loss": 1.0203, + "step": 451 + }, + { + "epoch": 0.510590228748941, + "grad_norm": 0.028874509036540985, + "learning_rate": 0.00018653317197217653, + "loss": 1.0266, + "step": 452 + }, + { + "epoch": 0.5117198531488281, + "grad_norm": 0.029967116191983223, + "learning_rate": 0.00018647358108397625, + "loss": 1.0335, + "step": 453 + }, + { + "epoch": 0.512849477548715, + "grad_norm": 0.030794909223914146, + "learning_rate": 0.00018641386820365385, + "loss": 1.0284, + "step": 454 + }, + { + "epoch": 0.5139791019486021, + "grad_norm": 0.031100483611226082, + "learning_rate": 0.000186354033415449, + "loss": 1.0486, + "step": 455 + }, + { + "epoch": 0.5151087263484891, + "grad_norm": 0.030945099890232086, + "learning_rate": 0.00018629407680377318, + "loss": 1.0685, + "step": 456 + }, + { + "epoch": 0.5162383507483762, + "grad_norm": 0.030694004148244858, + "learning_rate": 0.00018623399845320993, + "loss": 0.9765, + "step": 457 + }, + { + "epoch": 0.5173679751482632, + "grad_norm": 0.03131450340151787, + "learning_rate": 0.00018617379844851443, + "loss": 1.0927, + "step": 458 + }, + { + "epoch": 0.5184975995481502, + "grad_norm": 0.030793707817792892, + "learning_rate": 0.00018611347687461349, + "loss": 0.9999, + "step": 459 + }, + { + "epoch": 0.5196272239480373, + "grad_norm": 0.029182102531194687, + "learning_rate": 0.00018605303381660543, + "loss": 0.967, + "step": 460 + }, + { + "epoch": 0.5207568483479244, + "grad_norm": 0.030693160369992256, + "learning_rate": 0.00018599246935976, + "loss": 1.084, + "step": 461 + }, + { + "epoch": 0.5218864727478113, + "grad_norm": 0.030196724459528923, + "learning_rate": 0.0001859317835895181, + "loss": 1.024, + "step": 462 + }, + { + "epoch": 0.5230160971476984, + "grad_norm": 0.029934274032711983, + "learning_rate": 0.0001858709765914919, + "loss": 1.0975, + "step": 463 + }, + { + "epoch": 0.5241457215475854, + "grad_norm": 0.030209926888346672, + "learning_rate": 0.00018581004845146453, + "loss": 1.0485, + "step": 464 + }, + { + "epoch": 0.5252753459474725, + "grad_norm": 0.0305222999304533, + "learning_rate": 0.00018574899925538998, + "loss": 1.0272, + "step": 465 + }, + { + "epoch": 0.5264049703473596, + "grad_norm": 0.029943542554974556, + "learning_rate": 0.00018568782908939309, + "loss": 1.0122, + "step": 466 + }, + { + "epoch": 0.5275345947472465, + "grad_norm": 0.02910439483821392, + "learning_rate": 0.00018562653803976936, + "loss": 0.8831, + "step": 467 + }, + { + "epoch": 0.5286642191471336, + "grad_norm": 0.030156375840306282, + "learning_rate": 0.00018556512619298472, + "loss": 1.0245, + "step": 468 + }, + { + "epoch": 0.5297938435470206, + "grad_norm": 0.029457733035087585, + "learning_rate": 0.00018550359363567567, + "loss": 0.9933, + "step": 469 + }, + { + "epoch": 0.5309234679469077, + "grad_norm": 0.03006352297961712, + "learning_rate": 0.00018544194045464886, + "loss": 0.9978, + "step": 470 + }, + { + "epoch": 0.5320530923467947, + "grad_norm": 0.03152355179190636, + "learning_rate": 0.0001853801667368812, + "loss": 0.9832, + "step": 471 + }, + { + "epoch": 0.5331827167466817, + "grad_norm": 0.02921919897198677, + "learning_rate": 0.00018531827256951962, + "loss": 0.9178, + "step": 472 + }, + { + "epoch": 0.5343123411465688, + "grad_norm": 0.031064407899975777, + "learning_rate": 0.00018525625803988104, + "loss": 1.0384, + "step": 473 + }, + { + "epoch": 0.5354419655464558, + "grad_norm": 0.029859617352485657, + "learning_rate": 0.00018519412323545194, + "loss": 0.9886, + "step": 474 + }, + { + "epoch": 0.5365715899463428, + "grad_norm": 0.030883649364113808, + "learning_rate": 0.00018513186824388879, + "loss": 1.1247, + "step": 475 + }, + { + "epoch": 0.5377012143462299, + "grad_norm": 0.030706819146871567, + "learning_rate": 0.00018506949315301742, + "loss": 0.9923, + "step": 476 + }, + { + "epoch": 0.5388308387461169, + "grad_norm": 0.02973487228155136, + "learning_rate": 0.00018500699805083318, + "loss": 0.9388, + "step": 477 + }, + { + "epoch": 0.539960463146004, + "grad_norm": 0.03165286406874657, + "learning_rate": 0.00018494438302550062, + "loss": 1.0297, + "step": 478 + }, + { + "epoch": 0.5410900875458909, + "grad_norm": 0.0324639268219471, + "learning_rate": 0.0001848816481653536, + "loss": 1.0399, + "step": 479 + }, + { + "epoch": 0.542219711945778, + "grad_norm": 0.03156152740120888, + "learning_rate": 0.00018481879355889495, + "loss": 0.9528, + "step": 480 + }, + { + "epoch": 0.5433493363456651, + "grad_norm": 0.030102282762527466, + "learning_rate": 0.00018475581929479646, + "loss": 0.9972, + "step": 481 + }, + { + "epoch": 0.5444789607455521, + "grad_norm": 0.03062708117067814, + "learning_rate": 0.0001846927254618987, + "loss": 0.9629, + "step": 482 + }, + { + "epoch": 0.5456085851454392, + "grad_norm": 0.02973772957921028, + "learning_rate": 0.000184629512149211, + "loss": 1.1059, + "step": 483 + }, + { + "epoch": 0.5467382095453261, + "grad_norm": 0.030491316691040993, + "learning_rate": 0.00018456617944591111, + "loss": 1.093, + "step": 484 + }, + { + "epoch": 0.5478678339452132, + "grad_norm": 0.029982471838593483, + "learning_rate": 0.00018450272744134532, + "loss": 1.0719, + "step": 485 + }, + { + "epoch": 0.5489974583451003, + "grad_norm": 0.03204856067895889, + "learning_rate": 0.00018443915622502822, + "loss": 1.0136, + "step": 486 + }, + { + "epoch": 0.5501270827449873, + "grad_norm": 0.030183738097548485, + "learning_rate": 0.00018437546588664252, + "loss": 1.0613, + "step": 487 + }, + { + "epoch": 0.5512567071448743, + "grad_norm": 0.03049345500767231, + "learning_rate": 0.00018431165651603903, + "loss": 0.9428, + "step": 488 + }, + { + "epoch": 0.5523863315447614, + "grad_norm": 0.030976206064224243, + "learning_rate": 0.00018424772820323644, + "loss": 0.9908, + "step": 489 + }, + { + "epoch": 0.5535159559446484, + "grad_norm": 0.030059922486543655, + "learning_rate": 0.00018418368103842125, + "loss": 0.9546, + "step": 490 + }, + { + "epoch": 0.5546455803445355, + "grad_norm": 0.029848681762814522, + "learning_rate": 0.0001841195151119477, + "loss": 1.0269, + "step": 491 + }, + { + "epoch": 0.5557752047444224, + "grad_norm": 0.03216058760881424, + "learning_rate": 0.00018405523051433743, + "loss": 0.9717, + "step": 492 + }, + { + "epoch": 0.5569048291443095, + "grad_norm": 0.030524935573339462, + "learning_rate": 0.00018399082733627965, + "loss": 1.0208, + "step": 493 + }, + { + "epoch": 0.5580344535441966, + "grad_norm": 0.03152266517281532, + "learning_rate": 0.00018392630566863076, + "loss": 1.0353, + "step": 494 + }, + { + "epoch": 0.5591640779440836, + "grad_norm": 0.03233015537261963, + "learning_rate": 0.00018386166560241434, + "loss": 1.1238, + "step": 495 + }, + { + "epoch": 0.5602937023439706, + "grad_norm": 0.031183136627078056, + "learning_rate": 0.000183796907228821, + "loss": 1.0266, + "step": 496 + }, + { + "epoch": 0.5614233267438576, + "grad_norm": 0.030228251591324806, + "learning_rate": 0.00018373203063920822, + "loss": 1.0074, + "step": 497 + }, + { + "epoch": 0.5625529511437447, + "grad_norm": 0.031268905848264694, + "learning_rate": 0.00018366703592510034, + "loss": 1.0106, + "step": 498 + }, + { + "epoch": 0.5636825755436318, + "grad_norm": 0.031185952946543694, + "learning_rate": 0.0001836019231781883, + "loss": 1.0476, + "step": 499 + }, + { + "epoch": 0.5648121999435188, + "grad_norm": 0.03026709146797657, + "learning_rate": 0.0001835366924903295, + "loss": 1.0619, + "step": 500 + }, + { + "epoch": 0.5659418243434058, + "grad_norm": 0.029817136004567146, + "learning_rate": 0.00018347134395354776, + "loss": 1.0016, + "step": 501 + }, + { + "epoch": 0.5670714487432928, + "grad_norm": 0.030526304617524147, + "learning_rate": 0.00018340587766003323, + "loss": 1.0559, + "step": 502 + }, + { + "epoch": 0.5682010731431799, + "grad_norm": 0.03136800602078438, + "learning_rate": 0.00018334029370214208, + "loss": 0.9867, + "step": 503 + }, + { + "epoch": 0.569330697543067, + "grad_norm": 0.030273810029029846, + "learning_rate": 0.0001832745921723965, + "loss": 0.9358, + "step": 504 + }, + { + "epoch": 0.5704603219429539, + "grad_norm": 0.02991536259651184, + "learning_rate": 0.00018320877316348454, + "loss": 0.9964, + "step": 505 + }, + { + "epoch": 0.571589946342841, + "grad_norm": 0.031318966299295425, + "learning_rate": 0.00018314283676826009, + "loss": 0.9946, + "step": 506 + }, + { + "epoch": 0.5727195707427281, + "grad_norm": 0.030620397999882698, + "learning_rate": 0.00018307678307974241, + "loss": 1.0597, + "step": 507 + }, + { + "epoch": 0.5738491951426151, + "grad_norm": 0.03023059107363224, + "learning_rate": 0.0001830106121911165, + "loss": 0.9825, + "step": 508 + }, + { + "epoch": 0.5749788195425021, + "grad_norm": 0.03067387081682682, + "learning_rate": 0.0001829443241957325, + "loss": 0.9863, + "step": 509 + }, + { + "epoch": 0.5761084439423891, + "grad_norm": 0.03259598836302757, + "learning_rate": 0.00018287791918710587, + "loss": 1.0366, + "step": 510 + }, + { + "epoch": 0.5772380683422762, + "grad_norm": 0.03081597201526165, + "learning_rate": 0.00018281139725891707, + "loss": 1.144, + "step": 511 + }, + { + "epoch": 0.5783676927421633, + "grad_norm": 0.03100423514842987, + "learning_rate": 0.00018274475850501158, + "loss": 1.011, + "step": 512 + }, + { + "epoch": 0.5794973171420502, + "grad_norm": 0.030796082690358162, + "learning_rate": 0.00018267800301939965, + "loss": 0.8843, + "step": 513 + }, + { + "epoch": 0.5806269415419373, + "grad_norm": 0.030977580696344376, + "learning_rate": 0.00018261113089625613, + "loss": 1.0606, + "step": 514 + }, + { + "epoch": 0.5817565659418243, + "grad_norm": 0.03037908300757408, + "learning_rate": 0.0001825441422299206, + "loss": 0.9751, + "step": 515 + }, + { + "epoch": 0.5828861903417114, + "grad_norm": 0.03079284355044365, + "learning_rate": 0.00018247703711489686, + "loss": 1.0062, + "step": 516 + }, + { + "epoch": 0.5840158147415985, + "grad_norm": 0.031534090638160706, + "learning_rate": 0.00018240981564585313, + "loss": 0.949, + "step": 517 + }, + { + "epoch": 0.5851454391414854, + "grad_norm": 0.03137180209159851, + "learning_rate": 0.0001823424779176217, + "loss": 1.0799, + "step": 518 + }, + { + "epoch": 0.5862750635413725, + "grad_norm": 0.0305685643106699, + "learning_rate": 0.00018227502402519893, + "loss": 1.0609, + "step": 519 + }, + { + "epoch": 0.5874046879412596, + "grad_norm": 0.02950458414852619, + "learning_rate": 0.00018220745406374498, + "loss": 0.9671, + "step": 520 + }, + { + "epoch": 0.5885343123411466, + "grad_norm": 0.030199820175766945, + "learning_rate": 0.00018213976812858382, + "loss": 1.0684, + "step": 521 + }, + { + "epoch": 0.5896639367410336, + "grad_norm": 0.031708989292383194, + "learning_rate": 0.00018207196631520297, + "loss": 0.9994, + "step": 522 + }, + { + "epoch": 0.5907935611409206, + "grad_norm": 0.03120891936123371, + "learning_rate": 0.00018200404871925353, + "loss": 1.001, + "step": 523 + }, + { + "epoch": 0.5919231855408077, + "grad_norm": 0.033152077347040176, + "learning_rate": 0.0001819360154365498, + "loss": 1.0489, + "step": 524 + }, + { + "epoch": 0.5930528099406948, + "grad_norm": 0.03135927394032478, + "learning_rate": 0.00018186786656306935, + "loss": 1.1065, + "step": 525 + }, + { + "epoch": 0.5941824343405817, + "grad_norm": 0.030605459585785866, + "learning_rate": 0.0001817996021949529, + "loss": 1.0116, + "step": 526 + }, + { + "epoch": 0.5953120587404688, + "grad_norm": 0.031958550214767456, + "learning_rate": 0.00018173122242850397, + "loss": 1.0113, + "step": 527 + }, + { + "epoch": 0.5964416831403558, + "grad_norm": 0.033079009503126144, + "learning_rate": 0.00018166272736018895, + "loss": 0.9531, + "step": 528 + }, + { + "epoch": 0.5975713075402429, + "grad_norm": 0.0316440686583519, + "learning_rate": 0.00018159411708663684, + "loss": 0.9916, + "step": 529 + }, + { + "epoch": 0.5987009319401299, + "grad_norm": 0.030489858239889145, + "learning_rate": 0.00018152539170463925, + "loss": 0.995, + "step": 530 + }, + { + "epoch": 0.5998305563400169, + "grad_norm": 0.0322355218231678, + "learning_rate": 0.00018145655131115009, + "loss": 1.0784, + "step": 531 + }, + { + "epoch": 0.600960180739904, + "grad_norm": 0.03130833059549332, + "learning_rate": 0.00018138759600328563, + "loss": 1.0537, + "step": 532 + }, + { + "epoch": 0.602089805139791, + "grad_norm": 0.031001951545476913, + "learning_rate": 0.0001813185258783241, + "loss": 0.9956, + "step": 533 + }, + { + "epoch": 0.6032194295396781, + "grad_norm": 0.03067929483950138, + "learning_rate": 0.0001812493410337058, + "loss": 1.0148, + "step": 534 + }, + { + "epoch": 0.6043490539395651, + "grad_norm": 0.03192298486828804, + "learning_rate": 0.00018118004156703296, + "loss": 0.9635, + "step": 535 + }, + { + "epoch": 0.6054786783394521, + "grad_norm": 0.031253885477781296, + "learning_rate": 0.00018111062757606932, + "loss": 0.9987, + "step": 536 + }, + { + "epoch": 0.6066083027393392, + "grad_norm": 0.031125715002417564, + "learning_rate": 0.0001810410991587403, + "loss": 0.9915, + "step": 537 + }, + { + "epoch": 0.6077379271392263, + "grad_norm": 0.03175501897931099, + "learning_rate": 0.00018097145641313272, + "loss": 1.0357, + "step": 538 + }, + { + "epoch": 0.6088675515391132, + "grad_norm": 0.031910236924886703, + "learning_rate": 0.00018090169943749476, + "loss": 1.0679, + "step": 539 + }, + { + "epoch": 0.6099971759390003, + "grad_norm": 0.03214259445667267, + "learning_rate": 0.00018083182833023562, + "loss": 1.0173, + "step": 540 + }, + { + "epoch": 0.6111268003388873, + "grad_norm": 0.03169810026884079, + "learning_rate": 0.00018076184318992558, + "loss": 1.0428, + "step": 541 + }, + { + "epoch": 0.6122564247387744, + "grad_norm": 0.03129338473081589, + "learning_rate": 0.00018069174411529577, + "loss": 1.0236, + "step": 542 + }, + { + "epoch": 0.6133860491386613, + "grad_norm": 0.03245764225721359, + "learning_rate": 0.0001806215312052381, + "loss": 1.0081, + "step": 543 + }, + { + "epoch": 0.6145156735385484, + "grad_norm": 0.030435949563980103, + "learning_rate": 0.0001805512045588051, + "loss": 1.0731, + "step": 544 + }, + { + "epoch": 0.6156452979384355, + "grad_norm": 0.030730856582522392, + "learning_rate": 0.0001804807642752096, + "loss": 1.0793, + "step": 545 + }, + { + "epoch": 0.6167749223383225, + "grad_norm": 0.02937515825033188, + "learning_rate": 0.00018041021045382485, + "loss": 1.0123, + "step": 546 + }, + { + "epoch": 0.6179045467382095, + "grad_norm": 0.03019302524626255, + "learning_rate": 0.0001803395431941843, + "loss": 1.0232, + "step": 547 + }, + { + "epoch": 0.6190341711380966, + "grad_norm": 0.04123188927769661, + "learning_rate": 0.00018026876259598135, + "loss": 1.0309, + "step": 548 + }, + { + "epoch": 0.6201637955379836, + "grad_norm": 0.03046722523868084, + "learning_rate": 0.00018019786875906935, + "loss": 0.9721, + "step": 549 + }, + { + "epoch": 0.6212934199378707, + "grad_norm": 0.033260468393564224, + "learning_rate": 0.00018012686178346142, + "loss": 1.0726, + "step": 550 + }, + { + "epoch": 0.6224230443377577, + "grad_norm": 0.03144606575369835, + "learning_rate": 0.0001800557417693302, + "loss": 0.947, + "step": 551 + }, + { + "epoch": 0.6235526687376447, + "grad_norm": 0.03095083311200142, + "learning_rate": 0.00017998450881700787, + "loss": 0.9937, + "step": 552 + }, + { + "epoch": 0.6246822931375318, + "grad_norm": 0.03133854269981384, + "learning_rate": 0.00017991316302698595, + "loss": 0.9502, + "step": 553 + }, + { + "epoch": 0.6258119175374188, + "grad_norm": 0.03148304298520088, + "learning_rate": 0.00017984170449991506, + "loss": 1.1628, + "step": 554 + }, + { + "epoch": 0.6269415419373059, + "grad_norm": 0.03164827451109886, + "learning_rate": 0.000179770133336605, + "loss": 0.8814, + "step": 555 + }, + { + "epoch": 0.6280711663371928, + "grad_norm": 0.03083074651658535, + "learning_rate": 0.0001796984496380243, + "loss": 0.9999, + "step": 556 + }, + { + "epoch": 0.6292007907370799, + "grad_norm": 0.03223288804292679, + "learning_rate": 0.0001796266535053004, + "loss": 1.0819, + "step": 557 + }, + { + "epoch": 0.630330415136967, + "grad_norm": 0.03053288348019123, + "learning_rate": 0.00017955474503971925, + "loss": 1.1077, + "step": 558 + }, + { + "epoch": 0.631460039536854, + "grad_norm": 0.03127776086330414, + "learning_rate": 0.00017948272434272535, + "loss": 1.041, + "step": 559 + }, + { + "epoch": 0.632589663936741, + "grad_norm": 0.03209880739450455, + "learning_rate": 0.00017941059151592147, + "loss": 1.0081, + "step": 560 + }, + { + "epoch": 0.633719288336628, + "grad_norm": 0.02959609404206276, + "learning_rate": 0.00017933834666106864, + "loss": 0.9875, + "step": 561 + }, + { + "epoch": 0.6348489127365151, + "grad_norm": 0.03344092145562172, + "learning_rate": 0.00017926598988008582, + "loss": 0.9677, + "step": 562 + }, + { + "epoch": 0.6359785371364022, + "grad_norm": 0.03260407596826553, + "learning_rate": 0.00017919352127505, + "loss": 1.0449, + "step": 563 + }, + { + "epoch": 0.6371081615362891, + "grad_norm": 0.031249945983290672, + "learning_rate": 0.0001791209409481958, + "loss": 1.0662, + "step": 564 + }, + { + "epoch": 0.6382377859361762, + "grad_norm": 0.031923823058605194, + "learning_rate": 0.00017904824900191556, + "loss": 1.0379, + "step": 565 + }, + { + "epoch": 0.6393674103360633, + "grad_norm": 0.030242929235100746, + "learning_rate": 0.00017897544553875902, + "loss": 1.0257, + "step": 566 + }, + { + "epoch": 0.6404970347359503, + "grad_norm": 0.032716382294893265, + "learning_rate": 0.00017890253066143324, + "loss": 0.9987, + "step": 567 + }, + { + "epoch": 0.6416266591358374, + "grad_norm": 0.03140626102685928, + "learning_rate": 0.0001788295044728025, + "loss": 1.0162, + "step": 568 + }, + { + "epoch": 0.6427562835357243, + "grad_norm": 0.029912738129496574, + "learning_rate": 0.0001787563670758881, + "loss": 1.0318, + "step": 569 + }, + { + "epoch": 0.6438859079356114, + "grad_norm": 0.03130066394805908, + "learning_rate": 0.0001786831185738682, + "loss": 1.0026, + "step": 570 + }, + { + "epoch": 0.6450155323354985, + "grad_norm": 0.033079057931900024, + "learning_rate": 0.00017860975907007772, + "loss": 1.0262, + "step": 571 + }, + { + "epoch": 0.6461451567353855, + "grad_norm": 0.03027520515024662, + "learning_rate": 0.00017853628866800812, + "loss": 1.0075, + "step": 572 + }, + { + "epoch": 0.6472747811352725, + "grad_norm": 0.03166157007217407, + "learning_rate": 0.00017846270747130742, + "loss": 1.0858, + "step": 573 + }, + { + "epoch": 0.6484044055351595, + "grad_norm": 0.03081650286912918, + "learning_rate": 0.00017838901558377986, + "loss": 1.0215, + "step": 574 + }, + { + "epoch": 0.6495340299350466, + "grad_norm": 0.03256387263536453, + "learning_rate": 0.0001783152131093859, + "loss": 0.973, + "step": 575 + }, + { + "epoch": 0.6506636543349337, + "grad_norm": 0.030604898929595947, + "learning_rate": 0.00017824130015224192, + "loss": 1.057, + "step": 576 + }, + { + "epoch": 0.6517932787348206, + "grad_norm": 0.030695458874106407, + "learning_rate": 0.00017816727681662023, + "loss": 1.0804, + "step": 577 + }, + { + "epoch": 0.6529229031347077, + "grad_norm": 0.031340498477220535, + "learning_rate": 0.0001780931432069488, + "loss": 0.9722, + "step": 578 + }, + { + "epoch": 0.6540525275345948, + "grad_norm": 0.03206819295883179, + "learning_rate": 0.00017801889942781126, + "loss": 1.0593, + "step": 579 + }, + { + "epoch": 0.6551821519344818, + "grad_norm": 0.030380915850400925, + "learning_rate": 0.00017794454558394657, + "loss": 0.9263, + "step": 580 + }, + { + "epoch": 0.6563117763343688, + "grad_norm": 0.03320132568478584, + "learning_rate": 0.00017787008178024905, + "loss": 1.0798, + "step": 581 + }, + { + "epoch": 0.6574414007342558, + "grad_norm": 0.0311865396797657, + "learning_rate": 0.00017779550812176806, + "loss": 0.9205, + "step": 582 + }, + { + "epoch": 0.6585710251341429, + "grad_norm": 0.032210152596235275, + "learning_rate": 0.00017772082471370797, + "loss": 1.0411, + "step": 583 + }, + { + "epoch": 0.65970064953403, + "grad_norm": 0.03178109973669052, + "learning_rate": 0.00017764603166142798, + "loss": 1.0502, + "step": 584 + }, + { + "epoch": 0.660830273933917, + "grad_norm": 0.0323721244931221, + "learning_rate": 0.000177571129070442, + "loss": 1.0372, + "step": 585 + }, + { + "epoch": 0.661959898333804, + "grad_norm": 0.031241752207279205, + "learning_rate": 0.0001774961170464184, + "loss": 0.9741, + "step": 586 + }, + { + "epoch": 0.663089522733691, + "grad_norm": 0.03263148292899132, + "learning_rate": 0.00017742099569518, + "loss": 1.0956, + "step": 587 + }, + { + "epoch": 0.6642191471335781, + "grad_norm": 0.031760070472955704, + "learning_rate": 0.00017734576512270383, + "loss": 0.9795, + "step": 588 + }, + { + "epoch": 0.6653487715334652, + "grad_norm": 0.03184381127357483, + "learning_rate": 0.00017727042543512099, + "loss": 0.9054, + "step": 589 + }, + { + "epoch": 0.6664783959333521, + "grad_norm": 0.03145081177353859, + "learning_rate": 0.00017719497673871653, + "loss": 1.0219, + "step": 590 + }, + { + "epoch": 0.6664783959333521, + "eval_loss": 1.0118999481201172, + "eval_runtime": 547.41, + "eval_samples_per_second": 17.871, + "eval_steps_per_second": 8.937, + "step": 590 + }, + { + "epoch": 0.6676080203332392, + "grad_norm": 0.03253559768199921, + "learning_rate": 0.00017711941913992928, + "loss": 0.9635, + "step": 591 + }, + { + "epoch": 0.6687376447331262, + "grad_norm": 0.03181855380535126, + "learning_rate": 0.00017704375274535167, + "loss": 0.8852, + "step": 592 + }, + { + "epoch": 0.6698672691330133, + "grad_norm": 0.03165988251566887, + "learning_rate": 0.0001769679776617297, + "loss": 1.0201, + "step": 593 + }, + { + "epoch": 0.6709968935329003, + "grad_norm": 0.03077312745153904, + "learning_rate": 0.00017689209399596257, + "loss": 1.0307, + "step": 594 + }, + { + "epoch": 0.6721265179327873, + "grad_norm": 0.032232630997896194, + "learning_rate": 0.00017681610185510285, + "loss": 1.0121, + "step": 595 + }, + { + "epoch": 0.6732561423326744, + "grad_norm": 0.03249699994921684, + "learning_rate": 0.0001767400013463559, + "loss": 0.9288, + "step": 596 + }, + { + "epoch": 0.6743857667325615, + "grad_norm": 0.03216133266687393, + "learning_rate": 0.0001766637925770802, + "loss": 0.9665, + "step": 597 + }, + { + "epoch": 0.6755153911324484, + "grad_norm": 0.03151794150471687, + "learning_rate": 0.00017658747565478677, + "loss": 1.0497, + "step": 598 + }, + { + "epoch": 0.6766450155323355, + "grad_norm": 0.03118024580180645, + "learning_rate": 0.00017651105068713935, + "loss": 0.9403, + "step": 599 + }, + { + "epoch": 0.6777746399322225, + "grad_norm": 0.030804403126239777, + "learning_rate": 0.00017643451778195395, + "loss": 1.0011, + "step": 600 + }, + { + "epoch": 0.6789042643321096, + "grad_norm": 0.03352154418826103, + "learning_rate": 0.000176357877047199, + "loss": 1.0294, + "step": 601 + }, + { + "epoch": 0.6800338887319967, + "grad_norm": 0.03205511346459389, + "learning_rate": 0.00017628112859099498, + "loss": 1.0487, + "step": 602 + }, + { + "epoch": 0.6811635131318836, + "grad_norm": 0.031228026375174522, + "learning_rate": 0.00017620427252161433, + "loss": 0.9319, + "step": 603 + }, + { + "epoch": 0.6822931375317707, + "grad_norm": 0.031973280012607574, + "learning_rate": 0.00017612730894748136, + "loss": 1.0829, + "step": 604 + }, + { + "epoch": 0.6834227619316577, + "grad_norm": 0.03331442177295685, + "learning_rate": 0.00017605023797717195, + "loss": 1.0669, + "step": 605 + }, + { + "epoch": 0.6845523863315448, + "grad_norm": 0.0336139053106308, + "learning_rate": 0.00017597305971941358, + "loss": 1.0722, + "step": 606 + }, + { + "epoch": 0.6856820107314318, + "grad_norm": 0.03086121752858162, + "learning_rate": 0.00017589577428308502, + "loss": 1.092, + "step": 607 + }, + { + "epoch": 0.6868116351313188, + "grad_norm": 0.032204341143369675, + "learning_rate": 0.0001758183817772163, + "loss": 0.9483, + "step": 608 + }, + { + "epoch": 0.6879412595312059, + "grad_norm": 0.03183162584900856, + "learning_rate": 0.00017574088231098843, + "loss": 1.0029, + "step": 609 + }, + { + "epoch": 0.689070883931093, + "grad_norm": 0.031096026301383972, + "learning_rate": 0.00017566327599373338, + "loss": 1.0094, + "step": 610 + }, + { + "epoch": 0.6902005083309799, + "grad_norm": 0.032303981482982635, + "learning_rate": 0.0001755855629349338, + "loss": 0.976, + "step": 611 + }, + { + "epoch": 0.691330132730867, + "grad_norm": 0.03237254545092583, + "learning_rate": 0.00017550774324422296, + "loss": 0.9472, + "step": 612 + }, + { + "epoch": 0.692459757130754, + "grad_norm": 0.03161952272057533, + "learning_rate": 0.0001754298170313846, + "loss": 0.995, + "step": 613 + }, + { + "epoch": 0.6935893815306411, + "grad_norm": 0.032882727682590485, + "learning_rate": 0.00017535178440635264, + "loss": 0.9078, + "step": 614 + }, + { + "epoch": 0.694719005930528, + "grad_norm": 0.030476143583655357, + "learning_rate": 0.0001752736454792112, + "loss": 0.9488, + "step": 615 + }, + { + "epoch": 0.6958486303304151, + "grad_norm": 0.032640308141708374, + "learning_rate": 0.00017519540036019428, + "loss": 0.9968, + "step": 616 + }, + { + "epoch": 0.6969782547303022, + "grad_norm": 0.03207506611943245, + "learning_rate": 0.00017511704915968581, + "loss": 1.0598, + "step": 617 + }, + { + "epoch": 0.6981078791301892, + "grad_norm": 0.04379906877875328, + "learning_rate": 0.0001750385919882193, + "loss": 1.0801, + "step": 618 + }, + { + "epoch": 0.6992375035300763, + "grad_norm": 0.03258811682462692, + "learning_rate": 0.00017496002895647775, + "loss": 1.0901, + "step": 619 + }, + { + "epoch": 0.7003671279299633, + "grad_norm": 0.034346289932727814, + "learning_rate": 0.0001748813601752935, + "loss": 0.9995, + "step": 620 + }, + { + "epoch": 0.7014967523298503, + "grad_norm": 0.0332537442445755, + "learning_rate": 0.0001748025857556481, + "loss": 1.0382, + "step": 621 + }, + { + "epoch": 0.7026263767297374, + "grad_norm": 0.031845249235630035, + "learning_rate": 0.0001747237058086722, + "loss": 1.1217, + "step": 622 + }, + { + "epoch": 0.7037560011296244, + "grad_norm": 0.032574612647295, + "learning_rate": 0.00017464472044564512, + "loss": 0.8765, + "step": 623 + }, + { + "epoch": 0.7048856255295114, + "grad_norm": 0.031227873638272285, + "learning_rate": 0.00017456562977799514, + "loss": 0.9676, + "step": 624 + }, + { + "epoch": 0.7060152499293985, + "grad_norm": 0.032643262296915054, + "learning_rate": 0.00017448643391729888, + "loss": 0.9842, + "step": 625 + }, + { + "epoch": 0.7071448743292855, + "grad_norm": 0.031137650832533836, + "learning_rate": 0.00017440713297528154, + "loss": 0.9877, + "step": 626 + }, + { + "epoch": 0.7082744987291726, + "grad_norm": 0.030961019918322563, + "learning_rate": 0.0001743277270638164, + "loss": 1.017, + "step": 627 + }, + { + "epoch": 0.7094041231290595, + "grad_norm": 0.032677747309207916, + "learning_rate": 0.00017424821629492495, + "loss": 1.0023, + "step": 628 + }, + { + "epoch": 0.7105337475289466, + "grad_norm": 0.032716501504182816, + "learning_rate": 0.00017416860078077657, + "loss": 0.9893, + "step": 629 + }, + { + "epoch": 0.7116633719288337, + "grad_norm": 0.03239135444164276, + "learning_rate": 0.0001740888806336884, + "loss": 0.9949, + "step": 630 + }, + { + "epoch": 0.7127929963287207, + "grad_norm": 0.03276536986231804, + "learning_rate": 0.0001740090559661252, + "loss": 1.0778, + "step": 631 + }, + { + "epoch": 0.7139226207286078, + "grad_norm": 0.030909627676010132, + "learning_rate": 0.00017392912689069917, + "loss": 1.0098, + "step": 632 + }, + { + "epoch": 0.7150522451284947, + "grad_norm": 0.032094262540340424, + "learning_rate": 0.00017384909352016975, + "loss": 0.9703, + "step": 633 + }, + { + "epoch": 0.7161818695283818, + "grad_norm": 0.03388513997197151, + "learning_rate": 0.00017376895596744367, + "loss": 1.0014, + "step": 634 + }, + { + "epoch": 0.7173114939282689, + "grad_norm": 0.03186871111392975, + "learning_rate": 0.00017368871434557447, + "loss": 1.0076, + "step": 635 + }, + { + "epoch": 0.7184411183281559, + "grad_norm": 0.03189585730433464, + "learning_rate": 0.00017360836876776256, + "loss": 0.9721, + "step": 636 + }, + { + "epoch": 0.7195707427280429, + "grad_norm": 0.033433668315410614, + "learning_rate": 0.0001735279193473551, + "loss": 0.9798, + "step": 637 + }, + { + "epoch": 0.72070036712793, + "grad_norm": 0.031073307618498802, + "learning_rate": 0.00017344736619784553, + "loss": 0.9629, + "step": 638 + }, + { + "epoch": 0.721829991527817, + "grad_norm": 0.030326619744300842, + "learning_rate": 0.00017336670943287388, + "loss": 1.0727, + "step": 639 + }, + { + "epoch": 0.7229596159277041, + "grad_norm": 0.03189557045698166, + "learning_rate": 0.00017328594916622616, + "loss": 1.0175, + "step": 640 + }, + { + "epoch": 0.724089240327591, + "grad_norm": 0.03241390734910965, + "learning_rate": 0.00017320508551183446, + "loss": 1.1313, + "step": 641 + }, + { + "epoch": 0.7252188647274781, + "grad_norm": 0.0323265865445137, + "learning_rate": 0.0001731241185837768, + "loss": 1.0418, + "step": 642 + }, + { + "epoch": 0.7263484891273652, + "grad_norm": 0.032344575971364975, + "learning_rate": 0.00017304304849627677, + "loss": 1.0882, + "step": 643 + }, + { + "epoch": 0.7274781135272522, + "grad_norm": 0.032916001975536346, + "learning_rate": 0.00017296187536370355, + "loss": 0.9596, + "step": 644 + }, + { + "epoch": 0.7286077379271392, + "grad_norm": 0.031346168369054794, + "learning_rate": 0.00017288059930057166, + "loss": 0.9729, + "step": 645 + }, + { + "epoch": 0.7297373623270262, + "grad_norm": 0.032094355672597885, + "learning_rate": 0.00017279922042154092, + "loss": 1.0331, + "step": 646 + }, + { + "epoch": 0.7308669867269133, + "grad_norm": 0.03289850801229477, + "learning_rate": 0.00017271773884141607, + "loss": 1.0411, + "step": 647 + }, + { + "epoch": 0.7319966111268004, + "grad_norm": 0.03297988697886467, + "learning_rate": 0.0001726361546751468, + "loss": 0.9847, + "step": 648 + }, + { + "epoch": 0.7331262355266874, + "grad_norm": 0.03368350863456726, + "learning_rate": 0.00017255446803782754, + "loss": 0.9978, + "step": 649 + }, + { + "epoch": 0.7342558599265744, + "grad_norm": 0.03280177712440491, + "learning_rate": 0.00017247267904469725, + "loss": 1.0363, + "step": 650 + }, + { + "epoch": 0.7353854843264614, + "grad_norm": 0.031202217563986778, + "learning_rate": 0.00017239078781113926, + "loss": 1.025, + "step": 651 + }, + { + "epoch": 0.7365151087263485, + "grad_norm": 0.03219619765877724, + "learning_rate": 0.00017230879445268124, + "loss": 0.9878, + "step": 652 + }, + { + "epoch": 0.7376447331262356, + "grad_norm": 0.033005617558956146, + "learning_rate": 0.00017222669908499482, + "loss": 1.0223, + "step": 653 + }, + { + "epoch": 0.7387743575261225, + "grad_norm": 0.03350326791405678, + "learning_rate": 0.00017214450182389559, + "loss": 1.0173, + "step": 654 + }, + { + "epoch": 0.7399039819260096, + "grad_norm": 0.031389541923999786, + "learning_rate": 0.00017206220278534286, + "loss": 1.0458, + "step": 655 + }, + { + "epoch": 0.7410336063258967, + "grad_norm": 0.031541019678115845, + "learning_rate": 0.00017197980208543954, + "loss": 0.9489, + "step": 656 + }, + { + "epoch": 0.7421632307257837, + "grad_norm": 0.03202977776527405, + "learning_rate": 0.00017189729984043204, + "loss": 1.0364, + "step": 657 + }, + { + "epoch": 0.7432928551256707, + "grad_norm": 0.03152487054467201, + "learning_rate": 0.00017181469616670984, + "loss": 0.9827, + "step": 658 + }, + { + "epoch": 0.7444224795255577, + "grad_norm": 0.03225429356098175, + "learning_rate": 0.00017173199118080564, + "loss": 1.0996, + "step": 659 + }, + { + "epoch": 0.7455521039254448, + "grad_norm": 0.03217494860291481, + "learning_rate": 0.00017164918499939504, + "loss": 0.9355, + "step": 660 + }, + { + "epoch": 0.7466817283253319, + "grad_norm": 0.032104648649692535, + "learning_rate": 0.00017156627773929644, + "loss": 1.0552, + "step": 661 + }, + { + "epoch": 0.7478113527252188, + "grad_norm": 0.03186746686697006, + "learning_rate": 0.0001714832695174707, + "loss": 1.071, + "step": 662 + }, + { + "epoch": 0.7489409771251059, + "grad_norm": 0.03182530775666237, + "learning_rate": 0.00017140016045102133, + "loss": 1.0688, + "step": 663 + }, + { + "epoch": 0.7500706015249929, + "grad_norm": 0.03153397887945175, + "learning_rate": 0.00017131695065719386, + "loss": 0.9624, + "step": 664 + }, + { + "epoch": 0.75120022592488, + "grad_norm": 0.03226126730442047, + "learning_rate": 0.0001712336402533761, + "loss": 1.1134, + "step": 665 + }, + { + "epoch": 0.7523298503247671, + "grad_norm": 0.031511638313531876, + "learning_rate": 0.00017115022935709778, + "loss": 1.0753, + "step": 666 + }, + { + "epoch": 0.753459474724654, + "grad_norm": 0.03331499546766281, + "learning_rate": 0.00017106671808603027, + "loss": 0.9709, + "step": 667 + }, + { + "epoch": 0.7545890991245411, + "grad_norm": 0.032829850912094116, + "learning_rate": 0.0001709831065579867, + "loss": 0.9839, + "step": 668 + }, + { + "epoch": 0.7557187235244281, + "grad_norm": 0.032828208059072495, + "learning_rate": 0.00017089939489092152, + "loss": 0.9924, + "step": 669 + }, + { + "epoch": 0.7568483479243152, + "grad_norm": 0.0320126973092556, + "learning_rate": 0.00017081558320293055, + "loss": 0.9649, + "step": 670 + }, + { + "epoch": 0.7579779723242022, + "grad_norm": 0.03252957761287689, + "learning_rate": 0.0001707316716122506, + "loss": 0.9643, + "step": 671 + }, + { + "epoch": 0.7591075967240892, + "grad_norm": 0.032323576509952545, + "learning_rate": 0.00017064766023725948, + "loss": 0.9962, + "step": 672 + }, + { + "epoch": 0.7602372211239763, + "grad_norm": 0.03305547684431076, + "learning_rate": 0.00017056354919647583, + "loss": 1.0864, + "step": 673 + }, + { + "epoch": 0.7613668455238634, + "grad_norm": 0.0321505106985569, + "learning_rate": 0.0001704793386085588, + "loss": 1.01, + "step": 674 + }, + { + "epoch": 0.7624964699237503, + "grad_norm": 0.03243474289774895, + "learning_rate": 0.000170395028592308, + "loss": 1.1192, + "step": 675 + }, + { + "epoch": 0.7636260943236374, + "grad_norm": 0.03369235247373581, + "learning_rate": 0.00017031061926666333, + "loss": 0.9846, + "step": 676 + }, + { + "epoch": 0.7647557187235244, + "grad_norm": 0.03172389790415764, + "learning_rate": 0.00017022611075070474, + "loss": 1.0406, + "step": 677 + }, + { + "epoch": 0.7658853431234115, + "grad_norm": 0.03241589665412903, + "learning_rate": 0.00017014150316365216, + "loss": 0.9235, + "step": 678 + }, + { + "epoch": 0.7670149675232985, + "grad_norm": 0.03271762281656265, + "learning_rate": 0.0001700567966248653, + "loss": 0.9516, + "step": 679 + }, + { + "epoch": 0.7681445919231855, + "grad_norm": 0.032931018620729446, + "learning_rate": 0.00016997199125384343, + "loss": 1.0315, + "step": 680 + }, + { + "epoch": 0.7692742163230726, + "grad_norm": 0.032814498990774155, + "learning_rate": 0.00016988708717022522, + "loss": 0.941, + "step": 681 + }, + { + "epoch": 0.7704038407229596, + "grad_norm": 0.031445201486349106, + "learning_rate": 0.00016980208449378866, + "loss": 1.0588, + "step": 682 + }, + { + "epoch": 0.7715334651228467, + "grad_norm": 0.033274564892053604, + "learning_rate": 0.0001697169833444508, + "loss": 0.9968, + "step": 683 + }, + { + "epoch": 0.7726630895227337, + "grad_norm": 0.03313668072223663, + "learning_rate": 0.00016963178384226763, + "loss": 1.0308, + "step": 684 + }, + { + "epoch": 0.7737927139226207, + "grad_norm": 0.032828278839588165, + "learning_rate": 0.00016954648610743384, + "loss": 1.0245, + "step": 685 + }, + { + "epoch": 0.7749223383225078, + "grad_norm": 0.03268923610448837, + "learning_rate": 0.00016946109026028274, + "loss": 1.0515, + "step": 686 + }, + { + "epoch": 0.7760519627223949, + "grad_norm": 0.03162987902760506, + "learning_rate": 0.00016937559642128604, + "loss": 0.9649, + "step": 687 + }, + { + "epoch": 0.7771815871222818, + "grad_norm": 0.03206837549805641, + "learning_rate": 0.0001692900047110537, + "loss": 1.0174, + "step": 688 + }, + { + "epoch": 0.7783112115221689, + "grad_norm": 0.03194599226117134, + "learning_rate": 0.0001692043152503338, + "loss": 0.9872, + "step": 689 + }, + { + "epoch": 0.7794408359220559, + "grad_norm": 0.032261595129966736, + "learning_rate": 0.0001691185281600122, + "loss": 1.0046, + "step": 690 + }, + { + "epoch": 0.780570460321943, + "grad_norm": 0.032003022730350494, + "learning_rate": 0.00016903264356111258, + "loss": 1.0223, + "step": 691 + }, + { + "epoch": 0.78170008472183, + "grad_norm": 0.03204648569226265, + "learning_rate": 0.00016894666157479614, + "loss": 0.9402, + "step": 692 + }, + { + "epoch": 0.782829709121717, + "grad_norm": 0.03307194262742996, + "learning_rate": 0.00016886058232236156, + "loss": 0.977, + "step": 693 + }, + { + "epoch": 0.7839593335216041, + "grad_norm": 0.03305744007229805, + "learning_rate": 0.00016877440592524457, + "loss": 1.0158, + "step": 694 + }, + { + "epoch": 0.7850889579214911, + "grad_norm": 0.03376347944140434, + "learning_rate": 0.0001686881325050181, + "loss": 0.9266, + "step": 695 + }, + { + "epoch": 0.7862185823213781, + "grad_norm": 0.031977638602256775, + "learning_rate": 0.0001686017621833919, + "loss": 0.9966, + "step": 696 + }, + { + "epoch": 0.7873482067212652, + "grad_norm": 0.02983999252319336, + "learning_rate": 0.00016851529508221235, + "loss": 1.0418, + "step": 697 + }, + { + "epoch": 0.7884778311211522, + "grad_norm": 0.032275013625621796, + "learning_rate": 0.00016842873132346252, + "loss": 1.0745, + "step": 698 + }, + { + "epoch": 0.7896074555210393, + "grad_norm": 0.031730227172374725, + "learning_rate": 0.0001683420710292617, + "loss": 1.0505, + "step": 699 + }, + { + "epoch": 0.7907370799209263, + "grad_norm": 0.03272555023431778, + "learning_rate": 0.00016825531432186543, + "loss": 0.9826, + "step": 700 + }, + { + "epoch": 0.7918667043208133, + "grad_norm": 0.03270925581455231, + "learning_rate": 0.00016816846132366523, + "loss": 0.9954, + "step": 701 + }, + { + "epoch": 0.7929963287207004, + "grad_norm": 0.03216206654906273, + "learning_rate": 0.00016808151215718853, + "loss": 1.0266, + "step": 702 + }, + { + "epoch": 0.7941259531205874, + "grad_norm": 0.035289812833070755, + "learning_rate": 0.00016799446694509834, + "loss": 0.9776, + "step": 703 + }, + { + "epoch": 0.7952555775204745, + "grad_norm": 0.03199274092912674, + "learning_rate": 0.00016790732581019321, + "loss": 1.1088, + "step": 704 + }, + { + "epoch": 0.7963852019203614, + "grad_norm": 0.032748714089393616, + "learning_rate": 0.00016782008887540704, + "loss": 1.0957, + "step": 705 + }, + { + "epoch": 0.7975148263202485, + "grad_norm": 0.03328438848257065, + "learning_rate": 0.00016773275626380882, + "loss": 1.033, + "step": 706 + }, + { + "epoch": 0.7986444507201356, + "grad_norm": 0.031454868614673615, + "learning_rate": 0.00016764532809860255, + "loss": 0.9854, + "step": 707 + }, + { + "epoch": 0.7997740751200226, + "grad_norm": 0.032634053379297256, + "learning_rate": 0.00016755780450312705, + "loss": 0.9914, + "step": 708 + }, + { + "epoch": 0.8009036995199096, + "grad_norm": 0.03341520577669144, + "learning_rate": 0.00016747018560085572, + "loss": 0.9696, + "step": 709 + }, + { + "epoch": 0.8020333239197966, + "grad_norm": 0.03382014483213425, + "learning_rate": 0.00016738247151539643, + "loss": 1.0987, + "step": 710 + }, + { + "epoch": 0.8031629483196837, + "grad_norm": 0.03288474678993225, + "learning_rate": 0.00016729466237049137, + "loss": 1.0378, + "step": 711 + }, + { + "epoch": 0.8042925727195708, + "grad_norm": 0.03332820534706116, + "learning_rate": 0.00016720675829001675, + "loss": 1.0544, + "step": 712 + }, + { + "epoch": 0.8054221971194577, + "grad_norm": 0.033210329711437225, + "learning_rate": 0.0001671187593979828, + "loss": 0.9805, + "step": 713 + }, + { + "epoch": 0.8065518215193448, + "grad_norm": 0.03217016160488129, + "learning_rate": 0.00016703066581853345, + "loss": 1.0576, + "step": 714 + }, + { + "epoch": 0.8076814459192319, + "grad_norm": 0.03476932644844055, + "learning_rate": 0.00016694247767594624, + "loss": 1.0224, + "step": 715 + }, + { + "epoch": 0.8088110703191189, + "grad_norm": 0.03346959874033928, + "learning_rate": 0.00016685419509463213, + "loss": 1.0332, + "step": 716 + }, + { + "epoch": 0.809940694719006, + "grad_norm": 0.03398541361093521, + "learning_rate": 0.00016676581819913516, + "loss": 0.8649, + "step": 717 + }, + { + "epoch": 0.8110703191188929, + "grad_norm": 0.033196430653333664, + "learning_rate": 0.0001666773471141327, + "loss": 0.9434, + "step": 718 + }, + { + "epoch": 0.81219994351878, + "grad_norm": 0.03290561959147453, + "learning_rate": 0.00016658878196443476, + "loss": 0.9993, + "step": 719 + }, + { + "epoch": 0.8133295679186671, + "grad_norm": 0.03241690620779991, + "learning_rate": 0.00016650012287498412, + "loss": 1.0136, + "step": 720 + }, + { + "epoch": 0.8144591923185541, + "grad_norm": 0.03312429040670395, + "learning_rate": 0.00016641136997085608, + "loss": 1.0292, + "step": 721 + }, + { + "epoch": 0.8155888167184411, + "grad_norm": 0.03105839341878891, + "learning_rate": 0.0001663225233772584, + "loss": 0.9277, + "step": 722 + }, + { + "epoch": 0.8167184411183281, + "grad_norm": 0.0325755774974823, + "learning_rate": 0.00016623358321953078, + "loss": 1.0722, + "step": 723 + }, + { + "epoch": 0.8178480655182152, + "grad_norm": 0.033952005207538605, + "learning_rate": 0.00016614454962314516, + "loss": 1.0253, + "step": 724 + }, + { + "epoch": 0.8189776899181023, + "grad_norm": 0.0334470197558403, + "learning_rate": 0.00016605542271370513, + "loss": 1.0267, + "step": 725 + }, + { + "epoch": 0.8201073143179892, + "grad_norm": 0.03237008675932884, + "learning_rate": 0.00016596620261694604, + "loss": 1.0669, + "step": 726 + }, + { + "epoch": 0.8212369387178763, + "grad_norm": 0.03195658326148987, + "learning_rate": 0.00016587688945873458, + "loss": 0.9879, + "step": 727 + }, + { + "epoch": 0.8223665631177633, + "grad_norm": 0.03366916999220848, + "learning_rate": 0.0001657874833650688, + "loss": 0.9801, + "step": 728 + }, + { + "epoch": 0.8234961875176504, + "grad_norm": 0.03287327662110329, + "learning_rate": 0.0001656979844620779, + "loss": 0.9283, + "step": 729 + }, + { + "epoch": 0.8246258119175374, + "grad_norm": 0.03366275876760483, + "learning_rate": 0.00016560839287602192, + "loss": 1.0678, + "step": 730 + }, + { + "epoch": 0.8257554363174244, + "grad_norm": 0.03446445241570473, + "learning_rate": 0.00016551870873329167, + "loss": 0.9899, + "step": 731 + }, + { + "epoch": 0.8268850607173115, + "grad_norm": 0.03468972072005272, + "learning_rate": 0.0001654289321604086, + "loss": 1.0614, + "step": 732 + }, + { + "epoch": 0.8280146851171986, + "grad_norm": 0.03443734720349312, + "learning_rate": 0.00016533906328402448, + "loss": 1.0321, + "step": 733 + }, + { + "epoch": 0.8291443095170856, + "grad_norm": 0.03276367112994194, + "learning_rate": 0.0001652491022309213, + "loss": 0.9848, + "step": 734 + }, + { + "epoch": 0.8302739339169726, + "grad_norm": 0.03289159759879112, + "learning_rate": 0.00016515904912801118, + "loss": 1.0121, + "step": 735 + }, + { + "epoch": 0.8314035583168596, + "grad_norm": 0.034025318920612335, + "learning_rate": 0.000165068904102336, + "loss": 1.0589, + "step": 736 + }, + { + "epoch": 0.8325331827167467, + "grad_norm": 0.03421149030327797, + "learning_rate": 0.00016497866728106735, + "loss": 1.0138, + "step": 737 + }, + { + "epoch": 0.8336628071166338, + "grad_norm": 0.03334156796336174, + "learning_rate": 0.0001648883387915063, + "loss": 1.0337, + "step": 738 + }, + { + "epoch": 0.8347924315165207, + "grad_norm": 0.03213927149772644, + "learning_rate": 0.0001647979187610833, + "loss": 1.0248, + "step": 739 + }, + { + "epoch": 0.8359220559164078, + "grad_norm": 0.03407248482108116, + "learning_rate": 0.00016470740731735787, + "loss": 0.9995, + "step": 740 + }, + { + "epoch": 0.8370516803162948, + "grad_norm": 0.03234965354204178, + "learning_rate": 0.00016461680458801858, + "loss": 1.0526, + "step": 741 + }, + { + "epoch": 0.8381813047161819, + "grad_norm": 0.03325793519616127, + "learning_rate": 0.0001645261107008827, + "loss": 0.9461, + "step": 742 + }, + { + "epoch": 0.8393109291160689, + "grad_norm": 0.034206606447696686, + "learning_rate": 0.00016443532578389606, + "loss": 0.9095, + "step": 743 + }, + { + "epoch": 0.8404405535159559, + "grad_norm": 0.03346103057265282, + "learning_rate": 0.00016434444996513305, + "loss": 1.0337, + "step": 744 + }, + { + "epoch": 0.841570177915843, + "grad_norm": 0.03360540792346001, + "learning_rate": 0.0001642534833727962, + "loss": 0.9532, + "step": 745 + }, + { + "epoch": 0.84269980231573, + "grad_norm": 0.03263968229293823, + "learning_rate": 0.0001641624261352161, + "loss": 1.0579, + "step": 746 + }, + { + "epoch": 0.843829426715617, + "grad_norm": 0.033077508211135864, + "learning_rate": 0.0001640712783808513, + "loss": 0.9993, + "step": 747 + }, + { + "epoch": 0.8449590511155041, + "grad_norm": 0.03186168894171715, + "learning_rate": 0.00016398004023828797, + "loss": 0.9576, + "step": 748 + }, + { + "epoch": 0.8460886755153911, + "grad_norm": 0.032343216240406036, + "learning_rate": 0.00016388871183623977, + "loss": 1.0693, + "step": 749 + }, + { + "epoch": 0.8472182999152782, + "grad_norm": 0.03365077078342438, + "learning_rate": 0.00016379729330354774, + "loss": 0.9867, + "step": 750 + }, + { + "epoch": 0.8483479243151653, + "grad_norm": 0.03302355110645294, + "learning_rate": 0.00016370578476918008, + "loss": 1.002, + "step": 751 + }, + { + "epoch": 0.8494775487150522, + "grad_norm": 0.034719739109277725, + "learning_rate": 0.00016361418636223198, + "loss": 0.9621, + "step": 752 + }, + { + "epoch": 0.8506071731149393, + "grad_norm": 0.03225456923246384, + "learning_rate": 0.0001635224982119253, + "loss": 1.0285, + "step": 753 + }, + { + "epoch": 0.8517367975148263, + "grad_norm": 0.03379584476351738, + "learning_rate": 0.0001634307204476087, + "loss": 1.0787, + "step": 754 + }, + { + "epoch": 0.8528664219147134, + "grad_norm": 0.03374066203832626, + "learning_rate": 0.00016333885319875702, + "loss": 1.0322, + "step": 755 + }, + { + "epoch": 0.8539960463146004, + "grad_norm": 0.03438407927751541, + "learning_rate": 0.00016324689659497155, + "loss": 1.0204, + "step": 756 + }, + { + "epoch": 0.8551256707144874, + "grad_norm": 0.03300711140036583, + "learning_rate": 0.00016315485076597957, + "loss": 1.0088, + "step": 757 + }, + { + "epoch": 0.8562552951143745, + "grad_norm": 0.032439880073070526, + "learning_rate": 0.00016306271584163416, + "loss": 1.0198, + "step": 758 + }, + { + "epoch": 0.8573849195142615, + "grad_norm": 0.0341016985476017, + "learning_rate": 0.00016297049195191415, + "loss": 1.0242, + "step": 759 + }, + { + "epoch": 0.8585145439141485, + "grad_norm": 0.032917320728302, + "learning_rate": 0.00016287817922692395, + "loss": 1.0012, + "step": 760 + }, + { + "epoch": 0.8596441683140356, + "grad_norm": 0.03229722008109093, + "learning_rate": 0.00016278577779689314, + "loss": 0.9944, + "step": 761 + }, + { + "epoch": 0.8607737927139226, + "grad_norm": 0.0344838984310627, + "learning_rate": 0.0001626932877921766, + "loss": 0.9813, + "step": 762 + }, + { + "epoch": 0.8619034171138097, + "grad_norm": 0.033522870391607285, + "learning_rate": 0.00016260070934325402, + "loss": 1.0256, + "step": 763 + }, + { + "epoch": 0.8630330415136966, + "grad_norm": 0.03514671325683594, + "learning_rate": 0.00016250804258072997, + "loss": 0.9543, + "step": 764 + }, + { + "epoch": 0.8641626659135837, + "grad_norm": 0.03211130201816559, + "learning_rate": 0.00016241528763533353, + "loss": 1.0009, + "step": 765 + }, + { + "epoch": 0.8652922903134708, + "grad_norm": 0.033048368990421295, + "learning_rate": 0.00016232244463791826, + "loss": 1.0042, + "step": 766 + }, + { + "epoch": 0.8664219147133578, + "grad_norm": 0.031953100115060806, + "learning_rate": 0.00016222951371946192, + "loss": 1.0096, + "step": 767 + }, + { + "epoch": 0.8675515391132449, + "grad_norm": 0.03293442353606224, + "learning_rate": 0.00016213649501106622, + "loss": 0.9987, + "step": 768 + }, + { + "epoch": 0.8686811635131318, + "grad_norm": 0.033335424959659576, + "learning_rate": 0.00016204338864395684, + "loss": 1.0035, + "step": 769 + }, + { + "epoch": 0.8698107879130189, + "grad_norm": 0.04050195962190628, + "learning_rate": 0.00016195019474948299, + "loss": 1.0326, + "step": 770 + }, + { + "epoch": 0.870940412312906, + "grad_norm": 0.03311360627412796, + "learning_rate": 0.00016185691345911755, + "loss": 1.0184, + "step": 771 + }, + { + "epoch": 0.872070036712793, + "grad_norm": 0.03323720395565033, + "learning_rate": 0.0001617635449044565, + "loss": 0.9625, + "step": 772 + }, + { + "epoch": 0.87319966111268, + "grad_norm": 0.03422234579920769, + "learning_rate": 0.00016167008921721902, + "loss": 1.0654, + "step": 773 + }, + { + "epoch": 0.8743292855125671, + "grad_norm": 0.034163184463977814, + "learning_rate": 0.00016157654652924723, + "loss": 0.9953, + "step": 774 + }, + { + "epoch": 0.8754589099124541, + "grad_norm": 0.03320545703172684, + "learning_rate": 0.00016148291697250594, + "loss": 0.9766, + "step": 775 + }, + { + "epoch": 0.8765885343123412, + "grad_norm": 0.03346817195415497, + "learning_rate": 0.0001613892006790825, + "loss": 0.9201, + "step": 776 + }, + { + "epoch": 0.8777181587122281, + "grad_norm": 0.03284529596567154, + "learning_rate": 0.00016129539778118667, + "loss": 0.9284, + "step": 777 + }, + { + "epoch": 0.8788477831121152, + "grad_norm": 0.032990384846925735, + "learning_rate": 0.00016120150841115037, + "loss": 1.0058, + "step": 778 + }, + { + "epoch": 0.8799774075120023, + "grad_norm": 0.03396923094987869, + "learning_rate": 0.0001611075327014275, + "loss": 1.0831, + "step": 779 + }, + { + "epoch": 0.8811070319118893, + "grad_norm": 0.03224315121769905, + "learning_rate": 0.00016101347078459373, + "loss": 0.9318, + "step": 780 + }, + { + "epoch": 0.8822366563117763, + "grad_norm": 0.031812380999326706, + "learning_rate": 0.00016091932279334645, + "loss": 1.0566, + "step": 781 + }, + { + "epoch": 0.8833662807116633, + "grad_norm": 0.033773597329854965, + "learning_rate": 0.00016082508886050437, + "loss": 0.9349, + "step": 782 + }, + { + "epoch": 0.8844959051115504, + "grad_norm": 0.03375870734453201, + "learning_rate": 0.00016073076911900754, + "loss": 0.9875, + "step": 783 + }, + { + "epoch": 0.8856255295114375, + "grad_norm": 0.03443336486816406, + "learning_rate": 0.00016063636370191692, + "loss": 1.0604, + "step": 784 + }, + { + "epoch": 0.8867551539113245, + "grad_norm": 0.032187797129154205, + "learning_rate": 0.0001605418727424145, + "loss": 0.986, + "step": 785 + }, + { + "epoch": 0.8878847783112115, + "grad_norm": 0.03341427072882652, + "learning_rate": 0.00016044729637380284, + "loss": 0.9184, + "step": 786 + }, + { + "epoch": 0.8890144027110986, + "grad_norm": 0.03245866298675537, + "learning_rate": 0.000160352634729505, + "loss": 1.1511, + "step": 787 + }, + { + "epoch": 0.8901440271109856, + "grad_norm": 0.032569848001003265, + "learning_rate": 0.00016025788794306442, + "loss": 1.0948, + "step": 788 + }, + { + "epoch": 0.8912736515108727, + "grad_norm": 0.03429558128118515, + "learning_rate": 0.0001601630561481446, + "loss": 0.9379, + "step": 789 + }, + { + "epoch": 0.8924032759107596, + "grad_norm": 0.033720601350069046, + "learning_rate": 0.00016006813947852893, + "loss": 0.9845, + "step": 790 + }, + { + "epoch": 0.8935329003106467, + "grad_norm": 0.033525846898555756, + "learning_rate": 0.00015997313806812057, + "loss": 1.0279, + "step": 791 + }, + { + "epoch": 0.8946625247105338, + "grad_norm": 0.03577594459056854, + "learning_rate": 0.00015987805205094227, + "loss": 0.9654, + "step": 792 + }, + { + "epoch": 0.8957921491104208, + "grad_norm": 0.03572090342640877, + "learning_rate": 0.00015978288156113604, + "loss": 1.0292, + "step": 793 + }, + { + "epoch": 0.8969217735103078, + "grad_norm": 0.0330742709338665, + "learning_rate": 0.00015968762673296318, + "loss": 1.0898, + "step": 794 + }, + { + "epoch": 0.8980513979101948, + "grad_norm": 0.03374762088060379, + "learning_rate": 0.0001595922877008039, + "loss": 1.0368, + "step": 795 + }, + { + "epoch": 0.8991810223100819, + "grad_norm": 0.035000476986169815, + "learning_rate": 0.00015949686459915715, + "loss": 1.0531, + "step": 796 + }, + { + "epoch": 0.900310646709969, + "grad_norm": 0.03325015306472778, + "learning_rate": 0.00015940135756264062, + "loss": 1.0199, + "step": 797 + }, + { + "epoch": 0.901440271109856, + "grad_norm": 0.03547768294811249, + "learning_rate": 0.0001593057667259902, + "loss": 0.9988, + "step": 798 + }, + { + "epoch": 0.902569895509743, + "grad_norm": 0.03294992819428444, + "learning_rate": 0.0001592100922240603, + "loss": 0.9943, + "step": 799 + }, + { + "epoch": 0.90369951990963, + "grad_norm": 0.03369821235537529, + "learning_rate": 0.00015911433419182305, + "loss": 1.0186, + "step": 800 + }, + { + "epoch": 0.9048291443095171, + "grad_norm": 0.03317281976342201, + "learning_rate": 0.00015901849276436862, + "loss": 0.9601, + "step": 801 + }, + { + "epoch": 0.9059587687094042, + "grad_norm": 0.036661259829998016, + "learning_rate": 0.00015892256807690478, + "loss": 1.0847, + "step": 802 + }, + { + "epoch": 0.9070883931092911, + "grad_norm": 0.0334974080324173, + "learning_rate": 0.00015882656026475672, + "loss": 1.0264, + "step": 803 + }, + { + "epoch": 0.9082180175091782, + "grad_norm": 0.03364727646112442, + "learning_rate": 0.00015873046946336694, + "loss": 0.9768, + "step": 804 + }, + { + "epoch": 0.9093476419090653, + "grad_norm": 0.03534623235464096, + "learning_rate": 0.000158634295808295, + "loss": 1.0705, + "step": 805 + }, + { + "epoch": 0.9104772663089523, + "grad_norm": 0.032764844596385956, + "learning_rate": 0.00015853803943521733, + "loss": 0.9543, + "step": 806 + }, + { + "epoch": 0.9116068907088393, + "grad_norm": 0.03310185670852661, + "learning_rate": 0.00015844170047992712, + "loss": 1.0077, + "step": 807 + }, + { + "epoch": 0.9127365151087263, + "grad_norm": 0.0327795036137104, + "learning_rate": 0.00015834527907833396, + "loss": 0.9765, + "step": 808 + }, + { + "epoch": 0.9138661395086134, + "grad_norm": 0.03351445123553276, + "learning_rate": 0.00015824877536646382, + "loss": 1.0634, + "step": 809 + }, + { + "epoch": 0.9149957639085005, + "grad_norm": 0.03497536852955818, + "learning_rate": 0.00015815218948045878, + "loss": 0.9211, + "step": 810 + }, + { + "epoch": 0.9161253883083874, + "grad_norm": 0.03262564167380333, + "learning_rate": 0.00015805552155657683, + "loss": 0.9841, + "step": 811 + }, + { + "epoch": 0.9172550127082745, + "grad_norm": 0.03305838629603386, + "learning_rate": 0.00015795877173119176, + "loss": 0.9968, + "step": 812 + }, + { + "epoch": 0.9183846371081615, + "grad_norm": 0.03393985703587532, + "learning_rate": 0.00015786194014079274, + "loss": 1.0257, + "step": 813 + }, + { + "epoch": 0.9195142615080486, + "grad_norm": 0.03377285972237587, + "learning_rate": 0.00015776502692198448, + "loss": 0.979, + "step": 814 + }, + { + "epoch": 0.9206438859079357, + "grad_norm": 0.03390325978398323, + "learning_rate": 0.00015766803221148673, + "loss": 1.0935, + "step": 815 + }, + { + "epoch": 0.9217735103078226, + "grad_norm": 0.034586288034915924, + "learning_rate": 0.00015757095614613427, + "loss": 1.0286, + "step": 816 + }, + { + "epoch": 0.9229031347077097, + "grad_norm": 0.034462425857782364, + "learning_rate": 0.00015747379886287655, + "loss": 0.9826, + "step": 817 + }, + { + "epoch": 0.9240327591075967, + "grad_norm": 0.03412788733839989, + "learning_rate": 0.0001573765604987777, + "loss": 1.0391, + "step": 818 + }, + { + "epoch": 0.9251623835074838, + "grad_norm": 0.03411950543522835, + "learning_rate": 0.0001572792411910162, + "loss": 1.014, + "step": 819 + }, + { + "epoch": 0.9262920079073708, + "grad_norm": 0.03366335481405258, + "learning_rate": 0.0001571818410768848, + "loss": 1.0191, + "step": 820 + }, + { + "epoch": 0.9274216323072578, + "grad_norm": 0.033515483140945435, + "learning_rate": 0.00015708436029379004, + "loss": 1.0072, + "step": 821 + }, + { + "epoch": 0.9285512567071449, + "grad_norm": 0.033421795815229416, + "learning_rate": 0.0001569867989792525, + "loss": 1.0311, + "step": 822 + }, + { + "epoch": 0.929680881107032, + "grad_norm": 0.032961517572402954, + "learning_rate": 0.00015688915727090613, + "loss": 1.0476, + "step": 823 + }, + { + "epoch": 0.9308105055069189, + "grad_norm": 0.03382313251495361, + "learning_rate": 0.00015679143530649854, + "loss": 0.9863, + "step": 824 + }, + { + "epoch": 0.931940129906806, + "grad_norm": 0.03453601896762848, + "learning_rate": 0.0001566936332238904, + "loss": 0.981, + "step": 825 + }, + { + "epoch": 0.933069754306693, + "grad_norm": 0.03426108881831169, + "learning_rate": 0.00015659575116105544, + "loss": 1.0615, + "step": 826 + }, + { + "epoch": 0.9341993787065801, + "grad_norm": 0.03343765065073967, + "learning_rate": 0.0001564977892560803, + "loss": 1.0745, + "step": 827 + }, + { + "epoch": 0.935329003106467, + "grad_norm": 0.03495456650853157, + "learning_rate": 0.00015639974764716414, + "loss": 0.9985, + "step": 828 + }, + { + "epoch": 0.9364586275063541, + "grad_norm": 0.033679116517305374, + "learning_rate": 0.0001563016264726186, + "loss": 1.0216, + "step": 829 + }, + { + "epoch": 0.9375882519062412, + "grad_norm": 0.03362250700592995, + "learning_rate": 0.0001562034258708676, + "loss": 1.0337, + "step": 830 + }, + { + "epoch": 0.9387178763061282, + "grad_norm": 0.034377310425043106, + "learning_rate": 0.00015610514598044707, + "loss": 1.0583, + "step": 831 + }, + { + "epoch": 0.9398475007060153, + "grad_norm": 0.033647313714027405, + "learning_rate": 0.00015600678694000487, + "loss": 1.0126, + "step": 832 + }, + { + "epoch": 0.9409771251059023, + "grad_norm": 0.03457539901137352, + "learning_rate": 0.0001559083488883004, + "loss": 1.1528, + "step": 833 + }, + { + "epoch": 0.9421067495057893, + "grad_norm": 0.03426367789506912, + "learning_rate": 0.00015580983196420464, + "loss": 0.9055, + "step": 834 + }, + { + "epoch": 0.9432363739056764, + "grad_norm": 0.03347745165228844, + "learning_rate": 0.0001557112363066998, + "loss": 0.9978, + "step": 835 + }, + { + "epoch": 0.9443659983055634, + "grad_norm": 0.03355059772729874, + "learning_rate": 0.00015561256205487908, + "loss": 0.9844, + "step": 836 + }, + { + "epoch": 0.9454956227054504, + "grad_norm": 0.032837532460689545, + "learning_rate": 0.0001555138093479467, + "loss": 0.932, + "step": 837 + }, + { + "epoch": 0.9466252471053375, + "grad_norm": 0.03441225364804268, + "learning_rate": 0.0001554149783252175, + "loss": 0.9975, + "step": 838 + }, + { + "epoch": 0.9477548715052245, + "grad_norm": 0.033451907336711884, + "learning_rate": 0.00015531606912611674, + "loss": 0.9707, + "step": 839 + }, + { + "epoch": 0.9488844959051116, + "grad_norm": 0.03538847342133522, + "learning_rate": 0.00015521708189018005, + "loss": 1.0129, + "step": 840 + }, + { + "epoch": 0.9500141203049985, + "grad_norm": 0.033600080758333206, + "learning_rate": 0.00015511801675705312, + "loss": 1.0403, + "step": 841 + }, + { + "epoch": 0.9511437447048856, + "grad_norm": 0.03426308557391167, + "learning_rate": 0.00015501887386649155, + "loss": 0.9879, + "step": 842 + }, + { + "epoch": 0.9522733691047727, + "grad_norm": 0.033120229840278625, + "learning_rate": 0.00015491965335836055, + "loss": 1.0627, + "step": 843 + }, + { + "epoch": 0.9534029935046597, + "grad_norm": 0.0343567430973053, + "learning_rate": 0.00015482035537263498, + "loss": 1.0308, + "step": 844 + }, + { + "epoch": 0.9545326179045467, + "grad_norm": 0.033301327377557755, + "learning_rate": 0.00015472098004939888, + "loss": 1.0106, + "step": 845 + }, + { + "epoch": 0.9556622423044338, + "grad_norm": 0.03342900052666664, + "learning_rate": 0.00015462152752884544, + "loss": 1.0261, + "step": 846 + }, + { + "epoch": 0.9567918667043208, + "grad_norm": 0.032714009284973145, + "learning_rate": 0.00015452199795127678, + "loss": 0.8953, + "step": 847 + }, + { + "epoch": 0.9579214911042079, + "grad_norm": 0.0333135612308979, + "learning_rate": 0.00015442239145710364, + "loss": 1.0105, + "step": 848 + }, + { + "epoch": 0.9590511155040949, + "grad_norm": 0.03534407541155815, + "learning_rate": 0.00015432270818684532, + "loss": 0.9325, + "step": 849 + }, + { + "epoch": 0.9601807399039819, + "grad_norm": 0.03319082036614418, + "learning_rate": 0.00015422294828112954, + "loss": 0.9187, + "step": 850 + }, + { + "epoch": 0.961310364303869, + "grad_norm": 0.03402223438024521, + "learning_rate": 0.00015412311188069193, + "loss": 0.9523, + "step": 851 + }, + { + "epoch": 0.962439988703756, + "grad_norm": 0.038419678807258606, + "learning_rate": 0.00015402319912637613, + "loss": 1.0135, + "step": 852 + }, + { + "epoch": 0.9635696131036431, + "grad_norm": 0.03462392836809158, + "learning_rate": 0.00015392321015913357, + "loss": 1.0811, + "step": 853 + }, + { + "epoch": 0.96469923750353, + "grad_norm": 0.033567875623703, + "learning_rate": 0.0001538231451200231, + "loss": 1.0052, + "step": 854 + }, + { + "epoch": 0.9658288619034171, + "grad_norm": 0.03398734703660011, + "learning_rate": 0.00015372300415021091, + "loss": 0.9939, + "step": 855 + }, + { + "epoch": 0.9669584863033042, + "grad_norm": 0.03315124288201332, + "learning_rate": 0.00015362278739097026, + "loss": 1.0515, + "step": 856 + }, + { + "epoch": 0.9680881107031912, + "grad_norm": 0.03387816995382309, + "learning_rate": 0.0001535224949836815, + "loss": 1.0906, + "step": 857 + }, + { + "epoch": 0.9692177351030782, + "grad_norm": 0.033208638429641724, + "learning_rate": 0.00015342212706983153, + "loss": 0.9542, + "step": 858 + }, + { + "epoch": 0.9703473595029652, + "grad_norm": 0.0338163860142231, + "learning_rate": 0.00015332168379101377, + "loss": 0.9892, + "step": 859 + }, + { + "epoch": 0.9714769839028523, + "grad_norm": 0.033496033400297165, + "learning_rate": 0.00015322116528892807, + "loss": 1.0253, + "step": 860 + }, + { + "epoch": 0.9726066083027394, + "grad_norm": 0.034597545862197876, + "learning_rate": 0.00015312057170538035, + "loss": 1.0102, + "step": 861 + }, + { + "epoch": 0.9737362327026263, + "grad_norm": 0.03476065397262573, + "learning_rate": 0.00015301990318228244, + "loss": 0.938, + "step": 862 + }, + { + "epoch": 0.9748658571025134, + "grad_norm": 0.036271460354328156, + "learning_rate": 0.00015291915986165186, + "loss": 0.9072, + "step": 863 + }, + { + "epoch": 0.9759954815024005, + "grad_norm": 0.032739460468292236, + "learning_rate": 0.00015281834188561174, + "loss": 0.9955, + "step": 864 + }, + { + "epoch": 0.9771251059022875, + "grad_norm": 0.03603595495223999, + "learning_rate": 0.0001527174493963905, + "loss": 0.978, + "step": 865 + }, + { + "epoch": 0.9782547303021746, + "grad_norm": 0.03469686582684517, + "learning_rate": 0.00015261648253632156, + "loss": 1.0928, + "step": 866 + }, + { + "epoch": 0.9793843547020615, + "grad_norm": 0.03487220034003258, + "learning_rate": 0.0001525154414478434, + "loss": 1.0144, + "step": 867 + }, + { + "epoch": 0.9805139791019486, + "grad_norm": 0.03308931365609169, + "learning_rate": 0.00015241432627349918, + "loss": 0.9912, + "step": 868 + }, + { + "epoch": 0.9816436035018357, + "grad_norm": 0.0350349023938179, + "learning_rate": 0.00015231313715593662, + "loss": 1.0209, + "step": 869 + }, + { + "epoch": 0.9827732279017227, + "grad_norm": 0.034897249191999435, + "learning_rate": 0.0001522118742379076, + "loss": 0.9873, + "step": 870 + }, + { + "epoch": 0.9839028523016097, + "grad_norm": 0.03427942842245102, + "learning_rate": 0.00015211053766226828, + "loss": 0.9497, + "step": 871 + }, + { + "epoch": 0.9850324767014967, + "grad_norm": 0.0339798741042614, + "learning_rate": 0.00015200912757197868, + "loss": 0.9741, + "step": 872 + }, + { + "epoch": 0.9861621011013838, + "grad_norm": 0.03557536378502846, + "learning_rate": 0.00015190764411010247, + "loss": 0.9747, + "step": 873 + }, + { + "epoch": 0.9872917255012709, + "grad_norm": 0.036786146461963654, + "learning_rate": 0.00015180608741980692, + "loss": 1.0296, + "step": 874 + }, + { + "epoch": 0.9884213499011578, + "grad_norm": 0.03306087478995323, + "learning_rate": 0.00015170445764436252, + "loss": 1.0559, + "step": 875 + }, + { + "epoch": 0.9895509743010449, + "grad_norm": 0.03436678647994995, + "learning_rate": 0.00015160275492714296, + "loss": 0.9572, + "step": 876 + }, + { + "epoch": 0.990680598700932, + "grad_norm": 0.03426647186279297, + "learning_rate": 0.00015150097941162474, + "loss": 0.999, + "step": 877 + }, + { + "epoch": 0.991810223100819, + "grad_norm": 0.03366367891430855, + "learning_rate": 0.00015139913124138715, + "loss": 1.0365, + "step": 878 + }, + { + "epoch": 0.992939847500706, + "grad_norm": 0.034058500081300735, + "learning_rate": 0.00015129721056011185, + "loss": 0.9835, + "step": 879 + }, + { + "epoch": 0.994069471900593, + "grad_norm": 0.03479884937405586, + "learning_rate": 0.00015119521751158296, + "loss": 1.0604, + "step": 880 + }, + { + "epoch": 0.9951990963004801, + "grad_norm": 0.03426951542496681, + "learning_rate": 0.00015109315223968655, + "loss": 1.0344, + "step": 881 + }, + { + "epoch": 0.9963287207003672, + "grad_norm": 0.034726936370134354, + "learning_rate": 0.0001509910148884106, + "loss": 0.927, + "step": 882 + }, + { + "epoch": 0.9974583451002542, + "grad_norm": 0.03522869199514389, + "learning_rate": 0.00015088880560184493, + "loss": 1.035, + "step": 883 + }, + { + "epoch": 0.9985879695001412, + "grad_norm": 0.03507549315690994, + "learning_rate": 0.00015078652452418063, + "loss": 0.952, + "step": 884 + }, + { + "epoch": 0.9997175939000282, + "grad_norm": 0.03401617333292961, + "learning_rate": 0.00015068417179971014, + "loss": 1.0006, + "step": 885 + }, + { + "epoch": 0.9997175939000282, + "eval_loss": 1.0020042657852173, + "eval_runtime": 552.3244, + "eval_samples_per_second": 17.712, + "eval_steps_per_second": 8.857, + "step": 885 + } + ], + "logging_steps": 1, + "max_steps": 2655, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 885, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0674035770156646e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}