{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 3415, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014654698662758747, "grad_norm": 3.7154194217547345, "learning_rate": 0.0, "loss": 1.1894, "num_tokens": 3198322.0, "step": 1 }, { "epoch": 0.0029309397325517493, "grad_norm": 3.8159096754478, "learning_rate": 3.883495145631068e-07, "loss": 1.1959, "num_tokens": 6267889.0, "step": 2 }, { "epoch": 0.004396409598827624, "grad_norm": 3.831445269987945, "learning_rate": 7.766990291262136e-07, "loss": 1.237, "num_tokens": 9183528.0, "step": 3 }, { "epoch": 0.005861879465103499, "grad_norm": 3.756428669522261, "learning_rate": 1.1650485436893206e-06, "loss": 1.2097, "num_tokens": 12005997.0, "step": 4 }, { "epoch": 0.007327349331379373, "grad_norm": 3.7049281074937817, "learning_rate": 1.5533980582524272e-06, "loss": 1.2168, "num_tokens": 14867401.0, "step": 5 }, { "epoch": 0.008792819197655248, "grad_norm": 3.2899890256573694, "learning_rate": 1.941747572815534e-06, "loss": 1.2075, "num_tokens": 17926912.0, "step": 6 }, { "epoch": 0.010258289063931123, "grad_norm": 3.0589726023363477, "learning_rate": 2.330097087378641e-06, "loss": 1.183, "num_tokens": 20972782.0, "step": 7 }, { "epoch": 0.011723758930206997, "grad_norm": 2.290087708190892, "learning_rate": 2.718446601941748e-06, "loss": 1.1912, "num_tokens": 23831942.0, "step": 8 }, { "epoch": 0.013189228796482872, "grad_norm": 2.205407335694658, "learning_rate": 3.1067961165048544e-06, "loss": 1.1797, "num_tokens": 26808849.0, "step": 9 }, { "epoch": 0.014654698662758746, "grad_norm": 2.1597756695335093, "learning_rate": 3.4951456310679615e-06, "loss": 1.1746, "num_tokens": 29800658.0, "step": 10 }, { "epoch": 0.016120168529034622, "grad_norm": 3.2808602516524483, "learning_rate": 3.883495145631068e-06, "loss": 1.1718, "num_tokens": 32709194.0, "step": 11 }, { "epoch": 0.017585638395310497, "grad_norm": 3.6239058889427964, "learning_rate": 4.271844660194175e-06, "loss": 1.1736, "num_tokens": 35774224.0, "step": 12 }, { "epoch": 0.01905110826158637, "grad_norm": 3.2224743488745724, "learning_rate": 4.660194174757282e-06, "loss": 1.1688, "num_tokens": 38705681.0, "step": 13 }, { "epoch": 0.020516578127862246, "grad_norm": 2.188551653370355, "learning_rate": 5.048543689320389e-06, "loss": 1.1255, "num_tokens": 41755116.0, "step": 14 }, { "epoch": 0.02198204799413812, "grad_norm": 2.455305133554404, "learning_rate": 5.436893203883496e-06, "loss": 1.1036, "num_tokens": 44857316.0, "step": 15 }, { "epoch": 0.023447517860413995, "grad_norm": 2.6182275949629936, "learning_rate": 5.825242718446602e-06, "loss": 1.1218, "num_tokens": 47585617.0, "step": 16 }, { "epoch": 0.02491298772668987, "grad_norm": 6.20879928353618, "learning_rate": 6.213592233009709e-06, "loss": 1.0892, "num_tokens": 50672319.0, "step": 17 }, { "epoch": 0.026378457592965743, "grad_norm": 2.209383359381836, "learning_rate": 6.601941747572816e-06, "loss": 1.0662, "num_tokens": 53726652.0, "step": 18 }, { "epoch": 0.027843927459241618, "grad_norm": 2.2063531579858724, "learning_rate": 6.990291262135923e-06, "loss": 1.0671, "num_tokens": 56942695.0, "step": 19 }, { "epoch": 0.029309397325517492, "grad_norm": 1.6889738500376674, "learning_rate": 7.37864077669903e-06, "loss": 1.0496, "num_tokens": 60025080.0, "step": 20 }, { "epoch": 0.03077486719179337, "grad_norm": 5.050365716432631, "learning_rate": 7.766990291262136e-06, "loss": 1.0, "num_tokens": 63190690.0, "step": 21 }, { "epoch": 0.032240337058069245, "grad_norm": 1.3669283816660662, "learning_rate": 8.155339805825243e-06, "loss": 0.9853, "num_tokens": 66159787.0, "step": 22 }, { "epoch": 0.03370580692434512, "grad_norm": 1.2365578960470651, "learning_rate": 8.54368932038835e-06, "loss": 0.9868, "num_tokens": 69372254.0, "step": 23 }, { "epoch": 0.035171276790620994, "grad_norm": 1.004333292885707, "learning_rate": 8.932038834951458e-06, "loss": 0.9841, "num_tokens": 72158598.0, "step": 24 }, { "epoch": 0.03663674665689687, "grad_norm": 0.8552509734385357, "learning_rate": 9.320388349514565e-06, "loss": 0.9954, "num_tokens": 75199277.0, "step": 25 }, { "epoch": 0.03810221652317274, "grad_norm": 0.8435970179653405, "learning_rate": 9.708737864077671e-06, "loss": 0.9613, "num_tokens": 78505230.0, "step": 26 }, { "epoch": 0.03956768638944862, "grad_norm": 0.7965616227829095, "learning_rate": 1.0097087378640778e-05, "loss": 0.9647, "num_tokens": 81528695.0, "step": 27 }, { "epoch": 0.04103315625572449, "grad_norm": 0.6986712691976343, "learning_rate": 1.0485436893203885e-05, "loss": 0.9534, "num_tokens": 84629623.0, "step": 28 }, { "epoch": 0.042498626122000366, "grad_norm": 0.6499330969978084, "learning_rate": 1.0873786407766991e-05, "loss": 0.9424, "num_tokens": 87822492.0, "step": 29 }, { "epoch": 0.04396409598827624, "grad_norm": 0.5704142147162633, "learning_rate": 1.1262135922330098e-05, "loss": 0.9647, "num_tokens": 91028813.0, "step": 30 }, { "epoch": 0.045429565854552115, "grad_norm": 0.5687018137715497, "learning_rate": 1.1650485436893204e-05, "loss": 0.9564, "num_tokens": 94210012.0, "step": 31 }, { "epoch": 0.04689503572082799, "grad_norm": 0.5057684258625181, "learning_rate": 1.2038834951456311e-05, "loss": 0.9535, "num_tokens": 97431648.0, "step": 32 }, { "epoch": 0.048360505587103864, "grad_norm": 0.5368187773607731, "learning_rate": 1.2427184466019418e-05, "loss": 0.9419, "num_tokens": 100240083.0, "step": 33 }, { "epoch": 0.04982597545337974, "grad_norm": 0.521600938283564, "learning_rate": 1.2815533980582526e-05, "loss": 0.9342, "num_tokens": 103326466.0, "step": 34 }, { "epoch": 0.05129144531965561, "grad_norm": 0.48855050271178957, "learning_rate": 1.3203883495145633e-05, "loss": 0.9254, "num_tokens": 106214597.0, "step": 35 }, { "epoch": 0.05275691518593149, "grad_norm": 0.455737524836322, "learning_rate": 1.359223300970874e-05, "loss": 0.9206, "num_tokens": 109354828.0, "step": 36 }, { "epoch": 0.05422238505220736, "grad_norm": 0.508354078655406, "learning_rate": 1.3980582524271846e-05, "loss": 0.926, "num_tokens": 112421290.0, "step": 37 }, { "epoch": 0.055687854918483236, "grad_norm": 0.4786981791275268, "learning_rate": 1.4368932038834953e-05, "loss": 0.9123, "num_tokens": 115343538.0, "step": 38 }, { "epoch": 0.05715332478475911, "grad_norm": 0.46294861923447456, "learning_rate": 1.475728155339806e-05, "loss": 0.9113, "num_tokens": 118367356.0, "step": 39 }, { "epoch": 0.058618794651034985, "grad_norm": 0.5295701920957653, "learning_rate": 1.5145631067961166e-05, "loss": 0.9226, "num_tokens": 121358192.0, "step": 40 }, { "epoch": 0.060084264517310866, "grad_norm": 0.49306899051866787, "learning_rate": 1.5533980582524273e-05, "loss": 0.9091, "num_tokens": 124534846.0, "step": 41 }, { "epoch": 0.06154973438358674, "grad_norm": 0.46334592033574157, "learning_rate": 1.592233009708738e-05, "loss": 0.911, "num_tokens": 127636604.0, "step": 42 }, { "epoch": 0.06301520424986261, "grad_norm": 0.5062020274630149, "learning_rate": 1.6310679611650486e-05, "loss": 0.9267, "num_tokens": 130520629.0, "step": 43 }, { "epoch": 0.06448067411613849, "grad_norm": 0.44534995182779014, "learning_rate": 1.6699029126213594e-05, "loss": 0.9073, "num_tokens": 133517640.0, "step": 44 }, { "epoch": 0.06594614398241436, "grad_norm": 0.4864222301298436, "learning_rate": 1.70873786407767e-05, "loss": 0.915, "num_tokens": 136872123.0, "step": 45 }, { "epoch": 0.06741161384869024, "grad_norm": 0.43845702373502604, "learning_rate": 1.7475728155339808e-05, "loss": 0.9046, "num_tokens": 139899803.0, "step": 46 }, { "epoch": 0.0688770837149661, "grad_norm": 0.48413509568112467, "learning_rate": 1.7864077669902916e-05, "loss": 0.9004, "num_tokens": 143029695.0, "step": 47 }, { "epoch": 0.07034255358124199, "grad_norm": 0.4979005902336758, "learning_rate": 1.825242718446602e-05, "loss": 0.9081, "num_tokens": 146136067.0, "step": 48 }, { "epoch": 0.07180802344751785, "grad_norm": 0.48474843137917467, "learning_rate": 1.864077669902913e-05, "loss": 0.8975, "num_tokens": 149134152.0, "step": 49 }, { "epoch": 0.07327349331379374, "grad_norm": 0.5520629070878562, "learning_rate": 1.9029126213592234e-05, "loss": 0.9135, "num_tokens": 152171548.0, "step": 50 }, { "epoch": 0.0747389631800696, "grad_norm": 0.41162920146592064, "learning_rate": 1.9417475728155343e-05, "loss": 0.8686, "num_tokens": 155385344.0, "step": 51 }, { "epoch": 0.07620443304634548, "grad_norm": 0.46665397621944266, "learning_rate": 1.9805825242718447e-05, "loss": 0.8828, "num_tokens": 158641919.0, "step": 52 }, { "epoch": 0.07766990291262135, "grad_norm": 0.49616593118930535, "learning_rate": 2.0194174757281556e-05, "loss": 0.8942, "num_tokens": 161685973.0, "step": 53 }, { "epoch": 0.07913537277889723, "grad_norm": 0.4689210013321571, "learning_rate": 2.058252427184466e-05, "loss": 0.8968, "num_tokens": 164656830.0, "step": 54 }, { "epoch": 0.08060084264517312, "grad_norm": 0.5546915841150304, "learning_rate": 2.097087378640777e-05, "loss": 0.8843, "num_tokens": 167703016.0, "step": 55 }, { "epoch": 0.08206631251144898, "grad_norm": 0.5253100741527619, "learning_rate": 2.1359223300970874e-05, "loss": 0.8954, "num_tokens": 170862180.0, "step": 56 }, { "epoch": 0.08353178237772486, "grad_norm": 0.4871450938003483, "learning_rate": 2.1747572815533982e-05, "loss": 0.8791, "num_tokens": 173771441.0, "step": 57 }, { "epoch": 0.08499725224400073, "grad_norm": 0.7115493810638506, "learning_rate": 2.2135922330097087e-05, "loss": 0.9003, "num_tokens": 176699325.0, "step": 58 }, { "epoch": 0.08646272211027661, "grad_norm": 0.5788787720979568, "learning_rate": 2.2524271844660196e-05, "loss": 0.8965, "num_tokens": 179878886.0, "step": 59 }, { "epoch": 0.08792819197655248, "grad_norm": 0.7218463417280757, "learning_rate": 2.29126213592233e-05, "loss": 0.872, "num_tokens": 182854320.0, "step": 60 }, { "epoch": 0.08939366184282836, "grad_norm": 0.6405516461924528, "learning_rate": 2.330097087378641e-05, "loss": 0.8908, "num_tokens": 185743356.0, "step": 61 }, { "epoch": 0.09085913170910423, "grad_norm": 0.7642503850701262, "learning_rate": 2.3689320388349514e-05, "loss": 0.8984, "num_tokens": 188709848.0, "step": 62 }, { "epoch": 0.09232460157538011, "grad_norm": 0.5282646333755459, "learning_rate": 2.4077669902912622e-05, "loss": 0.895, "num_tokens": 192129742.0, "step": 63 }, { "epoch": 0.09379007144165598, "grad_norm": 0.7703478916276936, "learning_rate": 2.4466019417475727e-05, "loss": 0.8567, "num_tokens": 195114755.0, "step": 64 }, { "epoch": 0.09525554130793186, "grad_norm": 0.5769241566237074, "learning_rate": 2.4854368932038836e-05, "loss": 0.8861, "num_tokens": 198140939.0, "step": 65 }, { "epoch": 0.09672101117420773, "grad_norm": 0.7287654485975635, "learning_rate": 2.5242718446601947e-05, "loss": 0.8627, "num_tokens": 201019131.0, "step": 66 }, { "epoch": 0.09818648104048361, "grad_norm": 0.6390322091922136, "learning_rate": 2.5631067961165052e-05, "loss": 0.8757, "num_tokens": 204203222.0, "step": 67 }, { "epoch": 0.09965195090675948, "grad_norm": 0.6642161473425475, "learning_rate": 2.601941747572816e-05, "loss": 0.8699, "num_tokens": 207301837.0, "step": 68 }, { "epoch": 0.10111742077303536, "grad_norm": 0.6093151136946741, "learning_rate": 2.6407766990291266e-05, "loss": 0.8684, "num_tokens": 210327220.0, "step": 69 }, { "epoch": 0.10258289063931122, "grad_norm": 0.6789884618682116, "learning_rate": 2.6796116504854374e-05, "loss": 0.8644, "num_tokens": 213155631.0, "step": 70 }, { "epoch": 0.1040483605055871, "grad_norm": 0.5795597272375321, "learning_rate": 2.718446601941748e-05, "loss": 0.8469, "num_tokens": 216233990.0, "step": 71 }, { "epoch": 0.10551383037186297, "grad_norm": 0.6394234581377922, "learning_rate": 2.7572815533980587e-05, "loss": 0.876, "num_tokens": 219210949.0, "step": 72 }, { "epoch": 0.10697930023813886, "grad_norm": 0.6103946781659355, "learning_rate": 2.7961165048543692e-05, "loss": 0.8733, "num_tokens": 222342989.0, "step": 73 }, { "epoch": 0.10844477010441472, "grad_norm": 0.8026490815081038, "learning_rate": 2.83495145631068e-05, "loss": 0.8632, "num_tokens": 225683535.0, "step": 74 }, { "epoch": 0.1099102399706906, "grad_norm": 0.6498093598726404, "learning_rate": 2.8737864077669905e-05, "loss": 0.8549, "num_tokens": 228993739.0, "step": 75 }, { "epoch": 0.11137570983696647, "grad_norm": 0.6796573878122779, "learning_rate": 2.9126213592233014e-05, "loss": 0.8701, "num_tokens": 232307768.0, "step": 76 }, { "epoch": 0.11284117970324235, "grad_norm": 0.5991007871834552, "learning_rate": 2.951456310679612e-05, "loss": 0.8778, "num_tokens": 235284000.0, "step": 77 }, { "epoch": 0.11430664956951822, "grad_norm": 0.6225913604825317, "learning_rate": 2.9902912621359227e-05, "loss": 0.8496, "num_tokens": 238369671.0, "step": 78 }, { "epoch": 0.1157721194357941, "grad_norm": 0.805492420861617, "learning_rate": 3.0291262135922332e-05, "loss": 0.8519, "num_tokens": 241594371.0, "step": 79 }, { "epoch": 0.11723758930206997, "grad_norm": 0.7420095626227796, "learning_rate": 3.067961165048544e-05, "loss": 0.8547, "num_tokens": 244622780.0, "step": 80 }, { "epoch": 0.11870305916834585, "grad_norm": 0.6037490765727215, "learning_rate": 3.1067961165048545e-05, "loss": 0.8685, "num_tokens": 247756320.0, "step": 81 }, { "epoch": 0.12016852903462173, "grad_norm": 0.7097892538296358, "learning_rate": 3.1456310679611654e-05, "loss": 0.8499, "num_tokens": 250888113.0, "step": 82 }, { "epoch": 0.1216339989008976, "grad_norm": 0.6423028146339036, "learning_rate": 3.184466019417476e-05, "loss": 0.8774, "num_tokens": 253982723.0, "step": 83 }, { "epoch": 0.12309946876717348, "grad_norm": 0.7476981768912153, "learning_rate": 3.2233009708737864e-05, "loss": 0.8595, "num_tokens": 257284764.0, "step": 84 }, { "epoch": 0.12456493863344935, "grad_norm": 0.6000248431150975, "learning_rate": 3.262135922330097e-05, "loss": 0.8564, "num_tokens": 260479493.0, "step": 85 }, { "epoch": 0.12603040849972522, "grad_norm": 0.7338133703090208, "learning_rate": 3.300970873786408e-05, "loss": 0.8363, "num_tokens": 263554740.0, "step": 86 }, { "epoch": 0.1274958783660011, "grad_norm": 0.7838147196534231, "learning_rate": 3.339805825242719e-05, "loss": 0.8619, "num_tokens": 266523921.0, "step": 87 }, { "epoch": 0.12896134823227698, "grad_norm": 0.5395144411175702, "learning_rate": 3.378640776699029e-05, "loss": 0.8535, "num_tokens": 269868631.0, "step": 88 }, { "epoch": 0.13042681809855286, "grad_norm": 0.9256661786843077, "learning_rate": 3.41747572815534e-05, "loss": 0.8367, "num_tokens": 273242182.0, "step": 89 }, { "epoch": 0.1318922879648287, "grad_norm": 0.6320711229649394, "learning_rate": 3.456310679611651e-05, "loss": 0.8676, "num_tokens": 276420774.0, "step": 90 }, { "epoch": 0.1333577578311046, "grad_norm": 0.9111610484555086, "learning_rate": 3.4951456310679615e-05, "loss": 0.8525, "num_tokens": 279322789.0, "step": 91 }, { "epoch": 0.13482322769738048, "grad_norm": 0.6630400558870672, "learning_rate": 3.5339805825242724e-05, "loss": 0.8459, "num_tokens": 282594946.0, "step": 92 }, { "epoch": 0.13628869756365636, "grad_norm": 0.7872835328404213, "learning_rate": 3.572815533980583e-05, "loss": 0.8696, "num_tokens": 285739114.0, "step": 93 }, { "epoch": 0.1377541674299322, "grad_norm": 0.7757734012108687, "learning_rate": 3.6116504854368933e-05, "loss": 0.8404, "num_tokens": 288842961.0, "step": 94 }, { "epoch": 0.1392196372962081, "grad_norm": 0.8172131583040233, "learning_rate": 3.650485436893204e-05, "loss": 0.8349, "num_tokens": 291857499.0, "step": 95 }, { "epoch": 0.14068510716248397, "grad_norm": 0.7505284006119732, "learning_rate": 3.689320388349515e-05, "loss": 0.8506, "num_tokens": 294786027.0, "step": 96 }, { "epoch": 0.14215057702875986, "grad_norm": 0.8544005803634707, "learning_rate": 3.728155339805826e-05, "loss": 0.8381, "num_tokens": 297533328.0, "step": 97 }, { "epoch": 0.1436160468950357, "grad_norm": 0.5925127226227879, "learning_rate": 3.766990291262136e-05, "loss": 0.8218, "num_tokens": 300605580.0, "step": 98 }, { "epoch": 0.1450815167613116, "grad_norm": 0.9521999060037152, "learning_rate": 3.805825242718447e-05, "loss": 0.8471, "num_tokens": 303698254.0, "step": 99 }, { "epoch": 0.14654698662758747, "grad_norm": 0.7010064402363387, "learning_rate": 3.844660194174758e-05, "loss": 0.8273, "num_tokens": 306988972.0, "step": 100 }, { "epoch": 0.14801245649386335, "grad_norm": 0.9654068200560665, "learning_rate": 3.8834951456310685e-05, "loss": 0.8429, "num_tokens": 309998481.0, "step": 101 }, { "epoch": 0.1494779263601392, "grad_norm": 0.7428113583562915, "learning_rate": 3.9223300970873787e-05, "loss": 0.8606, "num_tokens": 312761538.0, "step": 102 }, { "epoch": 0.1509433962264151, "grad_norm": 1.1685023275237025, "learning_rate": 3.9611650485436895e-05, "loss": 0.8526, "num_tokens": 316066460.0, "step": 103 }, { "epoch": 0.15240886609269097, "grad_norm": 0.8975147633009299, "learning_rate": 4e-05, "loss": 0.8401, "num_tokens": 319285989.0, "step": 104 }, { "epoch": 0.15387433595896685, "grad_norm": 0.9617782907220275, "learning_rate": 3.999999190230219e-05, "loss": 0.8521, "num_tokens": 322459463.0, "step": 105 }, { "epoch": 0.1553398058252427, "grad_norm": 0.9014787361849639, "learning_rate": 3.9999967609216026e-05, "loss": 0.8682, "num_tokens": 325398105.0, "step": 106 }, { "epoch": 0.1568052756915186, "grad_norm": 0.8448968557840734, "learning_rate": 3.9999927120763375e-05, "loss": 0.8515, "num_tokens": 328354575.0, "step": 107 }, { "epoch": 0.15827074555779447, "grad_norm": 0.7675935741857141, "learning_rate": 3.9999870436980666e-05, "loss": 0.849, "num_tokens": 331380106.0, "step": 108 }, { "epoch": 0.15973621542407035, "grad_norm": 0.7819100282863429, "learning_rate": 3.99997975579189e-05, "loss": 0.8492, "num_tokens": 334337989.0, "step": 109 }, { "epoch": 0.16120168529034623, "grad_norm": 0.6316748610620255, "learning_rate": 3.999970848364365e-05, "loss": 0.8335, "num_tokens": 337316918.0, "step": 110 }, { "epoch": 0.16266715515662208, "grad_norm": 0.9880671696016186, "learning_rate": 3.999960321423505e-05, "loss": 0.8565, "num_tokens": 340569368.0, "step": 111 }, { "epoch": 0.16413262502289797, "grad_norm": 0.8459771138972552, "learning_rate": 3.9999481749787836e-05, "loss": 0.8351, "num_tokens": 343785795.0, "step": 112 }, { "epoch": 0.16559809488917385, "grad_norm": 0.7228713159313074, "learning_rate": 3.9999344090411276e-05, "loss": 0.8432, "num_tokens": 347009560.0, "step": 113 }, { "epoch": 0.16706356475544973, "grad_norm": 0.6915081540386461, "learning_rate": 3.9999190236229236e-05, "loss": 0.8427, "num_tokens": 350077813.0, "step": 114 }, { "epoch": 0.16852903462172558, "grad_norm": 0.7399809312221232, "learning_rate": 3.999902018738014e-05, "loss": 0.854, "num_tokens": 353365170.0, "step": 115 }, { "epoch": 0.16999450448800146, "grad_norm": 0.6204926157232024, "learning_rate": 3.9998833944017004e-05, "loss": 0.8672, "num_tokens": 356249147.0, "step": 116 }, { "epoch": 0.17145997435427734, "grad_norm": 0.8167529790814838, "learning_rate": 3.9998631506307376e-05, "loss": 0.8396, "num_tokens": 359194019.0, "step": 117 }, { "epoch": 0.17292544422055323, "grad_norm": 0.7448155552014681, "learning_rate": 3.9998412874433426e-05, "loss": 0.8432, "num_tokens": 362220812.0, "step": 118 }, { "epoch": 0.17439091408682908, "grad_norm": 0.7704376811076306, "learning_rate": 3.999817804859184e-05, "loss": 0.8335, "num_tokens": 365054187.0, "step": 119 }, { "epoch": 0.17585638395310496, "grad_norm": 0.8236095964572204, "learning_rate": 3.999792702899391e-05, "loss": 0.8401, "num_tokens": 368095469.0, "step": 120 }, { "epoch": 0.17732185381938084, "grad_norm": 0.7242411939290744, "learning_rate": 3.999765981586551e-05, "loss": 0.8421, "num_tokens": 371179280.0, "step": 121 }, { "epoch": 0.17878732368565672, "grad_norm": 0.7602144061016769, "learning_rate": 3.9997376409447025e-05, "loss": 0.82, "num_tokens": 374510666.0, "step": 122 }, { "epoch": 0.18025279355193258, "grad_norm": 0.6611171707745039, "learning_rate": 3.9997076809993475e-05, "loss": 0.818, "num_tokens": 377732134.0, "step": 123 }, { "epoch": 0.18171826341820846, "grad_norm": 0.8899181285907293, "learning_rate": 3.999676101777443e-05, "loss": 0.8437, "num_tokens": 380914452.0, "step": 124 }, { "epoch": 0.18318373328448434, "grad_norm": 0.7437575189790447, "learning_rate": 3.999642903307399e-05, "loss": 0.8385, "num_tokens": 383855630.0, "step": 125 }, { "epoch": 0.18464920315076022, "grad_norm": 0.79914125957133, "learning_rate": 3.999608085619088e-05, "loss": 0.8438, "num_tokens": 386829036.0, "step": 126 }, { "epoch": 0.18611467301703608, "grad_norm": 0.7133956104576291, "learning_rate": 3.999571648743837e-05, "loss": 0.8319, "num_tokens": 389956141.0, "step": 127 }, { "epoch": 0.18758014288331196, "grad_norm": 0.7472988405317954, "learning_rate": 3.999533592714429e-05, "loss": 0.8323, "num_tokens": 392958264.0, "step": 128 }, { "epoch": 0.18904561274958784, "grad_norm": 0.5783827236951032, "learning_rate": 3.999493917565105e-05, "loss": 0.8223, "num_tokens": 396253470.0, "step": 129 }, { "epoch": 0.19051108261586372, "grad_norm": 0.677932958212559, "learning_rate": 3.999452623331563e-05, "loss": 0.8381, "num_tokens": 399257893.0, "step": 130 }, { "epoch": 0.19197655248213957, "grad_norm": 0.6926833614130089, "learning_rate": 3.999409710050957e-05, "loss": 0.8576, "num_tokens": 402276427.0, "step": 131 }, { "epoch": 0.19344202234841545, "grad_norm": 0.6312795952253336, "learning_rate": 3.999365177761897e-05, "loss": 0.8461, "num_tokens": 405193016.0, "step": 132 }, { "epoch": 0.19490749221469134, "grad_norm": 1.0920656953343262, "learning_rate": 3.999319026504452e-05, "loss": 0.8329, "num_tokens": 408193549.0, "step": 133 }, { "epoch": 0.19637296208096722, "grad_norm": 0.5899652588744977, "learning_rate": 3.999271256320145e-05, "loss": 0.8472, "num_tokens": 411359687.0, "step": 134 }, { "epoch": 0.1978384319472431, "grad_norm": 1.0499494065913662, "learning_rate": 3.9992218672519585e-05, "loss": 0.8424, "num_tokens": 414484226.0, "step": 135 }, { "epoch": 0.19930390181351895, "grad_norm": 0.7735587808705342, "learning_rate": 3.99917085934433e-05, "loss": 0.8192, "num_tokens": 417469898.0, "step": 136 }, { "epoch": 0.20076937167979483, "grad_norm": 0.7893087239695173, "learning_rate": 3.999118232643152e-05, "loss": 0.8353, "num_tokens": 420518881.0, "step": 137 }, { "epoch": 0.20223484154607071, "grad_norm": 0.8844659631297803, "learning_rate": 3.9990639871957755e-05, "loss": 0.8505, "num_tokens": 423606313.0, "step": 138 }, { "epoch": 0.2037003114123466, "grad_norm": 0.7422524640015375, "learning_rate": 3.9990081230510086e-05, "loss": 0.8201, "num_tokens": 426791799.0, "step": 139 }, { "epoch": 0.20516578127862245, "grad_norm": 0.7895678346896589, "learning_rate": 3.998950640259113e-05, "loss": 0.8532, "num_tokens": 429715234.0, "step": 140 }, { "epoch": 0.20663125114489833, "grad_norm": 0.7162901158544891, "learning_rate": 3.998891538871811e-05, "loss": 0.8367, "num_tokens": 432885607.0, "step": 141 }, { "epoch": 0.2080967210111742, "grad_norm": 1.0051799893599505, "learning_rate": 3.9988308189422764e-05, "loss": 0.8187, "num_tokens": 436138685.0, "step": 142 }, { "epoch": 0.2095621908774501, "grad_norm": 0.6031350174028013, "learning_rate": 3.9987684805251434e-05, "loss": 0.8293, "num_tokens": 439190104.0, "step": 143 }, { "epoch": 0.21102766074372595, "grad_norm": 1.08526344989608, "learning_rate": 3.998704523676499e-05, "loss": 0.8226, "num_tokens": 442137425.0, "step": 144 }, { "epoch": 0.21249313061000183, "grad_norm": 0.6482151176096894, "learning_rate": 3.9986389484538886e-05, "loss": 0.8287, "num_tokens": 445161524.0, "step": 145 }, { "epoch": 0.2139586004762777, "grad_norm": 0.860680740857215, "learning_rate": 3.998571754916314e-05, "loss": 0.8153, "num_tokens": 448239655.0, "step": 146 }, { "epoch": 0.2154240703425536, "grad_norm": 0.7415041111321264, "learning_rate": 3.9985029431242305e-05, "loss": 0.8497, "num_tokens": 451132218.0, "step": 147 }, { "epoch": 0.21688954020882945, "grad_norm": 0.8029762433268514, "learning_rate": 3.998432513139553e-05, "loss": 0.8384, "num_tokens": 454285865.0, "step": 148 }, { "epoch": 0.21835501007510533, "grad_norm": 0.6349460139245732, "learning_rate": 3.998360465025649e-05, "loss": 0.8313, "num_tokens": 457309502.0, "step": 149 }, { "epoch": 0.2198204799413812, "grad_norm": 0.8030936589895405, "learning_rate": 3.9982867988473446e-05, "loss": 0.8156, "num_tokens": 460241737.0, "step": 150 }, { "epoch": 0.2212859498076571, "grad_norm": 0.5663180192493148, "learning_rate": 3.998211514670919e-05, "loss": 0.8299, "num_tokens": 463436310.0, "step": 151 }, { "epoch": 0.22275141967393294, "grad_norm": 0.819436376636797, "learning_rate": 3.99813461256411e-05, "loss": 0.8193, "num_tokens": 466381548.0, "step": 152 }, { "epoch": 0.22421688954020882, "grad_norm": 0.7434237957955598, "learning_rate": 3.998056092596109e-05, "loss": 0.8621, "num_tokens": 469350892.0, "step": 153 }, { "epoch": 0.2256823594064847, "grad_norm": 0.7452172603634878, "learning_rate": 3.997975954837564e-05, "loss": 0.8249, "num_tokens": 472530190.0, "step": 154 }, { "epoch": 0.2271478292727606, "grad_norm": 0.7123150549669498, "learning_rate": 3.99789419936058e-05, "loss": 0.8407, "num_tokens": 475507490.0, "step": 155 }, { "epoch": 0.22861329913903644, "grad_norm": 0.7865493328792948, "learning_rate": 3.9978108262387135e-05, "loss": 0.8165, "num_tokens": 478670275.0, "step": 156 }, { "epoch": 0.23007876900531232, "grad_norm": 0.6101056307376534, "learning_rate": 3.997725835546981e-05, "loss": 0.827, "num_tokens": 481730944.0, "step": 157 }, { "epoch": 0.2315442388715882, "grad_norm": 0.8565396599611025, "learning_rate": 3.9976392273618514e-05, "loss": 0.8385, "num_tokens": 484600898.0, "step": 158 }, { "epoch": 0.23300970873786409, "grad_norm": 0.7336100996232997, "learning_rate": 3.99755100176125e-05, "loss": 0.8263, "num_tokens": 487778777.0, "step": 159 }, { "epoch": 0.23447517860413994, "grad_norm": 0.6937429258421548, "learning_rate": 3.9974611588245576e-05, "loss": 0.827, "num_tokens": 490635037.0, "step": 160 }, { "epoch": 0.23594064847041582, "grad_norm": 0.6441213653525039, "learning_rate": 3.99736969863261e-05, "loss": 0.8332, "num_tokens": 493613420.0, "step": 161 }, { "epoch": 0.2374061183366917, "grad_norm": 0.7216488874387245, "learning_rate": 3.997276621267697e-05, "loss": 0.8252, "num_tokens": 496857348.0, "step": 162 }, { "epoch": 0.23887158820296758, "grad_norm": 0.8074819977336892, "learning_rate": 3.9971819268135646e-05, "loss": 0.8374, "num_tokens": 499788261.0, "step": 163 }, { "epoch": 0.24033705806924346, "grad_norm": 0.6077381796651184, "learning_rate": 3.997085615355416e-05, "loss": 0.8164, "num_tokens": 503170980.0, "step": 164 }, { "epoch": 0.24180252793551932, "grad_norm": 0.7944494723827582, "learning_rate": 3.996987686979903e-05, "loss": 0.8154, "num_tokens": 506409642.0, "step": 165 }, { "epoch": 0.2432679978017952, "grad_norm": 0.6690511398862766, "learning_rate": 3.996888141775139e-05, "loss": 0.8171, "num_tokens": 509537723.0, "step": 166 }, { "epoch": 0.24473346766807108, "grad_norm": 0.8344368839873392, "learning_rate": 3.996786979830687e-05, "loss": 0.8265, "num_tokens": 512407522.0, "step": 167 }, { "epoch": 0.24619893753434696, "grad_norm": 0.6592804149080247, "learning_rate": 3.996684201237569e-05, "loss": 0.8545, "num_tokens": 515316936.0, "step": 168 }, { "epoch": 0.24766440740062282, "grad_norm": 0.6466670112342553, "learning_rate": 3.996579806088259e-05, "loss": 0.8315, "num_tokens": 518454253.0, "step": 169 }, { "epoch": 0.2491298772668987, "grad_norm": 0.803154966017149, "learning_rate": 3.996473794476685e-05, "loss": 0.8316, "num_tokens": 521488424.0, "step": 170 }, { "epoch": 0.25059534713317455, "grad_norm": 0.6940984560980675, "learning_rate": 3.996366166498231e-05, "loss": 0.8201, "num_tokens": 524663901.0, "step": 171 }, { "epoch": 0.25206081699945043, "grad_norm": 0.6805309224566704, "learning_rate": 3.996256922249734e-05, "loss": 0.8312, "num_tokens": 527436321.0, "step": 172 }, { "epoch": 0.2535262868657263, "grad_norm": 0.6749512403454886, "learning_rate": 3.996146061829487e-05, "loss": 0.8161, "num_tokens": 530418450.0, "step": 173 }, { "epoch": 0.2549917567320022, "grad_norm": 0.7730068368828539, "learning_rate": 3.9960335853372345e-05, "loss": 0.821, "num_tokens": 533701141.0, "step": 174 }, { "epoch": 0.2564572265982781, "grad_norm": 0.7203670802264126, "learning_rate": 3.995919492874178e-05, "loss": 0.8104, "num_tokens": 536860257.0, "step": 175 }, { "epoch": 0.25792269646455396, "grad_norm": 0.7984702395294277, "learning_rate": 3.995803784542971e-05, "loss": 0.8465, "num_tokens": 539725159.0, "step": 176 }, { "epoch": 0.25938816633082984, "grad_norm": 0.5362726653124847, "learning_rate": 3.9956864604477214e-05, "loss": 0.8267, "num_tokens": 542644241.0, "step": 177 }, { "epoch": 0.2608536361971057, "grad_norm": 0.6835100341938316, "learning_rate": 3.9955675206939906e-05, "loss": 0.8317, "num_tokens": 545588641.0, "step": 178 }, { "epoch": 0.26231910606338155, "grad_norm": 0.8074313766534024, "learning_rate": 3.995446965388794e-05, "loss": 0.8174, "num_tokens": 548795621.0, "step": 179 }, { "epoch": 0.2637845759296574, "grad_norm": 0.5507924702900725, "learning_rate": 3.995324794640601e-05, "loss": 0.8387, "num_tokens": 551569263.0, "step": 180 }, { "epoch": 0.2652500457959333, "grad_norm": 0.6814789822425229, "learning_rate": 3.995201008559334e-05, "loss": 0.836, "num_tokens": 554671550.0, "step": 181 }, { "epoch": 0.2667155156622092, "grad_norm": 0.811174854448483, "learning_rate": 3.995075607256368e-05, "loss": 0.8184, "num_tokens": 557598005.0, "step": 182 }, { "epoch": 0.26818098552848507, "grad_norm": 0.5709985096651433, "learning_rate": 3.994948590844533e-05, "loss": 0.7927, "num_tokens": 560902990.0, "step": 183 }, { "epoch": 0.26964645539476095, "grad_norm": 0.9184572254839238, "learning_rate": 3.994819959438111e-05, "loss": 0.8215, "num_tokens": 563930630.0, "step": 184 }, { "epoch": 0.27111192526103683, "grad_norm": 0.6528820775338945, "learning_rate": 3.9946897131528374e-05, "loss": 0.8105, "num_tokens": 567208936.0, "step": 185 }, { "epoch": 0.2725773951273127, "grad_norm": 0.7296617750225377, "learning_rate": 3.9945578521059e-05, "loss": 0.812, "num_tokens": 570240635.0, "step": 186 }, { "epoch": 0.27404286499358854, "grad_norm": 0.7285613438531743, "learning_rate": 3.994424376415941e-05, "loss": 0.82, "num_tokens": 573184515.0, "step": 187 }, { "epoch": 0.2755083348598644, "grad_norm": 0.6901016626567648, "learning_rate": 3.994289286203053e-05, "loss": 0.811, "num_tokens": 576074626.0, "step": 188 }, { "epoch": 0.2769738047261403, "grad_norm": 0.5757190070173106, "learning_rate": 3.9941525815887836e-05, "loss": 0.824, "num_tokens": 579444824.0, "step": 189 }, { "epoch": 0.2784392745924162, "grad_norm": 0.8966354598014828, "learning_rate": 3.994014262696133e-05, "loss": 0.817, "num_tokens": 582565685.0, "step": 190 }, { "epoch": 0.27990474445869207, "grad_norm": 0.6254396798953378, "learning_rate": 3.99387432964955e-05, "loss": 0.8342, "num_tokens": 585677800.0, "step": 191 }, { "epoch": 0.28137021432496795, "grad_norm": 0.7372678089826386, "learning_rate": 3.993732782574942e-05, "loss": 0.7932, "num_tokens": 588847079.0, "step": 192 }, { "epoch": 0.28283568419124383, "grad_norm": 0.7299211035819455, "learning_rate": 3.993589621599662e-05, "loss": 0.7952, "num_tokens": 591843647.0, "step": 193 }, { "epoch": 0.2843011540575197, "grad_norm": 0.7200112059284413, "learning_rate": 3.99344484685252e-05, "loss": 0.8195, "num_tokens": 594992889.0, "step": 194 }, { "epoch": 0.2857666239237956, "grad_norm": 0.8431571261967636, "learning_rate": 3.993298458463776e-05, "loss": 0.8351, "num_tokens": 598058589.0, "step": 195 }, { "epoch": 0.2872320937900714, "grad_norm": 0.5891969513358577, "learning_rate": 3.993150456565143e-05, "loss": 0.8348, "num_tokens": 601074188.0, "step": 196 }, { "epoch": 0.2886975636563473, "grad_norm": 0.7584934823993437, "learning_rate": 3.993000841289782e-05, "loss": 0.8292, "num_tokens": 604082070.0, "step": 197 }, { "epoch": 0.2901630335226232, "grad_norm": 0.6632460309991204, "learning_rate": 3.9928496127723114e-05, "loss": 0.8002, "num_tokens": 607498039.0, "step": 198 }, { "epoch": 0.29162850338889906, "grad_norm": 0.7422033104844754, "learning_rate": 3.9926967711487966e-05, "loss": 0.8176, "num_tokens": 610437913.0, "step": 199 }, { "epoch": 0.29309397325517494, "grad_norm": 0.5154286101153708, "learning_rate": 3.9925423165567564e-05, "loss": 0.8074, "num_tokens": 613524270.0, "step": 200 }, { "epoch": 0.2945594431214508, "grad_norm": 0.7019171226982792, "learning_rate": 3.992386249135161e-05, "loss": 0.8238, "num_tokens": 616438860.0, "step": 201 }, { "epoch": 0.2960249129877267, "grad_norm": 0.7459400715093033, "learning_rate": 3.99222856902443e-05, "loss": 0.8105, "num_tokens": 619976734.0, "step": 202 }, { "epoch": 0.2974903828540026, "grad_norm": 0.8354419436277302, "learning_rate": 3.992069276366436e-05, "loss": 0.8177, "num_tokens": 623066903.0, "step": 203 }, { "epoch": 0.2989558527202784, "grad_norm": 0.6196032193238038, "learning_rate": 3.991908371304502e-05, "loss": 0.81, "num_tokens": 626141402.0, "step": 204 }, { "epoch": 0.3004213225865543, "grad_norm": 0.7017892859541583, "learning_rate": 3.9917458539834e-05, "loss": 0.8213, "num_tokens": 629307909.0, "step": 205 }, { "epoch": 0.3018867924528302, "grad_norm": 0.6712223525214189, "learning_rate": 3.9915817245493555e-05, "loss": 0.8089, "num_tokens": 632370796.0, "step": 206 }, { "epoch": 0.30335226231910606, "grad_norm": 0.6752590187047599, "learning_rate": 3.991415983150042e-05, "loss": 0.8492, "num_tokens": 635295397.0, "step": 207 }, { "epoch": 0.30481773218538194, "grad_norm": 0.8611888424687264, "learning_rate": 3.991248629934585e-05, "loss": 0.8288, "num_tokens": 638322212.0, "step": 208 }, { "epoch": 0.3062832020516578, "grad_norm": 0.5131364545213245, "learning_rate": 3.9910796650535594e-05, "loss": 0.8076, "num_tokens": 641478294.0, "step": 209 }, { "epoch": 0.3077486719179337, "grad_norm": 0.7907027963530079, "learning_rate": 3.99090908865899e-05, "loss": 0.8292, "num_tokens": 644684314.0, "step": 210 }, { "epoch": 0.3092141417842096, "grad_norm": 0.6935867625715167, "learning_rate": 3.9907369009043525e-05, "loss": 0.8186, "num_tokens": 647703251.0, "step": 211 }, { "epoch": 0.3106796116504854, "grad_norm": 0.6529157200526053, "learning_rate": 3.9905631019445706e-05, "loss": 0.8106, "num_tokens": 650647298.0, "step": 212 }, { "epoch": 0.3121450815167613, "grad_norm": 0.7831886799826963, "learning_rate": 3.9903876919360207e-05, "loss": 0.8209, "num_tokens": 653422516.0, "step": 213 }, { "epoch": 0.3136105513830372, "grad_norm": 0.6510852093617102, "learning_rate": 3.990210671036527e-05, "loss": 0.8177, "num_tokens": 656293002.0, "step": 214 }, { "epoch": 0.31507602124931305, "grad_norm": 0.6425154337484317, "learning_rate": 3.99003203940536e-05, "loss": 0.8283, "num_tokens": 659414419.0, "step": 215 }, { "epoch": 0.31654149111558894, "grad_norm": 0.837178388828837, "learning_rate": 3.9898517972032456e-05, "loss": 0.8187, "num_tokens": 662466418.0, "step": 216 }, { "epoch": 0.3180069609818648, "grad_norm": 0.7419767469506745, "learning_rate": 3.989669944592355e-05, "loss": 0.8152, "num_tokens": 665332711.0, "step": 217 }, { "epoch": 0.3194724308481407, "grad_norm": 0.6899215374772198, "learning_rate": 3.989486481736308e-05, "loss": 0.8053, "num_tokens": 668191041.0, "step": 218 }, { "epoch": 0.3209379007144166, "grad_norm": 0.9102526563098543, "learning_rate": 3.9893014088001754e-05, "loss": 0.8002, "num_tokens": 671019312.0, "step": 219 }, { "epoch": 0.32240337058069246, "grad_norm": 0.7618911735458993, "learning_rate": 3.989114725950475e-05, "loss": 0.7975, "num_tokens": 674135552.0, "step": 220 }, { "epoch": 0.3238688404469683, "grad_norm": 0.8828199745356196, "learning_rate": 3.988926433355174e-05, "loss": 0.8024, "num_tokens": 677512820.0, "step": 221 }, { "epoch": 0.32533431031324417, "grad_norm": 0.5871783833268484, "learning_rate": 3.9887365311836865e-05, "loss": 0.8033, "num_tokens": 680449923.0, "step": 222 }, { "epoch": 0.32679978017952005, "grad_norm": 1.3220464396030602, "learning_rate": 3.9885450196068774e-05, "loss": 0.8136, "num_tokens": 683398145.0, "step": 223 }, { "epoch": 0.32826525004579593, "grad_norm": 0.8448212854927531, "learning_rate": 3.9883518987970564e-05, "loss": 0.8169, "num_tokens": 686301937.0, "step": 224 }, { "epoch": 0.3297307199120718, "grad_norm": 1.337087513449658, "learning_rate": 3.988157168927984e-05, "loss": 0.82, "num_tokens": 689608883.0, "step": 225 }, { "epoch": 0.3311961897783477, "grad_norm": 1.0891499449411013, "learning_rate": 3.9879608301748676e-05, "loss": 0.8196, "num_tokens": 692335164.0, "step": 226 }, { "epoch": 0.3326616596446236, "grad_norm": 1.1160541404724815, "learning_rate": 3.987762882714361e-05, "loss": 0.8112, "num_tokens": 695609166.0, "step": 227 }, { "epoch": 0.33412712951089946, "grad_norm": 1.0211109905827904, "learning_rate": 3.987563326724566e-05, "loss": 0.8064, "num_tokens": 698716359.0, "step": 228 }, { "epoch": 0.3355925993771753, "grad_norm": 0.8574404998176582, "learning_rate": 3.987362162385033e-05, "loss": 0.8264, "num_tokens": 701775841.0, "step": 229 }, { "epoch": 0.33705806924345116, "grad_norm": 0.964153884020382, "learning_rate": 3.987159389876758e-05, "loss": 0.8316, "num_tokens": 704772315.0, "step": 230 }, { "epoch": 0.33852353910972705, "grad_norm": 0.6003963708381315, "learning_rate": 3.986955009382185e-05, "loss": 0.8013, "num_tokens": 707787887.0, "step": 231 }, { "epoch": 0.3399890089760029, "grad_norm": 0.9262843585184378, "learning_rate": 3.9867490210852023e-05, "loss": 0.828, "num_tokens": 710898819.0, "step": 232 }, { "epoch": 0.3414544788422788, "grad_norm": 0.5636810797185664, "learning_rate": 3.9865414251711484e-05, "loss": 0.805, "num_tokens": 714001836.0, "step": 233 }, { "epoch": 0.3429199487085547, "grad_norm": 0.9796068494628346, "learning_rate": 3.986332221826806e-05, "loss": 0.8104, "num_tokens": 716937618.0, "step": 234 }, { "epoch": 0.34438541857483057, "grad_norm": 0.8212024547318494, "learning_rate": 3.986121411240404e-05, "loss": 0.8119, "num_tokens": 720053014.0, "step": 235 }, { "epoch": 0.34585088844110645, "grad_norm": 0.7990815267851159, "learning_rate": 3.9859089936016194e-05, "loss": 0.8088, "num_tokens": 723012456.0, "step": 236 }, { "epoch": 0.3473163583073823, "grad_norm": 0.932304241352169, "learning_rate": 3.985694969101573e-05, "loss": 0.8095, "num_tokens": 725810540.0, "step": 237 }, { "epoch": 0.34878182817365816, "grad_norm": 0.6575298005513099, "learning_rate": 3.9854793379328314e-05, "loss": 0.8253, "num_tokens": 728859351.0, "step": 238 }, { "epoch": 0.35024729803993404, "grad_norm": 0.9884944468854905, "learning_rate": 3.9852621002894084e-05, "loss": 0.8187, "num_tokens": 731985755.0, "step": 239 }, { "epoch": 0.3517127679062099, "grad_norm": 0.6499706972996803, "learning_rate": 3.9850432563667624e-05, "loss": 0.7985, "num_tokens": 734818087.0, "step": 240 }, { "epoch": 0.3531782377724858, "grad_norm": 0.9432189990448124, "learning_rate": 3.984822806361797e-05, "loss": 0.8061, "num_tokens": 737882416.0, "step": 241 }, { "epoch": 0.3546437076387617, "grad_norm": 0.7416121449625138, "learning_rate": 3.9846007504728593e-05, "loss": 0.7946, "num_tokens": 740872334.0, "step": 242 }, { "epoch": 0.35610917750503757, "grad_norm": 0.7958157545430461, "learning_rate": 3.9843770888997444e-05, "loss": 0.7975, "num_tokens": 744025224.0, "step": 243 }, { "epoch": 0.35757464737131345, "grad_norm": 0.8168175520110875, "learning_rate": 3.98415182184369e-05, "loss": 0.825, "num_tokens": 747083529.0, "step": 244 }, { "epoch": 0.35904011723758933, "grad_norm": 0.6333502231811122, "learning_rate": 3.983924949507379e-05, "loss": 0.8089, "num_tokens": 749927489.0, "step": 245 }, { "epoch": 0.36050558710386515, "grad_norm": 0.7167453216406626, "learning_rate": 3.9836964720949376e-05, "loss": 0.7852, "num_tokens": 753003946.0, "step": 246 }, { "epoch": 0.36197105697014104, "grad_norm": 0.6586488885722229, "learning_rate": 3.983466389811938e-05, "loss": 0.8022, "num_tokens": 756206664.0, "step": 247 }, { "epoch": 0.3634365268364169, "grad_norm": 0.6739528099479933, "learning_rate": 3.9832347028653956e-05, "loss": 0.7856, "num_tokens": 759305905.0, "step": 248 }, { "epoch": 0.3649019967026928, "grad_norm": 0.756683568899693, "learning_rate": 3.9830014114637685e-05, "loss": 0.7916, "num_tokens": 762317675.0, "step": 249 }, { "epoch": 0.3663674665689687, "grad_norm": 0.5972547793831667, "learning_rate": 3.98276651581696e-05, "loss": 0.8033, "num_tokens": 765233847.0, "step": 250 }, { "epoch": 0.36783293643524456, "grad_norm": 0.7968804539912702, "learning_rate": 3.982530016136316e-05, "loss": 0.8171, "num_tokens": 768110450.0, "step": 251 }, { "epoch": 0.36929840630152044, "grad_norm": 0.6402463575133113, "learning_rate": 3.982291912634625e-05, "loss": 0.8081, "num_tokens": 770993870.0, "step": 252 }, { "epoch": 0.3707638761677963, "grad_norm": 0.5854244804235733, "learning_rate": 3.9820522055261205e-05, "loss": 0.8046, "num_tokens": 773870954.0, "step": 253 }, { "epoch": 0.37222934603407215, "grad_norm": 0.8352382465839165, "learning_rate": 3.981810895026476e-05, "loss": 0.8152, "num_tokens": 776774139.0, "step": 254 }, { "epoch": 0.37369481590034803, "grad_norm": 0.5477078216784473, "learning_rate": 3.9815679813528107e-05, "loss": 0.7987, "num_tokens": 779627344.0, "step": 255 }, { "epoch": 0.3751602857666239, "grad_norm": 0.9948671635332358, "learning_rate": 3.9813234647236835e-05, "loss": 0.7874, "num_tokens": 782724891.0, "step": 256 }, { "epoch": 0.3766257556328998, "grad_norm": 0.7177257458743718, "learning_rate": 3.981077345359098e-05, "loss": 0.7984, "num_tokens": 785864127.0, "step": 257 }, { "epoch": 0.3780912254991757, "grad_norm": 1.0248541853552995, "learning_rate": 3.980829623480498e-05, "loss": 0.8065, "num_tokens": 788860443.0, "step": 258 }, { "epoch": 0.37955669536545156, "grad_norm": 0.8269486779942563, "learning_rate": 3.98058029931077e-05, "loss": 0.7892, "num_tokens": 791965513.0, "step": 259 }, { "epoch": 0.38102216523172744, "grad_norm": 0.8210289323787733, "learning_rate": 3.980329373074242e-05, "loss": 0.7909, "num_tokens": 795188967.0, "step": 260 }, { "epoch": 0.3824876350980033, "grad_norm": 0.7413417161587943, "learning_rate": 3.980076844996683e-05, "loss": 0.8017, "num_tokens": 798214311.0, "step": 261 }, { "epoch": 0.38395310496427915, "grad_norm": 0.6897198702390833, "learning_rate": 3.979822715305305e-05, "loss": 0.8075, "num_tokens": 801079842.0, "step": 262 }, { "epoch": 0.385418574830555, "grad_norm": 0.6951560105884733, "learning_rate": 3.9795669842287575e-05, "loss": 0.8122, "num_tokens": 804219719.0, "step": 263 }, { "epoch": 0.3868840446968309, "grad_norm": 0.6195012813677117, "learning_rate": 3.9793096519971354e-05, "loss": 0.8068, "num_tokens": 807280123.0, "step": 264 }, { "epoch": 0.3883495145631068, "grad_norm": 0.7587752207285182, "learning_rate": 3.97905071884197e-05, "loss": 0.8067, "num_tokens": 810041493.0, "step": 265 }, { "epoch": 0.38981498442938267, "grad_norm": 0.6222903416283405, "learning_rate": 3.9787901849962355e-05, "loss": 0.8065, "num_tokens": 812919486.0, "step": 266 }, { "epoch": 0.39128045429565855, "grad_norm": 0.6674186345466703, "learning_rate": 3.9785280506943465e-05, "loss": 0.7986, "num_tokens": 816094940.0, "step": 267 }, { "epoch": 0.39274592416193443, "grad_norm": 0.7153698861223848, "learning_rate": 3.978264316172156e-05, "loss": 0.8086, "num_tokens": 819154728.0, "step": 268 }, { "epoch": 0.3942113940282103, "grad_norm": 0.5121286382129685, "learning_rate": 3.977998981666957e-05, "loss": 0.8062, "num_tokens": 822342162.0, "step": 269 }, { "epoch": 0.3956768638944862, "grad_norm": 0.8154009432621302, "learning_rate": 3.977732047417485e-05, "loss": 0.8087, "num_tokens": 825271625.0, "step": 270 }, { "epoch": 0.397142333760762, "grad_norm": 0.6427132566769365, "learning_rate": 3.9774635136639106e-05, "loss": 0.7945, "num_tokens": 828216400.0, "step": 271 }, { "epoch": 0.3986078036270379, "grad_norm": 0.7439761634674241, "learning_rate": 3.9771933806478455e-05, "loss": 0.8059, "num_tokens": 831222721.0, "step": 272 }, { "epoch": 0.4000732734933138, "grad_norm": 0.7090772876336, "learning_rate": 3.976921648612341e-05, "loss": 0.7848, "num_tokens": 834382978.0, "step": 273 }, { "epoch": 0.40153874335958967, "grad_norm": 0.6604548780890596, "learning_rate": 3.976648317801886e-05, "loss": 0.7996, "num_tokens": 837453495.0, "step": 274 }, { "epoch": 0.40300421322586555, "grad_norm": 0.6988608505240668, "learning_rate": 3.976373388462409e-05, "loss": 0.8125, "num_tokens": 840495565.0, "step": 275 }, { "epoch": 0.40446968309214143, "grad_norm": 0.5309357857778855, "learning_rate": 3.9760968608412755e-05, "loss": 0.8153, "num_tokens": 843584708.0, "step": 276 }, { "epoch": 0.4059351529584173, "grad_norm": 0.7055946980845533, "learning_rate": 3.9758187351872895e-05, "loss": 0.8005, "num_tokens": 846486455.0, "step": 277 }, { "epoch": 0.4074006228246932, "grad_norm": 0.6784624675781239, "learning_rate": 3.9755390117506924e-05, "loss": 0.7907, "num_tokens": 849756701.0, "step": 278 }, { "epoch": 0.408866092690969, "grad_norm": 0.5924931852271991, "learning_rate": 3.975257690783166e-05, "loss": 0.7945, "num_tokens": 852723313.0, "step": 279 }, { "epoch": 0.4103315625572449, "grad_norm": 0.7632430943044748, "learning_rate": 3.9749747725378245e-05, "loss": 0.7974, "num_tokens": 855700614.0, "step": 280 }, { "epoch": 0.4117970324235208, "grad_norm": 0.5546325040716105, "learning_rate": 3.974690257269224e-05, "loss": 0.7984, "num_tokens": 858771918.0, "step": 281 }, { "epoch": 0.41326250228979666, "grad_norm": 0.8281035270624759, "learning_rate": 3.974404145233354e-05, "loss": 0.8052, "num_tokens": 861559560.0, "step": 282 }, { "epoch": 0.41472797215607254, "grad_norm": 0.570178531770508, "learning_rate": 3.974116436687643e-05, "loss": 0.7844, "num_tokens": 864562982.0, "step": 283 }, { "epoch": 0.4161934420223484, "grad_norm": 0.6470628944736802, "learning_rate": 3.9738271318909544e-05, "loss": 0.7975, "num_tokens": 867609919.0, "step": 284 }, { "epoch": 0.4176589118886243, "grad_norm": 0.7056418954685535, "learning_rate": 3.9735362311035894e-05, "loss": 0.8221, "num_tokens": 870665592.0, "step": 285 }, { "epoch": 0.4191243817549002, "grad_norm": 0.7723744780339872, "learning_rate": 3.973243734587284e-05, "loss": 0.7755, "num_tokens": 873694884.0, "step": 286 }, { "epoch": 0.420589851621176, "grad_norm": 0.6891367616734481, "learning_rate": 3.97294964260521e-05, "loss": 0.796, "num_tokens": 876730774.0, "step": 287 }, { "epoch": 0.4220553214874519, "grad_norm": 0.5892571573919205, "learning_rate": 3.972653955421975e-05, "loss": 0.8076, "num_tokens": 879715258.0, "step": 288 }, { "epoch": 0.4235207913537278, "grad_norm": 0.8101736963219275, "learning_rate": 3.972356673303622e-05, "loss": 0.8176, "num_tokens": 882831143.0, "step": 289 }, { "epoch": 0.42498626122000366, "grad_norm": 0.6682034633143411, "learning_rate": 3.972057796517628e-05, "loss": 0.8112, "num_tokens": 886199642.0, "step": 290 }, { "epoch": 0.42645173108627954, "grad_norm": 0.6904065303589332, "learning_rate": 3.9717573253329076e-05, "loss": 0.813, "num_tokens": 889091703.0, "step": 291 }, { "epoch": 0.4279172009525554, "grad_norm": 0.676075982662874, "learning_rate": 3.9714552600198056e-05, "loss": 0.8026, "num_tokens": 892457899.0, "step": 292 }, { "epoch": 0.4293826708188313, "grad_norm": 0.6386175341949273, "learning_rate": 3.971151600850105e-05, "loss": 0.7951, "num_tokens": 895354175.0, "step": 293 }, { "epoch": 0.4308481406851072, "grad_norm": 0.8040523475084294, "learning_rate": 3.9708463480970224e-05, "loss": 0.7881, "num_tokens": 898455626.0, "step": 294 }, { "epoch": 0.432313610551383, "grad_norm": 0.6037494861758412, "learning_rate": 3.970539502035205e-05, "loss": 0.8072, "num_tokens": 901682272.0, "step": 295 }, { "epoch": 0.4337790804176589, "grad_norm": 0.8445458036330848, "learning_rate": 3.970231062940736e-05, "loss": 0.8455, "num_tokens": 904622788.0, "step": 296 }, { "epoch": 0.4352445502839348, "grad_norm": 0.6079029180527582, "learning_rate": 3.969921031091133e-05, "loss": 0.7967, "num_tokens": 907679701.0, "step": 297 }, { "epoch": 0.43671002015021065, "grad_norm": 0.8638414962718668, "learning_rate": 3.969609406765345e-05, "loss": 0.7715, "num_tokens": 910720194.0, "step": 298 }, { "epoch": 0.43817549001648654, "grad_norm": 0.5879101715228864, "learning_rate": 3.9692961902437536e-05, "loss": 0.7968, "num_tokens": 913883318.0, "step": 299 }, { "epoch": 0.4396409598827624, "grad_norm": 1.1033230276301016, "learning_rate": 3.968981381808174e-05, "loss": 0.8025, "num_tokens": 916930317.0, "step": 300 }, { "epoch": 0.4411064297490383, "grad_norm": 0.7204023357354239, "learning_rate": 3.968664981741852e-05, "loss": 0.803, "num_tokens": 919860522.0, "step": 301 }, { "epoch": 0.4425718996153142, "grad_norm": 1.1140147070786401, "learning_rate": 3.968346990329469e-05, "loss": 0.7883, "num_tokens": 923091105.0, "step": 302 }, { "epoch": 0.44403736948159006, "grad_norm": 0.931350919822376, "learning_rate": 3.968027407857134e-05, "loss": 0.794, "num_tokens": 926028964.0, "step": 303 }, { "epoch": 0.4455028393478659, "grad_norm": 0.9984343179523539, "learning_rate": 3.967706234612391e-05, "loss": 0.7777, "num_tokens": 929348858.0, "step": 304 }, { "epoch": 0.44696830921414177, "grad_norm": 0.8807420148602723, "learning_rate": 3.967383470884213e-05, "loss": 0.8031, "num_tokens": 932222204.0, "step": 305 }, { "epoch": 0.44843377908041765, "grad_norm": 0.8750755663477989, "learning_rate": 3.967059116963004e-05, "loss": 0.8033, "num_tokens": 935364724.0, "step": 306 }, { "epoch": 0.44989924894669353, "grad_norm": 0.8628230525343615, "learning_rate": 3.9667331731406015e-05, "loss": 0.8133, "num_tokens": 938525625.0, "step": 307 }, { "epoch": 0.4513647188129694, "grad_norm": 0.7845540994685867, "learning_rate": 3.966405639710269e-05, "loss": 0.8156, "num_tokens": 941740823.0, "step": 308 }, { "epoch": 0.4528301886792453, "grad_norm": 0.8150532723202197, "learning_rate": 3.966076516966705e-05, "loss": 0.7963, "num_tokens": 944914711.0, "step": 309 }, { "epoch": 0.4542956585455212, "grad_norm": 0.6892761616796849, "learning_rate": 3.965745805206035e-05, "loss": 0.7926, "num_tokens": 948008105.0, "step": 310 }, { "epoch": 0.45576112841179706, "grad_norm": 0.7309392964250384, "learning_rate": 3.965413504725815e-05, "loss": 0.7914, "num_tokens": 951233502.0, "step": 311 }, { "epoch": 0.4572265982780729, "grad_norm": 0.7253252405163354, "learning_rate": 3.9650796158250304e-05, "loss": 0.7882, "num_tokens": 954103566.0, "step": 312 }, { "epoch": 0.45869206814434876, "grad_norm": 0.66944422228442, "learning_rate": 3.964744138804096e-05, "loss": 0.7825, "num_tokens": 957109159.0, "step": 313 }, { "epoch": 0.46015753801062464, "grad_norm": 0.569515057687289, "learning_rate": 3.964407073964854e-05, "loss": 0.7941, "num_tokens": 960136750.0, "step": 314 }, { "epoch": 0.4616230078769005, "grad_norm": 0.6938206879198919, "learning_rate": 3.964068421610579e-05, "loss": 0.8057, "num_tokens": 963127053.0, "step": 315 }, { "epoch": 0.4630884777431764, "grad_norm": 0.6512216286182886, "learning_rate": 3.9637281820459696e-05, "loss": 0.794, "num_tokens": 966385018.0, "step": 316 }, { "epoch": 0.4645539476094523, "grad_norm": 0.5956523162117932, "learning_rate": 3.963386355577155e-05, "loss": 0.7958, "num_tokens": 969223008.0, "step": 317 }, { "epoch": 0.46601941747572817, "grad_norm": 0.6460599725708294, "learning_rate": 3.9630429425116924e-05, "loss": 0.7853, "num_tokens": 972338208.0, "step": 318 }, { "epoch": 0.46748488734200405, "grad_norm": 0.6323153278934952, "learning_rate": 3.962697943158564e-05, "loss": 0.7834, "num_tokens": 975501698.0, "step": 319 }, { "epoch": 0.4689503572082799, "grad_norm": 0.6910925634491862, "learning_rate": 3.9623513578281814e-05, "loss": 0.7978, "num_tokens": 978725425.0, "step": 320 }, { "epoch": 0.47041582707455576, "grad_norm": 0.5623156008534718, "learning_rate": 3.9620031868323834e-05, "loss": 0.7916, "num_tokens": 982024542.0, "step": 321 }, { "epoch": 0.47188129694083164, "grad_norm": 0.5869379770798241, "learning_rate": 3.9616534304844355e-05, "loss": 0.7983, "num_tokens": 985153492.0, "step": 322 }, { "epoch": 0.4733467668071075, "grad_norm": 0.645479405742131, "learning_rate": 3.961302089099027e-05, "loss": 0.7844, "num_tokens": 988100099.0, "step": 323 }, { "epoch": 0.4748122366733834, "grad_norm": 0.6116473211266792, "learning_rate": 3.960949162992276e-05, "loss": 0.7909, "num_tokens": 991146505.0, "step": 324 }, { "epoch": 0.4762777065396593, "grad_norm": 0.753635641137463, "learning_rate": 3.960594652481727e-05, "loss": 0.7947, "num_tokens": 994303276.0, "step": 325 }, { "epoch": 0.47774317640593517, "grad_norm": 0.5061771930164656, "learning_rate": 3.960238557886347e-05, "loss": 0.7685, "num_tokens": 997379414.0, "step": 326 }, { "epoch": 0.47920864627221105, "grad_norm": 0.6105881092651655, "learning_rate": 3.959880879526531e-05, "loss": 0.7825, "num_tokens": 1000390205.0, "step": 327 }, { "epoch": 0.48067411613848693, "grad_norm": 0.5438321751943127, "learning_rate": 3.959521617724098e-05, "loss": 0.7822, "num_tokens": 1003438133.0, "step": 328 }, { "epoch": 0.48213958600476275, "grad_norm": 0.8333424366739319, "learning_rate": 3.9591607728022915e-05, "loss": 0.7938, "num_tokens": 1006560380.0, "step": 329 }, { "epoch": 0.48360505587103864, "grad_norm": 0.5779230897698857, "learning_rate": 3.95879834508578e-05, "loss": 0.7972, "num_tokens": 1009754494.0, "step": 330 }, { "epoch": 0.4850705257373145, "grad_norm": 0.6645147682746726, "learning_rate": 3.9584343349006554e-05, "loss": 0.7991, "num_tokens": 1012777513.0, "step": 331 }, { "epoch": 0.4865359956035904, "grad_norm": 0.6547200183154098, "learning_rate": 3.9580687425744336e-05, "loss": 0.7958, "num_tokens": 1015900012.0, "step": 332 }, { "epoch": 0.4880014654698663, "grad_norm": 0.725519819194977, "learning_rate": 3.957701568436054e-05, "loss": 0.7794, "num_tokens": 1018865976.0, "step": 333 }, { "epoch": 0.48946693533614216, "grad_norm": 0.5428382191286174, "learning_rate": 3.95733281281588e-05, "loss": 0.7831, "num_tokens": 1021912685.0, "step": 334 }, { "epoch": 0.49093240520241804, "grad_norm": 0.5866903093772561, "learning_rate": 3.956962476045698e-05, "loss": 0.7921, "num_tokens": 1024858000.0, "step": 335 }, { "epoch": 0.4923978750686939, "grad_norm": 0.7383254938022301, "learning_rate": 3.956590558458714e-05, "loss": 0.7817, "num_tokens": 1028067724.0, "step": 336 }, { "epoch": 0.49386334493496975, "grad_norm": 0.7228307517265506, "learning_rate": 3.956217060389561e-05, "loss": 0.809, "num_tokens": 1030949134.0, "step": 337 }, { "epoch": 0.49532881480124563, "grad_norm": 0.5758838905550404, "learning_rate": 3.955841982174292e-05, "loss": 0.8036, "num_tokens": 1034039321.0, "step": 338 }, { "epoch": 0.4967942846675215, "grad_norm": 0.6632232135488345, "learning_rate": 3.9554653241503785e-05, "loss": 0.7936, "num_tokens": 1037238595.0, "step": 339 }, { "epoch": 0.4982597545337974, "grad_norm": 0.6954498018354363, "learning_rate": 3.9550870866567186e-05, "loss": 0.7831, "num_tokens": 1040511123.0, "step": 340 }, { "epoch": 0.4997252244000733, "grad_norm": 0.6315183800034614, "learning_rate": 3.95470727003363e-05, "loss": 0.8094, "num_tokens": 1043604871.0, "step": 341 }, { "epoch": 0.5011906942663491, "grad_norm": 0.5996472827135996, "learning_rate": 3.954325874622848e-05, "loss": 0.7767, "num_tokens": 1046621373.0, "step": 342 }, { "epoch": 0.502656164132625, "grad_norm": 0.5474040962151834, "learning_rate": 3.953942900767533e-05, "loss": 0.8032, "num_tokens": 1049599585.0, "step": 343 }, { "epoch": 0.5041216339989009, "grad_norm": 0.5543290180624266, "learning_rate": 3.953558348812263e-05, "loss": 0.8225, "num_tokens": 1052567894.0, "step": 344 }, { "epoch": 0.5055871038651768, "grad_norm": 0.7581832677374636, "learning_rate": 3.953172219103036e-05, "loss": 0.7849, "num_tokens": 1055667078.0, "step": 345 }, { "epoch": 0.5070525737314526, "grad_norm": 0.5984245809336709, "learning_rate": 3.952784511987269e-05, "loss": 0.802, "num_tokens": 1058805802.0, "step": 346 }, { "epoch": 0.5085180435977286, "grad_norm": 0.5481121604127609, "learning_rate": 3.952395227813802e-05, "loss": 0.8092, "num_tokens": 1061690228.0, "step": 347 }, { "epoch": 0.5099835134640044, "grad_norm": 0.6257314219554806, "learning_rate": 3.9520043669328896e-05, "loss": 0.7891, "num_tokens": 1064713785.0, "step": 348 }, { "epoch": 0.5114489833302802, "grad_norm": 0.5963159753362327, "learning_rate": 3.951611929696206e-05, "loss": 0.8011, "num_tokens": 1067670764.0, "step": 349 }, { "epoch": 0.5129144531965562, "grad_norm": 0.6794277560003524, "learning_rate": 3.9512179164568446e-05, "loss": 0.7741, "num_tokens": 1070805078.0, "step": 350 }, { "epoch": 0.514379923062832, "grad_norm": 0.6359463570012431, "learning_rate": 3.950822327569318e-05, "loss": 0.7971, "num_tokens": 1073917951.0, "step": 351 }, { "epoch": 0.5158453929291079, "grad_norm": 0.6999400118469834, "learning_rate": 3.950425163389553e-05, "loss": 0.8135, "num_tokens": 1077041988.0, "step": 352 }, { "epoch": 0.5173108627953837, "grad_norm": 0.6467988992899758, "learning_rate": 3.950026424274897e-05, "loss": 0.8002, "num_tokens": 1080239292.0, "step": 353 }, { "epoch": 0.5187763326616597, "grad_norm": 0.4342948331930541, "learning_rate": 3.949626110584112e-05, "loss": 0.8039, "num_tokens": 1083360690.0, "step": 354 }, { "epoch": 0.5202418025279355, "grad_norm": 0.6917638123972273, "learning_rate": 3.9492242226773796e-05, "loss": 0.7702, "num_tokens": 1086576694.0, "step": 355 }, { "epoch": 0.5217072723942114, "grad_norm": 0.686982527796773, "learning_rate": 3.9488207609162946e-05, "loss": 0.7715, "num_tokens": 1089946956.0, "step": 356 }, { "epoch": 0.5231727422604873, "grad_norm": 0.5805771358591818, "learning_rate": 3.948415725663871e-05, "loss": 0.7941, "num_tokens": 1093097656.0, "step": 357 }, { "epoch": 0.5246382121267631, "grad_norm": 0.6947326498781151, "learning_rate": 3.948009117284536e-05, "loss": 0.7844, "num_tokens": 1095943615.0, "step": 358 }, { "epoch": 0.526103681993039, "grad_norm": 0.6461136086064133, "learning_rate": 3.9476009361441316e-05, "loss": 0.7932, "num_tokens": 1099018381.0, "step": 359 }, { "epoch": 0.5275691518593149, "grad_norm": 0.4945645826622396, "learning_rate": 3.947191182609919e-05, "loss": 0.7752, "num_tokens": 1102099212.0, "step": 360 }, { "epoch": 0.5290346217255908, "grad_norm": 0.9569932332248041, "learning_rate": 3.946779857050571e-05, "loss": 0.7959, "num_tokens": 1105067286.0, "step": 361 }, { "epoch": 0.5305000915918666, "grad_norm": 0.5557588276975498, "learning_rate": 3.946366959836174e-05, "loss": 0.7781, "num_tokens": 1107966322.0, "step": 362 }, { "epoch": 0.5319655614581426, "grad_norm": 0.773491420016794, "learning_rate": 3.9459524913382315e-05, "loss": 0.7671, "num_tokens": 1111109458.0, "step": 363 }, { "epoch": 0.5334310313244184, "grad_norm": 0.6832784929303869, "learning_rate": 3.945536451929657e-05, "loss": 0.782, "num_tokens": 1114146576.0, "step": 364 }, { "epoch": 0.5348965011906943, "grad_norm": 0.6103917935833321, "learning_rate": 3.9451188419847825e-05, "loss": 0.8011, "num_tokens": 1117297618.0, "step": 365 }, { "epoch": 0.5363619710569701, "grad_norm": 0.754558711716202, "learning_rate": 3.9446996618793475e-05, "loss": 0.8023, "num_tokens": 1120224816.0, "step": 366 }, { "epoch": 0.537827440923246, "grad_norm": 0.5819548538974069, "learning_rate": 3.944278911990509e-05, "loss": 0.7984, "num_tokens": 1123350152.0, "step": 367 }, { "epoch": 0.5392929107895219, "grad_norm": 0.7092411227189372, "learning_rate": 3.943856592696832e-05, "loss": 0.771, "num_tokens": 1126551904.0, "step": 368 }, { "epoch": 0.5407583806557977, "grad_norm": 0.584353224153154, "learning_rate": 3.943432704378297e-05, "loss": 0.7857, "num_tokens": 1129793804.0, "step": 369 }, { "epoch": 0.5422238505220737, "grad_norm": 0.6509116472948473, "learning_rate": 3.9430072474162956e-05, "loss": 0.8032, "num_tokens": 1132646681.0, "step": 370 }, { "epoch": 0.5436893203883495, "grad_norm": 0.658935817704477, "learning_rate": 3.9425802221936297e-05, "loss": 0.7915, "num_tokens": 1135669439.0, "step": 371 }, { "epoch": 0.5451547902546254, "grad_norm": 0.6326213942256899, "learning_rate": 3.942151629094513e-05, "loss": 0.8007, "num_tokens": 1138920242.0, "step": 372 }, { "epoch": 0.5466202601209013, "grad_norm": 0.5359650740188716, "learning_rate": 3.941721468504569e-05, "loss": 0.7814, "num_tokens": 1142055088.0, "step": 373 }, { "epoch": 0.5480857299871771, "grad_norm": 0.627386838556622, "learning_rate": 3.941289740810833e-05, "loss": 0.7864, "num_tokens": 1145189774.0, "step": 374 }, { "epoch": 0.549551199853453, "grad_norm": 0.597519572284839, "learning_rate": 3.940856446401749e-05, "loss": 0.7735, "num_tokens": 1147993618.0, "step": 375 }, { "epoch": 0.5510166697197288, "grad_norm": 0.7502438584293525, "learning_rate": 3.9404215856671714e-05, "loss": 0.793, "num_tokens": 1151068236.0, "step": 376 }, { "epoch": 0.5524821395860048, "grad_norm": 0.5907966795842524, "learning_rate": 3.9399851589983636e-05, "loss": 0.7858, "num_tokens": 1154095768.0, "step": 377 }, { "epoch": 0.5539476094522806, "grad_norm": 0.6486592223973918, "learning_rate": 3.939547166787997e-05, "loss": 0.7732, "num_tokens": 1157114624.0, "step": 378 }, { "epoch": 0.5554130793185565, "grad_norm": 0.5086127371666345, "learning_rate": 3.9391076094301544e-05, "loss": 0.7734, "num_tokens": 1160391302.0, "step": 379 }, { "epoch": 0.5568785491848324, "grad_norm": 0.7111022612740963, "learning_rate": 3.938666487320323e-05, "loss": 0.789, "num_tokens": 1163468792.0, "step": 380 }, { "epoch": 0.5583440190511083, "grad_norm": 0.6089368963135149, "learning_rate": 3.938223800855402e-05, "loss": 0.7846, "num_tokens": 1166565708.0, "step": 381 }, { "epoch": 0.5598094889173841, "grad_norm": 0.525040717736477, "learning_rate": 3.937779550433694e-05, "loss": 0.7641, "num_tokens": 1170031638.0, "step": 382 }, { "epoch": 0.56127495878366, "grad_norm": 0.6863441114088674, "learning_rate": 3.9373337364549115e-05, "loss": 0.7636, "num_tokens": 1173067839.0, "step": 383 }, { "epoch": 0.5627404286499359, "grad_norm": 0.5270062193185172, "learning_rate": 3.936886359320174e-05, "loss": 0.7802, "num_tokens": 1176040768.0, "step": 384 }, { "epoch": 0.5642058985162117, "grad_norm": 0.6740752203015982, "learning_rate": 3.9364374194320045e-05, "loss": 0.7902, "num_tokens": 1178909285.0, "step": 385 }, { "epoch": 0.5656713683824877, "grad_norm": 0.5466221291761898, "learning_rate": 3.9359869171943356e-05, "loss": 0.7744, "num_tokens": 1181817393.0, "step": 386 }, { "epoch": 0.5671368382487635, "grad_norm": 0.6160945556506481, "learning_rate": 3.935534853012504e-05, "loss": 0.7791, "num_tokens": 1184603430.0, "step": 387 }, { "epoch": 0.5686023081150394, "grad_norm": 0.6409266957731405, "learning_rate": 3.9350812272932504e-05, "loss": 0.7794, "num_tokens": 1187698443.0, "step": 388 }, { "epoch": 0.5700677779813152, "grad_norm": 0.6141934112720102, "learning_rate": 3.934626040444724e-05, "loss": 0.7885, "num_tokens": 1190909470.0, "step": 389 }, { "epoch": 0.5715332478475912, "grad_norm": 0.5316224934856943, "learning_rate": 3.934169292876475e-05, "loss": 0.7822, "num_tokens": 1193782360.0, "step": 390 }, { "epoch": 0.572998717713867, "grad_norm": 0.6855633623383222, "learning_rate": 3.9337109849994606e-05, "loss": 0.7819, "num_tokens": 1196698697.0, "step": 391 }, { "epoch": 0.5744641875801428, "grad_norm": 0.5043655130170671, "learning_rate": 3.933251117226039e-05, "loss": 0.7861, "num_tokens": 1199962102.0, "step": 392 }, { "epoch": 0.5759296574464188, "grad_norm": 0.7055760117776626, "learning_rate": 3.932789689969975e-05, "loss": 0.7836, "num_tokens": 1202749896.0, "step": 393 }, { "epoch": 0.5773951273126946, "grad_norm": 0.6052393958224018, "learning_rate": 3.932326703646435e-05, "loss": 0.783, "num_tokens": 1205844591.0, "step": 394 }, { "epoch": 0.5788605971789705, "grad_norm": 0.6709060336300964, "learning_rate": 3.9318621586719865e-05, "loss": 0.7868, "num_tokens": 1208875270.0, "step": 395 }, { "epoch": 0.5803260670452464, "grad_norm": 0.5504767987809124, "learning_rate": 3.931396055464603e-05, "loss": 0.7945, "num_tokens": 1211788947.0, "step": 396 }, { "epoch": 0.5817915369115223, "grad_norm": 0.6845434695696377, "learning_rate": 3.930928394443658e-05, "loss": 0.7871, "num_tokens": 1214838846.0, "step": 397 }, { "epoch": 0.5832570067777981, "grad_norm": 0.5786687718557336, "learning_rate": 3.9304591760299266e-05, "loss": 0.7889, "num_tokens": 1217622512.0, "step": 398 }, { "epoch": 0.584722476644074, "grad_norm": 0.6684765415797735, "learning_rate": 3.929988400645585e-05, "loss": 0.7703, "num_tokens": 1220437989.0, "step": 399 }, { "epoch": 0.5861879465103499, "grad_norm": 0.6565781443921442, "learning_rate": 3.929516068714211e-05, "loss": 0.7759, "num_tokens": 1223454041.0, "step": 400 }, { "epoch": 0.5876534163766257, "grad_norm": 0.5449202898670066, "learning_rate": 3.929042180660782e-05, "loss": 0.787, "num_tokens": 1226435353.0, "step": 401 }, { "epoch": 0.5891188862429017, "grad_norm": 0.6875512812145042, "learning_rate": 3.9285667369116764e-05, "loss": 0.7888, "num_tokens": 1229405928.0, "step": 402 }, { "epoch": 0.5905843561091775, "grad_norm": 0.4735567416076701, "learning_rate": 3.928089737894672e-05, "loss": 0.775, "num_tokens": 1232720332.0, "step": 403 }, { "epoch": 0.5920498259754534, "grad_norm": 0.735516944930375, "learning_rate": 3.9276111840389456e-05, "loss": 0.7752, "num_tokens": 1235824664.0, "step": 404 }, { "epoch": 0.5935152958417292, "grad_norm": 0.6645257659658121, "learning_rate": 3.927131075775074e-05, "loss": 0.805, "num_tokens": 1238529918.0, "step": 405 }, { "epoch": 0.5949807657080052, "grad_norm": 0.5638827920306675, "learning_rate": 3.9266494135350306e-05, "loss": 0.784, "num_tokens": 1241546540.0, "step": 406 }, { "epoch": 0.596446235574281, "grad_norm": 0.6112222243437431, "learning_rate": 3.9261661977521894e-05, "loss": 0.781, "num_tokens": 1244682810.0, "step": 407 }, { "epoch": 0.5979117054405568, "grad_norm": 0.6838035001907536, "learning_rate": 3.92568142886132e-05, "loss": 0.7665, "num_tokens": 1247620565.0, "step": 408 }, { "epoch": 0.5993771753068328, "grad_norm": 0.5170317822843736, "learning_rate": 3.925195107298592e-05, "loss": 0.7842, "num_tokens": 1250627433.0, "step": 409 }, { "epoch": 0.6008426451731086, "grad_norm": 0.6347012338565816, "learning_rate": 3.924707233501568e-05, "loss": 0.7931, "num_tokens": 1253693480.0, "step": 410 }, { "epoch": 0.6023081150393845, "grad_norm": 0.6736550040624822, "learning_rate": 3.9242178079092116e-05, "loss": 0.7742, "num_tokens": 1256774656.0, "step": 411 }, { "epoch": 0.6037735849056604, "grad_norm": 0.593308429622272, "learning_rate": 3.9237268309618796e-05, "loss": 0.7807, "num_tokens": 1259897203.0, "step": 412 }, { "epoch": 0.6052390547719363, "grad_norm": 0.6138921785316901, "learning_rate": 3.923234303101326e-05, "loss": 0.781, "num_tokens": 1263043492.0, "step": 413 }, { "epoch": 0.6067045246382121, "grad_norm": 0.5987593300522766, "learning_rate": 3.922740224770701e-05, "loss": 0.7732, "num_tokens": 1266154425.0, "step": 414 }, { "epoch": 0.608169994504488, "grad_norm": 0.5212218843700888, "learning_rate": 3.9222445964145466e-05, "loss": 0.7817, "num_tokens": 1269169061.0, "step": 415 }, { "epoch": 0.6096354643707639, "grad_norm": 0.5665301085024289, "learning_rate": 3.921747418478803e-05, "loss": 0.7866, "num_tokens": 1272417238.0, "step": 416 }, { "epoch": 0.6111009342370397, "grad_norm": 0.5137543210301353, "learning_rate": 3.921248691410802e-05, "loss": 0.776, "num_tokens": 1275474812.0, "step": 417 }, { "epoch": 0.6125664041033156, "grad_norm": 0.5527216864080352, "learning_rate": 3.920748415659272e-05, "loss": 0.7871, "num_tokens": 1278861034.0, "step": 418 }, { "epoch": 0.6140318739695915, "grad_norm": 0.6134234420326042, "learning_rate": 3.920246591674332e-05, "loss": 0.7845, "num_tokens": 1281795150.0, "step": 419 }, { "epoch": 0.6154973438358674, "grad_norm": 0.7442124436399065, "learning_rate": 3.9197432199074955e-05, "loss": 0.7883, "num_tokens": 1284921095.0, "step": 420 }, { "epoch": 0.6169628137021432, "grad_norm": 0.5380476473910203, "learning_rate": 3.9192383008116684e-05, "loss": 0.7809, "num_tokens": 1287873067.0, "step": 421 }, { "epoch": 0.6184282835684192, "grad_norm": 0.6606212619924012, "learning_rate": 3.9187318348411484e-05, "loss": 0.7732, "num_tokens": 1290760966.0, "step": 422 }, { "epoch": 0.619893753434695, "grad_norm": 0.592132378648681, "learning_rate": 3.9182238224516264e-05, "loss": 0.766, "num_tokens": 1293817398.0, "step": 423 }, { "epoch": 0.6213592233009708, "grad_norm": 0.8815843889587126, "learning_rate": 3.917714264100183e-05, "loss": 0.7811, "num_tokens": 1296831296.0, "step": 424 }, { "epoch": 0.6228246931672468, "grad_norm": 0.47232065460420103, "learning_rate": 3.9172031602452884e-05, "loss": 0.7817, "num_tokens": 1299973604.0, "step": 425 }, { "epoch": 0.6242901630335226, "grad_norm": 1.115093151183982, "learning_rate": 3.9166905113468086e-05, "loss": 0.7751, "num_tokens": 1303233366.0, "step": 426 }, { "epoch": 0.6257556328997985, "grad_norm": 0.7426681872407431, "learning_rate": 3.916176317865995e-05, "loss": 0.7888, "num_tokens": 1306379980.0, "step": 427 }, { "epoch": 0.6272211027660743, "grad_norm": 1.0384039770021642, "learning_rate": 3.9156605802654896e-05, "loss": 0.7799, "num_tokens": 1309404593.0, "step": 428 }, { "epoch": 0.6286865726323503, "grad_norm": 0.7670076455276811, "learning_rate": 3.9151432990093255e-05, "loss": 0.774, "num_tokens": 1312596883.0, "step": 429 }, { "epoch": 0.6301520424986261, "grad_norm": 0.9763673607777046, "learning_rate": 3.914624474562923e-05, "loss": 0.7653, "num_tokens": 1315917367.0, "step": 430 }, { "epoch": 0.631617512364902, "grad_norm": 0.7540824912315313, "learning_rate": 3.914104107393091e-05, "loss": 0.7566, "num_tokens": 1318837842.0, "step": 431 }, { "epoch": 0.6330829822311779, "grad_norm": 0.953332107944831, "learning_rate": 3.913582197968027e-05, "loss": 0.7644, "num_tokens": 1322279369.0, "step": 432 }, { "epoch": 0.6345484520974537, "grad_norm": 0.8161246900306962, "learning_rate": 3.9130587467573156e-05, "loss": 0.7815, "num_tokens": 1325342829.0, "step": 433 }, { "epoch": 0.6360139219637296, "grad_norm": 0.7848078108515719, "learning_rate": 3.9125337542319306e-05, "loss": 0.7904, "num_tokens": 1328280112.0, "step": 434 }, { "epoch": 0.6374793918300055, "grad_norm": 0.7229667163099274, "learning_rate": 3.9120072208642296e-05, "loss": 0.7868, "num_tokens": 1331197587.0, "step": 435 }, { "epoch": 0.6389448616962814, "grad_norm": 0.8101404125823519, "learning_rate": 3.911479147127958e-05, "loss": 0.7858, "num_tokens": 1334364662.0, "step": 436 }, { "epoch": 0.6404103315625572, "grad_norm": 0.621806279389199, "learning_rate": 3.9109495334982484e-05, "loss": 0.789, "num_tokens": 1337342389.0, "step": 437 }, { "epoch": 0.6418758014288332, "grad_norm": 0.7762252985115091, "learning_rate": 3.910418380451615e-05, "loss": 0.7683, "num_tokens": 1340543873.0, "step": 438 }, { "epoch": 0.643341271295109, "grad_norm": 0.6468013970310809, "learning_rate": 3.909885688465962e-05, "loss": 0.7954, "num_tokens": 1343548675.0, "step": 439 }, { "epoch": 0.6448067411613849, "grad_norm": 0.8112026541833866, "learning_rate": 3.909351458020575e-05, "loss": 0.7896, "num_tokens": 1346577580.0, "step": 440 }, { "epoch": 0.6462722110276607, "grad_norm": 0.7295288410932435, "learning_rate": 3.908815689596125e-05, "loss": 0.7651, "num_tokens": 1349560639.0, "step": 441 }, { "epoch": 0.6477376808939366, "grad_norm": 0.6982691689196974, "learning_rate": 3.9082783836746665e-05, "loss": 0.788, "num_tokens": 1352759086.0, "step": 442 }, { "epoch": 0.6492031507602125, "grad_norm": 0.8147723140051017, "learning_rate": 3.9077395407396365e-05, "loss": 0.7754, "num_tokens": 1355817913.0, "step": 443 }, { "epoch": 0.6506686206264883, "grad_norm": 0.6167307174187188, "learning_rate": 3.907199161275858e-05, "loss": 0.7621, "num_tokens": 1358828142.0, "step": 444 }, { "epoch": 0.6521340904927643, "grad_norm": 0.729215913207261, "learning_rate": 3.906657245769532e-05, "loss": 0.7753, "num_tokens": 1361889206.0, "step": 445 }, { "epoch": 0.6535995603590401, "grad_norm": 0.6522801125783623, "learning_rate": 3.9061137947082445e-05, "loss": 0.7879, "num_tokens": 1365138469.0, "step": 446 }, { "epoch": 0.655065030225316, "grad_norm": 0.7112828083812891, "learning_rate": 3.9055688085809635e-05, "loss": 0.7735, "num_tokens": 1368277984.0, "step": 447 }, { "epoch": 0.6565305000915919, "grad_norm": 0.5650826012003861, "learning_rate": 3.905022287878036e-05, "loss": 0.7747, "num_tokens": 1371434001.0, "step": 448 }, { "epoch": 0.6579959699578677, "grad_norm": 0.6451291680871525, "learning_rate": 3.9044742330911904e-05, "loss": 0.7715, "num_tokens": 1374392968.0, "step": 449 }, { "epoch": 0.6594614398241436, "grad_norm": 0.5762246494944382, "learning_rate": 3.9039246447135374e-05, "loss": 0.79, "num_tokens": 1377171680.0, "step": 450 }, { "epoch": 0.6609269096904195, "grad_norm": 0.6953658230451008, "learning_rate": 3.903373523239565e-05, "loss": 0.7936, "num_tokens": 1380071522.0, "step": 451 }, { "epoch": 0.6623923795566954, "grad_norm": 0.5468782132147643, "learning_rate": 3.902820869165141e-05, "loss": 0.7618, "num_tokens": 1383012986.0, "step": 452 }, { "epoch": 0.6638578494229712, "grad_norm": 0.8415941788046309, "learning_rate": 3.902266682987514e-05, "loss": 0.7966, "num_tokens": 1385910595.0, "step": 453 }, { "epoch": 0.6653233192892472, "grad_norm": 0.6087189391945046, "learning_rate": 3.9017109652053085e-05, "loss": 0.757, "num_tokens": 1389136825.0, "step": 454 }, { "epoch": 0.666788789155523, "grad_norm": 0.8174995843209238, "learning_rate": 3.9011537163185294e-05, "loss": 0.7779, "num_tokens": 1392384785.0, "step": 455 }, { "epoch": 0.6682542590217989, "grad_norm": 0.6976440952330858, "learning_rate": 3.9005949368285575e-05, "loss": 0.8005, "num_tokens": 1395371553.0, "step": 456 }, { "epoch": 0.6697197288880747, "grad_norm": 0.661072210064578, "learning_rate": 3.9000346272381516e-05, "loss": 0.7781, "num_tokens": 1398462633.0, "step": 457 }, { "epoch": 0.6711851987543506, "grad_norm": 0.6777063662822085, "learning_rate": 3.899472788051448e-05, "loss": 0.7852, "num_tokens": 1401596380.0, "step": 458 }, { "epoch": 0.6726506686206265, "grad_norm": 0.7089751388455204, "learning_rate": 3.898909419773956e-05, "loss": 0.7788, "num_tokens": 1404918933.0, "step": 459 }, { "epoch": 0.6741161384869023, "grad_norm": 0.580366557626494, "learning_rate": 3.898344522912565e-05, "loss": 0.7719, "num_tokens": 1407976306.0, "step": 460 }, { "epoch": 0.6755816083531783, "grad_norm": 0.7152891481812426, "learning_rate": 3.897778097975537e-05, "loss": 0.7836, "num_tokens": 1411177240.0, "step": 461 }, { "epoch": 0.6770470782194541, "grad_norm": 0.534737628124812, "learning_rate": 3.897210145472509e-05, "loss": 0.7817, "num_tokens": 1414218311.0, "step": 462 }, { "epoch": 0.67851254808573, "grad_norm": 0.6488712988579897, "learning_rate": 3.896640665914494e-05, "loss": 0.7755, "num_tokens": 1417136629.0, "step": 463 }, { "epoch": 0.6799780179520059, "grad_norm": 0.48440960634838226, "learning_rate": 3.896069659813878e-05, "loss": 0.7725, "num_tokens": 1420178133.0, "step": 464 }, { "epoch": 0.6814434878182818, "grad_norm": 0.7834086421903245, "learning_rate": 3.8954971276844184e-05, "loss": 0.7938, "num_tokens": 1423121554.0, "step": 465 }, { "epoch": 0.6829089576845576, "grad_norm": 0.599529069777397, "learning_rate": 3.89492307004125e-05, "loss": 0.7888, "num_tokens": 1426180217.0, "step": 466 }, { "epoch": 0.6843744275508334, "grad_norm": 0.6866984516490249, "learning_rate": 3.894347487400876e-05, "loss": 0.7763, "num_tokens": 1429299039.0, "step": 467 }, { "epoch": 0.6858398974171094, "grad_norm": 0.7533196608962499, "learning_rate": 3.893770380281175e-05, "loss": 0.7718, "num_tokens": 1432360004.0, "step": 468 }, { "epoch": 0.6873053672833852, "grad_norm": 0.5443359379346597, "learning_rate": 3.893191749201395e-05, "loss": 0.7578, "num_tokens": 1435359898.0, "step": 469 }, { "epoch": 0.6887708371496611, "grad_norm": 0.853703474916966, "learning_rate": 3.892611594682156e-05, "loss": 0.7773, "num_tokens": 1438209499.0, "step": 470 }, { "epoch": 0.690236307015937, "grad_norm": 0.6040732201550099, "learning_rate": 3.892029917245449e-05, "loss": 0.7915, "num_tokens": 1441072799.0, "step": 471 }, { "epoch": 0.6917017768822129, "grad_norm": 0.7784781059909279, "learning_rate": 3.891446717414635e-05, "loss": 0.7753, "num_tokens": 1444188254.0, "step": 472 }, { "epoch": 0.6931672467484887, "grad_norm": 0.634458856385789, "learning_rate": 3.8908619957144446e-05, "loss": 0.7839, "num_tokens": 1447221260.0, "step": 473 }, { "epoch": 0.6946327166147646, "grad_norm": 0.764673041289811, "learning_rate": 3.890275752670978e-05, "loss": 0.788, "num_tokens": 1450300624.0, "step": 474 }, { "epoch": 0.6960981864810405, "grad_norm": 0.6606186030664977, "learning_rate": 3.8896879888117026e-05, "loss": 0.7798, "num_tokens": 1453288079.0, "step": 475 }, { "epoch": 0.6975636563473163, "grad_norm": 0.6083189697546619, "learning_rate": 3.889098704665457e-05, "loss": 0.7699, "num_tokens": 1456315233.0, "step": 476 }, { "epoch": 0.6990291262135923, "grad_norm": 0.7083192251172743, "learning_rate": 3.8885079007624456e-05, "loss": 0.7735, "num_tokens": 1459527291.0, "step": 477 }, { "epoch": 0.7004945960798681, "grad_norm": 0.5412000913569363, "learning_rate": 3.8879155776342405e-05, "loss": 0.7885, "num_tokens": 1462729527.0, "step": 478 }, { "epoch": 0.701960065946144, "grad_norm": 0.9034052043802873, "learning_rate": 3.8873217358137814e-05, "loss": 0.7653, "num_tokens": 1465896073.0, "step": 479 }, { "epoch": 0.7034255358124198, "grad_norm": 0.6303286250366069, "learning_rate": 3.8867263758353746e-05, "loss": 0.7532, "num_tokens": 1469030392.0, "step": 480 }, { "epoch": 0.7048910056786958, "grad_norm": 0.7924394172209809, "learning_rate": 3.8861294982346905e-05, "loss": 0.7538, "num_tokens": 1472130503.0, "step": 481 }, { "epoch": 0.7063564755449716, "grad_norm": 0.602683271414693, "learning_rate": 3.885531103548768e-05, "loss": 0.768, "num_tokens": 1475017493.0, "step": 482 }, { "epoch": 0.7078219454112474, "grad_norm": 0.7463579993927153, "learning_rate": 3.884931192316007e-05, "loss": 0.7825, "num_tokens": 1478011774.0, "step": 483 }, { "epoch": 0.7092874152775234, "grad_norm": 0.5652187937745072, "learning_rate": 3.884329765076176e-05, "loss": 0.7591, "num_tokens": 1481019740.0, "step": 484 }, { "epoch": 0.7107528851437992, "grad_norm": 0.7411279672493576, "learning_rate": 3.8837268223704045e-05, "loss": 0.7722, "num_tokens": 1483932004.0, "step": 485 }, { "epoch": 0.7122183550100751, "grad_norm": 0.5838860770566946, "learning_rate": 3.883122364741188e-05, "loss": 0.7861, "num_tokens": 1486898777.0, "step": 486 }, { "epoch": 0.713683824876351, "grad_norm": 0.8460749934271568, "learning_rate": 3.8825163927323825e-05, "loss": 0.7775, "num_tokens": 1490122731.0, "step": 487 }, { "epoch": 0.7151492947426269, "grad_norm": 0.6077038252936527, "learning_rate": 3.881908906889208e-05, "loss": 0.7933, "num_tokens": 1493182135.0, "step": 488 }, { "epoch": 0.7166147646089027, "grad_norm": 0.7473797197512128, "learning_rate": 3.881299907758247e-05, "loss": 0.7685, "num_tokens": 1496024863.0, "step": 489 }, { "epoch": 0.7180802344751787, "grad_norm": 0.6096257828107025, "learning_rate": 3.880689395887443e-05, "loss": 0.7585, "num_tokens": 1499232607.0, "step": 490 }, { "epoch": 0.7195457043414545, "grad_norm": 0.7702386432638039, "learning_rate": 3.8800773718261e-05, "loss": 0.7686, "num_tokens": 1502277307.0, "step": 491 }, { "epoch": 0.7210111742077303, "grad_norm": 0.6071741645971063, "learning_rate": 3.879463836124882e-05, "loss": 0.8017, "num_tokens": 1505081067.0, "step": 492 }, { "epoch": 0.7224766440740062, "grad_norm": 0.833216656808556, "learning_rate": 3.878848789335817e-05, "loss": 0.7591, "num_tokens": 1508130314.0, "step": 493 }, { "epoch": 0.7239421139402821, "grad_norm": 0.7333919896146535, "learning_rate": 3.878232232012287e-05, "loss": 0.7818, "num_tokens": 1511194457.0, "step": 494 }, { "epoch": 0.725407583806558, "grad_norm": 0.7226789755837114, "learning_rate": 3.8776141647090375e-05, "loss": 0.7846, "num_tokens": 1514382240.0, "step": 495 }, { "epoch": 0.7268730536728338, "grad_norm": 0.7569312313476318, "learning_rate": 3.87699458798217e-05, "loss": 0.8025, "num_tokens": 1517380000.0, "step": 496 }, { "epoch": 0.7283385235391098, "grad_norm": 0.528979561273253, "learning_rate": 3.8763735023891464e-05, "loss": 0.7741, "num_tokens": 1520176959.0, "step": 497 }, { "epoch": 0.7298039934053856, "grad_norm": 0.8142373663026293, "learning_rate": 3.875750908488784e-05, "loss": 0.7628, "num_tokens": 1523340234.0, "step": 498 }, { "epoch": 0.7312694632716614, "grad_norm": 0.5947926582686632, "learning_rate": 3.875126806841258e-05, "loss": 0.7675, "num_tokens": 1526236003.0, "step": 499 }, { "epoch": 0.7327349331379374, "grad_norm": 0.7104407265737775, "learning_rate": 3.8745011980081e-05, "loss": 0.7724, "num_tokens": 1529336302.0, "step": 500 }, { "epoch": 0.7342004030042132, "grad_norm": 0.6697752916424362, "learning_rate": 3.873874082552199e-05, "loss": 0.7758, "num_tokens": 1532401849.0, "step": 501 }, { "epoch": 0.7356658728704891, "grad_norm": 0.7068147486297662, "learning_rate": 3.873245461037797e-05, "loss": 0.7964, "num_tokens": 1535510117.0, "step": 502 }, { "epoch": 0.737131342736765, "grad_norm": 0.6843236221096626, "learning_rate": 3.8726153340304926e-05, "loss": 0.7732, "num_tokens": 1538520247.0, "step": 503 }, { "epoch": 0.7385968126030409, "grad_norm": 0.6172541260094148, "learning_rate": 3.871983702097241e-05, "loss": 0.7662, "num_tokens": 1541583964.0, "step": 504 }, { "epoch": 0.7400622824693167, "grad_norm": 0.6765277604121307, "learning_rate": 3.8713505658063476e-05, "loss": 0.771, "num_tokens": 1544463505.0, "step": 505 }, { "epoch": 0.7415277523355926, "grad_norm": 0.5230724744281858, "learning_rate": 3.8707159257274734e-05, "loss": 0.7809, "num_tokens": 1547557309.0, "step": 506 }, { "epoch": 0.7429932222018685, "grad_norm": 0.6284111259896931, "learning_rate": 3.870079782431632e-05, "loss": 0.7791, "num_tokens": 1550458461.0, "step": 507 }, { "epoch": 0.7444586920681443, "grad_norm": 0.6094597467251823, "learning_rate": 3.869442136491191e-05, "loss": 0.7733, "num_tokens": 1553466783.0, "step": 508 }, { "epoch": 0.7459241619344202, "grad_norm": 0.6263105032387869, "learning_rate": 3.868802988479866e-05, "loss": 0.7565, "num_tokens": 1557008221.0, "step": 509 }, { "epoch": 0.7473896318006961, "grad_norm": 0.5884792669598419, "learning_rate": 3.8681623389727295e-05, "loss": 0.7713, "num_tokens": 1560029652.0, "step": 510 }, { "epoch": 0.748855101666972, "grad_norm": 0.5006096746404923, "learning_rate": 3.8675201885462e-05, "loss": 0.766, "num_tokens": 1563004556.0, "step": 511 }, { "epoch": 0.7503205715332478, "grad_norm": 0.7167339651060948, "learning_rate": 3.866876537778049e-05, "loss": 0.7824, "num_tokens": 1566005000.0, "step": 512 }, { "epoch": 0.7517860413995238, "grad_norm": 0.4953478599593932, "learning_rate": 3.8662313872473995e-05, "loss": 0.7686, "num_tokens": 1569059505.0, "step": 513 }, { "epoch": 0.7532515112657996, "grad_norm": 0.5031118002515562, "learning_rate": 3.8655847375347206e-05, "loss": 0.7757, "num_tokens": 1572217300.0, "step": 514 }, { "epoch": 0.7547169811320755, "grad_norm": 0.5426351733471624, "learning_rate": 3.864936589221831e-05, "loss": 0.7659, "num_tokens": 1575210357.0, "step": 515 }, { "epoch": 0.7561824509983514, "grad_norm": 0.6525469950913664, "learning_rate": 3.864286942891898e-05, "loss": 0.7663, "num_tokens": 1578123445.0, "step": 516 }, { "epoch": 0.7576479208646272, "grad_norm": 0.630243241204321, "learning_rate": 3.86363579912944e-05, "loss": 0.7809, "num_tokens": 1581090645.0, "step": 517 }, { "epoch": 0.7591133907309031, "grad_norm": 0.49214684456991553, "learning_rate": 3.862983158520316e-05, "loss": 0.7747, "num_tokens": 1584330857.0, "step": 518 }, { "epoch": 0.7605788605971789, "grad_norm": 0.5863879783395399, "learning_rate": 3.862329021651739e-05, "loss": 0.768, "num_tokens": 1587377799.0, "step": 519 }, { "epoch": 0.7620443304634549, "grad_norm": 0.6522482556156715, "learning_rate": 3.861673389112262e-05, "loss": 0.7742, "num_tokens": 1590573823.0, "step": 520 }, { "epoch": 0.7635098003297307, "grad_norm": 0.41971785510632237, "learning_rate": 3.8610162614917894e-05, "loss": 0.7707, "num_tokens": 1593652876.0, "step": 521 }, { "epoch": 0.7649752701960066, "grad_norm": 0.8835081541898636, "learning_rate": 3.860357639381566e-05, "loss": 0.7801, "num_tokens": 1596867208.0, "step": 522 }, { "epoch": 0.7664407400622825, "grad_norm": 0.5152010993737639, "learning_rate": 3.859697523374183e-05, "loss": 0.7809, "num_tokens": 1599825066.0, "step": 523 }, { "epoch": 0.7679062099285583, "grad_norm": 0.8714571453596084, "learning_rate": 3.859035914063577e-05, "loss": 0.7873, "num_tokens": 1602974774.0, "step": 524 }, { "epoch": 0.7693716797948342, "grad_norm": 0.5784098116916205, "learning_rate": 3.858372812045028e-05, "loss": 0.7771, "num_tokens": 1606035021.0, "step": 525 }, { "epoch": 0.77083714966111, "grad_norm": 0.8381661227962233, "learning_rate": 3.857708217915156e-05, "loss": 0.7522, "num_tokens": 1609266943.0, "step": 526 }, { "epoch": 0.772302619527386, "grad_norm": 0.5982072885783272, "learning_rate": 3.857042132271926e-05, "loss": 0.7529, "num_tokens": 1612830214.0, "step": 527 }, { "epoch": 0.7737680893936618, "grad_norm": 0.7758919332290647, "learning_rate": 3.8563745557146466e-05, "loss": 0.7943, "num_tokens": 1615457501.0, "step": 528 }, { "epoch": 0.7752335592599378, "grad_norm": 0.5661228425225252, "learning_rate": 3.8557054888439636e-05, "loss": 0.772, "num_tokens": 1618288030.0, "step": 529 }, { "epoch": 0.7766990291262136, "grad_norm": 0.7197739030423916, "learning_rate": 3.8550349322618685e-05, "loss": 0.7735, "num_tokens": 1621146668.0, "step": 530 }, { "epoch": 0.7781644989924895, "grad_norm": 0.660519962074317, "learning_rate": 3.8543628865716886e-05, "loss": 0.8066, "num_tokens": 1624154648.0, "step": 531 }, { "epoch": 0.7796299688587653, "grad_norm": 0.8140158300335816, "learning_rate": 3.8536893523780944e-05, "loss": 0.7739, "num_tokens": 1627411378.0, "step": 532 }, { "epoch": 0.7810954387250412, "grad_norm": 0.6517068414951401, "learning_rate": 3.853014330287093e-05, "loss": 0.7677, "num_tokens": 1630447325.0, "step": 533 }, { "epoch": 0.7825609085913171, "grad_norm": 0.7886290194470019, "learning_rate": 3.852337820906033e-05, "loss": 0.78, "num_tokens": 1633236728.0, "step": 534 }, { "epoch": 0.7840263784575929, "grad_norm": 0.6662349152469276, "learning_rate": 3.8516598248436e-05, "loss": 0.7866, "num_tokens": 1636053550.0, "step": 535 }, { "epoch": 0.7854918483238689, "grad_norm": 0.6597644024068506, "learning_rate": 3.850980342709816e-05, "loss": 0.7674, "num_tokens": 1639185387.0, "step": 536 }, { "epoch": 0.7869573181901447, "grad_norm": 0.6628934212702776, "learning_rate": 3.8502993751160406e-05, "loss": 0.7733, "num_tokens": 1642324692.0, "step": 537 }, { "epoch": 0.7884227880564206, "grad_norm": 0.5531629243895193, "learning_rate": 3.8496169226749725e-05, "loss": 0.7857, "num_tokens": 1645256217.0, "step": 538 }, { "epoch": 0.7898882579226965, "grad_norm": 0.7072817019038333, "learning_rate": 3.8489329860006426e-05, "loss": 0.7546, "num_tokens": 1648541265.0, "step": 539 }, { "epoch": 0.7913537277889724, "grad_norm": 0.5655990813132724, "learning_rate": 3.848247565708419e-05, "loss": 0.7524, "num_tokens": 1651817564.0, "step": 540 }, { "epoch": 0.7928191976552482, "grad_norm": 0.518083058589993, "learning_rate": 3.8475606624150055e-05, "loss": 0.7666, "num_tokens": 1654699908.0, "step": 541 }, { "epoch": 0.794284667521524, "grad_norm": 0.6114597863287824, "learning_rate": 3.8468722767384386e-05, "loss": 0.7505, "num_tokens": 1657639430.0, "step": 542 }, { "epoch": 0.7957501373878, "grad_norm": 0.5686664510550941, "learning_rate": 3.84618240929809e-05, "loss": 0.781, "num_tokens": 1660594851.0, "step": 543 }, { "epoch": 0.7972156072540758, "grad_norm": 0.6263953588269895, "learning_rate": 3.8454910607146634e-05, "loss": 0.7592, "num_tokens": 1663687166.0, "step": 544 }, { "epoch": 0.7986810771203517, "grad_norm": 0.593559663144848, "learning_rate": 3.844798231610196e-05, "loss": 0.7649, "num_tokens": 1666804213.0, "step": 545 }, { "epoch": 0.8001465469866276, "grad_norm": 0.6393386034179077, "learning_rate": 3.844103922608057e-05, "loss": 0.7995, "num_tokens": 1669948659.0, "step": 546 }, { "epoch": 0.8016120168529035, "grad_norm": 0.5159120808173705, "learning_rate": 3.843408134332946e-05, "loss": 0.7532, "num_tokens": 1672947803.0, "step": 547 }, { "epoch": 0.8030774867191793, "grad_norm": 0.859704414753804, "learning_rate": 3.842710867410895e-05, "loss": 0.7712, "num_tokens": 1675898262.0, "step": 548 }, { "epoch": 0.8045429565854552, "grad_norm": 0.5255441946988123, "learning_rate": 3.842012122469266e-05, "loss": 0.7569, "num_tokens": 1678870562.0, "step": 549 }, { "epoch": 0.8060084264517311, "grad_norm": 0.6058251553544091, "learning_rate": 3.841311900136751e-05, "loss": 0.7463, "num_tokens": 1681961893.0, "step": 550 }, { "epoch": 0.8074738963180069, "grad_norm": 0.7387923965593198, "learning_rate": 3.84061020104337e-05, "loss": 0.7774, "num_tokens": 1684915961.0, "step": 551 }, { "epoch": 0.8089393661842829, "grad_norm": 0.508863830599595, "learning_rate": 3.839907025820474e-05, "loss": 0.7832, "num_tokens": 1687823301.0, "step": 552 }, { "epoch": 0.8104048360505587, "grad_norm": 0.5867690563669935, "learning_rate": 3.839202375100739e-05, "loss": 0.7934, "num_tokens": 1690689894.0, "step": 553 }, { "epoch": 0.8118703059168346, "grad_norm": 0.6860803174324955, "learning_rate": 3.8384962495181724e-05, "loss": 0.7835, "num_tokens": 1693640395.0, "step": 554 }, { "epoch": 0.8133357757831104, "grad_norm": 0.5034383439781328, "learning_rate": 3.8377886497081054e-05, "loss": 0.7659, "num_tokens": 1696721998.0, "step": 555 }, { "epoch": 0.8148012456493864, "grad_norm": 0.5723866443275422, "learning_rate": 3.8370795763071974e-05, "loss": 0.7715, "num_tokens": 1699940440.0, "step": 556 }, { "epoch": 0.8162667155156622, "grad_norm": 0.7362265331009978, "learning_rate": 3.836369029953432e-05, "loss": 0.7753, "num_tokens": 1703301089.0, "step": 557 }, { "epoch": 0.817732185381938, "grad_norm": 0.5840870466351289, "learning_rate": 3.835657011286121e-05, "loss": 0.7538, "num_tokens": 1706454921.0, "step": 558 }, { "epoch": 0.819197655248214, "grad_norm": 0.5685976903083878, "learning_rate": 3.834943520945897e-05, "loss": 0.7741, "num_tokens": 1709625437.0, "step": 559 }, { "epoch": 0.8206631251144898, "grad_norm": 0.8711292180046611, "learning_rate": 3.83422855957472e-05, "loss": 0.7546, "num_tokens": 1712752714.0, "step": 560 }, { "epoch": 0.8221285949807657, "grad_norm": 0.513888958021349, "learning_rate": 3.833512127815873e-05, "loss": 0.7712, "num_tokens": 1715832151.0, "step": 561 }, { "epoch": 0.8235940648470416, "grad_norm": 1.123361662531118, "learning_rate": 3.83279422631396e-05, "loss": 0.75, "num_tokens": 1718946506.0, "step": 562 }, { "epoch": 0.8250595347133175, "grad_norm": 0.9046599041199087, "learning_rate": 3.832074855714909e-05, "loss": 0.789, "num_tokens": 1722009749.0, "step": 563 }, { "epoch": 0.8265250045795933, "grad_norm": 1.0516863082542982, "learning_rate": 3.83135401666597e-05, "loss": 0.7683, "num_tokens": 1724900146.0, "step": 564 }, { "epoch": 0.8279904744458693, "grad_norm": 0.9850461210213872, "learning_rate": 3.8306317098157135e-05, "loss": 0.763, "num_tokens": 1728198817.0, "step": 565 }, { "epoch": 0.8294559443121451, "grad_norm": 0.848996674112302, "learning_rate": 3.829907935814031e-05, "loss": 0.7533, "num_tokens": 1731133358.0, "step": 566 }, { "epoch": 0.8309214141784209, "grad_norm": 0.7782184320669764, "learning_rate": 3.829182695312134e-05, "loss": 0.7608, "num_tokens": 1734403770.0, "step": 567 }, { "epoch": 0.8323868840446969, "grad_norm": 0.7828890158174929, "learning_rate": 3.828455988962553e-05, "loss": 0.7819, "num_tokens": 1737385413.0, "step": 568 }, { "epoch": 0.8338523539109727, "grad_norm": 0.7079152085683248, "learning_rate": 3.827727817419138e-05, "loss": 0.7766, "num_tokens": 1740556447.0, "step": 569 }, { "epoch": 0.8353178237772486, "grad_norm": 0.7639860590735518, "learning_rate": 3.8269981813370576e-05, "loss": 0.7727, "num_tokens": 1743547906.0, "step": 570 }, { "epoch": 0.8367832936435244, "grad_norm": 0.6579434434211288, "learning_rate": 3.8262670813727964e-05, "loss": 0.7725, "num_tokens": 1746704671.0, "step": 571 }, { "epoch": 0.8382487635098004, "grad_norm": 0.8593002193880989, "learning_rate": 3.825534518184159e-05, "loss": 0.7614, "num_tokens": 1749743059.0, "step": 572 }, { "epoch": 0.8397142333760762, "grad_norm": 0.6201447637005403, "learning_rate": 3.824800492430264e-05, "loss": 0.7417, "num_tokens": 1753037690.0, "step": 573 }, { "epoch": 0.841179703242352, "grad_norm": 0.7348326542706543, "learning_rate": 3.824065004771547e-05, "loss": 0.7621, "num_tokens": 1755987574.0, "step": 574 }, { "epoch": 0.842645173108628, "grad_norm": 0.5753543623117091, "learning_rate": 3.823328055869759e-05, "loss": 0.7687, "num_tokens": 1759155412.0, "step": 575 }, { "epoch": 0.8441106429749038, "grad_norm": 0.867551702155227, "learning_rate": 3.8225896463879645e-05, "loss": 0.7711, "num_tokens": 1762221945.0, "step": 576 }, { "epoch": 0.8455761128411797, "grad_norm": 0.6886997315319308, "learning_rate": 3.821849776990544e-05, "loss": 0.7593, "num_tokens": 1765184272.0, "step": 577 }, { "epoch": 0.8470415827074556, "grad_norm": 0.7583845133299721, "learning_rate": 3.8211084483431905e-05, "loss": 0.7552, "num_tokens": 1768176944.0, "step": 578 }, { "epoch": 0.8485070525737315, "grad_norm": 0.6706691330864654, "learning_rate": 3.820365661112911e-05, "loss": 0.7614, "num_tokens": 1771268843.0, "step": 579 }, { "epoch": 0.8499725224400073, "grad_norm": 0.8022246493600097, "learning_rate": 3.819621415968022e-05, "loss": 0.7765, "num_tokens": 1774237204.0, "step": 580 }, { "epoch": 0.8514379923062833, "grad_norm": 0.686071693150319, "learning_rate": 3.8188757135781555e-05, "loss": 0.7601, "num_tokens": 1777192729.0, "step": 581 }, { "epoch": 0.8529034621725591, "grad_norm": 0.7164221100816927, "learning_rate": 3.8181285546142516e-05, "loss": 0.7696, "num_tokens": 1780591122.0, "step": 582 }, { "epoch": 0.8543689320388349, "grad_norm": 0.6258251071096054, "learning_rate": 3.817379939748564e-05, "loss": 0.7793, "num_tokens": 1783720426.0, "step": 583 }, { "epoch": 0.8558344019051108, "grad_norm": 0.6613147576084321, "learning_rate": 3.8166298696546535e-05, "loss": 0.7492, "num_tokens": 1786456272.0, "step": 584 }, { "epoch": 0.8572998717713867, "grad_norm": 0.5617988990411139, "learning_rate": 3.815878345007391e-05, "loss": 0.7699, "num_tokens": 1789577159.0, "step": 585 }, { "epoch": 0.8587653416376626, "grad_norm": 0.7354277155930304, "learning_rate": 3.815125366482957e-05, "loss": 0.7583, "num_tokens": 1792698276.0, "step": 586 }, { "epoch": 0.8602308115039384, "grad_norm": 0.6010858642506439, "learning_rate": 3.8143709347588396e-05, "loss": 0.755, "num_tokens": 1795728576.0, "step": 587 }, { "epoch": 0.8616962813702144, "grad_norm": 0.6919995060659836, "learning_rate": 3.8136150505138336e-05, "loss": 0.786, "num_tokens": 1798751032.0, "step": 588 }, { "epoch": 0.8631617512364902, "grad_norm": 0.6231253593139175, "learning_rate": 3.8128577144280424e-05, "loss": 0.7697, "num_tokens": 1801685360.0, "step": 589 }, { "epoch": 0.864627221102766, "grad_norm": 0.5251147821580963, "learning_rate": 3.812098927182874e-05, "loss": 0.7834, "num_tokens": 1804558735.0, "step": 590 }, { "epoch": 0.866092690969042, "grad_norm": 0.5855076540161251, "learning_rate": 3.811338689461044e-05, "loss": 0.7675, "num_tokens": 1807850994.0, "step": 591 }, { "epoch": 0.8675581608353178, "grad_norm": 0.6121148974997432, "learning_rate": 3.8105770019465706e-05, "loss": 0.7601, "num_tokens": 1810969593.0, "step": 592 }, { "epoch": 0.8690236307015937, "grad_norm": 0.6031889914215367, "learning_rate": 3.8098138653247785e-05, "loss": 0.7798, "num_tokens": 1814052780.0, "step": 593 }, { "epoch": 0.8704891005678695, "grad_norm": 0.558087348577981, "learning_rate": 3.8090492802822946e-05, "loss": 0.7539, "num_tokens": 1817391223.0, "step": 594 }, { "epoch": 0.8719545704341455, "grad_norm": 0.5549099055701825, "learning_rate": 3.8082832475070505e-05, "loss": 0.7763, "num_tokens": 1820417584.0, "step": 595 }, { "epoch": 0.8734200403004213, "grad_norm": 0.6208111128014616, "learning_rate": 3.80751576768828e-05, "loss": 0.785, "num_tokens": 1823515514.0, "step": 596 }, { "epoch": 0.8748855101666972, "grad_norm": 0.4976925243765284, "learning_rate": 3.806746841516518e-05, "loss": 0.7591, "num_tokens": 1826899894.0, "step": 597 }, { "epoch": 0.8763509800329731, "grad_norm": 0.7321468338160054, "learning_rate": 3.805976469683602e-05, "loss": 0.7824, "num_tokens": 1829798308.0, "step": 598 }, { "epoch": 0.8778164498992489, "grad_norm": 0.47468106625610196, "learning_rate": 3.805204652882669e-05, "loss": 0.77, "num_tokens": 1832599869.0, "step": 599 }, { "epoch": 0.8792819197655248, "grad_norm": 0.589599283303487, "learning_rate": 3.804431391808157e-05, "loss": 0.7773, "num_tokens": 1835854114.0, "step": 600 }, { "epoch": 0.8807473896318007, "grad_norm": 0.5939112988176937, "learning_rate": 3.8036566871558035e-05, "loss": 0.7534, "num_tokens": 1839016372.0, "step": 601 }, { "epoch": 0.8822128594980766, "grad_norm": 0.7221522422621756, "learning_rate": 3.802880539622644e-05, "loss": 0.7578, "num_tokens": 1841980827.0, "step": 602 }, { "epoch": 0.8836783293643524, "grad_norm": 0.5087678196844709, "learning_rate": 3.802102949907012e-05, "loss": 0.7538, "num_tokens": 1845008150.0, "step": 603 }, { "epoch": 0.8851437992306284, "grad_norm": 0.7152874057852808, "learning_rate": 3.801323918708541e-05, "loss": 0.7638, "num_tokens": 1847736990.0, "step": 604 }, { "epoch": 0.8866092690969042, "grad_norm": 0.5427245440311039, "learning_rate": 3.800543446728159e-05, "loss": 0.7684, "num_tokens": 1850914429.0, "step": 605 }, { "epoch": 0.8880747389631801, "grad_norm": 0.7468173045710687, "learning_rate": 3.799761534668091e-05, "loss": 0.7644, "num_tokens": 1853981556.0, "step": 606 }, { "epoch": 0.889540208829456, "grad_norm": 0.6290630816339953, "learning_rate": 3.798978183231858e-05, "loss": 0.7542, "num_tokens": 1856959268.0, "step": 607 }, { "epoch": 0.8910056786957318, "grad_norm": 0.5729155712992845, "learning_rate": 3.7981933931242754e-05, "loss": 0.7735, "num_tokens": 1859919603.0, "step": 608 }, { "epoch": 0.8924711485620077, "grad_norm": 0.6710013158198507, "learning_rate": 3.797407165051455e-05, "loss": 0.7634, "num_tokens": 1863028848.0, "step": 609 }, { "epoch": 0.8939366184282835, "grad_norm": 0.654982737002341, "learning_rate": 3.796619499720799e-05, "loss": 0.7688, "num_tokens": 1866030242.0, "step": 610 }, { "epoch": 0.8954020882945595, "grad_norm": 0.486423032708723, "learning_rate": 3.795830397841007e-05, "loss": 0.761, "num_tokens": 1869043166.0, "step": 611 }, { "epoch": 0.8968675581608353, "grad_norm": 0.638014776803668, "learning_rate": 3.795039860122066e-05, "loss": 0.7688, "num_tokens": 1872129976.0, "step": 612 }, { "epoch": 0.8983330280271112, "grad_norm": 0.7837014284517078, "learning_rate": 3.7942478872752606e-05, "loss": 0.7694, "num_tokens": 1875247846.0, "step": 613 }, { "epoch": 0.8997984978933871, "grad_norm": 0.4713128952747065, "learning_rate": 3.7934544800131616e-05, "loss": 0.7573, "num_tokens": 1878495853.0, "step": 614 }, { "epoch": 0.9012639677596629, "grad_norm": 0.6479293136023899, "learning_rate": 3.792659639049634e-05, "loss": 0.7692, "num_tokens": 1881595328.0, "step": 615 }, { "epoch": 0.9027294376259388, "grad_norm": 0.5824132333654548, "learning_rate": 3.79186336509983e-05, "loss": 0.7742, "num_tokens": 1884535469.0, "step": 616 }, { "epoch": 0.9041949074922147, "grad_norm": 0.6436379597988445, "learning_rate": 3.7910656588801945e-05, "loss": 0.7686, "num_tokens": 1887531076.0, "step": 617 }, { "epoch": 0.9056603773584906, "grad_norm": 0.5656678989177333, "learning_rate": 3.7902665211084564e-05, "loss": 0.7419, "num_tokens": 1890365326.0, "step": 618 }, { "epoch": 0.9071258472247664, "grad_norm": 0.5674126326743792, "learning_rate": 3.789465952503638e-05, "loss": 0.7436, "num_tokens": 1893640789.0, "step": 619 }, { "epoch": 0.9085913170910423, "grad_norm": 0.7276819727120908, "learning_rate": 3.7886639537860444e-05, "loss": 0.7546, "num_tokens": 1896828926.0, "step": 620 }, { "epoch": 0.9100567869573182, "grad_norm": 0.4572270931522491, "learning_rate": 3.78786052567727e-05, "loss": 0.7852, "num_tokens": 1899616146.0, "step": 621 }, { "epoch": 0.9115222568235941, "grad_norm": 0.6660650488574197, "learning_rate": 3.7870556689001945e-05, "loss": 0.7516, "num_tokens": 1902694705.0, "step": 622 }, { "epoch": 0.9129877266898699, "grad_norm": 0.6345765146342996, "learning_rate": 3.7862493841789826e-05, "loss": 0.7715, "num_tokens": 1905706613.0, "step": 623 }, { "epoch": 0.9144531965561458, "grad_norm": 0.4783344256116178, "learning_rate": 3.785441672239085e-05, "loss": 0.7779, "num_tokens": 1908570121.0, "step": 624 }, { "epoch": 0.9159186664224217, "grad_norm": 0.686450956164066, "learning_rate": 3.784632533807235e-05, "loss": 0.7729, "num_tokens": 1911550249.0, "step": 625 }, { "epoch": 0.9173841362886975, "grad_norm": 0.6899407427286569, "learning_rate": 3.783821969611451e-05, "loss": 0.7472, "num_tokens": 1914636649.0, "step": 626 }, { "epoch": 0.9188496061549735, "grad_norm": 0.4934670114726233, "learning_rate": 3.783009980381034e-05, "loss": 0.7747, "num_tokens": 1917739554.0, "step": 627 }, { "epoch": 0.9203150760212493, "grad_norm": 0.8639122391165722, "learning_rate": 3.782196566846565e-05, "loss": 0.7715, "num_tokens": 1920751438.0, "step": 628 }, { "epoch": 0.9217805458875252, "grad_norm": 0.5585272785901677, "learning_rate": 3.7813817297399096e-05, "loss": 0.7525, "num_tokens": 1924023122.0, "step": 629 }, { "epoch": 0.923246015753801, "grad_norm": 0.9341746802404657, "learning_rate": 3.780565469794212e-05, "loss": 0.7738, "num_tokens": 1927229317.0, "step": 630 }, { "epoch": 0.924711485620077, "grad_norm": 0.6941226597369331, "learning_rate": 3.779747787743897e-05, "loss": 0.7849, "num_tokens": 1930194495.0, "step": 631 }, { "epoch": 0.9261769554863528, "grad_norm": 0.8787799800712817, "learning_rate": 3.77892868432467e-05, "loss": 0.7719, "num_tokens": 1933226447.0, "step": 632 }, { "epoch": 0.9276424253526286, "grad_norm": 0.6634970779532314, "learning_rate": 3.7781081602735145e-05, "loss": 0.759, "num_tokens": 1936221228.0, "step": 633 }, { "epoch": 0.9291078952189046, "grad_norm": 0.887136527526139, "learning_rate": 3.777286216328692e-05, "loss": 0.781, "num_tokens": 1939211467.0, "step": 634 }, { "epoch": 0.9305733650851804, "grad_norm": 0.6418875797886834, "learning_rate": 3.776462853229741e-05, "loss": 0.7721, "num_tokens": 1942214597.0, "step": 635 }, { "epoch": 0.9320388349514563, "grad_norm": 0.7994318117480244, "learning_rate": 3.7756380717174796e-05, "loss": 0.7591, "num_tokens": 1945156758.0, "step": 636 }, { "epoch": 0.9335043048177322, "grad_norm": 0.685468606703183, "learning_rate": 3.774811872533998e-05, "loss": 0.7615, "num_tokens": 1948231699.0, "step": 637 }, { "epoch": 0.9349697746840081, "grad_norm": 0.7091267136673098, "learning_rate": 3.7739842564226667e-05, "loss": 0.756, "num_tokens": 1951320148.0, "step": 638 }, { "epoch": 0.9364352445502839, "grad_norm": 0.6395586406376436, "learning_rate": 3.773155224128126e-05, "loss": 0.7481, "num_tokens": 1954425861.0, "step": 639 }, { "epoch": 0.9379007144165598, "grad_norm": 0.6514377068482021, "learning_rate": 3.772324776396294e-05, "loss": 0.765, "num_tokens": 1957327430.0, "step": 640 }, { "epoch": 0.9393661842828357, "grad_norm": 0.604915929784694, "learning_rate": 3.771492913974362e-05, "loss": 0.766, "num_tokens": 1960281867.0, "step": 641 }, { "epoch": 0.9408316541491115, "grad_norm": 0.5561202514864331, "learning_rate": 3.7706596376107916e-05, "loss": 0.7326, "num_tokens": 1963542347.0, "step": 642 }, { "epoch": 0.9422971240153875, "grad_norm": 0.6524460318326223, "learning_rate": 3.769824948055319e-05, "loss": 0.7667, "num_tokens": 1966572145.0, "step": 643 }, { "epoch": 0.9437625938816633, "grad_norm": 0.5037735999069346, "learning_rate": 3.768988846058953e-05, "loss": 0.7592, "num_tokens": 1969679554.0, "step": 644 }, { "epoch": 0.9452280637479392, "grad_norm": 0.5786731912448202, "learning_rate": 3.768151332373969e-05, "loss": 0.7491, "num_tokens": 1972803651.0, "step": 645 }, { "epoch": 0.946693533614215, "grad_norm": 0.523808608927916, "learning_rate": 3.767312407753917e-05, "loss": 0.7738, "num_tokens": 1975809709.0, "step": 646 }, { "epoch": 0.948159003480491, "grad_norm": 0.5995120358534207, "learning_rate": 3.766472072953613e-05, "loss": 0.7642, "num_tokens": 1978728863.0, "step": 647 }, { "epoch": 0.9496244733467668, "grad_norm": 0.4845529389777227, "learning_rate": 3.765630328729145e-05, "loss": 0.7471, "num_tokens": 1981859746.0, "step": 648 }, { "epoch": 0.9510899432130426, "grad_norm": 0.7277835379583072, "learning_rate": 3.7647871758378654e-05, "loss": 0.7462, "num_tokens": 1985036527.0, "step": 649 }, { "epoch": 0.9525554130793186, "grad_norm": 0.5203820949638468, "learning_rate": 3.7639426150383986e-05, "loss": 0.7756, "num_tokens": 1987994905.0, "step": 650 }, { "epoch": 0.9540208829455944, "grad_norm": 0.8262214620143367, "learning_rate": 3.7630966470906307e-05, "loss": 0.7588, "num_tokens": 1990952582.0, "step": 651 }, { "epoch": 0.9554863528118703, "grad_norm": 0.6958722537962205, "learning_rate": 3.762249272755719e-05, "loss": 0.7767, "num_tokens": 1994021014.0, "step": 652 }, { "epoch": 0.9569518226781462, "grad_norm": 0.6926954380394293, "learning_rate": 3.761400492796082e-05, "loss": 0.7654, "num_tokens": 1996988852.0, "step": 653 }, { "epoch": 0.9584172925444221, "grad_norm": 0.6978551806189677, "learning_rate": 3.760550307975405e-05, "loss": 0.7748, "num_tokens": 2000054813.0, "step": 654 }, { "epoch": 0.9598827624106979, "grad_norm": 0.605957194001, "learning_rate": 3.7596987190586374e-05, "loss": 0.7509, "num_tokens": 2003042990.0, "step": 655 }, { "epoch": 0.9613482322769739, "grad_norm": 0.6368133440966786, "learning_rate": 3.75884572681199e-05, "loss": 0.7508, "num_tokens": 2006319675.0, "step": 656 }, { "epoch": 0.9628137021432497, "grad_norm": 0.476910320318197, "learning_rate": 3.757991332002939e-05, "loss": 0.7661, "num_tokens": 2009422886.0, "step": 657 }, { "epoch": 0.9642791720095255, "grad_norm": 0.7325847239666787, "learning_rate": 3.75713553540022e-05, "loss": 0.7522, "num_tokens": 2012454764.0, "step": 658 }, { "epoch": 0.9657446418758014, "grad_norm": 0.5828842999264109, "learning_rate": 3.756278337773832e-05, "loss": 0.7772, "num_tokens": 2015465754.0, "step": 659 }, { "epoch": 0.9672101117420773, "grad_norm": 0.6543938148531748, "learning_rate": 3.7554197398950335e-05, "loss": 0.7525, "num_tokens": 2018379848.0, "step": 660 }, { "epoch": 0.9686755816083532, "grad_norm": 0.6074608961465975, "learning_rate": 3.7545597425363426e-05, "loss": 0.753, "num_tokens": 2021448637.0, "step": 661 }, { "epoch": 0.970141051474629, "grad_norm": 0.5277739122604715, "learning_rate": 3.7536983464715374e-05, "loss": 0.7794, "num_tokens": 2024365660.0, "step": 662 }, { "epoch": 0.971606521340905, "grad_norm": 0.6490356458883493, "learning_rate": 3.752835552475653e-05, "loss": 0.7609, "num_tokens": 2027404131.0, "step": 663 }, { "epoch": 0.9730719912071808, "grad_norm": 0.48663515563816123, "learning_rate": 3.751971361324985e-05, "loss": 0.7531, "num_tokens": 2030472960.0, "step": 664 }, { "epoch": 0.9745374610734566, "grad_norm": 0.8271509981819511, "learning_rate": 3.7511057737970824e-05, "loss": 0.7635, "num_tokens": 2033648017.0, "step": 665 }, { "epoch": 0.9760029309397326, "grad_norm": 0.6448244407421377, "learning_rate": 3.7502387906707536e-05, "loss": 0.785, "num_tokens": 2036775729.0, "step": 666 }, { "epoch": 0.9774684008060084, "grad_norm": 0.7868198580042994, "learning_rate": 3.7493704127260616e-05, "loss": 0.768, "num_tokens": 2039845178.0, "step": 667 }, { "epoch": 0.9789338706722843, "grad_norm": 0.6423425164607255, "learning_rate": 3.748500640744325e-05, "loss": 0.7589, "num_tokens": 2042913140.0, "step": 668 }, { "epoch": 0.9803993405385601, "grad_norm": 0.7312382148283119, "learning_rate": 3.747629475508115e-05, "loss": 0.7511, "num_tokens": 2045883851.0, "step": 669 }, { "epoch": 0.9818648104048361, "grad_norm": 0.5684524585243846, "learning_rate": 3.7467569178012575e-05, "loss": 0.7571, "num_tokens": 2048883833.0, "step": 670 }, { "epoch": 0.9833302802711119, "grad_norm": 0.7301807075780937, "learning_rate": 3.745882968408832e-05, "loss": 0.7452, "num_tokens": 2051893376.0, "step": 671 }, { "epoch": 0.9847957501373878, "grad_norm": 0.5149366367790847, "learning_rate": 3.7450076281171696e-05, "loss": 0.7633, "num_tokens": 2055080057.0, "step": 672 }, { "epoch": 0.9862612200036637, "grad_norm": 0.7592593842889864, "learning_rate": 3.7441308977138514e-05, "loss": 0.7622, "num_tokens": 2058308501.0, "step": 673 }, { "epoch": 0.9877266898699395, "grad_norm": 0.5840995350881396, "learning_rate": 3.743252777987712e-05, "loss": 0.7657, "num_tokens": 2061474649.0, "step": 674 }, { "epoch": 0.9891921597362154, "grad_norm": 0.7238127166623062, "learning_rate": 3.742373269728833e-05, "loss": 0.7641, "num_tokens": 2064280704.0, "step": 675 }, { "epoch": 0.9906576296024913, "grad_norm": 0.5656155902952759, "learning_rate": 3.741492373728548e-05, "loss": 0.7631, "num_tokens": 2067358410.0, "step": 676 }, { "epoch": 0.9921230994687672, "grad_norm": 0.8681921379583608, "learning_rate": 3.7406100907794375e-05, "loss": 0.7655, "num_tokens": 2070338357.0, "step": 677 }, { "epoch": 0.993588569335043, "grad_norm": 0.6811209766453731, "learning_rate": 3.739726421675331e-05, "loss": 0.7822, "num_tokens": 2073323968.0, "step": 678 }, { "epoch": 0.995054039201319, "grad_norm": 0.6764551214662876, "learning_rate": 3.738841367211304e-05, "loss": 0.7586, "num_tokens": 2076382759.0, "step": 679 }, { "epoch": 0.9965195090675948, "grad_norm": 0.7250122547172594, "learning_rate": 3.73795492818368e-05, "loss": 0.7664, "num_tokens": 2079519161.0, "step": 680 }, { "epoch": 0.9979849789338707, "grad_norm": 0.547601968793431, "learning_rate": 3.737067105390026e-05, "loss": 0.7553, "num_tokens": 2082558483.0, "step": 681 }, { "epoch": 0.9994504488001466, "grad_norm": 0.6722580383412239, "learning_rate": 3.736177899629156e-05, "loss": 0.7608, "num_tokens": 2085711886.0, "step": 682 }, { "epoch": 1.0, "grad_norm": 0.6722580383412239, "learning_rate": 3.735287311701129e-05, "loss": 0.7411, "num_tokens": 2086354946.0, "step": 683 }, { "epoch": 1.001465469866276, "grad_norm": 0.965701733730156, "learning_rate": 3.734395342407245e-05, "loss": 0.7464, "num_tokens": 2089424116.0, "step": 684 }, { "epoch": 1.0029309397325517, "grad_norm": 0.5133810627523421, "learning_rate": 3.733501992550048e-05, "loss": 0.7469, "num_tokens": 2092504087.0, "step": 685 }, { "epoch": 1.0043964095988276, "grad_norm": 0.6846944303065458, "learning_rate": 3.732607262933325e-05, "loss": 0.7501, "num_tokens": 2095695436.0, "step": 686 }, { "epoch": 1.0058618794651035, "grad_norm": 0.5492008160627736, "learning_rate": 3.7317111543621035e-05, "loss": 0.7623, "num_tokens": 2098818040.0, "step": 687 }, { "epoch": 1.0073273493313795, "grad_norm": 0.608048644578349, "learning_rate": 3.730813667642652e-05, "loss": 0.7297, "num_tokens": 2102081547.0, "step": 688 }, { "epoch": 1.0087928191976552, "grad_norm": 0.42408006198989767, "learning_rate": 3.729914803582479e-05, "loss": 0.7375, "num_tokens": 2105193885.0, "step": 689 }, { "epoch": 1.0102582890639311, "grad_norm": 0.5928279425557839, "learning_rate": 3.729014562990333e-05, "loss": 0.7562, "num_tokens": 2108095244.0, "step": 690 }, { "epoch": 1.011723758930207, "grad_norm": 0.6051913181331622, "learning_rate": 3.7281129466761995e-05, "loss": 0.7603, "num_tokens": 2111400323.0, "step": 691 }, { "epoch": 1.0131892287964828, "grad_norm": 0.6295552731757489, "learning_rate": 3.727209955451302e-05, "loss": 0.7417, "num_tokens": 2114436479.0, "step": 692 }, { "epoch": 1.0146546986627587, "grad_norm": 0.5359591982659879, "learning_rate": 3.7263055901281026e-05, "loss": 0.7568, "num_tokens": 2117679032.0, "step": 693 }, { "epoch": 1.0161201685290346, "grad_norm": 0.6670312571939421, "learning_rate": 3.7253998515202986e-05, "loss": 0.7493, "num_tokens": 2120717761.0, "step": 694 }, { "epoch": 1.0175856383953106, "grad_norm": 0.5887974320624524, "learning_rate": 3.724492740442822e-05, "loss": 0.732, "num_tokens": 2123668167.0, "step": 695 }, { "epoch": 1.0190511082615863, "grad_norm": 0.6428688747718982, "learning_rate": 3.723584257711842e-05, "loss": 0.7689, "num_tokens": 2126540267.0, "step": 696 }, { "epoch": 1.0205165781278622, "grad_norm": 0.5164418224075241, "learning_rate": 3.7226744041447607e-05, "loss": 0.7413, "num_tokens": 2129476421.0, "step": 697 }, { "epoch": 1.0219820479941382, "grad_norm": 0.6054376335161614, "learning_rate": 3.7217631805602125e-05, "loss": 0.734, "num_tokens": 2132390995.0, "step": 698 }, { "epoch": 1.023447517860414, "grad_norm": 0.5744791864215427, "learning_rate": 3.720850587778066e-05, "loss": 0.7429, "num_tokens": 2135359608.0, "step": 699 }, { "epoch": 1.0249129877266898, "grad_norm": 0.5649861377147157, "learning_rate": 3.719936626619422e-05, "loss": 0.7432, "num_tokens": 2138501286.0, "step": 700 }, { "epoch": 1.0263784575929658, "grad_norm": 0.5091744191362831, "learning_rate": 3.7190212979066107e-05, "loss": 0.7512, "num_tokens": 2141374314.0, "step": 701 }, { "epoch": 1.0278439274592417, "grad_norm": 0.5725742428395841, "learning_rate": 3.718104602463194e-05, "loss": 0.7407, "num_tokens": 2144285726.0, "step": 702 }, { "epoch": 1.0293093973255174, "grad_norm": 0.5304200237730711, "learning_rate": 3.717186541113964e-05, "loss": 0.7415, "num_tokens": 2147618227.0, "step": 703 }, { "epoch": 1.0307748671917933, "grad_norm": 0.5377789103815364, "learning_rate": 3.7162671146849414e-05, "loss": 0.7708, "num_tokens": 2150711331.0, "step": 704 }, { "epoch": 1.0322403370580693, "grad_norm": 0.5107631684950187, "learning_rate": 3.715346324003373e-05, "loss": 0.7681, "num_tokens": 2153694895.0, "step": 705 }, { "epoch": 1.0337058069243452, "grad_norm": 0.7161134476985407, "learning_rate": 3.714424169897737e-05, "loss": 0.7456, "num_tokens": 2157005666.0, "step": 706 }, { "epoch": 1.035171276790621, "grad_norm": 0.5206373746222844, "learning_rate": 3.713500653197734e-05, "loss": 0.7355, "num_tokens": 2160039775.0, "step": 707 }, { "epoch": 1.0366367466568969, "grad_norm": 0.5404919957857687, "learning_rate": 3.712575774734294e-05, "loss": 0.7595, "num_tokens": 2163129477.0, "step": 708 }, { "epoch": 1.0381022165231728, "grad_norm": 0.6472966435467952, "learning_rate": 3.7116495353395714e-05, "loss": 0.7479, "num_tokens": 2166037896.0, "step": 709 }, { "epoch": 1.0395676863894485, "grad_norm": 0.5231631163973381, "learning_rate": 3.710721935846944e-05, "loss": 0.7889, "num_tokens": 2169029002.0, "step": 710 }, { "epoch": 1.0410331562557245, "grad_norm": 0.530060468539575, "learning_rate": 3.7097929770910146e-05, "loss": 0.7482, "num_tokens": 2171870391.0, "step": 711 }, { "epoch": 1.0424986261220004, "grad_norm": 0.5622453631172125, "learning_rate": 3.708862659907608e-05, "loss": 0.7573, "num_tokens": 2175024201.0, "step": 712 }, { "epoch": 1.0439640959882763, "grad_norm": 0.5963062854922916, "learning_rate": 3.707930985133772e-05, "loss": 0.7462, "num_tokens": 2178280799.0, "step": 713 }, { "epoch": 1.045429565854552, "grad_norm": 0.5313545131659448, "learning_rate": 3.706997953607776e-05, "loss": 0.7648, "num_tokens": 2181266981.0, "step": 714 }, { "epoch": 1.046895035720828, "grad_norm": 0.5466249742925424, "learning_rate": 3.7060635661691084e-05, "loss": 0.7343, "num_tokens": 2184188084.0, "step": 715 }, { "epoch": 1.048360505587104, "grad_norm": 0.5663932311432857, "learning_rate": 3.70512782365848e-05, "loss": 0.7536, "num_tokens": 2187430737.0, "step": 716 }, { "epoch": 1.0498259754533796, "grad_norm": 0.4838039672619207, "learning_rate": 3.7041907269178195e-05, "loss": 0.7572, "num_tokens": 2190128313.0, "step": 717 }, { "epoch": 1.0512914453196556, "grad_norm": 0.5631770530991895, "learning_rate": 3.703252276790273e-05, "loss": 0.7513, "num_tokens": 2193187300.0, "step": 718 }, { "epoch": 1.0527569151859315, "grad_norm": 0.509897390962656, "learning_rate": 3.702312474120208e-05, "loss": 0.7551, "num_tokens": 2196209970.0, "step": 719 }, { "epoch": 1.0542223850522074, "grad_norm": 0.6061100376432039, "learning_rate": 3.701371319753205e-05, "loss": 0.7389, "num_tokens": 2199075223.0, "step": 720 }, { "epoch": 1.0556878549184832, "grad_norm": 0.49065247253595917, "learning_rate": 3.700428814536062e-05, "loss": 0.7411, "num_tokens": 2202166743.0, "step": 721 }, { "epoch": 1.057153324784759, "grad_norm": 0.6047032526105094, "learning_rate": 3.699484959316793e-05, "loss": 0.7482, "num_tokens": 2205269004.0, "step": 722 }, { "epoch": 1.058618794651035, "grad_norm": 0.5069813223229449, "learning_rate": 3.698539754944626e-05, "loss": 0.7453, "num_tokens": 2208177599.0, "step": 723 }, { "epoch": 1.060084264517311, "grad_norm": 0.5198781724310066, "learning_rate": 3.697593202270004e-05, "loss": 0.7732, "num_tokens": 2211161130.0, "step": 724 }, { "epoch": 1.0615497343835867, "grad_norm": 0.5297285255553046, "learning_rate": 3.696645302144582e-05, "loss": 0.7703, "num_tokens": 2214106845.0, "step": 725 }, { "epoch": 1.0630152042498626, "grad_norm": 0.5810591529235009, "learning_rate": 3.6956960554212264e-05, "loss": 0.74, "num_tokens": 2217036287.0, "step": 726 }, { "epoch": 1.0644806741161386, "grad_norm": 0.5253126415054268, "learning_rate": 3.694745462954018e-05, "loss": 0.7714, "num_tokens": 2219849225.0, "step": 727 }, { "epoch": 1.0659461439824143, "grad_norm": 0.5787768130087091, "learning_rate": 3.693793525598246e-05, "loss": 0.7594, "num_tokens": 2222920367.0, "step": 728 }, { "epoch": 1.0674116138486902, "grad_norm": 0.5951506746902782, "learning_rate": 3.6928402442104106e-05, "loss": 0.7481, "num_tokens": 2225714824.0, "step": 729 }, { "epoch": 1.0688770837149661, "grad_norm": 0.620649507315363, "learning_rate": 3.6918856196482204e-05, "loss": 0.7417, "num_tokens": 2228896692.0, "step": 730 }, { "epoch": 1.070342553581242, "grad_norm": 0.4214075647142451, "learning_rate": 3.6909296527705956e-05, "loss": 0.7537, "num_tokens": 2231821181.0, "step": 731 }, { "epoch": 1.0718080234475178, "grad_norm": 0.7835099958453671, "learning_rate": 3.68997234443766e-05, "loss": 0.746, "num_tokens": 2234900557.0, "step": 732 }, { "epoch": 1.0732734933137937, "grad_norm": 0.5186298880331794, "learning_rate": 3.6890136955107466e-05, "loss": 0.7435, "num_tokens": 2238144344.0, "step": 733 }, { "epoch": 1.0747389631800697, "grad_norm": 0.6085105303091098, "learning_rate": 3.6880537068523944e-05, "loss": 0.7423, "num_tokens": 2241146810.0, "step": 734 }, { "epoch": 1.0762044330463454, "grad_norm": 0.5636060930723763, "learning_rate": 3.6870923793263475e-05, "loss": 0.7577, "num_tokens": 2244224387.0, "step": 735 }, { "epoch": 1.0776699029126213, "grad_norm": 0.6067940184516105, "learning_rate": 3.686129713797555e-05, "loss": 0.7479, "num_tokens": 2247524768.0, "step": 736 }, { "epoch": 1.0791353727788973, "grad_norm": 0.5148018865782866, "learning_rate": 3.685165711132169e-05, "loss": 0.7427, "num_tokens": 2250688717.0, "step": 737 }, { "epoch": 1.0806008426451732, "grad_norm": 0.4802690919252581, "learning_rate": 3.684200372197547e-05, "loss": 0.7359, "num_tokens": 2253700064.0, "step": 738 }, { "epoch": 1.082066312511449, "grad_norm": 0.5275861366733628, "learning_rate": 3.683233697862245e-05, "loss": 0.7472, "num_tokens": 2256735602.0, "step": 739 }, { "epoch": 1.0835317823777249, "grad_norm": 0.6110310461543157, "learning_rate": 3.6822656889960233e-05, "loss": 0.7333, "num_tokens": 2259924265.0, "step": 740 }, { "epoch": 1.0849972522440008, "grad_norm": 0.5372957147826037, "learning_rate": 3.681296346469842e-05, "loss": 0.7551, "num_tokens": 2262916078.0, "step": 741 }, { "epoch": 1.0864627221102765, "grad_norm": 0.6659513036118253, "learning_rate": 3.680325671155863e-05, "loss": 0.7625, "num_tokens": 2265813741.0, "step": 742 }, { "epoch": 1.0879281919765524, "grad_norm": 0.4416955811201939, "learning_rate": 3.679353663927445e-05, "loss": 0.7484, "num_tokens": 2268793250.0, "step": 743 }, { "epoch": 1.0893936618428284, "grad_norm": 0.7027031517387361, "learning_rate": 3.678380325659145e-05, "loss": 0.7333, "num_tokens": 2271828634.0, "step": 744 }, { "epoch": 1.0908591317091043, "grad_norm": 0.5281969383089093, "learning_rate": 3.677405657226718e-05, "loss": 0.7676, "num_tokens": 2274614334.0, "step": 745 }, { "epoch": 1.09232460157538, "grad_norm": 0.5968650308781397, "learning_rate": 3.6764296595071185e-05, "loss": 0.7293, "num_tokens": 2277613326.0, "step": 746 }, { "epoch": 1.093790071441656, "grad_norm": 0.5649618616317241, "learning_rate": 3.6754523333784925e-05, "loss": 0.7567, "num_tokens": 2280784695.0, "step": 747 }, { "epoch": 1.095255541307932, "grad_norm": 0.5942567444664041, "learning_rate": 3.6744736797201856e-05, "loss": 0.7434, "num_tokens": 2283843683.0, "step": 748 }, { "epoch": 1.0967210111742078, "grad_norm": 0.5741612199460744, "learning_rate": 3.673493699412734e-05, "loss": 0.7318, "num_tokens": 2286716216.0, "step": 749 }, { "epoch": 1.0981864810404836, "grad_norm": 0.42116528502576295, "learning_rate": 3.672512393337871e-05, "loss": 0.7243, "num_tokens": 2289952952.0, "step": 750 }, { "epoch": 1.0996519509067595, "grad_norm": 0.6044648116673002, "learning_rate": 3.671529762378519e-05, "loss": 0.7452, "num_tokens": 2293060517.0, "step": 751 }, { "epoch": 1.1011174207730354, "grad_norm": 0.46348240332111157, "learning_rate": 3.670545807418796e-05, "loss": 0.7551, "num_tokens": 2296232202.0, "step": 752 }, { "epoch": 1.1025828906393111, "grad_norm": 0.5180548128420432, "learning_rate": 3.669560529344009e-05, "loss": 0.7579, "num_tokens": 2299313476.0, "step": 753 }, { "epoch": 1.104048360505587, "grad_norm": 0.5645230320912202, "learning_rate": 3.668573929040657e-05, "loss": 0.7316, "num_tokens": 2302211722.0, "step": 754 }, { "epoch": 1.105513830371863, "grad_norm": 0.6507893652662134, "learning_rate": 3.6675860073964275e-05, "loss": 0.7444, "num_tokens": 2305146236.0, "step": 755 }, { "epoch": 1.106979300238139, "grad_norm": 0.4689356093792895, "learning_rate": 3.666596765300197e-05, "loss": 0.7595, "num_tokens": 2308004712.0, "step": 756 }, { "epoch": 1.1084447701044147, "grad_norm": 0.5578643684173523, "learning_rate": 3.665606203642031e-05, "loss": 0.7456, "num_tokens": 2311076276.0, "step": 757 }, { "epoch": 1.1099102399706906, "grad_norm": 0.4923974582451954, "learning_rate": 3.664614323313181e-05, "loss": 0.7844, "num_tokens": 2314222276.0, "step": 758 }, { "epoch": 1.1113757098369665, "grad_norm": 0.5791975983862808, "learning_rate": 3.663621125206086e-05, "loss": 0.7539, "num_tokens": 2317277231.0, "step": 759 }, { "epoch": 1.1128411797032423, "grad_norm": 0.4948512630493004, "learning_rate": 3.66262661021437e-05, "loss": 0.7542, "num_tokens": 2320363787.0, "step": 760 }, { "epoch": 1.1143066495695182, "grad_norm": 0.6162555073830158, "learning_rate": 3.661630779232842e-05, "loss": 0.7462, "num_tokens": 2323562029.0, "step": 761 }, { "epoch": 1.1157721194357941, "grad_norm": 0.44356108900421054, "learning_rate": 3.660633633157495e-05, "loss": 0.7486, "num_tokens": 2326568575.0, "step": 762 }, { "epoch": 1.11723758930207, "grad_norm": 0.6717598253891943, "learning_rate": 3.6596351728855066e-05, "loss": 0.7553, "num_tokens": 2329403181.0, "step": 763 }, { "epoch": 1.1187030591683458, "grad_norm": 0.47370568301325094, "learning_rate": 3.658635399315233e-05, "loss": 0.7463, "num_tokens": 2332456267.0, "step": 764 }, { "epoch": 1.1201685290346217, "grad_norm": 0.6573236534532894, "learning_rate": 3.657634313346217e-05, "loss": 0.7609, "num_tokens": 2335289875.0, "step": 765 }, { "epoch": 1.1216339989008977, "grad_norm": 0.4608203279343698, "learning_rate": 3.6566319158791804e-05, "loss": 0.7717, "num_tokens": 2338029313.0, "step": 766 }, { "epoch": 1.1230994687671734, "grad_norm": 0.7499015540789802, "learning_rate": 3.655628207816023e-05, "loss": 0.7501, "num_tokens": 2341081528.0, "step": 767 }, { "epoch": 1.1245649386334493, "grad_norm": 0.5997630736891446, "learning_rate": 3.654623190059825e-05, "loss": 0.7707, "num_tokens": 2344094852.0, "step": 768 }, { "epoch": 1.1260304084997252, "grad_norm": 0.5725684757570828, "learning_rate": 3.6536168635148465e-05, "loss": 0.7354, "num_tokens": 2347198773.0, "step": 769 }, { "epoch": 1.1274958783660012, "grad_norm": 0.5888403125235339, "learning_rate": 3.652609229086523e-05, "loss": 0.7341, "num_tokens": 2350374813.0, "step": 770 }, { "epoch": 1.128961348232277, "grad_norm": 0.4836280198136197, "learning_rate": 3.6516002876814686e-05, "loss": 0.7431, "num_tokens": 2353533069.0, "step": 771 }, { "epoch": 1.1304268180985528, "grad_norm": 0.6400021899227083, "learning_rate": 3.650590040207473e-05, "loss": 0.7481, "num_tokens": 2356573904.0, "step": 772 }, { "epoch": 1.1318922879648288, "grad_norm": 0.4904287484823315, "learning_rate": 3.649578487573498e-05, "loss": 0.7548, "num_tokens": 2359819497.0, "step": 773 }, { "epoch": 1.1333577578311047, "grad_norm": 0.6436736840519235, "learning_rate": 3.6485656306896844e-05, "loss": 0.7445, "num_tokens": 2362852518.0, "step": 774 }, { "epoch": 1.1348232276973804, "grad_norm": 0.49343356331319965, "learning_rate": 3.647551470467343e-05, "loss": 0.7459, "num_tokens": 2366050595.0, "step": 775 }, { "epoch": 1.1362886975636564, "grad_norm": 0.574131251029063, "learning_rate": 3.6465360078189614e-05, "loss": 0.7397, "num_tokens": 2369039591.0, "step": 776 }, { "epoch": 1.1377541674299323, "grad_norm": 0.4761682632867504, "learning_rate": 3.645519243658193e-05, "loss": 0.742, "num_tokens": 2371936524.0, "step": 777 }, { "epoch": 1.139219637296208, "grad_norm": 0.5447938453075636, "learning_rate": 3.644501178899866e-05, "loss": 0.753, "num_tokens": 2374869651.0, "step": 778 }, { "epoch": 1.140685107162484, "grad_norm": 0.594985234757248, "learning_rate": 3.6434818144599794e-05, "loss": 0.7584, "num_tokens": 2377847693.0, "step": 779 }, { "epoch": 1.1421505770287599, "grad_norm": 0.5165556519729052, "learning_rate": 3.6424611512557e-05, "loss": 0.7413, "num_tokens": 2380837636.0, "step": 780 }, { "epoch": 1.1436160468950356, "grad_norm": 0.5484273098668319, "learning_rate": 3.641439190205363e-05, "loss": 0.7574, "num_tokens": 2384024622.0, "step": 781 }, { "epoch": 1.1450815167613115, "grad_norm": 0.49476176262851795, "learning_rate": 3.640415932228473e-05, "loss": 0.7462, "num_tokens": 2387127375.0, "step": 782 }, { "epoch": 1.1465469866275875, "grad_norm": 0.5673789447145892, "learning_rate": 3.639391378245699e-05, "loss": 0.7363, "num_tokens": 2390040686.0, "step": 783 }, { "epoch": 1.1480124564938634, "grad_norm": 0.4463761824945204, "learning_rate": 3.638365529178879e-05, "loss": 0.7225, "num_tokens": 2393069109.0, "step": 784 }, { "epoch": 1.1494779263601391, "grad_norm": 0.603203773557243, "learning_rate": 3.6373383859510135e-05, "loss": 0.7442, "num_tokens": 2396239053.0, "step": 785 }, { "epoch": 1.150943396226415, "grad_norm": 0.501386266641157, "learning_rate": 3.6363099494862704e-05, "loss": 0.7497, "num_tokens": 2399490843.0, "step": 786 }, { "epoch": 1.152408866092691, "grad_norm": 0.7142459542611617, "learning_rate": 3.635280220709977e-05, "loss": 0.7358, "num_tokens": 2402603598.0, "step": 787 }, { "epoch": 1.153874335958967, "grad_norm": 0.4884249292884803, "learning_rate": 3.6342492005486284e-05, "loss": 0.7537, "num_tokens": 2405554239.0, "step": 788 }, { "epoch": 1.1553398058252426, "grad_norm": 0.5448521132239552, "learning_rate": 3.633216889929877e-05, "loss": 0.7637, "num_tokens": 2408404215.0, "step": 789 }, { "epoch": 1.1568052756915186, "grad_norm": 0.5395111280241897, "learning_rate": 3.6321832897825396e-05, "loss": 0.7562, "num_tokens": 2411538139.0, "step": 790 }, { "epoch": 1.1582707455577945, "grad_norm": 0.5554376091980064, "learning_rate": 3.631148401036591e-05, "loss": 0.7623, "num_tokens": 2414484095.0, "step": 791 }, { "epoch": 1.1597362154240702, "grad_norm": 0.5472466773193756, "learning_rate": 3.630112224623167e-05, "loss": 0.7612, "num_tokens": 2417422671.0, "step": 792 }, { "epoch": 1.1612016852903462, "grad_norm": 0.47849574899464786, "learning_rate": 3.629074761474561e-05, "loss": 0.7364, "num_tokens": 2420428049.0, "step": 793 }, { "epoch": 1.1626671551566221, "grad_norm": 0.549409400606777, "learning_rate": 3.6280360125242234e-05, "loss": 0.7448, "num_tokens": 2423459433.0, "step": 794 }, { "epoch": 1.164132625022898, "grad_norm": 0.5949483510157997, "learning_rate": 3.626995978706765e-05, "loss": 0.7435, "num_tokens": 2426409962.0, "step": 795 }, { "epoch": 1.1655980948891738, "grad_norm": 0.43509381108765177, "learning_rate": 3.625954660957948e-05, "loss": 0.7342, "num_tokens": 2429570678.0, "step": 796 }, { "epoch": 1.1670635647554497, "grad_norm": 0.5173539496395994, "learning_rate": 3.6249120602146926e-05, "loss": 0.7525, "num_tokens": 2432651292.0, "step": 797 }, { "epoch": 1.1685290346217256, "grad_norm": 0.7109420938830574, "learning_rate": 3.623868177415074e-05, "loss": 0.7333, "num_tokens": 2435742723.0, "step": 798 }, { "epoch": 1.1699945044880016, "grad_norm": 0.4716566004637944, "learning_rate": 3.622823013498318e-05, "loss": 0.7341, "num_tokens": 2438697858.0, "step": 799 }, { "epoch": 1.1714599743542773, "grad_norm": 0.6207329486694613, "learning_rate": 3.621776569404806e-05, "loss": 0.7495, "num_tokens": 2441609339.0, "step": 800 }, { "epoch": 1.1729254442205532, "grad_norm": 0.5455232082217442, "learning_rate": 3.620728846076069e-05, "loss": 0.7449, "num_tokens": 2444460129.0, "step": 801 }, { "epoch": 1.1743909140868292, "grad_norm": 0.5715368187010545, "learning_rate": 3.619679844454791e-05, "loss": 0.7562, "num_tokens": 2447385702.0, "step": 802 }, { "epoch": 1.1758563839531049, "grad_norm": 0.6157891604805696, "learning_rate": 3.618629565484804e-05, "loss": 0.7425, "num_tokens": 2450596845.0, "step": 803 }, { "epoch": 1.1773218538193808, "grad_norm": 0.446325699112817, "learning_rate": 3.617578010111091e-05, "loss": 0.7277, "num_tokens": 2453748901.0, "step": 804 }, { "epoch": 1.1787873236856568, "grad_norm": 0.5441709024714138, "learning_rate": 3.616525179279784e-05, "loss": 0.7633, "num_tokens": 2456678338.0, "step": 805 }, { "epoch": 1.1802527935519325, "grad_norm": 0.6285552809644295, "learning_rate": 3.615471073938159e-05, "loss": 0.7302, "num_tokens": 2459606498.0, "step": 806 }, { "epoch": 1.1817182634182084, "grad_norm": 0.50570449381037, "learning_rate": 3.614415695034642e-05, "loss": 0.7561, "num_tokens": 2462718404.0, "step": 807 }, { "epoch": 1.1831837332844843, "grad_norm": 0.506231458394893, "learning_rate": 3.613359043518806e-05, "loss": 0.7354, "num_tokens": 2465810912.0, "step": 808 }, { "epoch": 1.1846492031507603, "grad_norm": 0.5096818741798217, "learning_rate": 3.612301120341365e-05, "loss": 0.7318, "num_tokens": 2468741379.0, "step": 809 }, { "epoch": 1.186114673017036, "grad_norm": 0.6600038539111857, "learning_rate": 3.61124192645418e-05, "loss": 0.7565, "num_tokens": 2471852662.0, "step": 810 }, { "epoch": 1.187580142883312, "grad_norm": 0.5539575573076669, "learning_rate": 3.610181462810254e-05, "loss": 0.7498, "num_tokens": 2475201268.0, "step": 811 }, { "epoch": 1.1890456127495879, "grad_norm": 0.5781660795578409, "learning_rate": 3.609119730363734e-05, "loss": 0.7429, "num_tokens": 2478183856.0, "step": 812 }, { "epoch": 1.1905110826158638, "grad_norm": 0.7656175651194363, "learning_rate": 3.608056730069906e-05, "loss": 0.7484, "num_tokens": 2481201221.0, "step": 813 }, { "epoch": 1.1919765524821395, "grad_norm": 0.5970484506671897, "learning_rate": 3.606992462885201e-05, "loss": 0.739, "num_tokens": 2484321297.0, "step": 814 }, { "epoch": 1.1934420223484155, "grad_norm": 0.6854697421273058, "learning_rate": 3.6059269297671844e-05, "loss": 0.7695, "num_tokens": 2487309910.0, "step": 815 }, { "epoch": 1.1949074922146914, "grad_norm": 0.7597415120672409, "learning_rate": 3.604860131674565e-05, "loss": 0.7445, "num_tokens": 2490232398.0, "step": 816 }, { "epoch": 1.196372962080967, "grad_norm": 0.5592664516086174, "learning_rate": 3.603792069567187e-05, "loss": 0.7512, "num_tokens": 2493447304.0, "step": 817 }, { "epoch": 1.197838431947243, "grad_norm": 0.8119270857697747, "learning_rate": 3.602722744406035e-05, "loss": 0.7496, "num_tokens": 2496716115.0, "step": 818 }, { "epoch": 1.199303901813519, "grad_norm": 0.5068081892319506, "learning_rate": 3.601652157153227e-05, "loss": 0.7572, "num_tokens": 2499573424.0, "step": 819 }, { "epoch": 1.200769371679795, "grad_norm": 0.8771479499729934, "learning_rate": 3.600580308772018e-05, "loss": 0.7468, "num_tokens": 2502631680.0, "step": 820 }, { "epoch": 1.2022348415460706, "grad_norm": 0.6003469705220338, "learning_rate": 3.599507200226796e-05, "loss": 0.7529, "num_tokens": 2505734677.0, "step": 821 }, { "epoch": 1.2037003114123466, "grad_norm": 0.8909743257302668, "learning_rate": 3.598432832483086e-05, "loss": 0.7424, "num_tokens": 2508663249.0, "step": 822 }, { "epoch": 1.2051657812786225, "grad_norm": 0.665613228337953, "learning_rate": 3.597357206507543e-05, "loss": 0.7587, "num_tokens": 2511523227.0, "step": 823 }, { "epoch": 1.2066312511448984, "grad_norm": 0.786596315501698, "learning_rate": 3.596280323267956e-05, "loss": 0.7335, "num_tokens": 2514710824.0, "step": 824 }, { "epoch": 1.2080967210111742, "grad_norm": 0.7115277495007731, "learning_rate": 3.595202183733244e-05, "loss": 0.7536, "num_tokens": 2517584775.0, "step": 825 }, { "epoch": 1.20956219087745, "grad_norm": 0.7014646239781099, "learning_rate": 3.594122788873457e-05, "loss": 0.7534, "num_tokens": 2520477375.0, "step": 826 }, { "epoch": 1.211027660743726, "grad_norm": 0.579524568878739, "learning_rate": 3.593042139659774e-05, "loss": 0.7393, "num_tokens": 2523649349.0, "step": 827 }, { "epoch": 1.2124931306100017, "grad_norm": 0.6768661068864853, "learning_rate": 3.5919602370645036e-05, "loss": 0.732, "num_tokens": 2526648143.0, "step": 828 }, { "epoch": 1.2139586004762777, "grad_norm": 0.5568210946116069, "learning_rate": 3.59087708206108e-05, "loss": 0.7336, "num_tokens": 2529986283.0, "step": 829 }, { "epoch": 1.2154240703425536, "grad_norm": 0.6464419441365561, "learning_rate": 3.589792675624067e-05, "loss": 0.7456, "num_tokens": 2533221035.0, "step": 830 }, { "epoch": 1.2168895402088293, "grad_norm": 0.6260454895826908, "learning_rate": 3.5887070187291516e-05, "loss": 0.7403, "num_tokens": 2536527199.0, "step": 831 }, { "epoch": 1.2183550100751053, "grad_norm": 0.49594840943509677, "learning_rate": 3.587620112353149e-05, "loss": 0.748, "num_tokens": 2539626744.0, "step": 832 }, { "epoch": 1.2198204799413812, "grad_norm": 0.7364413525453473, "learning_rate": 3.5865319574739965e-05, "loss": 0.7444, "num_tokens": 2542752358.0, "step": 833 }, { "epoch": 1.2212859498076571, "grad_norm": 0.5273930007968609, "learning_rate": 3.585442555070754e-05, "loss": 0.7242, "num_tokens": 2545592005.0, "step": 834 }, { "epoch": 1.2227514196739329, "grad_norm": 0.6778991032678832, "learning_rate": 3.584351906123606e-05, "loss": 0.73, "num_tokens": 2549012232.0, "step": 835 }, { "epoch": 1.2242168895402088, "grad_norm": 0.5183112827075003, "learning_rate": 3.583260011613857e-05, "loss": 0.752, "num_tokens": 2552055893.0, "step": 836 }, { "epoch": 1.2256823594064847, "grad_norm": 0.7821549164347252, "learning_rate": 3.582166872523933e-05, "loss": 0.7519, "num_tokens": 2555214237.0, "step": 837 }, { "epoch": 1.2271478292727607, "grad_norm": 0.6058890313797158, "learning_rate": 3.5810724898373794e-05, "loss": 0.7404, "num_tokens": 2558471673.0, "step": 838 }, { "epoch": 1.2286132991390364, "grad_norm": 0.8078949524200064, "learning_rate": 3.579976864538862e-05, "loss": 0.7361, "num_tokens": 2561663165.0, "step": 839 }, { "epoch": 1.2300787690053123, "grad_norm": 0.5594406291968924, "learning_rate": 3.578879997614161e-05, "loss": 0.7506, "num_tokens": 2564715693.0, "step": 840 }, { "epoch": 1.2315442388715883, "grad_norm": 0.6541570879836943, "learning_rate": 3.577781890050177e-05, "loss": 0.7299, "num_tokens": 2567862538.0, "step": 841 }, { "epoch": 1.233009708737864, "grad_norm": 0.6181771105707208, "learning_rate": 3.576682542834927e-05, "loss": 0.7247, "num_tokens": 2570985266.0, "step": 842 }, { "epoch": 1.23447517860414, "grad_norm": 0.6702058311025445, "learning_rate": 3.575581956957542e-05, "loss": 0.7581, "num_tokens": 2574141745.0, "step": 843 }, { "epoch": 1.2359406484704158, "grad_norm": 0.5415090090355892, "learning_rate": 3.5744801334082656e-05, "loss": 0.7366, "num_tokens": 2577186031.0, "step": 844 }, { "epoch": 1.2374061183366918, "grad_norm": 0.7304120868745388, "learning_rate": 3.57337707317846e-05, "loss": 0.7679, "num_tokens": 2580194225.0, "step": 845 }, { "epoch": 1.2388715882029675, "grad_norm": 0.5568789245984441, "learning_rate": 3.5722727772605954e-05, "loss": 0.7417, "num_tokens": 2583201268.0, "step": 846 }, { "epoch": 1.2403370580692434, "grad_norm": 0.6594030106680493, "learning_rate": 3.571167246648256e-05, "loss": 0.7581, "num_tokens": 2586260475.0, "step": 847 }, { "epoch": 1.2418025279355194, "grad_norm": 0.5883393365413924, "learning_rate": 3.570060482336137e-05, "loss": 0.7343, "num_tokens": 2589432034.0, "step": 848 }, { "epoch": 1.2432679978017953, "grad_norm": 0.5774334130789731, "learning_rate": 3.568952485320043e-05, "loss": 0.7388, "num_tokens": 2592559539.0, "step": 849 }, { "epoch": 1.244733467668071, "grad_norm": 0.5636442056489902, "learning_rate": 3.567843256596888e-05, "loss": 0.7453, "num_tokens": 2595456533.0, "step": 850 }, { "epoch": 1.246198937534347, "grad_norm": 0.5590164570830045, "learning_rate": 3.5667327971646934e-05, "loss": 0.7259, "num_tokens": 2598450360.0, "step": 851 }, { "epoch": 1.247664407400623, "grad_norm": 0.58963666002915, "learning_rate": 3.56562110802259e-05, "loss": 0.7399, "num_tokens": 2601810580.0, "step": 852 }, { "epoch": 1.2491298772668986, "grad_norm": 0.5532527541546471, "learning_rate": 3.564508190170812e-05, "loss": 0.7445, "num_tokens": 2604654098.0, "step": 853 }, { "epoch": 1.2505953471331746, "grad_norm": 0.5495565860270518, "learning_rate": 3.5633940446107026e-05, "loss": 0.7411, "num_tokens": 2607672619.0, "step": 854 }, { "epoch": 1.2520608169994505, "grad_norm": 0.5888307692148788, "learning_rate": 3.562278672344707e-05, "loss": 0.7543, "num_tokens": 2610561276.0, "step": 855 }, { "epoch": 1.2535262868657262, "grad_norm": 0.570711193868834, "learning_rate": 3.561162074376374e-05, "loss": 0.7358, "num_tokens": 2613497992.0, "step": 856 }, { "epoch": 1.2549917567320021, "grad_norm": 0.47406163794333306, "learning_rate": 3.5600442517103566e-05, "loss": 0.748, "num_tokens": 2616355657.0, "step": 857 }, { "epoch": 1.256457226598278, "grad_norm": 0.5476502333707635, "learning_rate": 3.5589252053524104e-05, "loss": 0.7371, "num_tokens": 2619405519.0, "step": 858 }, { "epoch": 1.257922696464554, "grad_norm": 0.48371903576850006, "learning_rate": 3.557804936309389e-05, "loss": 0.7313, "num_tokens": 2622608844.0, "step": 859 }, { "epoch": 1.25938816633083, "grad_norm": 0.6328645493932304, "learning_rate": 3.5566834455892505e-05, "loss": 0.751, "num_tokens": 2625478590.0, "step": 860 }, { "epoch": 1.2608536361971057, "grad_norm": 0.47925722201741144, "learning_rate": 3.555560734201047e-05, "loss": 0.7326, "num_tokens": 2628775410.0, "step": 861 }, { "epoch": 1.2623191060633816, "grad_norm": 0.5903566189409216, "learning_rate": 3.554436803154933e-05, "loss": 0.7462, "num_tokens": 2631955534.0, "step": 862 }, { "epoch": 1.2637845759296575, "grad_norm": 0.5574145870415723, "learning_rate": 3.55331165346216e-05, "loss": 0.7524, "num_tokens": 2635036510.0, "step": 863 }, { "epoch": 1.2652500457959333, "grad_norm": 0.57452401634078, "learning_rate": 3.552185286135072e-05, "loss": 0.7525, "num_tokens": 2638008798.0, "step": 864 }, { "epoch": 1.2667155156622092, "grad_norm": 0.4953923690178717, "learning_rate": 3.5510577021871145e-05, "loss": 0.7373, "num_tokens": 2640869157.0, "step": 865 }, { "epoch": 1.2681809855284851, "grad_norm": 0.4829210695497327, "learning_rate": 3.549928902632822e-05, "loss": 0.7204, "num_tokens": 2643766879.0, "step": 866 }, { "epoch": 1.2696464553947608, "grad_norm": 0.5408349654128393, "learning_rate": 3.548798888487827e-05, "loss": 0.7425, "num_tokens": 2646928103.0, "step": 867 }, { "epoch": 1.2711119252610368, "grad_norm": 0.5456776916813653, "learning_rate": 3.547667660768853e-05, "loss": 0.7375, "num_tokens": 2650178211.0, "step": 868 }, { "epoch": 1.2725773951273127, "grad_norm": 0.4812782524865056, "learning_rate": 3.5465352204937145e-05, "loss": 0.7466, "num_tokens": 2653318838.0, "step": 869 }, { "epoch": 1.2740428649935884, "grad_norm": 0.47967868325721497, "learning_rate": 3.54540156868132e-05, "loss": 0.7427, "num_tokens": 2656180221.0, "step": 870 }, { "epoch": 1.2755083348598644, "grad_norm": 0.6095567168366964, "learning_rate": 3.544266706351664e-05, "loss": 0.7382, "num_tokens": 2659195580.0, "step": 871 }, { "epoch": 1.2769738047261403, "grad_norm": 0.5163164285264217, "learning_rate": 3.5431306345258345e-05, "loss": 0.756, "num_tokens": 2662315281.0, "step": 872 }, { "epoch": 1.2784392745924162, "grad_norm": 0.6405609729624486, "learning_rate": 3.541993354226004e-05, "loss": 0.7477, "num_tokens": 2665382666.0, "step": 873 }, { "epoch": 1.2799047444586922, "grad_norm": 0.5697257241578433, "learning_rate": 3.540854866475433e-05, "loss": 0.7416, "num_tokens": 2668636950.0, "step": 874 }, { "epoch": 1.281370214324968, "grad_norm": 0.5449579903012629, "learning_rate": 3.539715172298472e-05, "loss": 0.758, "num_tokens": 2671520273.0, "step": 875 }, { "epoch": 1.2828356841912438, "grad_norm": 0.4613802454663772, "learning_rate": 3.5385742727205524e-05, "loss": 0.7383, "num_tokens": 2674478515.0, "step": 876 }, { "epoch": 1.2843011540575198, "grad_norm": 0.5723469994272484, "learning_rate": 3.5374321687681915e-05, "loss": 0.7431, "num_tokens": 2677611412.0, "step": 877 }, { "epoch": 1.2857666239237955, "grad_norm": 0.5047347499624695, "learning_rate": 3.5362888614689926e-05, "loss": 0.7466, "num_tokens": 2680677990.0, "step": 878 }, { "epoch": 1.2872320937900714, "grad_norm": 0.40220542916591606, "learning_rate": 3.5351443518516375e-05, "loss": 0.7338, "num_tokens": 2683873236.0, "step": 879 }, { "epoch": 1.2886975636563474, "grad_norm": 0.719296547305351, "learning_rate": 3.533998640945895e-05, "loss": 0.7421, "num_tokens": 2687040744.0, "step": 880 }, { "epoch": 1.290163033522623, "grad_norm": 0.4799917049068224, "learning_rate": 3.532851729782609e-05, "loss": 0.7463, "num_tokens": 2690231914.0, "step": 881 }, { "epoch": 1.291628503388899, "grad_norm": 0.649980288456009, "learning_rate": 3.5317036193937076e-05, "loss": 0.7562, "num_tokens": 2693452684.0, "step": 882 }, { "epoch": 1.293093973255175, "grad_norm": 0.5864776216563321, "learning_rate": 3.530554310812196e-05, "loss": 0.7281, "num_tokens": 2696480239.0, "step": 883 }, { "epoch": 1.2945594431214509, "grad_norm": 0.583749117807856, "learning_rate": 3.529403805072158e-05, "loss": 0.7196, "num_tokens": 2699452115.0, "step": 884 }, { "epoch": 1.2960249129877268, "grad_norm": 0.4439546221574545, "learning_rate": 3.528252103208755e-05, "loss": 0.7643, "num_tokens": 2702686208.0, "step": 885 }, { "epoch": 1.2974903828540025, "grad_norm": 0.6891188644057172, "learning_rate": 3.5270992062582236e-05, "loss": 0.7363, "num_tokens": 2705731990.0, "step": 886 }, { "epoch": 1.2989558527202785, "grad_norm": 0.591676696402332, "learning_rate": 3.525945115257876e-05, "loss": 0.7235, "num_tokens": 2708876244.0, "step": 887 }, { "epoch": 1.3004213225865544, "grad_norm": 0.4033024386014443, "learning_rate": 3.5247898312460994e-05, "loss": 0.7578, "num_tokens": 2711869617.0, "step": 888 }, { "epoch": 1.3018867924528301, "grad_norm": 0.5501485322714131, "learning_rate": 3.5236333552623536e-05, "loss": 0.7462, "num_tokens": 2714850812.0, "step": 889 }, { "epoch": 1.303352262319106, "grad_norm": 0.6634058318601604, "learning_rate": 3.522475688347171e-05, "loss": 0.7178, "num_tokens": 2717923189.0, "step": 890 }, { "epoch": 1.304817732185382, "grad_norm": 0.47786814637406694, "learning_rate": 3.5213168315421554e-05, "loss": 0.7275, "num_tokens": 2721026432.0, "step": 891 }, { "epoch": 1.3062832020516577, "grad_norm": 0.6881550321293585, "learning_rate": 3.5201567858899826e-05, "loss": 0.7455, "num_tokens": 2723974991.0, "step": 892 }, { "epoch": 1.3077486719179336, "grad_norm": 0.536795070157575, "learning_rate": 3.518995552434396e-05, "loss": 0.743, "num_tokens": 2727109890.0, "step": 893 }, { "epoch": 1.3092141417842096, "grad_norm": 0.5692289973097611, "learning_rate": 3.517833132220208e-05, "loss": 0.7436, "num_tokens": 2730015915.0, "step": 894 }, { "epoch": 1.3106796116504853, "grad_norm": 0.625149804943666, "learning_rate": 3.516669526293302e-05, "loss": 0.7519, "num_tokens": 2733262015.0, "step": 895 }, { "epoch": 1.3121450815167612, "grad_norm": 0.5464497519302005, "learning_rate": 3.515504735700624e-05, "loss": 0.7454, "num_tokens": 2736133138.0, "step": 896 }, { "epoch": 1.3136105513830372, "grad_norm": 0.9421700049481438, "learning_rate": 3.514338761490187e-05, "loss": 0.7372, "num_tokens": 2739271100.0, "step": 897 }, { "epoch": 1.315076021249313, "grad_norm": 0.6565265014193864, "learning_rate": 3.5131716047110704e-05, "loss": 0.7334, "num_tokens": 2742246498.0, "step": 898 }, { "epoch": 1.316541491115589, "grad_norm": 0.7901291103921996, "learning_rate": 3.512003266413416e-05, "loss": 0.725, "num_tokens": 2745337310.0, "step": 899 }, { "epoch": 1.3180069609818648, "grad_norm": 0.7269613575282566, "learning_rate": 3.51083374764843e-05, "loss": 0.7449, "num_tokens": 2748440341.0, "step": 900 }, { "epoch": 1.3194724308481407, "grad_norm": 0.7591733005729371, "learning_rate": 3.509663049468382e-05, "loss": 0.742, "num_tokens": 2751632761.0, "step": 901 }, { "epoch": 1.3209379007144166, "grad_norm": 0.573865126308196, "learning_rate": 3.508491172926597e-05, "loss": 0.7425, "num_tokens": 2754517329.0, "step": 902 }, { "epoch": 1.3224033705806923, "grad_norm": 0.8018713729767156, "learning_rate": 3.5073181190774665e-05, "loss": 0.7398, "num_tokens": 2757381874.0, "step": 903 }, { "epoch": 1.3238688404469683, "grad_norm": 0.513319093795368, "learning_rate": 3.5061438889764384e-05, "loss": 0.7378, "num_tokens": 2760492708.0, "step": 904 }, { "epoch": 1.3253343103132442, "grad_norm": 0.8144240071371247, "learning_rate": 3.50496848368002e-05, "loss": 0.7406, "num_tokens": 2763653827.0, "step": 905 }, { "epoch": 1.32679978017952, "grad_norm": 0.6678962157284265, "learning_rate": 3.503791904245774e-05, "loss": 0.7482, "num_tokens": 2766679813.0, "step": 906 }, { "epoch": 1.3282652500457959, "grad_norm": 0.7313164797008055, "learning_rate": 3.502614151732323e-05, "loss": 0.7483, "num_tokens": 2769515765.0, "step": 907 }, { "epoch": 1.3297307199120718, "grad_norm": 0.6108366670263453, "learning_rate": 3.5014352271993416e-05, "loss": 0.7563, "num_tokens": 2772513966.0, "step": 908 }, { "epoch": 1.3311961897783477, "grad_norm": 0.7368746447591218, "learning_rate": 3.5002551317075605e-05, "loss": 0.7453, "num_tokens": 2775430516.0, "step": 909 }, { "epoch": 1.3326616596446237, "grad_norm": 0.6535899430316167, "learning_rate": 3.499073866318763e-05, "loss": 0.7493, "num_tokens": 2778292388.0, "step": 910 }, { "epoch": 1.3341271295108994, "grad_norm": 0.7242669594512939, "learning_rate": 3.4978914320957877e-05, "loss": 0.7336, "num_tokens": 2781450279.0, "step": 911 }, { "epoch": 1.3355925993771753, "grad_norm": 0.7410910707778944, "learning_rate": 3.496707830102522e-05, "loss": 0.7348, "num_tokens": 2784588815.0, "step": 912 }, { "epoch": 1.3370580692434513, "grad_norm": 0.8021619705782066, "learning_rate": 3.495523061403904e-05, "loss": 0.7334, "num_tokens": 2787568115.0, "step": 913 }, { "epoch": 1.338523539109727, "grad_norm": 0.7166242360715578, "learning_rate": 3.4943371270659236e-05, "loss": 0.7449, "num_tokens": 2790342239.0, "step": 914 }, { "epoch": 1.339989008976003, "grad_norm": 0.6993206983026142, "learning_rate": 3.493150028155618e-05, "loss": 0.7402, "num_tokens": 2793388595.0, "step": 915 }, { "epoch": 1.3414544788422789, "grad_norm": 0.6663375919678785, "learning_rate": 3.491961765741073e-05, "loss": 0.7314, "num_tokens": 2796514001.0, "step": 916 }, { "epoch": 1.3429199487085546, "grad_norm": 0.6253520028667585, "learning_rate": 3.4907723408914196e-05, "loss": 0.7361, "num_tokens": 2799603959.0, "step": 917 }, { "epoch": 1.3443854185748305, "grad_norm": 0.5679769921239218, "learning_rate": 3.4895817546768365e-05, "loss": 0.7195, "num_tokens": 2802692213.0, "step": 918 }, { "epoch": 1.3458508884411065, "grad_norm": 0.6044390948647952, "learning_rate": 3.488390008168548e-05, "loss": 0.7567, "num_tokens": 2805650752.0, "step": 919 }, { "epoch": 1.3473163583073822, "grad_norm": 0.5294586623494583, "learning_rate": 3.4871971024388194e-05, "loss": 0.7166, "num_tokens": 2808966725.0, "step": 920 }, { "epoch": 1.348781828173658, "grad_norm": 0.6197481356480029, "learning_rate": 3.486003038560961e-05, "loss": 0.7558, "num_tokens": 2811968361.0, "step": 921 }, { "epoch": 1.350247298039934, "grad_norm": 0.4422127059512262, "learning_rate": 3.484807817609326e-05, "loss": 0.7295, "num_tokens": 2815190869.0, "step": 922 }, { "epoch": 1.35171276790621, "grad_norm": 0.6320052063219845, "learning_rate": 3.483611440659306e-05, "loss": 0.7287, "num_tokens": 2818489069.0, "step": 923 }, { "epoch": 1.353178237772486, "grad_norm": 0.4989912179242406, "learning_rate": 3.482413908787336e-05, "loss": 0.7614, "num_tokens": 2821507461.0, "step": 924 }, { "epoch": 1.3546437076387616, "grad_norm": 0.6948416855596531, "learning_rate": 3.4812152230708865e-05, "loss": 0.7277, "num_tokens": 2824764472.0, "step": 925 }, { "epoch": 1.3561091775050376, "grad_norm": 0.6054872670867794, "learning_rate": 3.4800153845884686e-05, "loss": 0.748, "num_tokens": 2827646531.0, "step": 926 }, { "epoch": 1.3575746473713135, "grad_norm": 0.6449690650564267, "learning_rate": 3.478814394419631e-05, "loss": 0.7273, "num_tokens": 2830527684.0, "step": 927 }, { "epoch": 1.3590401172375892, "grad_norm": 0.6459615081958557, "learning_rate": 3.477612253644956e-05, "loss": 0.7268, "num_tokens": 2833508673.0, "step": 928 }, { "epoch": 1.3605055871038652, "grad_norm": 0.6206023336750233, "learning_rate": 3.476408963346065e-05, "loss": 0.7472, "num_tokens": 2836495101.0, "step": 929 }, { "epoch": 1.361971056970141, "grad_norm": 0.5558160221659835, "learning_rate": 3.475204524605609e-05, "loss": 0.7375, "num_tokens": 2839618840.0, "step": 930 }, { "epoch": 1.3634365268364168, "grad_norm": 0.6834171666829266, "learning_rate": 3.473998938507276e-05, "loss": 0.7252, "num_tokens": 2842624399.0, "step": 931 }, { "epoch": 1.3649019967026927, "grad_norm": 0.535166478231198, "learning_rate": 3.472792206135786e-05, "loss": 0.7388, "num_tokens": 2845599578.0, "step": 932 }, { "epoch": 1.3663674665689687, "grad_norm": 0.6958789290012414, "learning_rate": 3.471584328576888e-05, "loss": 0.743, "num_tokens": 2849008132.0, "step": 933 }, { "epoch": 1.3678329364352446, "grad_norm": 0.5795017199080347, "learning_rate": 3.470375306917363e-05, "loss": 0.7271, "num_tokens": 2852241309.0, "step": 934 }, { "epoch": 1.3692984063015206, "grad_norm": 0.6430800398941781, "learning_rate": 3.469165142245022e-05, "loss": 0.7333, "num_tokens": 2855259819.0, "step": 935 }, { "epoch": 1.3707638761677963, "grad_norm": 0.6084882230817644, "learning_rate": 3.4679538356487026e-05, "loss": 0.734, "num_tokens": 2858303307.0, "step": 936 }, { "epoch": 1.3722293460340722, "grad_norm": 0.6178180948136079, "learning_rate": 3.4667413882182716e-05, "loss": 0.7472, "num_tokens": 2861516817.0, "step": 937 }, { "epoch": 1.3736948159003481, "grad_norm": 0.6337379342391498, "learning_rate": 3.465527801044622e-05, "loss": 0.742, "num_tokens": 2864492444.0, "step": 938 }, { "epoch": 1.3751602857666239, "grad_norm": 0.5602681633656569, "learning_rate": 3.464313075219671e-05, "loss": 0.738, "num_tokens": 2867632587.0, "step": 939 }, { "epoch": 1.3766257556328998, "grad_norm": 0.5722622089964792, "learning_rate": 3.463097211836361e-05, "loss": 0.7341, "num_tokens": 2870951702.0, "step": 940 }, { "epoch": 1.3780912254991757, "grad_norm": 0.5773895673524473, "learning_rate": 3.461880211988659e-05, "loss": 0.7417, "num_tokens": 2873623890.0, "step": 941 }, { "epoch": 1.3795566953654514, "grad_norm": 0.5525315648469733, "learning_rate": 3.4606620767715525e-05, "loss": 0.7232, "num_tokens": 2876713176.0, "step": 942 }, { "epoch": 1.3810221652317274, "grad_norm": 0.641720561109546, "learning_rate": 3.459442807281052e-05, "loss": 0.7448, "num_tokens": 2879951386.0, "step": 943 }, { "epoch": 1.3824876350980033, "grad_norm": 0.5224041688640501, "learning_rate": 3.458222404614188e-05, "loss": 0.7355, "num_tokens": 2883152057.0, "step": 944 }, { "epoch": 1.383953104964279, "grad_norm": 0.6819701096751417, "learning_rate": 3.4570008698690114e-05, "loss": 0.7137, "num_tokens": 2886164102.0, "step": 945 }, { "epoch": 1.385418574830555, "grad_norm": 0.5588874499535569, "learning_rate": 3.45577820414459e-05, "loss": 0.7346, "num_tokens": 2889284373.0, "step": 946 }, { "epoch": 1.386884044696831, "grad_norm": 0.5929731360099287, "learning_rate": 3.454554408541012e-05, "loss": 0.746, "num_tokens": 2892160330.0, "step": 947 }, { "epoch": 1.3883495145631068, "grad_norm": 0.5160411712842133, "learning_rate": 3.4533294841593776e-05, "loss": 0.7247, "num_tokens": 2895375745.0, "step": 948 }, { "epoch": 1.3898149844293828, "grad_norm": 0.7424513485301389, "learning_rate": 3.452103432101808e-05, "loss": 0.7565, "num_tokens": 2898300768.0, "step": 949 }, { "epoch": 1.3912804542956585, "grad_norm": 0.5544281865303642, "learning_rate": 3.450876253471434e-05, "loss": 0.7284, "num_tokens": 2901691257.0, "step": 950 }, { "epoch": 1.3927459241619344, "grad_norm": 0.6708300175017372, "learning_rate": 3.449647949372405e-05, "loss": 0.7187, "num_tokens": 2904669676.0, "step": 951 }, { "epoch": 1.3942113940282104, "grad_norm": 0.6765064450390005, "learning_rate": 3.4484185209098794e-05, "loss": 0.7348, "num_tokens": 2907777219.0, "step": 952 }, { "epoch": 1.395676863894486, "grad_norm": 0.5049804141461358, "learning_rate": 3.4471879691900286e-05, "loss": 0.7591, "num_tokens": 2910708073.0, "step": 953 }, { "epoch": 1.397142333760762, "grad_norm": 0.6468541829191335, "learning_rate": 3.445956295320033e-05, "loss": 0.7386, "num_tokens": 2913920906.0, "step": 954 }, { "epoch": 1.398607803627038, "grad_norm": 0.4467384601105551, "learning_rate": 3.444723500408085e-05, "loss": 0.746, "num_tokens": 2917014602.0, "step": 955 }, { "epoch": 1.4000732734933137, "grad_norm": 0.6180670226839605, "learning_rate": 3.443489585563385e-05, "loss": 0.7346, "num_tokens": 2919983150.0, "step": 956 }, { "epoch": 1.4015387433595896, "grad_norm": 0.5282588342729994, "learning_rate": 3.4422545518961414e-05, "loss": 0.7302, "num_tokens": 2922957480.0, "step": 957 }, { "epoch": 1.4030042132258655, "grad_norm": 0.7002150090693635, "learning_rate": 3.4410184005175664e-05, "loss": 0.7442, "num_tokens": 2926140811.0, "step": 958 }, { "epoch": 1.4044696830921415, "grad_norm": 0.47725222938137485, "learning_rate": 3.4397811325398814e-05, "loss": 0.7318, "num_tokens": 2929194529.0, "step": 959 }, { "epoch": 1.4059351529584174, "grad_norm": 0.747502314767629, "learning_rate": 3.4385427490763116e-05, "loss": 0.727, "num_tokens": 2932193635.0, "step": 960 }, { "epoch": 1.4074006228246931, "grad_norm": 0.552906821488841, "learning_rate": 3.4373032512410845e-05, "loss": 0.7563, "num_tokens": 2935381047.0, "step": 961 }, { "epoch": 1.408866092690969, "grad_norm": 0.8080846582881377, "learning_rate": 3.436062640149431e-05, "loss": 0.7363, "num_tokens": 2938530550.0, "step": 962 }, { "epoch": 1.410331562557245, "grad_norm": 0.7453059903364085, "learning_rate": 3.434820916917584e-05, "loss": 0.7401, "num_tokens": 2941383867.0, "step": 963 }, { "epoch": 1.4117970324235207, "grad_norm": 0.672349403884283, "learning_rate": 3.4335780826627764e-05, "loss": 0.736, "num_tokens": 2944459169.0, "step": 964 }, { "epoch": 1.4132625022897967, "grad_norm": 0.6891330381682266, "learning_rate": 3.432334138503242e-05, "loss": 0.7271, "num_tokens": 2947851010.0, "step": 965 }, { "epoch": 1.4147279721560726, "grad_norm": 0.6429726196876129, "learning_rate": 3.431089085558212e-05, "loss": 0.7572, "num_tokens": 2951004105.0, "step": 966 }, { "epoch": 1.4161934420223483, "grad_norm": 0.562623386368141, "learning_rate": 3.429842924947916e-05, "loss": 0.746, "num_tokens": 2954020786.0, "step": 967 }, { "epoch": 1.4176589118886243, "grad_norm": 0.607478971387216, "learning_rate": 3.428595657793578e-05, "loss": 0.7417, "num_tokens": 2957099154.0, "step": 968 }, { "epoch": 1.4191243817549002, "grad_norm": 0.5389546295318713, "learning_rate": 3.427347285217421e-05, "loss": 0.7351, "num_tokens": 2960108352.0, "step": 969 }, { "epoch": 1.420589851621176, "grad_norm": 0.5754693986416561, "learning_rate": 3.42609780834266e-05, "loss": 0.7389, "num_tokens": 2963035631.0, "step": 970 }, { "epoch": 1.4220553214874518, "grad_norm": 0.544263037545951, "learning_rate": 3.4248472282935065e-05, "loss": 0.7185, "num_tokens": 2966120311.0, "step": 971 }, { "epoch": 1.4235207913537278, "grad_norm": 0.46744224241911153, "learning_rate": 3.42359554619516e-05, "loss": 0.7361, "num_tokens": 2969538335.0, "step": 972 }, { "epoch": 1.4249862612200037, "grad_norm": 0.558341556330857, "learning_rate": 3.422342763173817e-05, "loss": 0.734, "num_tokens": 2972842448.0, "step": 973 }, { "epoch": 1.4264517310862797, "grad_norm": 0.5704332279149912, "learning_rate": 3.4210888803566586e-05, "loss": 0.7463, "num_tokens": 2975764776.0, "step": 974 }, { "epoch": 1.4279172009525554, "grad_norm": 0.5027732741836143, "learning_rate": 3.4198338988718606e-05, "loss": 0.7209, "num_tokens": 2978793947.0, "step": 975 }, { "epoch": 1.4293826708188313, "grad_norm": 0.4886380327133936, "learning_rate": 3.418577819848587e-05, "loss": 0.7454, "num_tokens": 2981910995.0, "step": 976 }, { "epoch": 1.4308481406851072, "grad_norm": 0.668011447770949, "learning_rate": 3.417320644416983e-05, "loss": 0.7591, "num_tokens": 2984863255.0, "step": 977 }, { "epoch": 1.432313610551383, "grad_norm": 0.4084779472147276, "learning_rate": 3.4160623737081886e-05, "loss": 0.7437, "num_tokens": 2987818764.0, "step": 978 }, { "epoch": 1.433779080417659, "grad_norm": 0.7710555219278353, "learning_rate": 3.414803008854324e-05, "loss": 0.7488, "num_tokens": 2991012288.0, "step": 979 }, { "epoch": 1.4352445502839348, "grad_norm": 0.4769981754981097, "learning_rate": 3.413542550988497e-05, "loss": 0.7411, "num_tokens": 2993879474.0, "step": 980 }, { "epoch": 1.4367100201502105, "grad_norm": 0.7442189314526665, "learning_rate": 3.412281001244796e-05, "loss": 0.7237, "num_tokens": 2996964695.0, "step": 981 }, { "epoch": 1.4381754900164865, "grad_norm": 0.5896864791581798, "learning_rate": 3.4110183607582924e-05, "loss": 0.734, "num_tokens": 2999946705.0, "step": 982 }, { "epoch": 1.4396409598827624, "grad_norm": 0.7182802812964383, "learning_rate": 3.4097546306650405e-05, "loss": 0.7266, "num_tokens": 3003064026.0, "step": 983 }, { "epoch": 1.4411064297490384, "grad_norm": 0.6154766894616646, "learning_rate": 3.4084898121020745e-05, "loss": 0.7309, "num_tokens": 3006311072.0, "step": 984 }, { "epoch": 1.4425718996153143, "grad_norm": 0.5629964853819998, "learning_rate": 3.407223906207407e-05, "loss": 0.7336, "num_tokens": 3009305560.0, "step": 985 }, { "epoch": 1.44403736948159, "grad_norm": 0.6747766488021395, "learning_rate": 3.40595691412003e-05, "loss": 0.7291, "num_tokens": 3012366568.0, "step": 986 }, { "epoch": 1.445502839347866, "grad_norm": 0.5401785966747638, "learning_rate": 3.404688836979911e-05, "loss": 0.748, "num_tokens": 3015395405.0, "step": 987 }, { "epoch": 1.4469683092141419, "grad_norm": 0.6958414132208053, "learning_rate": 3.403419675927997e-05, "loss": 0.7302, "num_tokens": 3018312192.0, "step": 988 }, { "epoch": 1.4484337790804176, "grad_norm": 0.7072724335636941, "learning_rate": 3.4021494321062075e-05, "loss": 0.7232, "num_tokens": 3021345414.0, "step": 989 }, { "epoch": 1.4498992489466935, "grad_norm": 0.619813952584958, "learning_rate": 3.4008781066574364e-05, "loss": 0.7384, "num_tokens": 3024209697.0, "step": 990 }, { "epoch": 1.4513647188129695, "grad_norm": 0.6326259284214336, "learning_rate": 3.3996057007255515e-05, "loss": 0.7201, "num_tokens": 3027275361.0, "step": 991 }, { "epoch": 1.4528301886792452, "grad_norm": 0.6490924108908067, "learning_rate": 3.3983322154553925e-05, "loss": 0.7453, "num_tokens": 3030208820.0, "step": 992 }, { "epoch": 1.4542956585455211, "grad_norm": 0.5454165487780195, "learning_rate": 3.397057651992772e-05, "loss": 0.7365, "num_tokens": 3033375472.0, "step": 993 }, { "epoch": 1.455761128411797, "grad_norm": 0.6814216328849936, "learning_rate": 3.3957820114844695e-05, "loss": 0.7269, "num_tokens": 3036382522.0, "step": 994 }, { "epoch": 1.4572265982780728, "grad_norm": 0.5297811835999773, "learning_rate": 3.3945052950782364e-05, "loss": 0.7474, "num_tokens": 3039228237.0, "step": 995 }, { "epoch": 1.4586920681443487, "grad_norm": 0.8430452099991067, "learning_rate": 3.39322750392279e-05, "loss": 0.7386, "num_tokens": 3042036354.0, "step": 996 }, { "epoch": 1.4601575380106246, "grad_norm": 0.618661490915788, "learning_rate": 3.391948639167814e-05, "loss": 0.7338, "num_tokens": 3044980209.0, "step": 997 }, { "epoch": 1.4616230078769006, "grad_norm": 0.8117242423674851, "learning_rate": 3.390668701963963e-05, "loss": 0.7414, "num_tokens": 3047910690.0, "step": 998 }, { "epoch": 1.4630884777431765, "grad_norm": 0.6036432914437648, "learning_rate": 3.389387693462849e-05, "loss": 0.7431, "num_tokens": 3051078641.0, "step": 999 }, { "epoch": 1.4645539476094522, "grad_norm": 0.7211648519234534, "learning_rate": 3.3881056148170555e-05, "loss": 0.7375, "num_tokens": 3054309968.0, "step": 1000 }, { "epoch": 1.4660194174757282, "grad_norm": 0.6218336784876674, "learning_rate": 3.386822467180124e-05, "loss": 0.7423, "num_tokens": 3057172620.0, "step": 1001 }, { "epoch": 1.467484887342004, "grad_norm": 0.5984309290919958, "learning_rate": 3.3855382517065596e-05, "loss": 0.7147, "num_tokens": 3060346309.0, "step": 1002 }, { "epoch": 1.4689503572082798, "grad_norm": 0.6385437456365122, "learning_rate": 3.384252969551827e-05, "loss": 0.745, "num_tokens": 3063391272.0, "step": 1003 }, { "epoch": 1.4704158270745558, "grad_norm": 0.6297622733277343, "learning_rate": 3.382966621872351e-05, "loss": 0.7568, "num_tokens": 3066181361.0, "step": 1004 }, { "epoch": 1.4718812969408317, "grad_norm": 0.5651789372439435, "learning_rate": 3.381679209825517e-05, "loss": 0.7291, "num_tokens": 3069156520.0, "step": 1005 }, { "epoch": 1.4733467668071074, "grad_norm": 0.5589722648461087, "learning_rate": 3.3803907345696646e-05, "loss": 0.7407, "num_tokens": 3072397598.0, "step": 1006 }, { "epoch": 1.4748122366733833, "grad_norm": 0.524079519668168, "learning_rate": 3.379101197264093e-05, "loss": 0.7324, "num_tokens": 3075555539.0, "step": 1007 }, { "epoch": 1.4762777065396593, "grad_norm": 0.6180532395165693, "learning_rate": 3.377810599069056e-05, "loss": 0.7352, "num_tokens": 3078543904.0, "step": 1008 }, { "epoch": 1.4777431764059352, "grad_norm": 0.5229208127610999, "learning_rate": 3.376518941145762e-05, "loss": 0.7158, "num_tokens": 3081761956.0, "step": 1009 }, { "epoch": 1.4792086462722112, "grad_norm": 0.6383453906067004, "learning_rate": 3.375226224656372e-05, "loss": 0.7273, "num_tokens": 3084811251.0, "step": 1010 }, { "epoch": 1.4806741161384869, "grad_norm": 0.49514539961983495, "learning_rate": 3.3739324507640015e-05, "loss": 0.7521, "num_tokens": 3087806764.0, "step": 1011 }, { "epoch": 1.4821395860047628, "grad_norm": 0.8699250735380287, "learning_rate": 3.372637620632715e-05, "loss": 0.7389, "num_tokens": 3090779784.0, "step": 1012 }, { "epoch": 1.4836050558710387, "grad_norm": 0.48935068934199033, "learning_rate": 3.371341735427528e-05, "loss": 0.7234, "num_tokens": 3094195816.0, "step": 1013 }, { "epoch": 1.4850705257373145, "grad_norm": 1.0395314552772876, "learning_rate": 3.370044796314407e-05, "loss": 0.7453, "num_tokens": 3097399998.0, "step": 1014 }, { "epoch": 1.4865359956035904, "grad_norm": 0.7646123752473198, "learning_rate": 3.3687468044602646e-05, "loss": 0.7234, "num_tokens": 3100545754.0, "step": 1015 }, { "epoch": 1.4880014654698663, "grad_norm": 0.8611666423619171, "learning_rate": 3.367447761032961e-05, "loss": 0.7499, "num_tokens": 3103484358.0, "step": 1016 }, { "epoch": 1.489466935336142, "grad_norm": 0.7014876343470275, "learning_rate": 3.366147667201304e-05, "loss": 0.7246, "num_tokens": 3106717553.0, "step": 1017 }, { "epoch": 1.490932405202418, "grad_norm": 0.8213018732023244, "learning_rate": 3.364846524135045e-05, "loss": 0.7328, "num_tokens": 3109883877.0, "step": 1018 }, { "epoch": 1.492397875068694, "grad_norm": 0.6751736098287298, "learning_rate": 3.36354433300488e-05, "loss": 0.7382, "num_tokens": 3112957536.0, "step": 1019 }, { "epoch": 1.4938633449349696, "grad_norm": 0.7140332273375319, "learning_rate": 3.3622410949824474e-05, "loss": 0.726, "num_tokens": 3116099381.0, "step": 1020 }, { "epoch": 1.4953288148012456, "grad_norm": 0.7338329659282037, "learning_rate": 3.360936811240328e-05, "loss": 0.7364, "num_tokens": 3119184190.0, "step": 1021 }, { "epoch": 1.4967942846675215, "grad_norm": 0.5879790484116247, "learning_rate": 3.3596314829520453e-05, "loss": 0.7443, "num_tokens": 3122460671.0, "step": 1022 }, { "epoch": 1.4982597545337974, "grad_norm": 0.6465434506831328, "learning_rate": 3.35832511129206e-05, "loss": 0.7413, "num_tokens": 3125486302.0, "step": 1023 }, { "epoch": 1.4997252244000734, "grad_norm": 0.6191655344923633, "learning_rate": 3.3570176974357714e-05, "loss": 0.7185, "num_tokens": 3128471232.0, "step": 1024 }, { "epoch": 1.501190694266349, "grad_norm": 0.5394981482070776, "learning_rate": 3.3557092425595176e-05, "loss": 0.7161, "num_tokens": 3131622502.0, "step": 1025 }, { "epoch": 1.502656164132625, "grad_norm": 0.5400216333786843, "learning_rate": 3.354399747840575e-05, "loss": 0.7179, "num_tokens": 3134599862.0, "step": 1026 }, { "epoch": 1.504121633998901, "grad_norm": 0.46173965699052566, "learning_rate": 3.353089214457152e-05, "loss": 0.721, "num_tokens": 3137434405.0, "step": 1027 }, { "epoch": 1.5055871038651767, "grad_norm": 0.5916615463438385, "learning_rate": 3.351777643588394e-05, "loss": 0.7391, "num_tokens": 3140414626.0, "step": 1028 }, { "epoch": 1.5070525737314526, "grad_norm": 0.4051633404578729, "learning_rate": 3.35046503641438e-05, "loss": 0.7105, "num_tokens": 3143608942.0, "step": 1029 }, { "epoch": 1.5085180435977286, "grad_norm": 0.6945349103596598, "learning_rate": 3.34915139411612e-05, "loss": 0.7415, "num_tokens": 3146615569.0, "step": 1030 }, { "epoch": 1.5099835134640043, "grad_norm": 0.524614205099497, "learning_rate": 3.347836717875556e-05, "loss": 0.7327, "num_tokens": 3149682923.0, "step": 1031 }, { "epoch": 1.5114489833302802, "grad_norm": 0.6678133863051265, "learning_rate": 3.346521008875561e-05, "loss": 0.7414, "num_tokens": 3152953914.0, "step": 1032 }, { "epoch": 1.5129144531965562, "grad_norm": 0.5011042569664438, "learning_rate": 3.345204268299936e-05, "loss": 0.7293, "num_tokens": 3155990327.0, "step": 1033 }, { "epoch": 1.5143799230628319, "grad_norm": 0.7013384833743773, "learning_rate": 3.343886497333411e-05, "loss": 0.724, "num_tokens": 3158890670.0, "step": 1034 }, { "epoch": 1.515845392929108, "grad_norm": 0.5873282531934446, "learning_rate": 3.342567697161642e-05, "loss": 0.7431, "num_tokens": 3161951741.0, "step": 1035 }, { "epoch": 1.5173108627953837, "grad_norm": 0.6791240518885524, "learning_rate": 3.341247868971213e-05, "loss": 0.733, "num_tokens": 3164976386.0, "step": 1036 }, { "epoch": 1.5187763326616597, "grad_norm": 0.5949906401251243, "learning_rate": 3.3399270139496306e-05, "loss": 0.7612, "num_tokens": 3167889904.0, "step": 1037 }, { "epoch": 1.5202418025279356, "grad_norm": 0.6706588465673512, "learning_rate": 3.338605133285327e-05, "loss": 0.7119, "num_tokens": 3171120853.0, "step": 1038 }, { "epoch": 1.5217072723942113, "grad_norm": 0.6429484704240662, "learning_rate": 3.3372822281676565e-05, "loss": 0.7201, "num_tokens": 3174232159.0, "step": 1039 }, { "epoch": 1.5231727422604873, "grad_norm": 0.5746702477684014, "learning_rate": 3.335958299786895e-05, "loss": 0.7596, "num_tokens": 3177062588.0, "step": 1040 }, { "epoch": 1.5246382121267632, "grad_norm": 0.545463085540361, "learning_rate": 3.3346333493342414e-05, "loss": 0.7378, "num_tokens": 3180182151.0, "step": 1041 }, { "epoch": 1.526103681993039, "grad_norm": 0.5929814988867435, "learning_rate": 3.3333073780018086e-05, "loss": 0.732, "num_tokens": 3183146742.0, "step": 1042 }, { "epoch": 1.5275691518593149, "grad_norm": 0.49123305976961607, "learning_rate": 3.3319803869826354e-05, "loss": 0.7251, "num_tokens": 3186262520.0, "step": 1043 }, { "epoch": 1.5290346217255908, "grad_norm": 0.6407367429991835, "learning_rate": 3.3306523774706715e-05, "loss": 0.7343, "num_tokens": 3189152324.0, "step": 1044 }, { "epoch": 1.5305000915918665, "grad_norm": 0.47801118859236186, "learning_rate": 3.329323350660787e-05, "loss": 0.7049, "num_tokens": 3192112782.0, "step": 1045 }, { "epoch": 1.5319655614581427, "grad_norm": 0.6416701796670824, "learning_rate": 3.327993307748766e-05, "loss": 0.723, "num_tokens": 3195045774.0, "step": 1046 }, { "epoch": 1.5334310313244184, "grad_norm": 0.5557665646133894, "learning_rate": 3.3266622499313066e-05, "loss": 0.7217, "num_tokens": 3198097635.0, "step": 1047 }, { "epoch": 1.5348965011906943, "grad_norm": 0.5704360442709387, "learning_rate": 3.3253301784060207e-05, "loss": 0.7299, "num_tokens": 3201147087.0, "step": 1048 }, { "epoch": 1.5363619710569703, "grad_norm": 0.49295754630428207, "learning_rate": 3.323997094371431e-05, "loss": 0.7339, "num_tokens": 3204216507.0, "step": 1049 }, { "epoch": 1.537827440923246, "grad_norm": 0.624907743916921, "learning_rate": 3.322662999026974e-05, "loss": 0.7335, "num_tokens": 3207563407.0, "step": 1050 }, { "epoch": 1.539292910789522, "grad_norm": 0.564770988258131, "learning_rate": 3.3213278935729924e-05, "loss": 0.7316, "num_tokens": 3210611152.0, "step": 1051 }, { "epoch": 1.5407583806557978, "grad_norm": 0.7179081314393227, "learning_rate": 3.31999177921074e-05, "loss": 0.7306, "num_tokens": 3213727576.0, "step": 1052 }, { "epoch": 1.5422238505220736, "grad_norm": 0.5848492695758731, "learning_rate": 3.318654657142379e-05, "loss": 0.7228, "num_tokens": 3216904552.0, "step": 1053 }, { "epoch": 1.5436893203883495, "grad_norm": 0.5371751676311499, "learning_rate": 3.317316528570976e-05, "loss": 0.7184, "num_tokens": 3219952403.0, "step": 1054 }, { "epoch": 1.5451547902546254, "grad_norm": 0.5781681539920612, "learning_rate": 3.3159773947005054e-05, "loss": 0.7258, "num_tokens": 3222983927.0, "step": 1055 }, { "epoch": 1.5466202601209011, "grad_norm": 0.4889391421591862, "learning_rate": 3.314637256735844e-05, "loss": 0.7217, "num_tokens": 3226075562.0, "step": 1056 }, { "epoch": 1.548085729987177, "grad_norm": 0.5275678672623512, "learning_rate": 3.3132961158827733e-05, "loss": 0.7301, "num_tokens": 3229168040.0, "step": 1057 }, { "epoch": 1.549551199853453, "grad_norm": 0.4633066536625952, "learning_rate": 3.3119539733479785e-05, "loss": 0.7547, "num_tokens": 3232171336.0, "step": 1058 }, { "epoch": 1.5510166697197287, "grad_norm": 0.5483144279440194, "learning_rate": 3.310610830339044e-05, "loss": 0.7212, "num_tokens": 3235061721.0, "step": 1059 }, { "epoch": 1.552482139586005, "grad_norm": 0.48880476646932947, "learning_rate": 3.3092666880644525e-05, "loss": 0.7243, "num_tokens": 3237903227.0, "step": 1060 }, { "epoch": 1.5539476094522806, "grad_norm": 0.46243286275003714, "learning_rate": 3.307921547733592e-05, "loss": 0.735, "num_tokens": 3240970329.0, "step": 1061 }, { "epoch": 1.5554130793185565, "grad_norm": 0.7132371324763109, "learning_rate": 3.306575410556742e-05, "loss": 0.723, "num_tokens": 3243978829.0, "step": 1062 }, { "epoch": 1.5568785491848325, "grad_norm": 0.4477501541427797, "learning_rate": 3.305228277745083e-05, "loss": 0.7218, "num_tokens": 3247116034.0, "step": 1063 }, { "epoch": 1.5583440190511082, "grad_norm": 0.6760630324645992, "learning_rate": 3.30388015051069e-05, "loss": 0.7285, "num_tokens": 3250381168.0, "step": 1064 }, { "epoch": 1.5598094889173841, "grad_norm": 0.5942818310541969, "learning_rate": 3.302531030066531e-05, "loss": 0.7117, "num_tokens": 3253460108.0, "step": 1065 }, { "epoch": 1.56127495878366, "grad_norm": 0.5297456474104453, "learning_rate": 3.301180917626471e-05, "loss": 0.7263, "num_tokens": 3256471161.0, "step": 1066 }, { "epoch": 1.5627404286499358, "grad_norm": 0.5573329355506744, "learning_rate": 3.299829814405265e-05, "loss": 0.7352, "num_tokens": 3259645447.0, "step": 1067 }, { "epoch": 1.5642058985162117, "grad_norm": 0.5511109591257657, "learning_rate": 3.298477721618561e-05, "loss": 0.7379, "num_tokens": 3262751134.0, "step": 1068 }, { "epoch": 1.5656713683824877, "grad_norm": 0.6279421627656148, "learning_rate": 3.297124640482896e-05, "loss": 0.7301, "num_tokens": 3265827560.0, "step": 1069 }, { "epoch": 1.5671368382487634, "grad_norm": 0.4783115147960427, "learning_rate": 3.295770572215697e-05, "loss": 0.7258, "num_tokens": 3268846708.0, "step": 1070 }, { "epoch": 1.5686023081150395, "grad_norm": 0.6547957915224174, "learning_rate": 3.2944155180352795e-05, "loss": 0.7408, "num_tokens": 3271910814.0, "step": 1071 }, { "epoch": 1.5700677779813152, "grad_norm": 0.45670962151841776, "learning_rate": 3.2930594791608454e-05, "loss": 0.7241, "num_tokens": 3274836099.0, "step": 1072 }, { "epoch": 1.5715332478475912, "grad_norm": 0.6451060056726136, "learning_rate": 3.291702456812483e-05, "loss": 0.7505, "num_tokens": 3277794487.0, "step": 1073 }, { "epoch": 1.5729987177138671, "grad_norm": 0.5679023782283982, "learning_rate": 3.290344452211164e-05, "loss": 0.7434, "num_tokens": 3281078531.0, "step": 1074 }, { "epoch": 1.5744641875801428, "grad_norm": 0.6597028958316802, "learning_rate": 3.2889854665787466e-05, "loss": 0.7179, "num_tokens": 3284324102.0, "step": 1075 }, { "epoch": 1.5759296574464188, "grad_norm": 0.56692146377955, "learning_rate": 3.2876255011379704e-05, "loss": 0.7169, "num_tokens": 3287622327.0, "step": 1076 }, { "epoch": 1.5773951273126947, "grad_norm": 0.6026189001325984, "learning_rate": 3.286264557112456e-05, "loss": 0.7486, "num_tokens": 3290657528.0, "step": 1077 }, { "epoch": 1.5788605971789704, "grad_norm": 0.5367689355581213, "learning_rate": 3.284902635726704e-05, "loss": 0.7389, "num_tokens": 3293587118.0, "step": 1078 }, { "epoch": 1.5803260670452464, "grad_norm": 0.47501642869900035, "learning_rate": 3.2835397382060966e-05, "loss": 0.7158, "num_tokens": 3296569848.0, "step": 1079 }, { "epoch": 1.5817915369115223, "grad_norm": 0.5319110225524265, "learning_rate": 3.282175865776892e-05, "loss": 0.7136, "num_tokens": 3299573942.0, "step": 1080 }, { "epoch": 1.583257006777798, "grad_norm": 0.5111563929543897, "learning_rate": 3.280811019666227e-05, "loss": 0.7254, "num_tokens": 3302835000.0, "step": 1081 }, { "epoch": 1.584722476644074, "grad_norm": 0.47152872247235283, "learning_rate": 3.279445201102114e-05, "loss": 0.7433, "num_tokens": 3305790323.0, "step": 1082 }, { "epoch": 1.5861879465103499, "grad_norm": 0.4350385103265398, "learning_rate": 3.278078411313439e-05, "loss": 0.7294, "num_tokens": 3308699330.0, "step": 1083 }, { "epoch": 1.5876534163766256, "grad_norm": 0.6991661751412634, "learning_rate": 3.276710651529966e-05, "loss": 0.7214, "num_tokens": 3311852068.0, "step": 1084 }, { "epoch": 1.5891188862429018, "grad_norm": 0.41310351036312054, "learning_rate": 3.275341922982326e-05, "loss": 0.7282, "num_tokens": 3314935902.0, "step": 1085 }, { "epoch": 1.5905843561091775, "grad_norm": 0.7182567783415769, "learning_rate": 3.2739722269020274e-05, "loss": 0.7288, "num_tokens": 3317773354.0, "step": 1086 }, { "epoch": 1.5920498259754534, "grad_norm": 0.5026910431209334, "learning_rate": 3.272601564521444e-05, "loss": 0.7187, "num_tokens": 3320777060.0, "step": 1087 }, { "epoch": 1.5935152958417294, "grad_norm": 0.7936574154842663, "learning_rate": 3.2712299370738216e-05, "loss": 0.7139, "num_tokens": 3323783788.0, "step": 1088 }, { "epoch": 1.594980765708005, "grad_norm": 0.5904362906427473, "learning_rate": 3.2698573457932756e-05, "loss": 0.7336, "num_tokens": 3327053346.0, "step": 1089 }, { "epoch": 1.596446235574281, "grad_norm": 0.6557313003915473, "learning_rate": 3.268483791914785e-05, "loss": 0.7172, "num_tokens": 3330003713.0, "step": 1090 }, { "epoch": 1.597911705440557, "grad_norm": 0.62142816696905, "learning_rate": 3.267109276674199e-05, "loss": 0.7163, "num_tokens": 3333253515.0, "step": 1091 }, { "epoch": 1.5993771753068327, "grad_norm": 0.5871516815498982, "learning_rate": 3.265733801308228e-05, "loss": 0.7301, "num_tokens": 3336608597.0, "step": 1092 }, { "epoch": 1.6008426451731086, "grad_norm": 0.5796492999924486, "learning_rate": 3.264357367054449e-05, "loss": 0.74, "num_tokens": 3339606789.0, "step": 1093 }, { "epoch": 1.6023081150393845, "grad_norm": 0.5926079023957083, "learning_rate": 3.2629799751512996e-05, "loss": 0.7332, "num_tokens": 3342709017.0, "step": 1094 }, { "epoch": 1.6037735849056602, "grad_norm": 0.5532692724452195, "learning_rate": 3.2616016268380815e-05, "loss": 0.7118, "num_tokens": 3345740170.0, "step": 1095 }, { "epoch": 1.6052390547719364, "grad_norm": 0.47389646118390866, "learning_rate": 3.2602223233549556e-05, "loss": 0.7194, "num_tokens": 3348752435.0, "step": 1096 }, { "epoch": 1.6067045246382121, "grad_norm": 0.49737385676964324, "learning_rate": 3.258842065942941e-05, "loss": 0.7443, "num_tokens": 3351883456.0, "step": 1097 }, { "epoch": 1.608169994504488, "grad_norm": 0.5505309311923081, "learning_rate": 3.257460855843917e-05, "loss": 0.7424, "num_tokens": 3354872328.0, "step": 1098 }, { "epoch": 1.609635464370764, "grad_norm": 0.5179267250505282, "learning_rate": 3.25607869430062e-05, "loss": 0.72, "num_tokens": 3357911934.0, "step": 1099 }, { "epoch": 1.6111009342370397, "grad_norm": 0.532374180336076, "learning_rate": 3.2546955825566404e-05, "loss": 0.724, "num_tokens": 3360937464.0, "step": 1100 }, { "epoch": 1.6125664041033156, "grad_norm": 0.4454089975890635, "learning_rate": 3.2533115218564265e-05, "loss": 0.7395, "num_tokens": 3363940797.0, "step": 1101 }, { "epoch": 1.6140318739695916, "grad_norm": 0.5161215969558626, "learning_rate": 3.2519265134452775e-05, "loss": 0.7724, "num_tokens": 3367099238.0, "step": 1102 }, { "epoch": 1.6154973438358673, "grad_norm": 0.4747798841869285, "learning_rate": 3.250540558569348e-05, "loss": 0.7602, "num_tokens": 3369999554.0, "step": 1103 }, { "epoch": 1.6169628137021432, "grad_norm": 0.4847535669381688, "learning_rate": 3.2491536584756416e-05, "loss": 0.7274, "num_tokens": 3372943501.0, "step": 1104 }, { "epoch": 1.6184282835684192, "grad_norm": 0.4115228114749518, "learning_rate": 3.247765814412014e-05, "loss": 0.7457, "num_tokens": 3375951689.0, "step": 1105 }, { "epoch": 1.6198937534346949, "grad_norm": 0.5336664125975602, "learning_rate": 3.246377027627171e-05, "loss": 0.7439, "num_tokens": 3379156676.0, "step": 1106 }, { "epoch": 1.6213592233009708, "grad_norm": 0.401389918543112, "learning_rate": 3.244987299370664e-05, "loss": 0.7501, "num_tokens": 3382240737.0, "step": 1107 }, { "epoch": 1.6228246931672468, "grad_norm": 0.7268465763963753, "learning_rate": 3.243596630892894e-05, "loss": 0.7243, "num_tokens": 3385196219.0, "step": 1108 }, { "epoch": 1.6242901630335225, "grad_norm": 0.5300005206436273, "learning_rate": 3.242205023445106e-05, "loss": 0.7368, "num_tokens": 3387967027.0, "step": 1109 }, { "epoch": 1.6257556328997986, "grad_norm": 0.7429931924769059, "learning_rate": 3.240812478279391e-05, "loss": 0.7206, "num_tokens": 3391236126.0, "step": 1110 }, { "epoch": 1.6272211027660743, "grad_norm": 0.5988607820965608, "learning_rate": 3.239418996648684e-05, "loss": 0.7397, "num_tokens": 3394493367.0, "step": 1111 }, { "epoch": 1.6286865726323503, "grad_norm": 0.7831629603886402, "learning_rate": 3.2380245798067614e-05, "loss": 0.7293, "num_tokens": 3397358120.0, "step": 1112 }, { "epoch": 1.6301520424986262, "grad_norm": 0.5698472888619368, "learning_rate": 3.236629229008241e-05, "loss": 0.7409, "num_tokens": 3400446904.0, "step": 1113 }, { "epoch": 1.631617512364902, "grad_norm": 0.7832338127587768, "learning_rate": 3.235232945508584e-05, "loss": 0.7277, "num_tokens": 3403780880.0, "step": 1114 }, { "epoch": 1.6330829822311779, "grad_norm": 0.5965913553926194, "learning_rate": 3.233835730564085e-05, "loss": 0.7287, "num_tokens": 3406895068.0, "step": 1115 }, { "epoch": 1.6345484520974538, "grad_norm": 0.8305549488545156, "learning_rate": 3.232437585431883e-05, "loss": 0.7454, "num_tokens": 3409902342.0, "step": 1116 }, { "epoch": 1.6360139219637295, "grad_norm": 0.6756638577677493, "learning_rate": 3.2310385113699496e-05, "loss": 0.7462, "num_tokens": 3412854131.0, "step": 1117 }, { "epoch": 1.6374793918300055, "grad_norm": 0.644462533790811, "learning_rate": 3.229638509637094e-05, "loss": 0.74, "num_tokens": 3415771153.0, "step": 1118 }, { "epoch": 1.6389448616962814, "grad_norm": 0.6646110036363029, "learning_rate": 3.228237581492959e-05, "loss": 0.7351, "num_tokens": 3418796312.0, "step": 1119 }, { "epoch": 1.6404103315625571, "grad_norm": 0.6225027121162595, "learning_rate": 3.226835728198023e-05, "loss": 0.7132, "num_tokens": 3421893847.0, "step": 1120 }, { "epoch": 1.6418758014288333, "grad_norm": 0.5222282192448345, "learning_rate": 3.225432951013593e-05, "loss": 0.7513, "num_tokens": 3424804513.0, "step": 1121 }, { "epoch": 1.643341271295109, "grad_norm": 0.6087105812141241, "learning_rate": 3.224029251201812e-05, "loss": 0.7275, "num_tokens": 3427716445.0, "step": 1122 }, { "epoch": 1.644806741161385, "grad_norm": 0.49733717478903516, "learning_rate": 3.2226246300256496e-05, "loss": 0.7306, "num_tokens": 3430705202.0, "step": 1123 }, { "epoch": 1.6462722110276609, "grad_norm": 0.6844653594208825, "learning_rate": 3.221219088748906e-05, "loss": 0.7158, "num_tokens": 3433730688.0, "step": 1124 }, { "epoch": 1.6477376808939366, "grad_norm": 0.54577534675777, "learning_rate": 3.219812628636208e-05, "loss": 0.7293, "num_tokens": 3436920229.0, "step": 1125 }, { "epoch": 1.6492031507602125, "grad_norm": 0.6712674045339279, "learning_rate": 3.2184052509530106e-05, "loss": 0.7122, "num_tokens": 3440037524.0, "step": 1126 }, { "epoch": 1.6506686206264884, "grad_norm": 0.5592617985393468, "learning_rate": 3.216996956965594e-05, "loss": 0.7292, "num_tokens": 3443366594.0, "step": 1127 }, { "epoch": 1.6521340904927642, "grad_norm": 0.6533970459944575, "learning_rate": 3.215587747941062e-05, "loss": 0.7368, "num_tokens": 3446537867.0, "step": 1128 }, { "epoch": 1.65359956035904, "grad_norm": 0.6034060954180704, "learning_rate": 3.214177625147342e-05, "loss": 0.7272, "num_tokens": 3449643899.0, "step": 1129 }, { "epoch": 1.655065030225316, "grad_norm": 0.5900912401335233, "learning_rate": 3.212766589853185e-05, "loss": 0.7237, "num_tokens": 3452601463.0, "step": 1130 }, { "epoch": 1.6565305000915918, "grad_norm": 0.5342510300640673, "learning_rate": 3.21135464332816e-05, "loss": 0.7171, "num_tokens": 3455723692.0, "step": 1131 }, { "epoch": 1.6579959699578677, "grad_norm": 0.5643493411202638, "learning_rate": 3.20994178684266e-05, "loss": 0.7256, "num_tokens": 3458622862.0, "step": 1132 }, { "epoch": 1.6594614398241436, "grad_norm": 0.5399316844760527, "learning_rate": 3.2085280216678923e-05, "loss": 0.7441, "num_tokens": 3461719654.0, "step": 1133 }, { "epoch": 1.6609269096904193, "grad_norm": 0.4921045738972708, "learning_rate": 3.207113349075885e-05, "loss": 0.716, "num_tokens": 3464702901.0, "step": 1134 }, { "epoch": 1.6623923795566955, "grad_norm": 0.4523643759793127, "learning_rate": 3.205697770339481e-05, "loss": 0.7316, "num_tokens": 3467766734.0, "step": 1135 }, { "epoch": 1.6638578494229712, "grad_norm": 0.6263062140338715, "learning_rate": 3.204281286732341e-05, "loss": 0.752, "num_tokens": 3470636718.0, "step": 1136 }, { "epoch": 1.6653233192892472, "grad_norm": 0.513645243389043, "learning_rate": 3.2028638995289345e-05, "loss": 0.7372, "num_tokens": 3473675088.0, "step": 1137 }, { "epoch": 1.666788789155523, "grad_norm": 0.588528659401566, "learning_rate": 3.2014456100045504e-05, "loss": 0.7195, "num_tokens": 3476699377.0, "step": 1138 }, { "epoch": 1.6682542590217988, "grad_norm": 0.5390585146923718, "learning_rate": 3.200026419435284e-05, "loss": 0.7398, "num_tokens": 3479592997.0, "step": 1139 }, { "epoch": 1.6697197288880747, "grad_norm": 0.5755487365152632, "learning_rate": 3.1986063290980457e-05, "loss": 0.7245, "num_tokens": 3482691964.0, "step": 1140 }, { "epoch": 1.6711851987543507, "grad_norm": 0.5244672541282115, "learning_rate": 3.1971853402705534e-05, "loss": 0.7271, "num_tokens": 3485731436.0, "step": 1141 }, { "epoch": 1.6726506686206264, "grad_norm": 0.6215608684517648, "learning_rate": 3.195763454231333e-05, "loss": 0.7117, "num_tokens": 3488714522.0, "step": 1142 }, { "epoch": 1.6741161384869023, "grad_norm": 0.5050276928827526, "learning_rate": 3.1943406722597186e-05, "loss": 0.7289, "num_tokens": 3491785471.0, "step": 1143 }, { "epoch": 1.6755816083531783, "grad_norm": 0.6954141112996651, "learning_rate": 3.192916995635849e-05, "loss": 0.7233, "num_tokens": 3494749318.0, "step": 1144 }, { "epoch": 1.677047078219454, "grad_norm": 0.5471953535700242, "learning_rate": 3.1914924256406706e-05, "loss": 0.7293, "num_tokens": 3497798166.0, "step": 1145 }, { "epoch": 1.6785125480857301, "grad_norm": 0.7340035788949208, "learning_rate": 3.1900669635559306e-05, "loss": 0.7105, "num_tokens": 3501072425.0, "step": 1146 }, { "epoch": 1.6799780179520059, "grad_norm": 0.6500835901284955, "learning_rate": 3.188640610664181e-05, "loss": 0.7288, "num_tokens": 3504045034.0, "step": 1147 }, { "epoch": 1.6814434878182818, "grad_norm": 0.5211505039482257, "learning_rate": 3.187213368248774e-05, "loss": 0.7361, "num_tokens": 3507246803.0, "step": 1148 }, { "epoch": 1.6829089576845577, "grad_norm": 0.7598502157506023, "learning_rate": 3.185785237593863e-05, "loss": 0.7378, "num_tokens": 3510298892.0, "step": 1149 }, { "epoch": 1.6843744275508334, "grad_norm": 0.4629403012605498, "learning_rate": 3.184356219984401e-05, "loss": 0.7326, "num_tokens": 3513347419.0, "step": 1150 }, { "epoch": 1.6858398974171094, "grad_norm": 0.7595370205256337, "learning_rate": 3.182926316706137e-05, "loss": 0.7475, "num_tokens": 3516145116.0, "step": 1151 }, { "epoch": 1.6873053672833853, "grad_norm": 0.6497827709852049, "learning_rate": 3.181495529045619e-05, "loss": 0.7423, "num_tokens": 3519224426.0, "step": 1152 }, { "epoch": 1.688770837149661, "grad_norm": 0.6208031171457175, "learning_rate": 3.18006385829019e-05, "loss": 0.7176, "num_tokens": 3522498714.0, "step": 1153 }, { "epoch": 1.690236307015937, "grad_norm": 0.5811303817958572, "learning_rate": 3.1786313057279864e-05, "loss": 0.7255, "num_tokens": 3525681709.0, "step": 1154 }, { "epoch": 1.691701776882213, "grad_norm": 0.49336714457331404, "learning_rate": 3.17719787264794e-05, "loss": 0.7217, "num_tokens": 3528737216.0, "step": 1155 }, { "epoch": 1.6931672467484886, "grad_norm": 0.5836098369806012, "learning_rate": 3.1757635603397735e-05, "loss": 0.7269, "num_tokens": 3531866415.0, "step": 1156 }, { "epoch": 1.6946327166147646, "grad_norm": 0.4697816778655563, "learning_rate": 3.174328370094002e-05, "loss": 0.7194, "num_tokens": 3534927930.0, "step": 1157 }, { "epoch": 1.6960981864810405, "grad_norm": 0.5838790882495166, "learning_rate": 3.1728923032019285e-05, "loss": 0.7397, "num_tokens": 3538129014.0, "step": 1158 }, { "epoch": 1.6975636563473162, "grad_norm": 0.5090830826740874, "learning_rate": 3.171455360955647e-05, "loss": 0.7302, "num_tokens": 3541246046.0, "step": 1159 }, { "epoch": 1.6990291262135924, "grad_norm": 0.6422400724318863, "learning_rate": 3.170017544648036e-05, "loss": 0.7278, "num_tokens": 3544645074.0, "step": 1160 }, { "epoch": 1.700494596079868, "grad_norm": 0.5383132944282785, "learning_rate": 3.1685788555727645e-05, "loss": 0.7397, "num_tokens": 3547839346.0, "step": 1161 }, { "epoch": 1.701960065946144, "grad_norm": 0.5343123575795636, "learning_rate": 3.1671392950242836e-05, "loss": 0.7354, "num_tokens": 3550811126.0, "step": 1162 }, { "epoch": 1.70342553581242, "grad_norm": 0.5795373498690242, "learning_rate": 3.165698864297829e-05, "loss": 0.7222, "num_tokens": 3553816084.0, "step": 1163 }, { "epoch": 1.7048910056786957, "grad_norm": 0.4559744537160204, "learning_rate": 3.164257564689421e-05, "loss": 0.7469, "num_tokens": 3556765666.0, "step": 1164 }, { "epoch": 1.7063564755449716, "grad_norm": 0.6471463380792962, "learning_rate": 3.162815397495861e-05, "loss": 0.7121, "num_tokens": 3559821001.0, "step": 1165 }, { "epoch": 1.7078219454112475, "grad_norm": 0.4850845905244907, "learning_rate": 3.1613723640147286e-05, "loss": 0.7298, "num_tokens": 3562873173.0, "step": 1166 }, { "epoch": 1.7092874152775233, "grad_norm": 0.6188422567781574, "learning_rate": 3.159928465544387e-05, "loss": 0.7242, "num_tokens": 3565849863.0, "step": 1167 }, { "epoch": 1.7107528851437992, "grad_norm": 0.4840749384988767, "learning_rate": 3.1584837033839736e-05, "loss": 0.7306, "num_tokens": 3568947848.0, "step": 1168 }, { "epoch": 1.7122183550100751, "grad_norm": 0.5596761016201692, "learning_rate": 3.1570380788334056e-05, "loss": 0.7105, "num_tokens": 3571882911.0, "step": 1169 }, { "epoch": 1.7136838248763508, "grad_norm": 0.4918055873183456, "learning_rate": 3.155591593193375e-05, "loss": 0.7288, "num_tokens": 3574990662.0, "step": 1170 }, { "epoch": 1.715149294742627, "grad_norm": 0.592534000360727, "learning_rate": 3.1541442477653504e-05, "loss": 0.7232, "num_tokens": 3577884351.0, "step": 1171 }, { "epoch": 1.7166147646089027, "grad_norm": 0.5114371719476067, "learning_rate": 3.15269604385157e-05, "loss": 0.7264, "num_tokens": 3580907849.0, "step": 1172 }, { "epoch": 1.7180802344751787, "grad_norm": 0.469456909143289, "learning_rate": 3.151246982755049e-05, "loss": 0.7071, "num_tokens": 3583984770.0, "step": 1173 }, { "epoch": 1.7195457043414546, "grad_norm": 0.4488381216659133, "learning_rate": 3.1497970657795704e-05, "loss": 0.7119, "num_tokens": 3587035519.0, "step": 1174 }, { "epoch": 1.7210111742077303, "grad_norm": 0.4706246660214865, "learning_rate": 3.1483462942296887e-05, "loss": 0.7301, "num_tokens": 3590095217.0, "step": 1175 }, { "epoch": 1.7224766440740062, "grad_norm": 0.48020071861280217, "learning_rate": 3.146894669410728e-05, "loss": 0.7354, "num_tokens": 3593060397.0, "step": 1176 }, { "epoch": 1.7239421139402822, "grad_norm": 0.4285191601994742, "learning_rate": 3.145442192628779e-05, "loss": 0.7279, "num_tokens": 3596249174.0, "step": 1177 }, { "epoch": 1.725407583806558, "grad_norm": 0.49624752039054004, "learning_rate": 3.143988865190699e-05, "loss": 0.7339, "num_tokens": 3599176168.0, "step": 1178 }, { "epoch": 1.7268730536728338, "grad_norm": 0.462328638319801, "learning_rate": 3.14253468840411e-05, "loss": 0.7211, "num_tokens": 3602206679.0, "step": 1179 }, { "epoch": 1.7283385235391098, "grad_norm": 0.41562889584227203, "learning_rate": 3.141079663577401e-05, "loss": 0.7066, "num_tokens": 3605320795.0, "step": 1180 }, { "epoch": 1.7298039934053855, "grad_norm": 0.527025598144562, "learning_rate": 3.1396237920197225e-05, "loss": 0.726, "num_tokens": 3608412928.0, "step": 1181 }, { "epoch": 1.7312694632716614, "grad_norm": 0.4446874218780101, "learning_rate": 3.1381670750409835e-05, "loss": 0.7399, "num_tokens": 3611443429.0, "step": 1182 }, { "epoch": 1.7327349331379374, "grad_norm": 0.5290020254165947, "learning_rate": 3.136709513951859e-05, "loss": 0.7404, "num_tokens": 3614404701.0, "step": 1183 }, { "epoch": 1.734200403004213, "grad_norm": 0.44132305155312357, "learning_rate": 3.1352511100637806e-05, "loss": 0.7327, "num_tokens": 3617466277.0, "step": 1184 }, { "epoch": 1.7356658728704892, "grad_norm": 0.4040477249423107, "learning_rate": 3.133791864688939e-05, "loss": 0.7279, "num_tokens": 3620521979.0, "step": 1185 }, { "epoch": 1.737131342736765, "grad_norm": 0.48742269882355593, "learning_rate": 3.132331779140281e-05, "loss": 0.7348, "num_tokens": 3623620004.0, "step": 1186 }, { "epoch": 1.7385968126030409, "grad_norm": 0.5296388508795381, "learning_rate": 3.130870854731511e-05, "loss": 0.7213, "num_tokens": 3626619270.0, "step": 1187 }, { "epoch": 1.7400622824693168, "grad_norm": 0.4144302858314123, "learning_rate": 3.129409092777087e-05, "loss": 0.7104, "num_tokens": 3629694777.0, "step": 1188 }, { "epoch": 1.7415277523355925, "grad_norm": 0.4799981364362493, "learning_rate": 3.1279464945922207e-05, "loss": 0.738, "num_tokens": 3632549218.0, "step": 1189 }, { "epoch": 1.7429932222018685, "grad_norm": 0.49866993256609593, "learning_rate": 3.126483061492876e-05, "loss": 0.7315, "num_tokens": 3635710550.0, "step": 1190 }, { "epoch": 1.7444586920681444, "grad_norm": 0.46700647493444075, "learning_rate": 3.125018794795769e-05, "loss": 0.737, "num_tokens": 3638794343.0, "step": 1191 }, { "epoch": 1.7459241619344201, "grad_norm": 0.5935741460610882, "learning_rate": 3.123553695818364e-05, "loss": 0.7356, "num_tokens": 3641788783.0, "step": 1192 }, { "epoch": 1.747389631800696, "grad_norm": 0.46757382601514824, "learning_rate": 3.1220877658788774e-05, "loss": 0.7267, "num_tokens": 3644594037.0, "step": 1193 }, { "epoch": 1.748855101666972, "grad_norm": 0.5719204535864542, "learning_rate": 3.120621006296269e-05, "loss": 0.7141, "num_tokens": 3647606571.0, "step": 1194 }, { "epoch": 1.7503205715332477, "grad_norm": 0.5712083735969787, "learning_rate": 3.119153418390249e-05, "loss": 0.7172, "num_tokens": 3650837850.0, "step": 1195 }, { "epoch": 1.7517860413995239, "grad_norm": 0.43977234853142266, "learning_rate": 3.1176850034812694e-05, "loss": 0.7488, "num_tokens": 3653885362.0, "step": 1196 }, { "epoch": 1.7532515112657996, "grad_norm": 0.6888557285643923, "learning_rate": 3.116215762890529e-05, "loss": 0.7265, "num_tokens": 3657005632.0, "step": 1197 }, { "epoch": 1.7547169811320755, "grad_norm": 0.4555981573824444, "learning_rate": 3.1147456979399686e-05, "loss": 0.7402, "num_tokens": 3660038114.0, "step": 1198 }, { "epoch": 1.7561824509983515, "grad_norm": 0.9133133219116264, "learning_rate": 3.113274809952271e-05, "loss": 0.7235, "num_tokens": 3663102093.0, "step": 1199 }, { "epoch": 1.7576479208646272, "grad_norm": 0.6001701307208664, "learning_rate": 3.111803100250858e-05, "loss": 0.7457, "num_tokens": 3666024379.0, "step": 1200 }, { "epoch": 1.7591133907309031, "grad_norm": 0.8691813575113173, "learning_rate": 3.110330570159892e-05, "loss": 0.7214, "num_tokens": 3669185912.0, "step": 1201 }, { "epoch": 1.760578860597179, "grad_norm": 0.7944104008325189, "learning_rate": 3.108857221004275e-05, "loss": 0.7133, "num_tokens": 3672169276.0, "step": 1202 }, { "epoch": 1.7620443304634548, "grad_norm": 0.7338254029358421, "learning_rate": 3.107383054109642e-05, "loss": 0.7189, "num_tokens": 3675174314.0, "step": 1203 }, { "epoch": 1.7635098003297307, "grad_norm": 0.7773729042515446, "learning_rate": 3.1059080708023674e-05, "loss": 0.7178, "num_tokens": 3678366344.0, "step": 1204 }, { "epoch": 1.7649752701960066, "grad_norm": 0.6663116985371427, "learning_rate": 3.104432272409559e-05, "loss": 0.7097, "num_tokens": 3681639036.0, "step": 1205 }, { "epoch": 1.7664407400622824, "grad_norm": 0.5910589152708715, "learning_rate": 3.102955660259058e-05, "loss": 0.7231, "num_tokens": 3684517526.0, "step": 1206 }, { "epoch": 1.7679062099285583, "grad_norm": 0.7038530116824047, "learning_rate": 3.101478235679437e-05, "loss": 0.7354, "num_tokens": 3687443110.0, "step": 1207 }, { "epoch": 1.7693716797948342, "grad_norm": 0.678595011949634, "learning_rate": 3.1e-05, "loss": 0.7257, "num_tokens": 3690614820.0, "step": 1208 }, { "epoch": 1.77083714966111, "grad_norm": 0.818009933965047, "learning_rate": 3.098520954550782e-05, "loss": 0.7275, "num_tokens": 3693621057.0, "step": 1209 }, { "epoch": 1.772302619527386, "grad_norm": 0.7179045363260441, "learning_rate": 3.0970411006625445e-05, "loss": 0.7043, "num_tokens": 3696534892.0, "step": 1210 }, { "epoch": 1.7737680893936618, "grad_norm": 0.6034110261983744, "learning_rate": 3.095560439666779e-05, "loss": 0.721, "num_tokens": 3699492714.0, "step": 1211 }, { "epoch": 1.7752335592599378, "grad_norm": 0.7017108899176389, "learning_rate": 3.094078972895699e-05, "loss": 0.7211, "num_tokens": 3702772210.0, "step": 1212 }, { "epoch": 1.7766990291262137, "grad_norm": 0.6500870497685952, "learning_rate": 3.0925967016822486e-05, "loss": 0.7374, "num_tokens": 3705916326.0, "step": 1213 }, { "epoch": 1.7781644989924894, "grad_norm": 0.5352470806940077, "learning_rate": 3.0911136273600906e-05, "loss": 0.7263, "num_tokens": 3708915957.0, "step": 1214 }, { "epoch": 1.7796299688587653, "grad_norm": 0.7690626070923308, "learning_rate": 3.089629751263614e-05, "loss": 0.719, "num_tokens": 3711971201.0, "step": 1215 }, { "epoch": 1.7810954387250413, "grad_norm": 0.5731324691113999, "learning_rate": 3.0881450747279255e-05, "loss": 0.728, "num_tokens": 3715236356.0, "step": 1216 }, { "epoch": 1.782560908591317, "grad_norm": 0.819380736545993, "learning_rate": 3.086659599088857e-05, "loss": 0.7065, "num_tokens": 3718341788.0, "step": 1217 }, { "epoch": 1.784026378457593, "grad_norm": 0.6858913601520322, "learning_rate": 3.085173325682955e-05, "loss": 0.7254, "num_tokens": 3721332499.0, "step": 1218 }, { "epoch": 1.7854918483238689, "grad_norm": 0.6445905221342578, "learning_rate": 3.0836862558474855e-05, "loss": 0.7249, "num_tokens": 3724273749.0, "step": 1219 }, { "epoch": 1.7869573181901446, "grad_norm": 0.574660481109089, "learning_rate": 3.0821983909204316e-05, "loss": 0.7195, "num_tokens": 3727425504.0, "step": 1220 }, { "epoch": 1.7884227880564207, "grad_norm": 0.75081816156552, "learning_rate": 3.08070973224049e-05, "loss": 0.7378, "num_tokens": 3730494884.0, "step": 1221 }, { "epoch": 1.7898882579226965, "grad_norm": 0.593475888457599, "learning_rate": 3.0792202811470735e-05, "loss": 0.7056, "num_tokens": 3733504552.0, "step": 1222 }, { "epoch": 1.7913537277889724, "grad_norm": 0.6755419208235262, "learning_rate": 3.077730038980307e-05, "loss": 0.7262, "num_tokens": 3736659452.0, "step": 1223 }, { "epoch": 1.7928191976552483, "grad_norm": 0.5849937750893893, "learning_rate": 3.0762390070810275e-05, "loss": 0.7366, "num_tokens": 3739684642.0, "step": 1224 }, { "epoch": 1.794284667521524, "grad_norm": 0.674225850230527, "learning_rate": 3.074747186790782e-05, "loss": 0.7394, "num_tokens": 3742597202.0, "step": 1225 }, { "epoch": 1.7957501373878, "grad_norm": 0.5711571637211587, "learning_rate": 3.0732545794518274e-05, "loss": 0.7314, "num_tokens": 3745617744.0, "step": 1226 }, { "epoch": 1.797215607254076, "grad_norm": 0.717277548086943, "learning_rate": 3.071761186407128e-05, "loss": 0.7375, "num_tokens": 3748811784.0, "step": 1227 }, { "epoch": 1.7986810771203516, "grad_norm": 0.5684042177698139, "learning_rate": 3.070267009000355e-05, "loss": 0.7162, "num_tokens": 3751778385.0, "step": 1228 }, { "epoch": 1.8001465469866276, "grad_norm": 0.6747659605905263, "learning_rate": 3.068772048575888e-05, "loss": 0.7134, "num_tokens": 3754704324.0, "step": 1229 }, { "epoch": 1.8016120168529035, "grad_norm": 0.6402452989130403, "learning_rate": 3.0672763064788074e-05, "loss": 0.7144, "num_tokens": 3757710461.0, "step": 1230 }, { "epoch": 1.8030774867191792, "grad_norm": 0.6290810914966188, "learning_rate": 3.065779784054898e-05, "loss": 0.7162, "num_tokens": 3760634495.0, "step": 1231 }, { "epoch": 1.8045429565854552, "grad_norm": 0.6439534028747723, "learning_rate": 3.064282482650648e-05, "loss": 0.7142, "num_tokens": 3763783677.0, "step": 1232 }, { "epoch": 1.806008426451731, "grad_norm": 0.5635779311708001, "learning_rate": 3.0627844036132466e-05, "loss": 0.7147, "num_tokens": 3766845635.0, "step": 1233 }, { "epoch": 1.8074738963180068, "grad_norm": 0.5929714627987843, "learning_rate": 3.06128554829058e-05, "loss": 0.7123, "num_tokens": 3770041904.0, "step": 1234 }, { "epoch": 1.808939366184283, "grad_norm": 0.5261616168233537, "learning_rate": 3.0597859180312344e-05, "loss": 0.7011, "num_tokens": 3773108762.0, "step": 1235 }, { "epoch": 1.8104048360505587, "grad_norm": 0.5032184727828983, "learning_rate": 3.0582855141844956e-05, "loss": 0.7171, "num_tokens": 3776214952.0, "step": 1236 }, { "epoch": 1.8118703059168346, "grad_norm": 0.6350475983716228, "learning_rate": 3.0567843381003416e-05, "loss": 0.7341, "num_tokens": 3779222343.0, "step": 1237 }, { "epoch": 1.8133357757831106, "grad_norm": 0.4474772270239477, "learning_rate": 3.055282391129446e-05, "loss": 0.7257, "num_tokens": 3782649879.0, "step": 1238 }, { "epoch": 1.8148012456493863, "grad_norm": 0.7076818118800702, "learning_rate": 3.0537796746231805e-05, "loss": 0.7336, "num_tokens": 3785569586.0, "step": 1239 }, { "epoch": 1.8162667155156622, "grad_norm": 0.6107843827660692, "learning_rate": 3.052276189933602e-05, "loss": 0.725, "num_tokens": 3788492150.0, "step": 1240 }, { "epoch": 1.8177321853819381, "grad_norm": 0.5603325806919623, "learning_rate": 3.050771938413463e-05, "loss": 0.7112, "num_tokens": 3791434690.0, "step": 1241 }, { "epoch": 1.8191976552482139, "grad_norm": 0.5556538930141467, "learning_rate": 3.049266921416205e-05, "loss": 0.7163, "num_tokens": 3794494914.0, "step": 1242 }, { "epoch": 1.8206631251144898, "grad_norm": 0.5650521624336602, "learning_rate": 3.0477611402959605e-05, "loss": 0.7281, "num_tokens": 3797463439.0, "step": 1243 }, { "epoch": 1.8221285949807657, "grad_norm": 0.5191335525390897, "learning_rate": 3.0462545964075433e-05, "loss": 0.7163, "num_tokens": 3800742071.0, "step": 1244 }, { "epoch": 1.8235940648470415, "grad_norm": 0.5726209233287751, "learning_rate": 3.0447472911064616e-05, "loss": 0.7384, "num_tokens": 3803769567.0, "step": 1245 }, { "epoch": 1.8250595347133176, "grad_norm": 0.4733174092666106, "learning_rate": 3.043239225748902e-05, "loss": 0.7347, "num_tokens": 3806715261.0, "step": 1246 }, { "epoch": 1.8265250045795933, "grad_norm": 0.6097679977025574, "learning_rate": 3.041730401691738e-05, "loss": 0.7359, "num_tokens": 3809710035.0, "step": 1247 }, { "epoch": 1.8279904744458693, "grad_norm": 0.4392833749665127, "learning_rate": 3.040220820292526e-05, "loss": 0.7129, "num_tokens": 3812762860.0, "step": 1248 }, { "epoch": 1.8294559443121452, "grad_norm": 0.642871805059476, "learning_rate": 3.038710482909503e-05, "loss": 0.7116, "num_tokens": 3815808336.0, "step": 1249 }, { "epoch": 1.830921414178421, "grad_norm": 0.473544473470459, "learning_rate": 3.0371993909015858e-05, "loss": 0.7303, "num_tokens": 3818893372.0, "step": 1250 }, { "epoch": 1.8323868840446969, "grad_norm": 0.5539277442754945, "learning_rate": 3.03568754562837e-05, "loss": 0.7301, "num_tokens": 3821947824.0, "step": 1251 }, { "epoch": 1.8338523539109728, "grad_norm": 0.46592363231241957, "learning_rate": 3.0341749484501316e-05, "loss": 0.7115, "num_tokens": 3825092070.0, "step": 1252 }, { "epoch": 1.8353178237772485, "grad_norm": 0.6924577231114566, "learning_rate": 3.03266160072782e-05, "loss": 0.7111, "num_tokens": 3827937601.0, "step": 1253 }, { "epoch": 1.8367832936435244, "grad_norm": 0.42918670883616356, "learning_rate": 3.0311475038230616e-05, "loss": 0.7249, "num_tokens": 3831020705.0, "step": 1254 }, { "epoch": 1.8382487635098004, "grad_norm": 0.660740856727847, "learning_rate": 3.029632659098155e-05, "loss": 0.7185, "num_tokens": 3834170742.0, "step": 1255 }, { "epoch": 1.839714233376076, "grad_norm": 0.5799651025885605, "learning_rate": 3.0281170679160743e-05, "loss": 0.725, "num_tokens": 3837245824.0, "step": 1256 }, { "epoch": 1.841179703242352, "grad_norm": 0.5447004998552158, "learning_rate": 3.0266007316404635e-05, "loss": 0.7119, "num_tokens": 3840314537.0, "step": 1257 }, { "epoch": 1.842645173108628, "grad_norm": 0.4921625001388072, "learning_rate": 3.025083651635636e-05, "loss": 0.7219, "num_tokens": 3843225950.0, "step": 1258 }, { "epoch": 1.8441106429749037, "grad_norm": 0.5500610059160009, "learning_rate": 3.0235658292665778e-05, "loss": 0.7111, "num_tokens": 3846310344.0, "step": 1259 }, { "epoch": 1.8455761128411798, "grad_norm": 0.4844317779243711, "learning_rate": 3.0220472658989407e-05, "loss": 0.7269, "num_tokens": 3849205778.0, "step": 1260 }, { "epoch": 1.8470415827074556, "grad_norm": 0.5577544085898276, "learning_rate": 3.0205279628990417e-05, "loss": 0.7139, "num_tokens": 3852222396.0, "step": 1261 }, { "epoch": 1.8485070525737315, "grad_norm": 0.4619759057969569, "learning_rate": 3.0190079216338656e-05, "loss": 0.7272, "num_tokens": 3855186567.0, "step": 1262 }, { "epoch": 1.8499725224400074, "grad_norm": 0.5868929021123546, "learning_rate": 3.0174871434710612e-05, "loss": 0.7234, "num_tokens": 3858432501.0, "step": 1263 }, { "epoch": 1.8514379923062831, "grad_norm": 0.5396731238726628, "learning_rate": 3.0159656297789396e-05, "loss": 0.7214, "num_tokens": 3861627148.0, "step": 1264 }, { "epoch": 1.852903462172559, "grad_norm": 0.5403695104336745, "learning_rate": 3.014443381926473e-05, "loss": 0.7019, "num_tokens": 3864632607.0, "step": 1265 }, { "epoch": 1.854368932038835, "grad_norm": 0.45004985065149816, "learning_rate": 3.0129204012832963e-05, "loss": 0.7052, "num_tokens": 3867717730.0, "step": 1266 }, { "epoch": 1.8558344019051107, "grad_norm": 0.6075585215305394, "learning_rate": 3.0113966892197026e-05, "loss": 0.7146, "num_tokens": 3870666202.0, "step": 1267 }, { "epoch": 1.8572998717713867, "grad_norm": 0.4328967180693791, "learning_rate": 3.0098722471066417e-05, "loss": 0.7001, "num_tokens": 3873591993.0, "step": 1268 }, { "epoch": 1.8587653416376626, "grad_norm": 0.539706477141303, "learning_rate": 3.008347076315722e-05, "loss": 0.7402, "num_tokens": 3876666875.0, "step": 1269 }, { "epoch": 1.8602308115039383, "grad_norm": 0.5367717342516909, "learning_rate": 3.0068211782192085e-05, "loss": 0.7028, "num_tokens": 3879833772.0, "step": 1270 }, { "epoch": 1.8616962813702145, "grad_norm": 0.4826607062342516, "learning_rate": 3.0052945541900175e-05, "loss": 0.7259, "num_tokens": 3882760931.0, "step": 1271 }, { "epoch": 1.8631617512364902, "grad_norm": 0.40969050374044647, "learning_rate": 3.00376720560172e-05, "loss": 0.7244, "num_tokens": 3885764262.0, "step": 1272 }, { "epoch": 1.864627221102766, "grad_norm": 0.691735874987434, "learning_rate": 3.0022391338285407e-05, "loss": 0.7328, "num_tokens": 3888743117.0, "step": 1273 }, { "epoch": 1.866092690969042, "grad_norm": 0.4928958006199086, "learning_rate": 3.000710340245352e-05, "loss": 0.7284, "num_tokens": 3891784760.0, "step": 1274 }, { "epoch": 1.8675581608353178, "grad_norm": 0.6032744021262607, "learning_rate": 2.9991808262276774e-05, "loss": 0.7248, "num_tokens": 3894970619.0, "step": 1275 }, { "epoch": 1.8690236307015937, "grad_norm": 0.6037351309961003, "learning_rate": 2.9976505931516878e-05, "loss": 0.7209, "num_tokens": 3897981649.0, "step": 1276 }, { "epoch": 1.8704891005678697, "grad_norm": 0.44614918124326197, "learning_rate": 2.9961196423942026e-05, "loss": 0.725, "num_tokens": 3901342269.0, "step": 1277 }, { "epoch": 1.8719545704341454, "grad_norm": 0.5684075035685395, "learning_rate": 2.9945879753326848e-05, "loss": 0.7283, "num_tokens": 3904568828.0, "step": 1278 }, { "epoch": 1.8734200403004213, "grad_norm": 0.434531047628026, "learning_rate": 2.9930555933452437e-05, "loss": 0.7067, "num_tokens": 3907811666.0, "step": 1279 }, { "epoch": 1.8748855101666972, "grad_norm": 0.500522785420292, "learning_rate": 2.9915224978106305e-05, "loss": 0.7126, "num_tokens": 3910784487.0, "step": 1280 }, { "epoch": 1.876350980032973, "grad_norm": 0.499460152151047, "learning_rate": 2.98998869010824e-05, "loss": 0.7148, "num_tokens": 3913881783.0, "step": 1281 }, { "epoch": 1.877816449899249, "grad_norm": 0.5777470029825018, "learning_rate": 2.988454171618105e-05, "loss": 0.7317, "num_tokens": 3916968803.0, "step": 1282 }, { "epoch": 1.8792819197655248, "grad_norm": 0.461422046466006, "learning_rate": 2.9869189437209017e-05, "loss": 0.7105, "num_tokens": 3920206115.0, "step": 1283 }, { "epoch": 1.8807473896318005, "grad_norm": 0.5384301075744651, "learning_rate": 2.985383007797942e-05, "loss": 0.7176, "num_tokens": 3923304339.0, "step": 1284 }, { "epoch": 1.8822128594980767, "grad_norm": 0.5054358006013112, "learning_rate": 2.983846365231174e-05, "loss": 0.7317, "num_tokens": 3926274071.0, "step": 1285 }, { "epoch": 1.8836783293643524, "grad_norm": 0.5318289539992627, "learning_rate": 2.9823090174031835e-05, "loss": 0.7285, "num_tokens": 3929194088.0, "step": 1286 }, { "epoch": 1.8851437992306284, "grad_norm": 0.4657849944244456, "learning_rate": 2.980770965697191e-05, "loss": 0.7193, "num_tokens": 3932347458.0, "step": 1287 }, { "epoch": 1.8866092690969043, "grad_norm": 0.4252315927345098, "learning_rate": 2.9792322114970495e-05, "loss": 0.7292, "num_tokens": 3935622990.0, "step": 1288 }, { "epoch": 1.88807473896318, "grad_norm": 0.5361154096964137, "learning_rate": 2.9776927561872434e-05, "loss": 0.7289, "num_tokens": 3938474829.0, "step": 1289 }, { "epoch": 1.889540208829456, "grad_norm": 0.46064933986285345, "learning_rate": 2.97615260115289e-05, "loss": 0.7231, "num_tokens": 3941721777.0, "step": 1290 }, { "epoch": 1.8910056786957319, "grad_norm": 0.44213175021100964, "learning_rate": 2.974611747779733e-05, "loss": 0.7281, "num_tokens": 3944819865.0, "step": 1291 }, { "epoch": 1.8924711485620076, "grad_norm": 0.5852228945647604, "learning_rate": 2.9730701974541478e-05, "loss": 0.7063, "num_tokens": 3948020936.0, "step": 1292 }, { "epoch": 1.8939366184282835, "grad_norm": 0.4811258644337281, "learning_rate": 2.9715279515631346e-05, "loss": 0.7156, "num_tokens": 3950942952.0, "step": 1293 }, { "epoch": 1.8954020882945595, "grad_norm": 0.46879300962403375, "learning_rate": 2.9699850114943208e-05, "loss": 0.7258, "num_tokens": 3953933033.0, "step": 1294 }, { "epoch": 1.8968675581608352, "grad_norm": 0.5570458912678902, "learning_rate": 2.9684413786359573e-05, "loss": 0.7066, "num_tokens": 3957025431.0, "step": 1295 }, { "epoch": 1.8983330280271113, "grad_norm": 0.4719145890572937, "learning_rate": 2.966897054376919e-05, "loss": 0.7164, "num_tokens": 3960114186.0, "step": 1296 }, { "epoch": 1.899798497893387, "grad_norm": 0.5369992859304673, "learning_rate": 2.965352040106703e-05, "loss": 0.7142, "num_tokens": 3962956785.0, "step": 1297 }, { "epoch": 1.9012639677596628, "grad_norm": 0.4683906897435623, "learning_rate": 2.9638063372154265e-05, "loss": 0.7235, "num_tokens": 3966035864.0, "step": 1298 }, { "epoch": 1.902729437625939, "grad_norm": 0.6021529583071882, "learning_rate": 2.9622599470938268e-05, "loss": 0.7167, "num_tokens": 3969163154.0, "step": 1299 }, { "epoch": 1.9041949074922147, "grad_norm": 0.46301091747249523, "learning_rate": 2.960712871133259e-05, "loss": 0.734, "num_tokens": 3972471784.0, "step": 1300 }, { "epoch": 1.9056603773584906, "grad_norm": 0.6292393409025439, "learning_rate": 2.959165110725697e-05, "loss": 0.7346, "num_tokens": 3975624326.0, "step": 1301 }, { "epoch": 1.9071258472247665, "grad_norm": 0.5152026860143738, "learning_rate": 2.957616667263728e-05, "loss": 0.7251, "num_tokens": 3978636213.0, "step": 1302 }, { "epoch": 1.9085913170910422, "grad_norm": 0.4502329354239532, "learning_rate": 2.956067542140555e-05, "loss": 0.7059, "num_tokens": 3981946784.0, "step": 1303 }, { "epoch": 1.9100567869573182, "grad_norm": 0.6162301579136784, "learning_rate": 2.954517736749996e-05, "loss": 0.7125, "num_tokens": 3985024580.0, "step": 1304 }, { "epoch": 1.9115222568235941, "grad_norm": 0.38837315418904206, "learning_rate": 2.952967252486477e-05, "loss": 0.7345, "num_tokens": 3988015338.0, "step": 1305 }, { "epoch": 1.9129877266898698, "grad_norm": 0.6197994483675844, "learning_rate": 2.9514160907450395e-05, "loss": 0.7308, "num_tokens": 3990995778.0, "step": 1306 }, { "epoch": 1.9144531965561458, "grad_norm": 0.483359263669142, "learning_rate": 2.9498642529213304e-05, "loss": 0.7055, "num_tokens": 3994217664.0, "step": 1307 }, { "epoch": 1.9159186664224217, "grad_norm": 0.5708665843435615, "learning_rate": 2.948311740411608e-05, "loss": 0.7335, "num_tokens": 3997315978.0, "step": 1308 }, { "epoch": 1.9173841362886974, "grad_norm": 0.5612096120423079, "learning_rate": 2.946758554612736e-05, "loss": 0.7115, "num_tokens": 4000347385.0, "step": 1309 }, { "epoch": 1.9188496061549736, "grad_norm": 0.47635266267691834, "learning_rate": 2.9452046969221845e-05, "loss": 0.7375, "num_tokens": 4003579959.0, "step": 1310 }, { "epoch": 1.9203150760212493, "grad_norm": 0.5346674676818062, "learning_rate": 2.9436501687380275e-05, "loss": 0.7078, "num_tokens": 4006684820.0, "step": 1311 }, { "epoch": 1.9217805458875252, "grad_norm": 0.4883809172026689, "learning_rate": 2.9420949714589438e-05, "loss": 0.7359, "num_tokens": 4009718818.0, "step": 1312 }, { "epoch": 1.9232460157538012, "grad_norm": 0.4297821439878945, "learning_rate": 2.9405391064842117e-05, "loss": 0.7364, "num_tokens": 4012738565.0, "step": 1313 }, { "epoch": 1.9247114856200769, "grad_norm": 0.5453164006040473, "learning_rate": 2.938982575213712e-05, "loss": 0.7071, "num_tokens": 4015733806.0, "step": 1314 }, { "epoch": 1.9261769554863528, "grad_norm": 0.49294531695059146, "learning_rate": 2.9374253790479264e-05, "loss": 0.7235, "num_tokens": 4018608291.0, "step": 1315 }, { "epoch": 1.9276424253526288, "grad_norm": 0.4571778837208386, "learning_rate": 2.9358675193879308e-05, "loss": 0.7289, "num_tokens": 4021508803.0, "step": 1316 }, { "epoch": 1.9291078952189045, "grad_norm": 0.4947590917261216, "learning_rate": 2.9343089976354008e-05, "loss": 0.7101, "num_tokens": 4024559584.0, "step": 1317 }, { "epoch": 1.9305733650851804, "grad_norm": 0.5507749069727216, "learning_rate": 2.932749815192609e-05, "loss": 0.6955, "num_tokens": 4027628747.0, "step": 1318 }, { "epoch": 1.9320388349514563, "grad_norm": 0.4632987655935729, "learning_rate": 2.9311899734624198e-05, "loss": 0.7274, "num_tokens": 4030826107.0, "step": 1319 }, { "epoch": 1.933504304817732, "grad_norm": 0.42026574409523243, "learning_rate": 2.9296294738482903e-05, "loss": 0.7171, "num_tokens": 4033947216.0, "step": 1320 }, { "epoch": 1.9349697746840082, "grad_norm": 0.5969106911659524, "learning_rate": 2.9280683177542733e-05, "loss": 0.7153, "num_tokens": 4036948374.0, "step": 1321 }, { "epoch": 1.936435244550284, "grad_norm": 0.3905694332410494, "learning_rate": 2.9265065065850086e-05, "loss": 0.7157, "num_tokens": 4039931936.0, "step": 1322 }, { "epoch": 1.9379007144165596, "grad_norm": 0.6594446442147556, "learning_rate": 2.9249440417457275e-05, "loss": 0.734, "num_tokens": 4042862775.0, "step": 1323 }, { "epoch": 1.9393661842828358, "grad_norm": 0.4448619573632073, "learning_rate": 2.9233809246422473e-05, "loss": 0.7223, "num_tokens": 4046131008.0, "step": 1324 }, { "epoch": 1.9408316541491115, "grad_norm": 0.6779458850258759, "learning_rate": 2.921817156680975e-05, "loss": 0.7184, "num_tokens": 4049216420.0, "step": 1325 }, { "epoch": 1.9422971240153875, "grad_norm": 0.6138260466188652, "learning_rate": 2.920252739268902e-05, "loss": 0.7032, "num_tokens": 4052411336.0, "step": 1326 }, { "epoch": 1.9437625938816634, "grad_norm": 0.6551602752722429, "learning_rate": 2.9186876738136018e-05, "loss": 0.7067, "num_tokens": 4055654355.0, "step": 1327 }, { "epoch": 1.945228063747939, "grad_norm": 0.5823608819089839, "learning_rate": 2.9171219617232348e-05, "loss": 0.7273, "num_tokens": 4058567839.0, "step": 1328 }, { "epoch": 1.946693533614215, "grad_norm": 0.648318666923373, "learning_rate": 2.915555604406541e-05, "loss": 0.7138, "num_tokens": 4061806942.0, "step": 1329 }, { "epoch": 1.948159003480491, "grad_norm": 0.557535463407278, "learning_rate": 2.913988603272841e-05, "loss": 0.7208, "num_tokens": 4065071761.0, "step": 1330 }, { "epoch": 1.9496244733467667, "grad_norm": 0.7194298036502226, "learning_rate": 2.9124209597320346e-05, "loss": 0.7381, "num_tokens": 4068299657.0, "step": 1331 }, { "epoch": 1.9510899432130426, "grad_norm": 0.5426991158983889, "learning_rate": 2.9108526751946006e-05, "loss": 0.7049, "num_tokens": 4071178407.0, "step": 1332 }, { "epoch": 1.9525554130793186, "grad_norm": 0.5672564952488988, "learning_rate": 2.9092837510715942e-05, "loss": 0.7543, "num_tokens": 4073992540.0, "step": 1333 }, { "epoch": 1.9540208829455943, "grad_norm": 0.4869103358013413, "learning_rate": 2.907714188774645e-05, "loss": 0.7175, "num_tokens": 4077328244.0, "step": 1334 }, { "epoch": 1.9554863528118704, "grad_norm": 0.5798219299973278, "learning_rate": 2.906143989715958e-05, "loss": 0.7508, "num_tokens": 4080237126.0, "step": 1335 }, { "epoch": 1.9569518226781462, "grad_norm": 0.46513049345236734, "learning_rate": 2.904573155308311e-05, "loss": 0.7069, "num_tokens": 4083358565.0, "step": 1336 }, { "epoch": 1.958417292544422, "grad_norm": 0.6001128590636814, "learning_rate": 2.9030016869650528e-05, "loss": 0.7336, "num_tokens": 4086227715.0, "step": 1337 }, { "epoch": 1.959882762410698, "grad_norm": 0.501079856171508, "learning_rate": 2.9014295861001023e-05, "loss": 0.7211, "num_tokens": 4089368015.0, "step": 1338 }, { "epoch": 1.9613482322769737, "grad_norm": 0.5602312410901916, "learning_rate": 2.899856854127949e-05, "loss": 0.7328, "num_tokens": 4092377960.0, "step": 1339 }, { "epoch": 1.9628137021432497, "grad_norm": 0.5319752150802948, "learning_rate": 2.8982834924636493e-05, "loss": 0.7221, "num_tokens": 4095492647.0, "step": 1340 }, { "epoch": 1.9642791720095256, "grad_norm": 0.5637811114221856, "learning_rate": 2.896709502522826e-05, "loss": 0.7255, "num_tokens": 4098470665.0, "step": 1341 }, { "epoch": 1.9657446418758013, "grad_norm": 0.5406982296451678, "learning_rate": 2.8951348857216675e-05, "loss": 0.7206, "num_tokens": 4101398879.0, "step": 1342 }, { "epoch": 1.9672101117420773, "grad_norm": 0.46364715576402576, "learning_rate": 2.8935596434769263e-05, "loss": 0.7041, "num_tokens": 4104234289.0, "step": 1343 }, { "epoch": 1.9686755816083532, "grad_norm": 0.5310161338306886, "learning_rate": 2.8919837772059173e-05, "loss": 0.7228, "num_tokens": 4107085693.0, "step": 1344 }, { "epoch": 1.970141051474629, "grad_norm": 0.530957934611928, "learning_rate": 2.8904072883265168e-05, "loss": 0.7117, "num_tokens": 4110242445.0, "step": 1345 }, { "epoch": 1.971606521340905, "grad_norm": 0.46089098970570264, "learning_rate": 2.8888301782571618e-05, "loss": 0.7296, "num_tokens": 4113351588.0, "step": 1346 }, { "epoch": 1.9730719912071808, "grad_norm": 0.4685933177800599, "learning_rate": 2.8872524484168482e-05, "loss": 0.7007, "num_tokens": 4116453159.0, "step": 1347 }, { "epoch": 1.9745374610734565, "grad_norm": 0.48712940482189737, "learning_rate": 2.885674100225128e-05, "loss": 0.7154, "num_tokens": 4119643279.0, "step": 1348 }, { "epoch": 1.9760029309397327, "grad_norm": 0.520129837952956, "learning_rate": 2.8840951351021133e-05, "loss": 0.727, "num_tokens": 4122618822.0, "step": 1349 }, { "epoch": 1.9774684008060084, "grad_norm": 0.39178646580988485, "learning_rate": 2.8825155544684665e-05, "loss": 0.7147, "num_tokens": 4125666917.0, "step": 1350 }, { "epoch": 1.9789338706722843, "grad_norm": 0.5280540580743738, "learning_rate": 2.8809353597454077e-05, "loss": 0.7011, "num_tokens": 4128858077.0, "step": 1351 }, { "epoch": 1.9803993405385603, "grad_norm": 0.5007308996893136, "learning_rate": 2.879354552354706e-05, "loss": 0.7139, "num_tokens": 4131961108.0, "step": 1352 }, { "epoch": 1.981864810404836, "grad_norm": 0.5251215565216402, "learning_rate": 2.8777731337186857e-05, "loss": 0.7508, "num_tokens": 4134905448.0, "step": 1353 }, { "epoch": 1.983330280271112, "grad_norm": 0.5004936037630865, "learning_rate": 2.876191105260218e-05, "loss": 0.7217, "num_tokens": 4137900805.0, "step": 1354 }, { "epoch": 1.9847957501373878, "grad_norm": 0.48371105608864456, "learning_rate": 2.874608468402724e-05, "loss": 0.7197, "num_tokens": 4140788207.0, "step": 1355 }, { "epoch": 1.9862612200036636, "grad_norm": 0.4816320746438291, "learning_rate": 2.8730252245701727e-05, "loss": 0.7147, "num_tokens": 4143859899.0, "step": 1356 }, { "epoch": 1.9877266898699395, "grad_norm": 0.4642541158788186, "learning_rate": 2.871441375187077e-05, "loss": 0.6947, "num_tokens": 4146853534.0, "step": 1357 }, { "epoch": 1.9891921597362154, "grad_norm": 0.450414638623165, "learning_rate": 2.8698569216784983e-05, "loss": 0.7215, "num_tokens": 4150035501.0, "step": 1358 }, { "epoch": 1.9906576296024912, "grad_norm": 0.47857178483603857, "learning_rate": 2.868271865470037e-05, "loss": 0.7286, "num_tokens": 4153000306.0, "step": 1359 }, { "epoch": 1.9921230994687673, "grad_norm": 0.4141431086182353, "learning_rate": 2.8666862079878403e-05, "loss": 0.7129, "num_tokens": 4155920615.0, "step": 1360 }, { "epoch": 1.993588569335043, "grad_norm": 0.4929319033580299, "learning_rate": 2.8650999506585938e-05, "loss": 0.7247, "num_tokens": 4159100804.0, "step": 1361 }, { "epoch": 1.995054039201319, "grad_norm": 0.45134386745122224, "learning_rate": 2.8635130949095225e-05, "loss": 0.6878, "num_tokens": 4162361276.0, "step": 1362 }, { "epoch": 1.996519509067595, "grad_norm": 0.44962153353337614, "learning_rate": 2.8619256421683913e-05, "loss": 0.742, "num_tokens": 4165501149.0, "step": 1363 }, { "epoch": 1.9979849789338706, "grad_norm": 0.3884680333290873, "learning_rate": 2.860337593863502e-05, "loss": 0.7382, "num_tokens": 4168617755.0, "step": 1364 }, { "epoch": 1.9994504488001466, "grad_norm": 0.5783997759046408, "learning_rate": 2.858748951423691e-05, "loss": 0.6998, "num_tokens": 4171870707.0, "step": 1365 }, { "epoch": 2.0, "grad_norm": 0.5783997759046408, "learning_rate": 2.8571597162783308e-05, "loss": 0.7226, "num_tokens": 4172753218.0, "step": 1366 }, { "epoch": 2.0014654698662757, "grad_norm": 0.7150832965250836, "learning_rate": 2.855569889857327e-05, "loss": 0.6991, "num_tokens": 4175680256.0, "step": 1367 }, { "epoch": 2.002930939732552, "grad_norm": 0.5789439210299216, "learning_rate": 2.853979473591115e-05, "loss": 0.702, "num_tokens": 4178589978.0, "step": 1368 }, { "epoch": 2.0043964095988276, "grad_norm": 0.5599640352542162, "learning_rate": 2.852388468910663e-05, "loss": 0.6984, "num_tokens": 4181835743.0, "step": 1369 }, { "epoch": 2.0058618794651033, "grad_norm": 0.498664619477045, "learning_rate": 2.85079687724747e-05, "loss": 0.705, "num_tokens": 4184805022.0, "step": 1370 }, { "epoch": 2.0073273493313795, "grad_norm": 0.5672060741625693, "learning_rate": 2.8492047000335597e-05, "loss": 0.7074, "num_tokens": 4187914648.0, "step": 1371 }, { "epoch": 2.008792819197655, "grad_norm": 0.4219184914583131, "learning_rate": 2.8476119387014848e-05, "loss": 0.7061, "num_tokens": 4190953717.0, "step": 1372 }, { "epoch": 2.0102582890639313, "grad_norm": 0.5301365665518536, "learning_rate": 2.846018594684323e-05, "loss": 0.6944, "num_tokens": 4194002720.0, "step": 1373 }, { "epoch": 2.011723758930207, "grad_norm": 0.4685033832184126, "learning_rate": 2.8444246694156762e-05, "loss": 0.6964, "num_tokens": 4196862452.0, "step": 1374 }, { "epoch": 2.0131892287964828, "grad_norm": 0.4569188103619466, "learning_rate": 2.84283016432967e-05, "loss": 0.7023, "num_tokens": 4199912036.0, "step": 1375 }, { "epoch": 2.014654698662759, "grad_norm": 0.5211571579293084, "learning_rate": 2.841235080860951e-05, "loss": 0.6997, "num_tokens": 4202976305.0, "step": 1376 }, { "epoch": 2.0161201685290346, "grad_norm": 0.43930487542771857, "learning_rate": 2.8396394204446857e-05, "loss": 0.6991, "num_tokens": 4206371547.0, "step": 1377 }, { "epoch": 2.0175856383953104, "grad_norm": 0.48306004230362376, "learning_rate": 2.8380431845165604e-05, "loss": 0.7084, "num_tokens": 4209439371.0, "step": 1378 }, { "epoch": 2.0190511082615865, "grad_norm": 0.41843159337181735, "learning_rate": 2.8364463745127794e-05, "loss": 0.6951, "num_tokens": 4212677246.0, "step": 1379 }, { "epoch": 2.0205165781278622, "grad_norm": 0.4495191244731692, "learning_rate": 2.8348489918700633e-05, "loss": 0.7029, "num_tokens": 4215691506.0, "step": 1380 }, { "epoch": 2.021982047994138, "grad_norm": 0.432235280660172, "learning_rate": 2.833251038025648e-05, "loss": 0.7074, "num_tokens": 4218741583.0, "step": 1381 }, { "epoch": 2.023447517860414, "grad_norm": 0.4259760710863493, "learning_rate": 2.8316525144172828e-05, "loss": 0.6949, "num_tokens": 4221794251.0, "step": 1382 }, { "epoch": 2.02491298772669, "grad_norm": 0.4622651259480561, "learning_rate": 2.83005342248323e-05, "loss": 0.6969, "num_tokens": 4224925634.0, "step": 1383 }, { "epoch": 2.0263784575929655, "grad_norm": 0.43563497029818465, "learning_rate": 2.8284537636622636e-05, "loss": 0.7299, "num_tokens": 4227833286.0, "step": 1384 }, { "epoch": 2.0278439274592417, "grad_norm": 0.4961594987745509, "learning_rate": 2.826853539393668e-05, "loss": 0.715, "num_tokens": 4230774701.0, "step": 1385 }, { "epoch": 2.0293093973255174, "grad_norm": 0.41209692322352337, "learning_rate": 2.8252527511172337e-05, "loss": 0.7219, "num_tokens": 4234000863.0, "step": 1386 }, { "epoch": 2.0307748671917936, "grad_norm": 0.41494590263460024, "learning_rate": 2.8236514002732628e-05, "loss": 0.7125, "num_tokens": 4236910019.0, "step": 1387 }, { "epoch": 2.0322403370580693, "grad_norm": 0.3950802833849933, "learning_rate": 2.8220494883025605e-05, "loss": 0.7042, "num_tokens": 4239994217.0, "step": 1388 }, { "epoch": 2.033705806924345, "grad_norm": 0.46091084797710363, "learning_rate": 2.820447016646438e-05, "loss": 0.7096, "num_tokens": 4243246290.0, "step": 1389 }, { "epoch": 2.035171276790621, "grad_norm": 0.4069316813675992, "learning_rate": 2.8188439867467098e-05, "loss": 0.7101, "num_tokens": 4246555547.0, "step": 1390 }, { "epoch": 2.036636746656897, "grad_norm": 0.46373275034903044, "learning_rate": 2.8172404000456923e-05, "loss": 0.7271, "num_tokens": 4249495182.0, "step": 1391 }, { "epoch": 2.0381022165231726, "grad_norm": 0.3986559874877942, "learning_rate": 2.8156362579862042e-05, "loss": 0.696, "num_tokens": 4252380137.0, "step": 1392 }, { "epoch": 2.0395676863894487, "grad_norm": 0.44316063703000513, "learning_rate": 2.8140315620115623e-05, "loss": 0.7057, "num_tokens": 4255473789.0, "step": 1393 }, { "epoch": 2.0410331562557245, "grad_norm": 0.4111133212307321, "learning_rate": 2.8124263135655826e-05, "loss": 0.7328, "num_tokens": 4258313168.0, "step": 1394 }, { "epoch": 2.042498626122, "grad_norm": 0.4413632764779973, "learning_rate": 2.8108205140925788e-05, "loss": 0.6971, "num_tokens": 4261448179.0, "step": 1395 }, { "epoch": 2.0439640959882763, "grad_norm": 0.44624448794342847, "learning_rate": 2.809214165037359e-05, "loss": 0.7005, "num_tokens": 4264375389.0, "step": 1396 }, { "epoch": 2.045429565854552, "grad_norm": 0.4615703494843691, "learning_rate": 2.8076072678452263e-05, "loss": 0.7002, "num_tokens": 4267441589.0, "step": 1397 }, { "epoch": 2.046895035720828, "grad_norm": 0.43548063319700353, "learning_rate": 2.8059998239619773e-05, "loss": 0.7143, "num_tokens": 4270537801.0, "step": 1398 }, { "epoch": 2.048360505587104, "grad_norm": 0.498436171096523, "learning_rate": 2.8043918348339006e-05, "loss": 0.703, "num_tokens": 4273751612.0, "step": 1399 }, { "epoch": 2.0498259754533796, "grad_norm": 0.459495179106195, "learning_rate": 2.8027833019077753e-05, "loss": 0.705, "num_tokens": 4276672844.0, "step": 1400 }, { "epoch": 2.051291445319656, "grad_norm": 0.49853715963840534, "learning_rate": 2.8011742266308686e-05, "loss": 0.7159, "num_tokens": 4279608101.0, "step": 1401 }, { "epoch": 2.0527569151859315, "grad_norm": 0.41049646140061313, "learning_rate": 2.7995646104509377e-05, "loss": 0.7155, "num_tokens": 4282851107.0, "step": 1402 }, { "epoch": 2.0542223850522072, "grad_norm": 0.49851252493865544, "learning_rate": 2.7979544548162246e-05, "loss": 0.7092, "num_tokens": 4285828000.0, "step": 1403 }, { "epoch": 2.0556878549184834, "grad_norm": 0.4051887579903802, "learning_rate": 2.7963437611754583e-05, "loss": 0.7036, "num_tokens": 4288586316.0, "step": 1404 }, { "epoch": 2.057153324784759, "grad_norm": 0.5329319472251645, "learning_rate": 2.7947325309778507e-05, "loss": 0.7174, "num_tokens": 4291923561.0, "step": 1405 }, { "epoch": 2.058618794651035, "grad_norm": 0.460651114307889, "learning_rate": 2.7931207656730963e-05, "loss": 0.6846, "num_tokens": 4295083241.0, "step": 1406 }, { "epoch": 2.060084264517311, "grad_norm": 0.486114312359634, "learning_rate": 2.791508466711372e-05, "loss": 0.7045, "num_tokens": 4298237914.0, "step": 1407 }, { "epoch": 2.0615497343835867, "grad_norm": 0.4987670456764497, "learning_rate": 2.7898956355433352e-05, "loss": 0.7144, "num_tokens": 4301268508.0, "step": 1408 }, { "epoch": 2.0630152042498624, "grad_norm": 0.500266060603412, "learning_rate": 2.7882822736201204e-05, "loss": 0.7085, "num_tokens": 4304381370.0, "step": 1409 }, { "epoch": 2.0644806741161386, "grad_norm": 0.4353557074583143, "learning_rate": 2.7866683823933406e-05, "loss": 0.6974, "num_tokens": 4307421839.0, "step": 1410 }, { "epoch": 2.0659461439824143, "grad_norm": 0.47008725586061084, "learning_rate": 2.7850539633150854e-05, "loss": 0.6849, "num_tokens": 4310347288.0, "step": 1411 }, { "epoch": 2.0674116138486904, "grad_norm": 0.5069092479458464, "learning_rate": 2.783439017837919e-05, "loss": 0.7049, "num_tokens": 4313449301.0, "step": 1412 }, { "epoch": 2.068877083714966, "grad_norm": 0.37045911630842837, "learning_rate": 2.7818235474148793e-05, "loss": 0.7089, "num_tokens": 4316493785.0, "step": 1413 }, { "epoch": 2.070342553581242, "grad_norm": 0.48734419235497006, "learning_rate": 2.7802075534994762e-05, "loss": 0.7122, "num_tokens": 4319571389.0, "step": 1414 }, { "epoch": 2.071808023447518, "grad_norm": 0.3875173556126372, "learning_rate": 2.7785910375456914e-05, "loss": 0.7205, "num_tokens": 4322582684.0, "step": 1415 }, { "epoch": 2.0732734933137937, "grad_norm": 0.46711045179092997, "learning_rate": 2.7769740010079752e-05, "loss": 0.7084, "num_tokens": 4325662864.0, "step": 1416 }, { "epoch": 2.0747389631800695, "grad_norm": 0.4230687596039707, "learning_rate": 2.7753564453412467e-05, "loss": 0.7025, "num_tokens": 4328763014.0, "step": 1417 }, { "epoch": 2.0762044330463456, "grad_norm": 0.49876048732424827, "learning_rate": 2.773738372000893e-05, "loss": 0.7117, "num_tokens": 4331931985.0, "step": 1418 }, { "epoch": 2.0776699029126213, "grad_norm": 0.4608663773168124, "learning_rate": 2.7721197824427654e-05, "loss": 0.7195, "num_tokens": 4334890165.0, "step": 1419 }, { "epoch": 2.079135372778897, "grad_norm": 0.4396921963271726, "learning_rate": 2.7705006781231816e-05, "loss": 0.718, "num_tokens": 4337979803.0, "step": 1420 }, { "epoch": 2.080600842645173, "grad_norm": 0.4800781680275101, "learning_rate": 2.7688810604989192e-05, "loss": 0.7168, "num_tokens": 4340877628.0, "step": 1421 }, { "epoch": 2.082066312511449, "grad_norm": 0.42113088750392697, "learning_rate": 2.7672609310272225e-05, "loss": 0.7167, "num_tokens": 4343880829.0, "step": 1422 }, { "epoch": 2.083531782377725, "grad_norm": 0.6151064757874548, "learning_rate": 2.7656402911657916e-05, "loss": 0.7201, "num_tokens": 4347124292.0, "step": 1423 }, { "epoch": 2.084997252244001, "grad_norm": 0.3903084005491772, "learning_rate": 2.764019142372788e-05, "loss": 0.6929, "num_tokens": 4350317771.0, "step": 1424 }, { "epoch": 2.0864627221102765, "grad_norm": 0.4241896334842566, "learning_rate": 2.7623974861068325e-05, "loss": 0.7061, "num_tokens": 4353409749.0, "step": 1425 }, { "epoch": 2.0879281919765527, "grad_norm": 0.5307331126530576, "learning_rate": 2.760775323826999e-05, "loss": 0.6962, "num_tokens": 4356315287.0, "step": 1426 }, { "epoch": 2.0893936618428284, "grad_norm": 0.45170129375128054, "learning_rate": 2.75915265699282e-05, "loss": 0.7093, "num_tokens": 4359469850.0, "step": 1427 }, { "epoch": 2.090859131709104, "grad_norm": 0.4296285528738621, "learning_rate": 2.7575294870642788e-05, "loss": 0.7104, "num_tokens": 4362633581.0, "step": 1428 }, { "epoch": 2.0923246015753802, "grad_norm": 0.46034044236783916, "learning_rate": 2.7559058155018156e-05, "loss": 0.7056, "num_tokens": 4365811768.0, "step": 1429 }, { "epoch": 2.093790071441656, "grad_norm": 0.48075567907737804, "learning_rate": 2.7542816437663174e-05, "loss": 0.7083, "num_tokens": 4368898502.0, "step": 1430 }, { "epoch": 2.0952555413079317, "grad_norm": 0.4018431277987484, "learning_rate": 2.752656973319124e-05, "loss": 0.7116, "num_tokens": 4371745065.0, "step": 1431 }, { "epoch": 2.096721011174208, "grad_norm": 0.3996716354520948, "learning_rate": 2.751031805622024e-05, "loss": 0.7054, "num_tokens": 4374797049.0, "step": 1432 }, { "epoch": 2.0981864810404836, "grad_norm": 0.48972108168234985, "learning_rate": 2.7494061421372516e-05, "loss": 0.7097, "num_tokens": 4377865245.0, "step": 1433 }, { "epoch": 2.0996519509067593, "grad_norm": 0.4190529698139894, "learning_rate": 2.7477799843274883e-05, "loss": 0.7062, "num_tokens": 4381044465.0, "step": 1434 }, { "epoch": 2.1011174207730354, "grad_norm": 0.3814217685185149, "learning_rate": 2.7461533336558602e-05, "loss": 0.71, "num_tokens": 4383879629.0, "step": 1435 }, { "epoch": 2.102582890639311, "grad_norm": 0.42597411237294686, "learning_rate": 2.7445261915859368e-05, "loss": 0.7172, "num_tokens": 4386979223.0, "step": 1436 }, { "epoch": 2.1040483605055873, "grad_norm": 0.4907231609040234, "learning_rate": 2.74289855958173e-05, "loss": 0.7013, "num_tokens": 4389983995.0, "step": 1437 }, { "epoch": 2.105513830371863, "grad_norm": 0.4435422173519564, "learning_rate": 2.7412704391076914e-05, "loss": 0.7137, "num_tokens": 4393091651.0, "step": 1438 }, { "epoch": 2.1069793002381387, "grad_norm": 0.40065940305539516, "learning_rate": 2.7396418316287147e-05, "loss": 0.7041, "num_tokens": 4396246794.0, "step": 1439 }, { "epoch": 2.108444770104415, "grad_norm": 0.559896009581261, "learning_rate": 2.7380127386101286e-05, "loss": 0.7418, "num_tokens": 4399129558.0, "step": 1440 }, { "epoch": 2.1099102399706906, "grad_norm": 0.4947013484164558, "learning_rate": 2.736383161517701e-05, "loss": 0.7082, "num_tokens": 4402121775.0, "step": 1441 }, { "epoch": 2.1113757098369663, "grad_norm": 0.4353515988323889, "learning_rate": 2.7347531018176328e-05, "loss": 0.7096, "num_tokens": 4405205601.0, "step": 1442 }, { "epoch": 2.1128411797032425, "grad_norm": 0.6034462994223836, "learning_rate": 2.7331225609765627e-05, "loss": 0.6952, "num_tokens": 4408553106.0, "step": 1443 }, { "epoch": 2.114306649569518, "grad_norm": 0.46473933307350507, "learning_rate": 2.731491540461559e-05, "loss": 0.7298, "num_tokens": 4411510861.0, "step": 1444 }, { "epoch": 2.115772119435794, "grad_norm": 0.6234698195596777, "learning_rate": 2.729860041740123e-05, "loss": 0.702, "num_tokens": 4414567370.0, "step": 1445 }, { "epoch": 2.11723758930207, "grad_norm": 0.4869956682434465, "learning_rate": 2.7282280662801875e-05, "loss": 0.7135, "num_tokens": 4417650261.0, "step": 1446 }, { "epoch": 2.118703059168346, "grad_norm": 0.5516773152750696, "learning_rate": 2.7265956155501127e-05, "loss": 0.7194, "num_tokens": 4420436912.0, "step": 1447 }, { "epoch": 2.120168529034622, "grad_norm": 0.424115026151848, "learning_rate": 2.724962691018685e-05, "loss": 0.7025, "num_tokens": 4423466190.0, "step": 1448 }, { "epoch": 2.1216339989008977, "grad_norm": 0.5130395934123798, "learning_rate": 2.7233292941551206e-05, "loss": 0.6927, "num_tokens": 4426574882.0, "step": 1449 }, { "epoch": 2.1230994687671734, "grad_norm": 0.39643696321888394, "learning_rate": 2.7216954264290585e-05, "loss": 0.721, "num_tokens": 4429442758.0, "step": 1450 }, { "epoch": 2.1245649386334495, "grad_norm": 0.4834044492577929, "learning_rate": 2.7200610893105603e-05, "loss": 0.6992, "num_tokens": 4432645594.0, "step": 1451 }, { "epoch": 2.1260304084997252, "grad_norm": 0.4545083425022002, "learning_rate": 2.718426284270113e-05, "loss": 0.7086, "num_tokens": 4435626799.0, "step": 1452 }, { "epoch": 2.127495878366001, "grad_norm": 0.49992934503077, "learning_rate": 2.7167910127786225e-05, "loss": 0.7033, "num_tokens": 4438856763.0, "step": 1453 }, { "epoch": 2.128961348232277, "grad_norm": 0.4287812255041137, "learning_rate": 2.715155276307415e-05, "loss": 0.7388, "num_tokens": 4441772945.0, "step": 1454 }, { "epoch": 2.130426818098553, "grad_norm": 0.472994373049365, "learning_rate": 2.713519076328234e-05, "loss": 0.7067, "num_tokens": 4444951065.0, "step": 1455 }, { "epoch": 2.1318922879648285, "grad_norm": 0.5468504578822707, "learning_rate": 2.7118824143132418e-05, "loss": 0.7021, "num_tokens": 4448191737.0, "step": 1456 }, { "epoch": 2.1333577578311047, "grad_norm": 0.4750744629594362, "learning_rate": 2.7102452917350155e-05, "loss": 0.7084, "num_tokens": 4451274622.0, "step": 1457 }, { "epoch": 2.1348232276973804, "grad_norm": 0.43552327613943814, "learning_rate": 2.708607710066546e-05, "loss": 0.715, "num_tokens": 4454363885.0, "step": 1458 }, { "epoch": 2.136288697563656, "grad_norm": 0.5272211124276347, "learning_rate": 2.706969670781239e-05, "loss": 0.7019, "num_tokens": 4457392721.0, "step": 1459 }, { "epoch": 2.1377541674299323, "grad_norm": 0.45237717868001254, "learning_rate": 2.70533117535291e-05, "loss": 0.7175, "num_tokens": 4460383747.0, "step": 1460 }, { "epoch": 2.139219637296208, "grad_norm": 0.5471667785200469, "learning_rate": 2.7036922252557864e-05, "loss": 0.7017, "num_tokens": 4463452054.0, "step": 1461 }, { "epoch": 2.140685107162484, "grad_norm": 0.40507183699805566, "learning_rate": 2.7020528219645025e-05, "loss": 0.6975, "num_tokens": 4466204417.0, "step": 1462 }, { "epoch": 2.14215057702876, "grad_norm": 0.5545717789795022, "learning_rate": 2.7004129669541043e-05, "loss": 0.6959, "num_tokens": 4469346772.0, "step": 1463 }, { "epoch": 2.1436160468950356, "grad_norm": 0.42769524467855713, "learning_rate": 2.6987726617000402e-05, "loss": 0.7149, "num_tokens": 4472292685.0, "step": 1464 }, { "epoch": 2.1450815167613118, "grad_norm": 0.4629481148897773, "learning_rate": 2.6971319076781655e-05, "loss": 0.7093, "num_tokens": 4475373653.0, "step": 1465 }, { "epoch": 2.1465469866275875, "grad_norm": 0.46920658665945847, "learning_rate": 2.6954907063647398e-05, "loss": 0.7189, "num_tokens": 4478564827.0, "step": 1466 }, { "epoch": 2.148012456493863, "grad_norm": 0.4349684646217136, "learning_rate": 2.6938490592364237e-05, "loss": 0.7033, "num_tokens": 4481389987.0, "step": 1467 }, { "epoch": 2.1494779263601393, "grad_norm": 0.4112801751522899, "learning_rate": 2.6922069677702804e-05, "loss": 0.718, "num_tokens": 4484338026.0, "step": 1468 }, { "epoch": 2.150943396226415, "grad_norm": 0.47651020160405994, "learning_rate": 2.690564433443771e-05, "loss": 0.718, "num_tokens": 4487434030.0, "step": 1469 }, { "epoch": 2.1524088660926908, "grad_norm": 0.4867578362602603, "learning_rate": 2.6889214577347576e-05, "loss": 0.7135, "num_tokens": 4490399652.0, "step": 1470 }, { "epoch": 2.153874335958967, "grad_norm": 0.35700012689753946, "learning_rate": 2.6872780421214973e-05, "loss": 0.7085, "num_tokens": 4493401596.0, "step": 1471 }, { "epoch": 2.1553398058252426, "grad_norm": 0.5453609352937585, "learning_rate": 2.685634188082644e-05, "loss": 0.7004, "num_tokens": 4496308185.0, "step": 1472 }, { "epoch": 2.156805275691519, "grad_norm": 0.4391571082705138, "learning_rate": 2.6839898970972456e-05, "loss": 0.7134, "num_tokens": 4499304149.0, "step": 1473 }, { "epoch": 2.1582707455577945, "grad_norm": 0.3989838507080752, "learning_rate": 2.6823451706447437e-05, "loss": 0.7002, "num_tokens": 4502305220.0, "step": 1474 }, { "epoch": 2.1597362154240702, "grad_norm": 0.5076727658096067, "learning_rate": 2.680700010204972e-05, "loss": 0.6906, "num_tokens": 4505395280.0, "step": 1475 }, { "epoch": 2.1612016852903464, "grad_norm": 0.46482056059113497, "learning_rate": 2.6790544172581524e-05, "loss": 0.6913, "num_tokens": 4508448842.0, "step": 1476 }, { "epoch": 2.162667155156622, "grad_norm": 0.39315894360191994, "learning_rate": 2.6774083932848988e-05, "loss": 0.7341, "num_tokens": 4511449747.0, "step": 1477 }, { "epoch": 2.164132625022898, "grad_norm": 0.4096665179245095, "learning_rate": 2.6757619397662112e-05, "loss": 0.6871, "num_tokens": 4514622716.0, "step": 1478 }, { "epoch": 2.165598094889174, "grad_norm": 0.391551317163508, "learning_rate": 2.674115058183477e-05, "loss": 0.7054, "num_tokens": 4517658281.0, "step": 1479 }, { "epoch": 2.1670635647554497, "grad_norm": 0.46982096394357575, "learning_rate": 2.672467750018469e-05, "loss": 0.6988, "num_tokens": 4520863503.0, "step": 1480 }, { "epoch": 2.1685290346217254, "grad_norm": 0.4046832299726708, "learning_rate": 2.670820016753342e-05, "loss": 0.7074, "num_tokens": 4523857466.0, "step": 1481 }, { "epoch": 2.1699945044880016, "grad_norm": 0.4515367388867664, "learning_rate": 2.6691718598706343e-05, "loss": 0.7136, "num_tokens": 4526887968.0, "step": 1482 }, { "epoch": 2.1714599743542773, "grad_norm": 0.3740509512074513, "learning_rate": 2.6675232808532666e-05, "loss": 0.7192, "num_tokens": 4529893348.0, "step": 1483 }, { "epoch": 2.172925444220553, "grad_norm": 0.5568171670414099, "learning_rate": 2.6658742811845377e-05, "loss": 0.7144, "num_tokens": 4533121756.0, "step": 1484 }, { "epoch": 2.174390914086829, "grad_norm": 0.4454143865159811, "learning_rate": 2.6642248623481256e-05, "loss": 0.7195, "num_tokens": 4536019859.0, "step": 1485 }, { "epoch": 2.175856383953105, "grad_norm": 0.5022680642208118, "learning_rate": 2.6625750258280852e-05, "loss": 0.7024, "num_tokens": 4539487786.0, "step": 1486 }, { "epoch": 2.177321853819381, "grad_norm": 0.4566353701294357, "learning_rate": 2.660924773108847e-05, "loss": 0.7182, "num_tokens": 4542315820.0, "step": 1487 }, { "epoch": 2.1787873236856568, "grad_norm": 0.4824052964454861, "learning_rate": 2.6592741056752172e-05, "loss": 0.7224, "num_tokens": 4545143690.0, "step": 1488 }, { "epoch": 2.1802527935519325, "grad_norm": 0.46144642592245877, "learning_rate": 2.6576230250123737e-05, "loss": 0.7191, "num_tokens": 4547967569.0, "step": 1489 }, { "epoch": 2.1817182634182086, "grad_norm": 0.4499005684772337, "learning_rate": 2.655971532605867e-05, "loss": 0.6857, "num_tokens": 4551101569.0, "step": 1490 }, { "epoch": 2.1831837332844843, "grad_norm": 0.42158175585861707, "learning_rate": 2.6543196299416178e-05, "loss": 0.7023, "num_tokens": 4554025894.0, "step": 1491 }, { "epoch": 2.18464920315076, "grad_norm": 0.46260129774360087, "learning_rate": 2.6526673185059156e-05, "loss": 0.7041, "num_tokens": 4557073969.0, "step": 1492 }, { "epoch": 2.186114673017036, "grad_norm": 0.38271699280905513, "learning_rate": 2.651014599785418e-05, "loss": 0.7101, "num_tokens": 4560149156.0, "step": 1493 }, { "epoch": 2.187580142883312, "grad_norm": 0.5187488912099569, "learning_rate": 2.6493614752671496e-05, "loss": 0.6977, "num_tokens": 4563138977.0, "step": 1494 }, { "epoch": 2.1890456127495876, "grad_norm": 0.43906544785007634, "learning_rate": 2.6477079464384994e-05, "loss": 0.724, "num_tokens": 4566257165.0, "step": 1495 }, { "epoch": 2.190511082615864, "grad_norm": 0.49988045967845707, "learning_rate": 2.64605401478722e-05, "loss": 0.7375, "num_tokens": 4569269646.0, "step": 1496 }, { "epoch": 2.1919765524821395, "grad_norm": 0.42744981173953817, "learning_rate": 2.6443996818014275e-05, "loss": 0.7084, "num_tokens": 4572346437.0, "step": 1497 }, { "epoch": 2.1934420223484157, "grad_norm": 0.5026916773678158, "learning_rate": 2.642744948969598e-05, "loss": 0.7178, "num_tokens": 4575512790.0, "step": 1498 }, { "epoch": 2.1949074922146914, "grad_norm": 0.4347661171067446, "learning_rate": 2.641089817780568e-05, "loss": 0.7091, "num_tokens": 4578404555.0, "step": 1499 }, { "epoch": 2.196372962080967, "grad_norm": 0.46608475793445053, "learning_rate": 2.6394342897235313e-05, "loss": 0.7034, "num_tokens": 4581373173.0, "step": 1500 }, { "epoch": 2.1978384319472433, "grad_norm": 0.4215216463291551, "learning_rate": 2.637778366288041e-05, "loss": 0.7131, "num_tokens": 4584220543.0, "step": 1501 }, { "epoch": 2.199303901813519, "grad_norm": 0.4236248838351213, "learning_rate": 2.6361220489640035e-05, "loss": 0.6885, "num_tokens": 4587205615.0, "step": 1502 }, { "epoch": 2.2007693716797947, "grad_norm": 0.3776607862012483, "learning_rate": 2.6344653392416812e-05, "loss": 0.7009, "num_tokens": 4590391814.0, "step": 1503 }, { "epoch": 2.202234841546071, "grad_norm": 0.4931212910627832, "learning_rate": 2.6328082386116896e-05, "loss": 0.7129, "num_tokens": 4593328534.0, "step": 1504 }, { "epoch": 2.2037003114123466, "grad_norm": 0.5261525985739094, "learning_rate": 2.631150748564994e-05, "loss": 0.6845, "num_tokens": 4596808432.0, "step": 1505 }, { "epoch": 2.2051657812786223, "grad_norm": 0.34419896421973223, "learning_rate": 2.629492870592913e-05, "loss": 0.7047, "num_tokens": 4599888220.0, "step": 1506 }, { "epoch": 2.2066312511448984, "grad_norm": 0.62009634919208, "learning_rate": 2.627834606187112e-05, "loss": 0.7181, "num_tokens": 4603018773.0, "step": 1507 }, { "epoch": 2.208096721011174, "grad_norm": 0.40849523678650623, "learning_rate": 2.6261759568396043e-05, "loss": 0.7003, "num_tokens": 4605994869.0, "step": 1508 }, { "epoch": 2.20956219087745, "grad_norm": 0.5019592793522145, "learning_rate": 2.624516924042751e-05, "loss": 0.7096, "num_tokens": 4609322052.0, "step": 1509 }, { "epoch": 2.211027660743726, "grad_norm": 0.4703685006069305, "learning_rate": 2.622857509289256e-05, "loss": 0.7021, "num_tokens": 4612192892.0, "step": 1510 }, { "epoch": 2.2124931306100017, "grad_norm": 0.47818667062498516, "learning_rate": 2.6211977140721696e-05, "loss": 0.7056, "num_tokens": 4615287610.0, "step": 1511 }, { "epoch": 2.213958600476278, "grad_norm": 0.4575909375597416, "learning_rate": 2.6195375398848823e-05, "loss": 0.7253, "num_tokens": 4618348670.0, "step": 1512 }, { "epoch": 2.2154240703425536, "grad_norm": 0.49874073229048155, "learning_rate": 2.6178769882211258e-05, "loss": 0.7091, "num_tokens": 4621590688.0, "step": 1513 }, { "epoch": 2.2168895402088293, "grad_norm": 0.4614959372438202, "learning_rate": 2.6162160605749716e-05, "loss": 0.7075, "num_tokens": 4624811553.0, "step": 1514 }, { "epoch": 2.2183550100751055, "grad_norm": 0.5292662513011022, "learning_rate": 2.614554758440831e-05, "loss": 0.6965, "num_tokens": 4628256482.0, "step": 1515 }, { "epoch": 2.219820479941381, "grad_norm": 0.46423765460771643, "learning_rate": 2.6128930833134494e-05, "loss": 0.6853, "num_tokens": 4631310222.0, "step": 1516 }, { "epoch": 2.221285949807657, "grad_norm": 0.47402951890962386, "learning_rate": 2.6112310366879102e-05, "loss": 0.719, "num_tokens": 4634292178.0, "step": 1517 }, { "epoch": 2.222751419673933, "grad_norm": 0.47173853329743476, "learning_rate": 2.6095686200596308e-05, "loss": 0.7069, "num_tokens": 4637265516.0, "step": 1518 }, { "epoch": 2.224216889540209, "grad_norm": 0.40933048509366193, "learning_rate": 2.6079058349243598e-05, "loss": 0.6891, "num_tokens": 4640375282.0, "step": 1519 }, { "epoch": 2.2256823594064845, "grad_norm": 0.4621371773273731, "learning_rate": 2.6062426827781793e-05, "loss": 0.7071, "num_tokens": 4643398067.0, "step": 1520 }, { "epoch": 2.2271478292727607, "grad_norm": 0.43312626084354955, "learning_rate": 2.6045791651175002e-05, "loss": 0.7168, "num_tokens": 4646284713.0, "step": 1521 }, { "epoch": 2.2286132991390364, "grad_norm": 0.4514745879032844, "learning_rate": 2.6029152834390635e-05, "loss": 0.7185, "num_tokens": 4649232533.0, "step": 1522 }, { "epoch": 2.2300787690053125, "grad_norm": 0.4268540691537877, "learning_rate": 2.6012510392399362e-05, "loss": 0.6973, "num_tokens": 4652360771.0, "step": 1523 }, { "epoch": 2.2315442388715883, "grad_norm": 0.4563440088845739, "learning_rate": 2.599586434017513e-05, "loss": 0.6998, "num_tokens": 4655269763.0, "step": 1524 }, { "epoch": 2.233009708737864, "grad_norm": 0.38187485764544354, "learning_rate": 2.597921469269513e-05, "loss": 0.695, "num_tokens": 4658393410.0, "step": 1525 }, { "epoch": 2.23447517860414, "grad_norm": 0.42925310462392985, "learning_rate": 2.5962561464939784e-05, "loss": 0.6866, "num_tokens": 4661531663.0, "step": 1526 }, { "epoch": 2.235940648470416, "grad_norm": 0.4136933855087547, "learning_rate": 2.5945904671892734e-05, "loss": 0.7121, "num_tokens": 4664562036.0, "step": 1527 }, { "epoch": 2.2374061183366916, "grad_norm": 0.4750444842705771, "learning_rate": 2.592924432854083e-05, "loss": 0.7163, "num_tokens": 4667274575.0, "step": 1528 }, { "epoch": 2.2388715882029677, "grad_norm": 0.3936626194420731, "learning_rate": 2.5912580449874128e-05, "loss": 0.7105, "num_tokens": 4670268030.0, "step": 1529 }, { "epoch": 2.2403370580692434, "grad_norm": 0.49449552868472935, "learning_rate": 2.5895913050885853e-05, "loss": 0.7003, "num_tokens": 4673271758.0, "step": 1530 }, { "epoch": 2.241802527935519, "grad_norm": 0.4138204838924853, "learning_rate": 2.5879242146572393e-05, "loss": 0.709, "num_tokens": 4676157079.0, "step": 1531 }, { "epoch": 2.2432679978017953, "grad_norm": 0.4813252943006639, "learning_rate": 2.5862567751933318e-05, "loss": 0.7008, "num_tokens": 4679380199.0, "step": 1532 }, { "epoch": 2.244733467668071, "grad_norm": 0.46217047253176013, "learning_rate": 2.5845889881971297e-05, "loss": 0.7148, "num_tokens": 4682210999.0, "step": 1533 }, { "epoch": 2.2461989375343467, "grad_norm": 0.4262143569993668, "learning_rate": 2.5829208551692154e-05, "loss": 0.702, "num_tokens": 4685386780.0, "step": 1534 }, { "epoch": 2.247664407400623, "grad_norm": 0.5126445835024396, "learning_rate": 2.5812523776104817e-05, "loss": 0.7073, "num_tokens": 4688314825.0, "step": 1535 }, { "epoch": 2.2491298772668986, "grad_norm": 0.3705290058926664, "learning_rate": 2.5795835570221327e-05, "loss": 0.7214, "num_tokens": 4691458716.0, "step": 1536 }, { "epoch": 2.2505953471331743, "grad_norm": 0.40531584365768175, "learning_rate": 2.5779143949056785e-05, "loss": 0.6937, "num_tokens": 4694633835.0, "step": 1537 }, { "epoch": 2.2520608169994505, "grad_norm": 0.47780126223279634, "learning_rate": 2.5762448927629383e-05, "loss": 0.6917, "num_tokens": 4697932335.0, "step": 1538 }, { "epoch": 2.253526286865726, "grad_norm": 0.4019289723190445, "learning_rate": 2.5745750520960377e-05, "loss": 0.6984, "num_tokens": 4700874782.0, "step": 1539 }, { "epoch": 2.2549917567320024, "grad_norm": 0.4392304072433179, "learning_rate": 2.5729048744074065e-05, "loss": 0.7176, "num_tokens": 4703963192.0, "step": 1540 }, { "epoch": 2.256457226598278, "grad_norm": 0.3942424069978266, "learning_rate": 2.5712343611997763e-05, "loss": 0.6814, "num_tokens": 4706901606.0, "step": 1541 }, { "epoch": 2.257922696464554, "grad_norm": 0.631590921534017, "learning_rate": 2.569563513976182e-05, "loss": 0.6993, "num_tokens": 4709676244.0, "step": 1542 }, { "epoch": 2.25938816633083, "grad_norm": 0.4449125280535514, "learning_rate": 2.5678923342399586e-05, "loss": 0.7284, "num_tokens": 4712611226.0, "step": 1543 }, { "epoch": 2.2608536361971057, "grad_norm": 0.5971225221636886, "learning_rate": 2.5662208234947415e-05, "loss": 0.6987, "num_tokens": 4715552972.0, "step": 1544 }, { "epoch": 2.2623191060633814, "grad_norm": 0.5233618672343574, "learning_rate": 2.564548983244461e-05, "loss": 0.7031, "num_tokens": 4718899810.0, "step": 1545 }, { "epoch": 2.2637845759296575, "grad_norm": 0.5113970821870213, "learning_rate": 2.5628768149933468e-05, "loss": 0.7278, "num_tokens": 4721818354.0, "step": 1546 }, { "epoch": 2.2652500457959333, "grad_norm": 0.6096047752396083, "learning_rate": 2.5612043202459224e-05, "loss": 0.6757, "num_tokens": 4724906045.0, "step": 1547 }, { "epoch": 2.2667155156622094, "grad_norm": 0.4600765189742594, "learning_rate": 2.559531500507005e-05, "loss": 0.689, "num_tokens": 4728105245.0, "step": 1548 }, { "epoch": 2.268180985528485, "grad_norm": 0.6494951961737271, "learning_rate": 2.557858357281705e-05, "loss": 0.6973, "num_tokens": 4731210768.0, "step": 1549 }, { "epoch": 2.269646455394761, "grad_norm": 0.4601967886926085, "learning_rate": 2.556184892075423e-05, "loss": 0.7006, "num_tokens": 4734185598.0, "step": 1550 }, { "epoch": 2.271111925261037, "grad_norm": 0.5967838727220792, "learning_rate": 2.5545111063938496e-05, "loss": 0.7, "num_tokens": 4737151000.0, "step": 1551 }, { "epoch": 2.2725773951273127, "grad_norm": 0.5146575347552436, "learning_rate": 2.552837001742963e-05, "loss": 0.692, "num_tokens": 4740369524.0, "step": 1552 }, { "epoch": 2.2740428649935884, "grad_norm": 0.4668041001226483, "learning_rate": 2.5511625796290314e-05, "loss": 0.7003, "num_tokens": 4743511147.0, "step": 1553 }, { "epoch": 2.2755083348598646, "grad_norm": 0.5075830399790876, "learning_rate": 2.5494878415586038e-05, "loss": 0.7105, "num_tokens": 4746688378.0, "step": 1554 }, { "epoch": 2.2769738047261403, "grad_norm": 0.3995088345431428, "learning_rate": 2.5478127890385174e-05, "loss": 0.6921, "num_tokens": 4749826435.0, "step": 1555 }, { "epoch": 2.278439274592416, "grad_norm": 0.5519960268752788, "learning_rate": 2.5461374235758907e-05, "loss": 0.7085, "num_tokens": 4752719778.0, "step": 1556 }, { "epoch": 2.279904744458692, "grad_norm": 0.4405535560249131, "learning_rate": 2.5444617466781246e-05, "loss": 0.7091, "num_tokens": 4755800998.0, "step": 1557 }, { "epoch": 2.281370214324968, "grad_norm": 0.5356627528981739, "learning_rate": 2.5427857598528988e-05, "loss": 0.706, "num_tokens": 4758878194.0, "step": 1558 }, { "epoch": 2.2828356841912436, "grad_norm": 0.5159859616617177, "learning_rate": 2.541109464608173e-05, "loss": 0.6924, "num_tokens": 4762149343.0, "step": 1559 }, { "epoch": 2.2843011540575198, "grad_norm": 0.4477858834521287, "learning_rate": 2.5394328624521846e-05, "loss": 0.7112, "num_tokens": 4765138501.0, "step": 1560 }, { "epoch": 2.2857666239237955, "grad_norm": 0.527033830404785, "learning_rate": 2.537755954893446e-05, "loss": 0.6973, "num_tokens": 4768263380.0, "step": 1561 }, { "epoch": 2.287232093790071, "grad_norm": 0.38730883261663307, "learning_rate": 2.5360787434407454e-05, "loss": 0.6989, "num_tokens": 4771231518.0, "step": 1562 }, { "epoch": 2.2886975636563474, "grad_norm": 0.5710216011318371, "learning_rate": 2.534401229603144e-05, "loss": 0.707, "num_tokens": 4774301939.0, "step": 1563 }, { "epoch": 2.290163033522623, "grad_norm": 0.42658426396905924, "learning_rate": 2.532723414889976e-05, "loss": 0.7184, "num_tokens": 4777108125.0, "step": 1564 }, { "epoch": 2.2916285033888992, "grad_norm": 0.48031632185990397, "learning_rate": 2.531045300810844e-05, "loss": 0.6995, "num_tokens": 4780285349.0, "step": 1565 }, { "epoch": 2.293093973255175, "grad_norm": 0.4910101340705258, "learning_rate": 2.529366888875622e-05, "loss": 0.6913, "num_tokens": 4783304659.0, "step": 1566 }, { "epoch": 2.2945594431214507, "grad_norm": 0.40827077273428575, "learning_rate": 2.527688180594451e-05, "loss": 0.7106, "num_tokens": 4786294098.0, "step": 1567 }, { "epoch": 2.296024912987727, "grad_norm": 0.5490585576812276, "learning_rate": 2.5260091774777406e-05, "loss": 0.7009, "num_tokens": 4789405194.0, "step": 1568 }, { "epoch": 2.2974903828540025, "grad_norm": 0.42890809911903044, "learning_rate": 2.5243298810361622e-05, "loss": 0.7098, "num_tokens": 4792473881.0, "step": 1569 }, { "epoch": 2.2989558527202782, "grad_norm": 0.5078086718368822, "learning_rate": 2.5226502927806546e-05, "loss": 0.6987, "num_tokens": 4795759964.0, "step": 1570 }, { "epoch": 2.3004213225865544, "grad_norm": 0.47328390655665037, "learning_rate": 2.520970414222417e-05, "loss": 0.7131, "num_tokens": 4798846170.0, "step": 1571 }, { "epoch": 2.30188679245283, "grad_norm": 0.422901477677502, "learning_rate": 2.5192902468729093e-05, "loss": 0.7225, "num_tokens": 4801812460.0, "step": 1572 }, { "epoch": 2.3033522623191063, "grad_norm": 0.49037466698442217, "learning_rate": 2.5176097922438537e-05, "loss": 0.6935, "num_tokens": 4804899800.0, "step": 1573 }, { "epoch": 2.304817732185382, "grad_norm": 0.41520483193175606, "learning_rate": 2.5159290518472297e-05, "loss": 0.6959, "num_tokens": 4807988317.0, "step": 1574 }, { "epoch": 2.3062832020516577, "grad_norm": 0.4440102539104402, "learning_rate": 2.514248027195273e-05, "loss": 0.701, "num_tokens": 4811072056.0, "step": 1575 }, { "epoch": 2.307748671917934, "grad_norm": 0.48360650396831395, "learning_rate": 2.512566719800475e-05, "loss": 0.7159, "num_tokens": 4814157145.0, "step": 1576 }, { "epoch": 2.3092141417842096, "grad_norm": 0.46976552211687617, "learning_rate": 2.5108851311755835e-05, "loss": 0.6882, "num_tokens": 4817345667.0, "step": 1577 }, { "epoch": 2.3106796116504853, "grad_norm": 0.4134040896116298, "learning_rate": 2.5092032628335986e-05, "loss": 0.692, "num_tokens": 4820549194.0, "step": 1578 }, { "epoch": 2.3121450815167615, "grad_norm": 0.4826129593980683, "learning_rate": 2.50752111628777e-05, "loss": 0.7106, "num_tokens": 4823537588.0, "step": 1579 }, { "epoch": 2.313610551383037, "grad_norm": 0.4011606785911005, "learning_rate": 2.5058386930516002e-05, "loss": 0.7172, "num_tokens": 4826543198.0, "step": 1580 }, { "epoch": 2.315076021249313, "grad_norm": 0.4201363852877886, "learning_rate": 2.50415599463884e-05, "loss": 0.6857, "num_tokens": 4829772695.0, "step": 1581 }, { "epoch": 2.316541491115589, "grad_norm": 0.40708394860164904, "learning_rate": 2.502473022563487e-05, "loss": 0.7065, "num_tokens": 4832793879.0, "step": 1582 }, { "epoch": 2.3180069609818648, "grad_norm": 0.43947573586368893, "learning_rate": 2.500789778339786e-05, "loss": 0.716, "num_tokens": 4835940572.0, "step": 1583 }, { "epoch": 2.3194724308481405, "grad_norm": 0.386665759483699, "learning_rate": 2.4991062634822257e-05, "loss": 0.7022, "num_tokens": 4839149225.0, "step": 1584 }, { "epoch": 2.3209379007144166, "grad_norm": 0.41471932244545606, "learning_rate": 2.497422479505539e-05, "loss": 0.7021, "num_tokens": 4841830529.0, "step": 1585 }, { "epoch": 2.3224033705806923, "grad_norm": 0.4244088347430158, "learning_rate": 2.495738427924701e-05, "loss": 0.712, "num_tokens": 4844732046.0, "step": 1586 }, { "epoch": 2.323868840446968, "grad_norm": 0.3936269678643711, "learning_rate": 2.4940541102549274e-05, "loss": 0.699, "num_tokens": 4847695548.0, "step": 1587 }, { "epoch": 2.3253343103132442, "grad_norm": 0.47417714097432423, "learning_rate": 2.492369528011673e-05, "loss": 0.7058, "num_tokens": 4850741647.0, "step": 1588 }, { "epoch": 2.32679978017952, "grad_norm": 0.4345149690779481, "learning_rate": 2.4906846827106313e-05, "loss": 0.6876, "num_tokens": 4853614086.0, "step": 1589 }, { "epoch": 2.328265250045796, "grad_norm": 0.4304813030682135, "learning_rate": 2.488999575867731e-05, "loss": 0.7022, "num_tokens": 4856680013.0, "step": 1590 }, { "epoch": 2.329730719912072, "grad_norm": 0.4593925584550838, "learning_rate": 2.4873142089991388e-05, "loss": 0.699, "num_tokens": 4859686497.0, "step": 1591 }, { "epoch": 2.3311961897783475, "grad_norm": 0.46948684905366594, "learning_rate": 2.4856285836212525e-05, "loss": 0.6895, "num_tokens": 4862695328.0, "step": 1592 }, { "epoch": 2.3326616596446237, "grad_norm": 0.37813928354894105, "learning_rate": 2.4839427012507047e-05, "loss": 0.7088, "num_tokens": 4865676776.0, "step": 1593 }, { "epoch": 2.3341271295108994, "grad_norm": 0.4251925757993123, "learning_rate": 2.482256563404357e-05, "loss": 0.6918, "num_tokens": 4868658389.0, "step": 1594 }, { "epoch": 2.335592599377175, "grad_norm": 0.4184711349236048, "learning_rate": 2.4805701715993042e-05, "loss": 0.6904, "num_tokens": 4871595173.0, "step": 1595 }, { "epoch": 2.3370580692434513, "grad_norm": 0.42153381504455484, "learning_rate": 2.4788835273528658e-05, "loss": 0.6986, "num_tokens": 4874800681.0, "step": 1596 }, { "epoch": 2.338523539109727, "grad_norm": 0.4317675602123301, "learning_rate": 2.47719663218259e-05, "loss": 0.7039, "num_tokens": 4877773000.0, "step": 1597 }, { "epoch": 2.339989008976003, "grad_norm": 0.46909322199986475, "learning_rate": 2.475509487606252e-05, "loss": 0.6894, "num_tokens": 4880941312.0, "step": 1598 }, { "epoch": 2.341454478842279, "grad_norm": 0.44854738056695603, "learning_rate": 2.4738220951418504e-05, "loss": 0.683, "num_tokens": 4884092210.0, "step": 1599 }, { "epoch": 2.3429199487085546, "grad_norm": 0.5999481479828603, "learning_rate": 2.472134456307606e-05, "loss": 0.7192, "num_tokens": 4887082679.0, "step": 1600 }, { "epoch": 2.3443854185748307, "grad_norm": 0.44290837918437054, "learning_rate": 2.4704465726219623e-05, "loss": 0.6946, "num_tokens": 4890236856.0, "step": 1601 }, { "epoch": 2.3458508884411065, "grad_norm": 0.7516365469189337, "learning_rate": 2.4687584456035834e-05, "loss": 0.7009, "num_tokens": 4893460982.0, "step": 1602 }, { "epoch": 2.347316358307382, "grad_norm": 0.5728080190258974, "learning_rate": 2.4670700767713517e-05, "loss": 0.7053, "num_tokens": 4896576079.0, "step": 1603 }, { "epoch": 2.3487818281736583, "grad_norm": 0.7198878208333858, "learning_rate": 2.4653814676443665e-05, "loss": 0.7091, "num_tokens": 4899471390.0, "step": 1604 }, { "epoch": 2.350247298039934, "grad_norm": 0.623252980083619, "learning_rate": 2.4636926197419445e-05, "loss": 0.7154, "num_tokens": 4902446164.0, "step": 1605 }, { "epoch": 2.3517127679062098, "grad_norm": 0.6355916390669742, "learning_rate": 2.462003534583617e-05, "loss": 0.6829, "num_tokens": 4905752212.0, "step": 1606 }, { "epoch": 2.353178237772486, "grad_norm": 0.5479898931986629, "learning_rate": 2.460314213689128e-05, "loss": 0.6986, "num_tokens": 4908606830.0, "step": 1607 }, { "epoch": 2.3546437076387616, "grad_norm": 0.6508796583730337, "learning_rate": 2.4586246585784352e-05, "loss": 0.7, "num_tokens": 4911688117.0, "step": 1608 }, { "epoch": 2.3561091775050373, "grad_norm": 0.537044909721635, "learning_rate": 2.456934870771705e-05, "loss": 0.7032, "num_tokens": 4914736118.0, "step": 1609 }, { "epoch": 2.3575746473713135, "grad_norm": 0.7071834570416751, "learning_rate": 2.455244851789315e-05, "loss": 0.7085, "num_tokens": 4917919508.0, "step": 1610 }, { "epoch": 2.359040117237589, "grad_norm": 0.559881708320018, "learning_rate": 2.453554603151849e-05, "loss": 0.6853, "num_tokens": 4920880512.0, "step": 1611 }, { "epoch": 2.360505587103865, "grad_norm": 0.7560345234773478, "learning_rate": 2.4518641263800996e-05, "loss": 0.6977, "num_tokens": 4924049676.0, "step": 1612 }, { "epoch": 2.361971056970141, "grad_norm": 0.6308929617902254, "learning_rate": 2.4501734229950623e-05, "loss": 0.6943, "num_tokens": 4927340029.0, "step": 1613 }, { "epoch": 2.363436526836417, "grad_norm": 0.6843937841346347, "learning_rate": 2.4484824945179382e-05, "loss": 0.6966, "num_tokens": 4930386666.0, "step": 1614 }, { "epoch": 2.364901996702693, "grad_norm": 0.6054878100395716, "learning_rate": 2.446791342470131e-05, "loss": 0.6846, "num_tokens": 4933479391.0, "step": 1615 }, { "epoch": 2.3663674665689687, "grad_norm": 0.6008603662918548, "learning_rate": 2.4450999683732443e-05, "loss": 0.7068, "num_tokens": 4936559134.0, "step": 1616 }, { "epoch": 2.3678329364352444, "grad_norm": 0.5441142453503164, "learning_rate": 2.443408373749082e-05, "loss": 0.6959, "num_tokens": 4939407688.0, "step": 1617 }, { "epoch": 2.3692984063015206, "grad_norm": 0.6613507741763264, "learning_rate": 2.441716560119647e-05, "loss": 0.6766, "num_tokens": 4942401419.0, "step": 1618 }, { "epoch": 2.3707638761677963, "grad_norm": 0.5502094135369707, "learning_rate": 2.440024529007138e-05, "loss": 0.6917, "num_tokens": 4945238043.0, "step": 1619 }, { "epoch": 2.372229346034072, "grad_norm": 0.7294829761149649, "learning_rate": 2.438332281933951e-05, "loss": 0.7021, "num_tokens": 4948276175.0, "step": 1620 }, { "epoch": 2.373694815900348, "grad_norm": 0.5920730310224475, "learning_rate": 2.4366398204226747e-05, "loss": 0.6899, "num_tokens": 4951371074.0, "step": 1621 }, { "epoch": 2.375160285766624, "grad_norm": 0.6548788248575416, "learning_rate": 2.4349471459960935e-05, "loss": 0.7138, "num_tokens": 4954345236.0, "step": 1622 }, { "epoch": 2.3766257556329, "grad_norm": 0.5823421828811941, "learning_rate": 2.433254260177179e-05, "loss": 0.6959, "num_tokens": 4957507241.0, "step": 1623 }, { "epoch": 2.3780912254991757, "grad_norm": 0.5657737679124532, "learning_rate": 2.4315611644890962e-05, "loss": 0.6917, "num_tokens": 4960675310.0, "step": 1624 }, { "epoch": 2.3795566953654514, "grad_norm": 0.500301905045223, "learning_rate": 2.4298678604551987e-05, "loss": 0.7202, "num_tokens": 4963633293.0, "step": 1625 }, { "epoch": 2.3810221652317276, "grad_norm": 0.6258679615658008, "learning_rate": 2.4281743495990265e-05, "loss": 0.7014, "num_tokens": 4966820549.0, "step": 1626 }, { "epoch": 2.3824876350980033, "grad_norm": 0.43036139990685973, "learning_rate": 2.4264806334443063e-05, "loss": 0.6837, "num_tokens": 4969868328.0, "step": 1627 }, { "epoch": 2.383953104964279, "grad_norm": 0.707392943102859, "learning_rate": 2.4247867135149492e-05, "loss": 0.6939, "num_tokens": 4972953368.0, "step": 1628 }, { "epoch": 2.385418574830555, "grad_norm": 0.5701379155261812, "learning_rate": 2.42309259133505e-05, "loss": 0.7108, "num_tokens": 4976035208.0, "step": 1629 }, { "epoch": 2.386884044696831, "grad_norm": 0.6526350427642641, "learning_rate": 2.4213982684288854e-05, "loss": 0.6972, "num_tokens": 4979018875.0, "step": 1630 }, { "epoch": 2.3883495145631066, "grad_norm": 0.5851435564536693, "learning_rate": 2.4197037463209125e-05, "loss": 0.7061, "num_tokens": 4982171477.0, "step": 1631 }, { "epoch": 2.389814984429383, "grad_norm": 0.515465607738937, "learning_rate": 2.4180090265357677e-05, "loss": 0.7094, "num_tokens": 4985252380.0, "step": 1632 }, { "epoch": 2.3912804542956585, "grad_norm": 0.5460414437995168, "learning_rate": 2.4163141105982654e-05, "loss": 0.7081, "num_tokens": 4988211886.0, "step": 1633 }, { "epoch": 2.392745924161934, "grad_norm": 0.5601054959392333, "learning_rate": 2.4146190000333963e-05, "loss": 0.7204, "num_tokens": 4991042932.0, "step": 1634 }, { "epoch": 2.3942113940282104, "grad_norm": 0.4462370193759463, "learning_rate": 2.412923696366326e-05, "loss": 0.7007, "num_tokens": 4994114342.0, "step": 1635 }, { "epoch": 2.395676863894486, "grad_norm": 0.6403216886203645, "learning_rate": 2.411228201122395e-05, "loss": 0.7231, "num_tokens": 4997190930.0, "step": 1636 }, { "epoch": 2.397142333760762, "grad_norm": 0.5212004733261729, "learning_rate": 2.4095325158271147e-05, "loss": 0.6925, "num_tokens": 5000483987.0, "step": 1637 }, { "epoch": 2.398607803627038, "grad_norm": 0.5929047369321417, "learning_rate": 2.407836642006168e-05, "loss": 0.7, "num_tokens": 5003424040.0, "step": 1638 }, { "epoch": 2.4000732734933137, "grad_norm": 0.562573654438875, "learning_rate": 2.4061405811854077e-05, "loss": 0.6756, "num_tokens": 5006555950.0, "step": 1639 }, { "epoch": 2.40153874335959, "grad_norm": 0.5650852948311534, "learning_rate": 2.4044443348908552e-05, "loss": 0.7066, "num_tokens": 5009595176.0, "step": 1640 }, { "epoch": 2.4030042132258655, "grad_norm": 0.49521276928984426, "learning_rate": 2.4027479046486973e-05, "loss": 0.6868, "num_tokens": 5012782271.0, "step": 1641 }, { "epoch": 2.4044696830921413, "grad_norm": 0.5544305326399391, "learning_rate": 2.401051291985288e-05, "loss": 0.7119, "num_tokens": 5015810449.0, "step": 1642 }, { "epoch": 2.4059351529584174, "grad_norm": 0.4821641040968821, "learning_rate": 2.3993544984271445e-05, "loss": 0.7057, "num_tokens": 5018756729.0, "step": 1643 }, { "epoch": 2.407400622824693, "grad_norm": 0.6098504832682118, "learning_rate": 2.3976575255009472e-05, "loss": 0.6941, "num_tokens": 5021596367.0, "step": 1644 }, { "epoch": 2.408866092690969, "grad_norm": 0.5536130451663451, "learning_rate": 2.3959603747335365e-05, "loss": 0.7022, "num_tokens": 5024755560.0, "step": 1645 }, { "epoch": 2.410331562557245, "grad_norm": 0.5351561881399429, "learning_rate": 2.3942630476519156e-05, "loss": 0.6997, "num_tokens": 5027873348.0, "step": 1646 }, { "epoch": 2.4117970324235207, "grad_norm": 0.534630250382874, "learning_rate": 2.392565545783244e-05, "loss": 0.7111, "num_tokens": 5030963769.0, "step": 1647 }, { "epoch": 2.413262502289797, "grad_norm": 0.5299766340975229, "learning_rate": 2.3908678706548386e-05, "loss": 0.6982, "num_tokens": 5034006383.0, "step": 1648 }, { "epoch": 2.4147279721560726, "grad_norm": 0.46516869549737344, "learning_rate": 2.3891700237941737e-05, "loss": 0.692, "num_tokens": 5037105465.0, "step": 1649 }, { "epoch": 2.4161934420223483, "grad_norm": 0.6530463712273142, "learning_rate": 2.3874720067288767e-05, "loss": 0.7158, "num_tokens": 5039962408.0, "step": 1650 }, { "epoch": 2.4176589118886245, "grad_norm": 0.5623690248547457, "learning_rate": 2.3857738209867284e-05, "loss": 0.6917, "num_tokens": 5043197782.0, "step": 1651 }, { "epoch": 2.4191243817549, "grad_norm": 0.6112477508786739, "learning_rate": 2.3840754680956618e-05, "loss": 0.7081, "num_tokens": 5046333919.0, "step": 1652 }, { "epoch": 2.420589851621176, "grad_norm": 0.6650381935697949, "learning_rate": 2.3823769495837606e-05, "loss": 0.6894, "num_tokens": 5049411444.0, "step": 1653 }, { "epoch": 2.422055321487452, "grad_norm": 0.4987206742555979, "learning_rate": 2.3806782669792558e-05, "loss": 0.6974, "num_tokens": 5052550110.0, "step": 1654 }, { "epoch": 2.4235207913537278, "grad_norm": 0.5586768913963881, "learning_rate": 2.3789794218105286e-05, "loss": 0.7135, "num_tokens": 5055481069.0, "step": 1655 }, { "epoch": 2.4249862612200035, "grad_norm": 0.4232436970320549, "learning_rate": 2.3772804156061038e-05, "loss": 0.6691, "num_tokens": 5058621117.0, "step": 1656 }, { "epoch": 2.4264517310862797, "grad_norm": 0.5546340569433037, "learning_rate": 2.3755812498946536e-05, "loss": 0.6892, "num_tokens": 5061339517.0, "step": 1657 }, { "epoch": 2.4279172009525554, "grad_norm": 0.4114748367994068, "learning_rate": 2.3738819262049916e-05, "loss": 0.7053, "num_tokens": 5064247144.0, "step": 1658 }, { "epoch": 2.429382670818831, "grad_norm": 0.47357288393711305, "learning_rate": 2.3721824460660745e-05, "loss": 0.7086, "num_tokens": 5067353206.0, "step": 1659 }, { "epoch": 2.4308481406851072, "grad_norm": 0.44768639244794906, "learning_rate": 2.370482811007e-05, "loss": 0.7034, "num_tokens": 5070369815.0, "step": 1660 }, { "epoch": 2.432313610551383, "grad_norm": 0.4271374396606263, "learning_rate": 2.368783022557005e-05, "loss": 0.7017, "num_tokens": 5073526215.0, "step": 1661 }, { "epoch": 2.4337790804176587, "grad_norm": 0.4502350928500912, "learning_rate": 2.3670830822454635e-05, "loss": 0.7076, "num_tokens": 5076585741.0, "step": 1662 }, { "epoch": 2.435244550283935, "grad_norm": 0.43177961736615933, "learning_rate": 2.365382991601888e-05, "loss": 0.6956, "num_tokens": 5079608999.0, "step": 1663 }, { "epoch": 2.4367100201502105, "grad_norm": 0.3908636891324485, "learning_rate": 2.363682752155925e-05, "loss": 0.7123, "num_tokens": 5082616950.0, "step": 1664 }, { "epoch": 2.4381754900164867, "grad_norm": 0.4679917280263215, "learning_rate": 2.361982365437354e-05, "loss": 0.7066, "num_tokens": 5085895965.0, "step": 1665 }, { "epoch": 2.4396409598827624, "grad_norm": 0.4532241232858204, "learning_rate": 2.36028183297609e-05, "loss": 0.7074, "num_tokens": 5088877120.0, "step": 1666 }, { "epoch": 2.441106429749038, "grad_norm": 0.4028034497483124, "learning_rate": 2.3585811563021754e-05, "loss": 0.7031, "num_tokens": 5091909265.0, "step": 1667 }, { "epoch": 2.4425718996153143, "grad_norm": 0.4667322255552734, "learning_rate": 2.356880336945785e-05, "loss": 0.6878, "num_tokens": 5095085879.0, "step": 1668 }, { "epoch": 2.44403736948159, "grad_norm": 0.3659754752343338, "learning_rate": 2.3551793764372213e-05, "loss": 0.6904, "num_tokens": 5098130086.0, "step": 1669 }, { "epoch": 2.4455028393478657, "grad_norm": 0.4789051098208179, "learning_rate": 2.353478276306913e-05, "loss": 0.6954, "num_tokens": 5101240176.0, "step": 1670 }, { "epoch": 2.446968309214142, "grad_norm": 0.4326893420828402, "learning_rate": 2.351777038085416e-05, "loss": 0.709, "num_tokens": 5104380830.0, "step": 1671 }, { "epoch": 2.4484337790804176, "grad_norm": 0.4736160950258135, "learning_rate": 2.3500756633034085e-05, "loss": 0.6838, "num_tokens": 5107118370.0, "step": 1672 }, { "epoch": 2.4498992489466938, "grad_norm": 0.36776651798126647, "learning_rate": 2.3483741534916935e-05, "loss": 0.6816, "num_tokens": 5110175639.0, "step": 1673 }, { "epoch": 2.4513647188129695, "grad_norm": 0.39959284191242234, "learning_rate": 2.3466725101811943e-05, "loss": 0.7102, "num_tokens": 5113498309.0, "step": 1674 }, { "epoch": 2.452830188679245, "grad_norm": 0.4932618347964198, "learning_rate": 2.344970734902954e-05, "loss": 0.6927, "num_tokens": 5116775774.0, "step": 1675 }, { "epoch": 2.4542956585455213, "grad_norm": 0.3618976055016676, "learning_rate": 2.3432688291881364e-05, "loss": 0.7032, "num_tokens": 5119803544.0, "step": 1676 }, { "epoch": 2.455761128411797, "grad_norm": 0.45775564343348013, "learning_rate": 2.3415667945680193e-05, "loss": 0.6973, "num_tokens": 5122860585.0, "step": 1677 }, { "epoch": 2.4572265982780728, "grad_norm": 0.4286152454645037, "learning_rate": 2.3398646325740004e-05, "loss": 0.7159, "num_tokens": 5125736092.0, "step": 1678 }, { "epoch": 2.458692068144349, "grad_norm": 0.4261392179623483, "learning_rate": 2.338162344737589e-05, "loss": 0.7173, "num_tokens": 5128728650.0, "step": 1679 }, { "epoch": 2.4601575380106246, "grad_norm": 0.42639584811684483, "learning_rate": 2.3364599325904096e-05, "loss": 0.6852, "num_tokens": 5131903574.0, "step": 1680 }, { "epoch": 2.4616230078769004, "grad_norm": 0.4571805749246758, "learning_rate": 2.3347573976641967e-05, "loss": 0.7089, "num_tokens": 5135062335.0, "step": 1681 }, { "epoch": 2.4630884777431765, "grad_norm": 0.4213473887983809, "learning_rate": 2.3330547414907968e-05, "loss": 0.6843, "num_tokens": 5137902574.0, "step": 1682 }, { "epoch": 2.4645539476094522, "grad_norm": 0.48983112933732914, "learning_rate": 2.3313519656021636e-05, "loss": 0.6974, "num_tokens": 5140844939.0, "step": 1683 }, { "epoch": 2.466019417475728, "grad_norm": 0.3893923429199581, "learning_rate": 2.3296490715303622e-05, "loss": 0.7073, "num_tokens": 5143718465.0, "step": 1684 }, { "epoch": 2.467484887342004, "grad_norm": 0.41429606875502134, "learning_rate": 2.32794606080756e-05, "loss": 0.7016, "num_tokens": 5146873373.0, "step": 1685 }, { "epoch": 2.46895035720828, "grad_norm": 0.3842309561278893, "learning_rate": 2.3262429349660312e-05, "loss": 0.7035, "num_tokens": 5149840982.0, "step": 1686 }, { "epoch": 2.4704158270745555, "grad_norm": 0.42592416558034174, "learning_rate": 2.324539695538154e-05, "loss": 0.7002, "num_tokens": 5152910877.0, "step": 1687 }, { "epoch": 2.4718812969408317, "grad_norm": 0.3741186707997887, "learning_rate": 2.322836344056408e-05, "loss": 0.7026, "num_tokens": 5156090266.0, "step": 1688 }, { "epoch": 2.4733467668071074, "grad_norm": 0.3770154200690991, "learning_rate": 2.3211328820533737e-05, "loss": 0.7015, "num_tokens": 5159062745.0, "step": 1689 }, { "epoch": 2.4748122366733836, "grad_norm": 0.4557063885567201, "learning_rate": 2.3194293110617302e-05, "loss": 0.6826, "num_tokens": 5162059475.0, "step": 1690 }, { "epoch": 2.4762777065396593, "grad_norm": 0.37808215438769005, "learning_rate": 2.3177256326142578e-05, "loss": 0.6901, "num_tokens": 5165172317.0, "step": 1691 }, { "epoch": 2.477743176405935, "grad_norm": 0.4257996289001036, "learning_rate": 2.3160218482438296e-05, "loss": 0.6911, "num_tokens": 5168235557.0, "step": 1692 }, { "epoch": 2.479208646272211, "grad_norm": 0.4490498053557484, "learning_rate": 2.314317959483416e-05, "loss": 0.7011, "num_tokens": 5171380314.0, "step": 1693 }, { "epoch": 2.480674116138487, "grad_norm": 0.39807884170215335, "learning_rate": 2.312613967866081e-05, "loss": 0.6953, "num_tokens": 5174354500.0, "step": 1694 }, { "epoch": 2.4821395860047626, "grad_norm": 0.43858665354529874, "learning_rate": 2.310909874924982e-05, "loss": 0.6969, "num_tokens": 5177534841.0, "step": 1695 }, { "epoch": 2.4836050558710387, "grad_norm": 0.39430986308107313, "learning_rate": 2.3092056821933655e-05, "loss": 0.6832, "num_tokens": 5180601199.0, "step": 1696 }, { "epoch": 2.4850705257373145, "grad_norm": 0.4398628004232293, "learning_rate": 2.3075013912045695e-05, "loss": 0.698, "num_tokens": 5183935316.0, "step": 1697 }, { "epoch": 2.4865359956035906, "grad_norm": 0.39109478350342175, "learning_rate": 2.305797003492021e-05, "loss": 0.6905, "num_tokens": 5186949660.0, "step": 1698 }, { "epoch": 2.4880014654698663, "grad_norm": 0.5184648301653152, "learning_rate": 2.304092520589232e-05, "loss": 0.7016, "num_tokens": 5190005585.0, "step": 1699 }, { "epoch": 2.489466935336142, "grad_norm": 0.39290488353167197, "learning_rate": 2.3023879440298006e-05, "loss": 0.7048, "num_tokens": 5192907436.0, "step": 1700 }, { "epoch": 2.490932405202418, "grad_norm": 0.477744882231662, "learning_rate": 2.3006832753474105e-05, "loss": 0.7205, "num_tokens": 5195806349.0, "step": 1701 }, { "epoch": 2.492397875068694, "grad_norm": 0.4152970304027692, "learning_rate": 2.2989785160758268e-05, "loss": 0.7154, "num_tokens": 5198886653.0, "step": 1702 }, { "epoch": 2.4938633449349696, "grad_norm": 0.490282393976516, "learning_rate": 2.2972736677488972e-05, "loss": 0.7165, "num_tokens": 5201814735.0, "step": 1703 }, { "epoch": 2.495328814801246, "grad_norm": 0.427275582242966, "learning_rate": 2.2955687319005495e-05, "loss": 0.6924, "num_tokens": 5204874261.0, "step": 1704 }, { "epoch": 2.4967942846675215, "grad_norm": 0.44740514325574476, "learning_rate": 2.2938637100647893e-05, "loss": 0.6948, "num_tokens": 5207905406.0, "step": 1705 }, { "epoch": 2.4982597545337972, "grad_norm": 0.40128117085505505, "learning_rate": 2.2921586037757004e-05, "loss": 0.6953, "num_tokens": 5210908059.0, "step": 1706 }, { "epoch": 2.4997252244000734, "grad_norm": 0.4319433461674659, "learning_rate": 2.2904534145674414e-05, "loss": 0.7169, "num_tokens": 5214098635.0, "step": 1707 }, { "epoch": 2.501190694266349, "grad_norm": 0.42759555494130985, "learning_rate": 2.2887481439742473e-05, "loss": 0.7144, "num_tokens": 5217162954.0, "step": 1708 }, { "epoch": 2.502656164132625, "grad_norm": 0.4252207273843664, "learning_rate": 2.2870427935304257e-05, "loss": 0.6839, "num_tokens": 5220274712.0, "step": 1709 }, { "epoch": 2.504121633998901, "grad_norm": 0.44018302759285016, "learning_rate": 2.2853373647703547e-05, "loss": 0.6866, "num_tokens": 5222996767.0, "step": 1710 }, { "epoch": 2.5055871038651767, "grad_norm": 0.35725160423238894, "learning_rate": 2.2836318592284847e-05, "loss": 0.713, "num_tokens": 5225987737.0, "step": 1711 }, { "epoch": 2.5070525737314524, "grad_norm": 0.45228040489835597, "learning_rate": 2.281926278439335e-05, "loss": 0.6976, "num_tokens": 5229086541.0, "step": 1712 }, { "epoch": 2.5085180435977286, "grad_norm": 0.3852933112978403, "learning_rate": 2.2802206239374906e-05, "loss": 0.6985, "num_tokens": 5232170082.0, "step": 1713 }, { "epoch": 2.5099835134640043, "grad_norm": 0.4734746106179736, "learning_rate": 2.2785148972576052e-05, "loss": 0.7017, "num_tokens": 5235463301.0, "step": 1714 }, { "epoch": 2.51144898333028, "grad_norm": 0.4309946167740658, "learning_rate": 2.276809099934396e-05, "loss": 0.7017, "num_tokens": 5238456703.0, "step": 1715 }, { "epoch": 2.512914453196556, "grad_norm": 0.4516134152131566, "learning_rate": 2.275103233502645e-05, "loss": 0.7238, "num_tokens": 5241540472.0, "step": 1716 }, { "epoch": 2.514379923062832, "grad_norm": 0.42861159158040885, "learning_rate": 2.2733972994971944e-05, "loss": 0.723, "num_tokens": 5244701257.0, "step": 1717 }, { "epoch": 2.515845392929108, "grad_norm": 0.4321770221433927, "learning_rate": 2.2716912994529493e-05, "loss": 0.6975, "num_tokens": 5247814451.0, "step": 1718 }, { "epoch": 2.5173108627953837, "grad_norm": 0.4773590465118812, "learning_rate": 2.2699852349048737e-05, "loss": 0.6926, "num_tokens": 5250944383.0, "step": 1719 }, { "epoch": 2.51877633266166, "grad_norm": 0.4210834166454949, "learning_rate": 2.2682791073879885e-05, "loss": 0.6955, "num_tokens": 5254017490.0, "step": 1720 }, { "epoch": 2.5202418025279356, "grad_norm": 0.46231394185287894, "learning_rate": 2.2665729184373728e-05, "loss": 0.6885, "num_tokens": 5257067037.0, "step": 1721 }, { "epoch": 2.5217072723942113, "grad_norm": 0.413395191394943, "learning_rate": 2.2648666695881594e-05, "loss": 0.6932, "num_tokens": 5259987698.0, "step": 1722 }, { "epoch": 2.5231727422604875, "grad_norm": 0.4302812291449578, "learning_rate": 2.263160362375536e-05, "loss": 0.69, "num_tokens": 5262896645.0, "step": 1723 }, { "epoch": 2.524638212126763, "grad_norm": 0.41170690795787557, "learning_rate": 2.2614539983347425e-05, "loss": 0.7038, "num_tokens": 5266013894.0, "step": 1724 }, { "epoch": 2.526103681993039, "grad_norm": 0.47109836332524, "learning_rate": 2.2597475790010706e-05, "loss": 0.6931, "num_tokens": 5268965193.0, "step": 1725 }, { "epoch": 2.527569151859315, "grad_norm": 0.3831334459134184, "learning_rate": 2.2580411059098615e-05, "loss": 0.6842, "num_tokens": 5272026640.0, "step": 1726 }, { "epoch": 2.529034621725591, "grad_norm": 0.5457375082552042, "learning_rate": 2.256334580596503e-05, "loss": 0.7046, "num_tokens": 5274948614.0, "step": 1727 }, { "epoch": 2.5305000915918665, "grad_norm": 0.41519285498910474, "learning_rate": 2.2546280045964327e-05, "loss": 0.6981, "num_tokens": 5277909807.0, "step": 1728 }, { "epoch": 2.5319655614581427, "grad_norm": 0.46697041524216876, "learning_rate": 2.2529213794451317e-05, "loss": 0.6927, "num_tokens": 5281150736.0, "step": 1729 }, { "epoch": 2.5334310313244184, "grad_norm": 0.5287569033034807, "learning_rate": 2.251214706678127e-05, "loss": 0.6949, "num_tokens": 5283921672.0, "step": 1730 }, { "epoch": 2.534896501190694, "grad_norm": 0.3837027940308178, "learning_rate": 2.249507987830986e-05, "loss": 0.6931, "num_tokens": 5287039342.0, "step": 1731 }, { "epoch": 2.5363619710569703, "grad_norm": 0.5449338532038152, "learning_rate": 2.2478012244393207e-05, "loss": 0.692, "num_tokens": 5290142128.0, "step": 1732 }, { "epoch": 2.537827440923246, "grad_norm": 0.4387671705981872, "learning_rate": 2.246094418038781e-05, "loss": 0.7046, "num_tokens": 5292818568.0, "step": 1733 }, { "epoch": 2.5392929107895217, "grad_norm": 0.4243455517852554, "learning_rate": 2.2443875701650555e-05, "loss": 0.7018, "num_tokens": 5295912168.0, "step": 1734 }, { "epoch": 2.540758380655798, "grad_norm": 0.47978551005261766, "learning_rate": 2.242680682353871e-05, "loss": 0.7027, "num_tokens": 5299081450.0, "step": 1735 }, { "epoch": 2.5422238505220736, "grad_norm": 0.38962393793596006, "learning_rate": 2.2409737561409905e-05, "loss": 0.6863, "num_tokens": 5302223258.0, "step": 1736 }, { "epoch": 2.5436893203883493, "grad_norm": 0.40803281708973477, "learning_rate": 2.2392667930622105e-05, "loss": 0.6944, "num_tokens": 5305234427.0, "step": 1737 }, { "epoch": 2.5451547902546254, "grad_norm": 0.42172256957755233, "learning_rate": 2.2375597946533604e-05, "loss": 0.6916, "num_tokens": 5308139897.0, "step": 1738 }, { "epoch": 2.546620260120901, "grad_norm": 0.3851222393626624, "learning_rate": 2.2358527624503033e-05, "loss": 0.7006, "num_tokens": 5311130614.0, "step": 1739 }, { "epoch": 2.548085729987177, "grad_norm": 0.4159949505676457, "learning_rate": 2.234145697988932e-05, "loss": 0.694, "num_tokens": 5314286173.0, "step": 1740 }, { "epoch": 2.549551199853453, "grad_norm": 0.3953443808609083, "learning_rate": 2.2324386028051655e-05, "loss": 0.6774, "num_tokens": 5317176163.0, "step": 1741 }, { "epoch": 2.5510166697197287, "grad_norm": 0.35991533607107334, "learning_rate": 2.230731478434955e-05, "loss": 0.6973, "num_tokens": 5320262176.0, "step": 1742 }, { "epoch": 2.552482139586005, "grad_norm": 0.4835340817744485, "learning_rate": 2.229024326414275e-05, "loss": 0.6875, "num_tokens": 5323407802.0, "step": 1743 }, { "epoch": 2.5539476094522806, "grad_norm": 0.38014384561880427, "learning_rate": 2.227317148279125e-05, "loss": 0.7047, "num_tokens": 5326330881.0, "step": 1744 }, { "epoch": 2.5554130793185568, "grad_norm": 0.4086776774011518, "learning_rate": 2.2256099455655295e-05, "loss": 0.7168, "num_tokens": 5329366826.0, "step": 1745 }, { "epoch": 2.5568785491848325, "grad_norm": 0.4002383270441421, "learning_rate": 2.2239027198095344e-05, "loss": 0.6983, "num_tokens": 5332498326.0, "step": 1746 }, { "epoch": 2.558344019051108, "grad_norm": 0.41980371497316715, "learning_rate": 2.2221954725472053e-05, "loss": 0.6841, "num_tokens": 5335455858.0, "step": 1747 }, { "epoch": 2.5598094889173844, "grad_norm": 0.38530416294195546, "learning_rate": 2.2204882053146278e-05, "loss": 0.694, "num_tokens": 5338458041.0, "step": 1748 }, { "epoch": 2.56127495878366, "grad_norm": 0.4452669570570983, "learning_rate": 2.2187809196479076e-05, "loss": 0.6905, "num_tokens": 5341521467.0, "step": 1749 }, { "epoch": 2.562740428649936, "grad_norm": 0.3812132328470332, "learning_rate": 2.217073617083163e-05, "loss": 0.714, "num_tokens": 5344442054.0, "step": 1750 }, { "epoch": 2.564205898516212, "grad_norm": 0.45277523551930565, "learning_rate": 2.2153662991565313e-05, "loss": 0.6977, "num_tokens": 5347643496.0, "step": 1751 }, { "epoch": 2.5656713683824877, "grad_norm": 0.4420879986386117, "learning_rate": 2.2136589674041603e-05, "loss": 0.685, "num_tokens": 5350782441.0, "step": 1752 }, { "epoch": 2.5671368382487634, "grad_norm": 0.37851210046543343, "learning_rate": 2.2119516233622127e-05, "loss": 0.6912, "num_tokens": 5353838752.0, "step": 1753 }, { "epoch": 2.5686023081150395, "grad_norm": 0.4195134406106909, "learning_rate": 2.2102442685668612e-05, "loss": 0.7112, "num_tokens": 5356932787.0, "step": 1754 }, { "epoch": 2.5700677779813152, "grad_norm": 0.3928975146385568, "learning_rate": 2.2085369045542887e-05, "loss": 0.7135, "num_tokens": 5359720144.0, "step": 1755 }, { "epoch": 2.571533247847591, "grad_norm": 0.4119050901772562, "learning_rate": 2.206829532860686e-05, "loss": 0.7014, "num_tokens": 5362939231.0, "step": 1756 }, { "epoch": 2.572998717713867, "grad_norm": 0.3538439833749421, "learning_rate": 2.20512215502225e-05, "loss": 0.7209, "num_tokens": 5366051538.0, "step": 1757 }, { "epoch": 2.574464187580143, "grad_norm": 0.40435964804803937, "learning_rate": 2.2034147725751858e-05, "loss": 0.6833, "num_tokens": 5368924107.0, "step": 1758 }, { "epoch": 2.5759296574464186, "grad_norm": 0.38603606136999563, "learning_rate": 2.2017073870556986e-05, "loss": 0.7111, "num_tokens": 5371984696.0, "step": 1759 }, { "epoch": 2.5773951273126947, "grad_norm": 0.41242163844300006, "learning_rate": 2.2000000000000003e-05, "loss": 0.7063, "num_tokens": 5374816634.0, "step": 1760 }, { "epoch": 2.5788605971789704, "grad_norm": 0.35169022483313644, "learning_rate": 2.1982926129443026e-05, "loss": 0.6735, "num_tokens": 5377841465.0, "step": 1761 }, { "epoch": 2.580326067045246, "grad_norm": 0.36207685864554656, "learning_rate": 2.1965852274248155e-05, "loss": 0.6847, "num_tokens": 5380959479.0, "step": 1762 }, { "epoch": 2.5817915369115223, "grad_norm": 0.37897119335917434, "learning_rate": 2.194877844977751e-05, "loss": 0.6991, "num_tokens": 5383841763.0, "step": 1763 }, { "epoch": 2.583257006777798, "grad_norm": 0.3815462715395564, "learning_rate": 2.1931704671393154e-05, "loss": 0.6963, "num_tokens": 5386711891.0, "step": 1764 }, { "epoch": 2.5847224766440737, "grad_norm": 0.352353635425442, "learning_rate": 2.1914630954457122e-05, "loss": 0.7225, "num_tokens": 5389859131.0, "step": 1765 }, { "epoch": 2.58618794651035, "grad_norm": 0.40112907509807566, "learning_rate": 2.1897557314331393e-05, "loss": 0.7244, "num_tokens": 5392961727.0, "step": 1766 }, { "epoch": 2.5876534163766256, "grad_norm": 0.4226198297504579, "learning_rate": 2.1880483766377886e-05, "loss": 0.6996, "num_tokens": 5395811060.0, "step": 1767 }, { "epoch": 2.5891188862429018, "grad_norm": 0.39483269218651457, "learning_rate": 2.1863410325958406e-05, "loss": 0.6961, "num_tokens": 5398810836.0, "step": 1768 }, { "epoch": 2.5905843561091775, "grad_norm": 0.4029231323130329, "learning_rate": 2.18463370084347e-05, "loss": 0.6979, "num_tokens": 5401936970.0, "step": 1769 }, { "epoch": 2.5920498259754536, "grad_norm": 0.37324675287376885, "learning_rate": 2.1829263829168372e-05, "loss": 0.6857, "num_tokens": 5404893409.0, "step": 1770 }, { "epoch": 2.5935152958417294, "grad_norm": 0.3943844612362309, "learning_rate": 2.1812190803520936e-05, "loss": 0.7134, "num_tokens": 5407687620.0, "step": 1771 }, { "epoch": 2.594980765708005, "grad_norm": 0.3984205688295169, "learning_rate": 2.1795117946853724e-05, "loss": 0.7132, "num_tokens": 5410805330.0, "step": 1772 }, { "epoch": 2.5964462355742812, "grad_norm": 0.3795056158888877, "learning_rate": 2.177804527452796e-05, "loss": 0.7003, "num_tokens": 5413818981.0, "step": 1773 }, { "epoch": 2.597911705440557, "grad_norm": 0.3540162489850375, "learning_rate": 2.1760972801904666e-05, "loss": 0.7204, "num_tokens": 5416739527.0, "step": 1774 }, { "epoch": 2.5993771753068327, "grad_norm": 0.3900492395667007, "learning_rate": 2.1743900544344718e-05, "loss": 0.69, "num_tokens": 5419965309.0, "step": 1775 }, { "epoch": 2.600842645173109, "grad_norm": 0.35087922048268094, "learning_rate": 2.1726828517208753e-05, "loss": 0.7029, "num_tokens": 5422932178.0, "step": 1776 }, { "epoch": 2.6023081150393845, "grad_norm": 0.3462316482646767, "learning_rate": 2.1709756735857264e-05, "loss": 0.7047, "num_tokens": 5426243883.0, "step": 1777 }, { "epoch": 2.6037735849056602, "grad_norm": 0.37080165293383144, "learning_rate": 2.1692685215650457e-05, "loss": 0.6986, "num_tokens": 5429327654.0, "step": 1778 }, { "epoch": 2.6052390547719364, "grad_norm": 0.3509337018028498, "learning_rate": 2.167561397194835e-05, "loss": 0.6917, "num_tokens": 5432430222.0, "step": 1779 }, { "epoch": 2.606704524638212, "grad_norm": 0.3808514092369401, "learning_rate": 2.1658543020110687e-05, "loss": 0.692, "num_tokens": 5435391517.0, "step": 1780 }, { "epoch": 2.608169994504488, "grad_norm": 0.3866933141864262, "learning_rate": 2.1641472375496973e-05, "loss": 0.7037, "num_tokens": 5438286691.0, "step": 1781 }, { "epoch": 2.609635464370764, "grad_norm": 0.39788507690008107, "learning_rate": 2.16244020534664e-05, "loss": 0.7102, "num_tokens": 5441318897.0, "step": 1782 }, { "epoch": 2.6111009342370397, "grad_norm": 0.4216778813061623, "learning_rate": 2.1607332069377904e-05, "loss": 0.6946, "num_tokens": 5444477122.0, "step": 1783 }, { "epoch": 2.6125664041033154, "grad_norm": 0.3993955932403623, "learning_rate": 2.15902624385901e-05, "loss": 0.6913, "num_tokens": 5447574224.0, "step": 1784 }, { "epoch": 2.6140318739695916, "grad_norm": 0.4038131848765513, "learning_rate": 2.15731931764613e-05, "loss": 0.7098, "num_tokens": 5450607876.0, "step": 1785 }, { "epoch": 2.6154973438358673, "grad_norm": 0.44514487786936924, "learning_rate": 2.1556124298349454e-05, "loss": 0.714, "num_tokens": 5453702027.0, "step": 1786 }, { "epoch": 2.616962813702143, "grad_norm": 0.4219765937577303, "learning_rate": 2.1539055819612202e-05, "loss": 0.6972, "num_tokens": 5456774227.0, "step": 1787 }, { "epoch": 2.618428283568419, "grad_norm": 0.40362396332004413, "learning_rate": 2.1521987755606795e-05, "loss": 0.7139, "num_tokens": 5459875307.0, "step": 1788 }, { "epoch": 2.619893753434695, "grad_norm": 0.4551726443459678, "learning_rate": 2.1504920121690147e-05, "loss": 0.6956, "num_tokens": 5462919027.0, "step": 1789 }, { "epoch": 2.6213592233009706, "grad_norm": 0.4075526551445301, "learning_rate": 2.148785293321874e-05, "loss": 0.7113, "num_tokens": 5466043971.0, "step": 1790 }, { "epoch": 2.6228246931672468, "grad_norm": 0.4018130600523857, "learning_rate": 2.1470786205548692e-05, "loss": 0.7056, "num_tokens": 5468859575.0, "step": 1791 }, { "epoch": 2.6242901630335225, "grad_norm": 0.45756838576783526, "learning_rate": 2.145371995403568e-05, "loss": 0.7036, "num_tokens": 5472004094.0, "step": 1792 }, { "epoch": 2.6257556328997986, "grad_norm": 0.3928693030898483, "learning_rate": 2.143665419403498e-05, "loss": 0.6883, "num_tokens": 5474896038.0, "step": 1793 }, { "epoch": 2.6272211027660743, "grad_norm": 0.41922446579264494, "learning_rate": 2.1419588940901394e-05, "loss": 0.7026, "num_tokens": 5478119123.0, "step": 1794 }, { "epoch": 2.6286865726323505, "grad_norm": 0.3672995555651936, "learning_rate": 2.14025242099893e-05, "loss": 0.6952, "num_tokens": 5481086729.0, "step": 1795 }, { "epoch": 2.630152042498626, "grad_norm": 0.4107603458910209, "learning_rate": 2.138546001665258e-05, "loss": 0.711, "num_tokens": 5484401277.0, "step": 1796 }, { "epoch": 2.631617512364902, "grad_norm": 0.37399207291427383, "learning_rate": 2.1368396376244652e-05, "loss": 0.6819, "num_tokens": 5487447872.0, "step": 1797 }, { "epoch": 2.633082982231178, "grad_norm": 0.3861560819726002, "learning_rate": 2.1351333304118415e-05, "loss": 0.6765, "num_tokens": 5490382272.0, "step": 1798 }, { "epoch": 2.634548452097454, "grad_norm": 0.3865982944097399, "learning_rate": 2.133427081562628e-05, "loss": 0.7088, "num_tokens": 5493473157.0, "step": 1799 }, { "epoch": 2.6360139219637295, "grad_norm": 0.3922135141945683, "learning_rate": 2.1317208926120117e-05, "loss": 0.6707, "num_tokens": 5496846991.0, "step": 1800 }, { "epoch": 2.6374793918300057, "grad_norm": 0.4133871026661409, "learning_rate": 2.1300147650951266e-05, "loss": 0.7022, "num_tokens": 5499843470.0, "step": 1801 }, { "epoch": 2.6389448616962814, "grad_norm": 0.39347843996125587, "learning_rate": 2.1283087005470512e-05, "loss": 0.6822, "num_tokens": 5503094514.0, "step": 1802 }, { "epoch": 2.640410331562557, "grad_norm": 0.35761310039194905, "learning_rate": 2.1266027005028062e-05, "loss": 0.6841, "num_tokens": 5506150519.0, "step": 1803 }, { "epoch": 2.6418758014288333, "grad_norm": 0.35598576319078123, "learning_rate": 2.124896766497356e-05, "loss": 0.6694, "num_tokens": 5509417285.0, "step": 1804 }, { "epoch": 2.643341271295109, "grad_norm": 0.4052146642496105, "learning_rate": 2.1231909000656044e-05, "loss": 0.6916, "num_tokens": 5512551490.0, "step": 1805 }, { "epoch": 2.6448067411613847, "grad_norm": 0.3494288417198552, "learning_rate": 2.1214851027423954e-05, "loss": 0.7086, "num_tokens": 5515598834.0, "step": 1806 }, { "epoch": 2.646272211027661, "grad_norm": 0.39636487396000797, "learning_rate": 2.1197793760625097e-05, "loss": 0.6926, "num_tokens": 5518888111.0, "step": 1807 }, { "epoch": 2.6477376808939366, "grad_norm": 0.353916515149785, "learning_rate": 2.118073721560666e-05, "loss": 0.7119, "num_tokens": 5522004678.0, "step": 1808 }, { "epoch": 2.6492031507602123, "grad_norm": 0.3332388046949698, "learning_rate": 2.1163681407715155e-05, "loss": 0.6994, "num_tokens": 5525014561.0, "step": 1809 }, { "epoch": 2.6506686206264884, "grad_norm": 0.3926047733788625, "learning_rate": 2.114662635229646e-05, "loss": 0.68, "num_tokens": 5528249058.0, "step": 1810 }, { "epoch": 2.652134090492764, "grad_norm": 0.32867142274955646, "learning_rate": 2.112957206469575e-05, "loss": 0.7051, "num_tokens": 5531216271.0, "step": 1811 }, { "epoch": 2.65359956035904, "grad_norm": 0.3901645980901081, "learning_rate": 2.1112518560257536e-05, "loss": 0.6957, "num_tokens": 5534103824.0, "step": 1812 }, { "epoch": 2.655065030225316, "grad_norm": 0.3529613171246676, "learning_rate": 2.1095465854325595e-05, "loss": 0.7047, "num_tokens": 5537041946.0, "step": 1813 }, { "epoch": 2.6565305000915918, "grad_norm": 0.4390300160654016, "learning_rate": 2.107841396224301e-05, "loss": 0.7053, "num_tokens": 5540194317.0, "step": 1814 }, { "epoch": 2.6579959699578675, "grad_norm": 0.33388418473388254, "learning_rate": 2.1061362899352113e-05, "loss": 0.6875, "num_tokens": 5543143168.0, "step": 1815 }, { "epoch": 2.6594614398241436, "grad_norm": 0.4324288965771675, "learning_rate": 2.1044312680994517e-05, "loss": 0.697, "num_tokens": 5546325640.0, "step": 1816 }, { "epoch": 2.6609269096904193, "grad_norm": 0.36462957981234934, "learning_rate": 2.1027263322511034e-05, "loss": 0.6901, "num_tokens": 5549427067.0, "step": 1817 }, { "epoch": 2.6623923795566955, "grad_norm": 0.49675386738857186, "learning_rate": 2.101021483924174e-05, "loss": 0.69, "num_tokens": 5552496473.0, "step": 1818 }, { "epoch": 2.663857849422971, "grad_norm": 0.3456516159140831, "learning_rate": 2.09931672465259e-05, "loss": 0.7108, "num_tokens": 5555449379.0, "step": 1819 }, { "epoch": 2.6653233192892474, "grad_norm": 0.44287645498830064, "learning_rate": 2.0976120559702e-05, "loss": 0.6944, "num_tokens": 5558723986.0, "step": 1820 }, { "epoch": 2.666788789155523, "grad_norm": 0.37872291517836526, "learning_rate": 2.0959074794107685e-05, "loss": 0.7063, "num_tokens": 5561709036.0, "step": 1821 }, { "epoch": 2.668254259021799, "grad_norm": 0.40924023003541127, "learning_rate": 2.0942029965079795e-05, "loss": 0.6852, "num_tokens": 5564909277.0, "step": 1822 }, { "epoch": 2.669719728888075, "grad_norm": 0.35888719145064746, "learning_rate": 2.09249860879543e-05, "loss": 0.6826, "num_tokens": 5567730994.0, "step": 1823 }, { "epoch": 2.6711851987543507, "grad_norm": 0.41781673870284436, "learning_rate": 2.0907943178066347e-05, "loss": 0.7062, "num_tokens": 5570652587.0, "step": 1824 }, { "epoch": 2.6726506686206264, "grad_norm": 0.3846080508440028, "learning_rate": 2.0890901250750182e-05, "loss": 0.6894, "num_tokens": 5573812149.0, "step": 1825 }, { "epoch": 2.6741161384869025, "grad_norm": 0.3963136899058783, "learning_rate": 2.0873860321339198e-05, "loss": 0.7051, "num_tokens": 5576743662.0, "step": 1826 }, { "epoch": 2.6755816083531783, "grad_norm": 0.3377951819800646, "learning_rate": 2.085682040516585e-05, "loss": 0.6921, "num_tokens": 5579923593.0, "step": 1827 }, { "epoch": 2.677047078219454, "grad_norm": 0.40164867568003276, "learning_rate": 2.083978151756171e-05, "loss": 0.6812, "num_tokens": 5582995374.0, "step": 1828 }, { "epoch": 2.67851254808573, "grad_norm": 0.37262522384475627, "learning_rate": 2.0822743673857424e-05, "loss": 0.6843, "num_tokens": 5586071446.0, "step": 1829 }, { "epoch": 2.679978017952006, "grad_norm": 0.3707030834020138, "learning_rate": 2.08057068893827e-05, "loss": 0.7068, "num_tokens": 5589461796.0, "step": 1830 }, { "epoch": 2.6814434878182816, "grad_norm": 0.3968189777496934, "learning_rate": 2.078867117946627e-05, "loss": 0.7003, "num_tokens": 5592591289.0, "step": 1831 }, { "epoch": 2.6829089576845577, "grad_norm": 0.36790201680087165, "learning_rate": 2.0771636559435925e-05, "loss": 0.6868, "num_tokens": 5595949016.0, "step": 1832 }, { "epoch": 2.6843744275508334, "grad_norm": 0.33842322862070695, "learning_rate": 2.075460304461846e-05, "loss": 0.7103, "num_tokens": 5599161111.0, "step": 1833 }, { "epoch": 2.685839897417109, "grad_norm": 0.4620805082759224, "learning_rate": 2.0737570650339697e-05, "loss": 0.7053, "num_tokens": 5602338595.0, "step": 1834 }, { "epoch": 2.6873053672833853, "grad_norm": 0.38798298967771305, "learning_rate": 2.0720539391924404e-05, "loss": 0.6827, "num_tokens": 5605594684.0, "step": 1835 }, { "epoch": 2.688770837149661, "grad_norm": 0.41064092560633614, "learning_rate": 2.0703509284696387e-05, "loss": 0.7185, "num_tokens": 5608642184.0, "step": 1836 }, { "epoch": 2.6902363070159367, "grad_norm": 0.3895576422553758, "learning_rate": 2.0686480343978367e-05, "loss": 0.7058, "num_tokens": 5611508375.0, "step": 1837 }, { "epoch": 2.691701776882213, "grad_norm": 0.33863716781512265, "learning_rate": 2.0669452585092045e-05, "loss": 0.6907, "num_tokens": 5614632883.0, "step": 1838 }, { "epoch": 2.6931672467484886, "grad_norm": 0.38078708994935273, "learning_rate": 2.065242602335804e-05, "loss": 0.7108, "num_tokens": 5617714705.0, "step": 1839 }, { "epoch": 2.6946327166147643, "grad_norm": 0.37187279184922245, "learning_rate": 2.0635400674095917e-05, "loss": 0.6899, "num_tokens": 5620987663.0, "step": 1840 }, { "epoch": 2.6960981864810405, "grad_norm": 0.4285827455168878, "learning_rate": 2.0618376552624112e-05, "loss": 0.6825, "num_tokens": 5623945959.0, "step": 1841 }, { "epoch": 2.697563656347316, "grad_norm": 0.3398789045751483, "learning_rate": 2.060135367426e-05, "loss": 0.7081, "num_tokens": 5626818341.0, "step": 1842 }, { "epoch": 2.6990291262135924, "grad_norm": 0.4522142093140537, "learning_rate": 2.0584332054319813e-05, "loss": 0.6995, "num_tokens": 5629863945.0, "step": 1843 }, { "epoch": 2.700494596079868, "grad_norm": 0.3718638179894877, "learning_rate": 2.0567311708118648e-05, "loss": 0.6896, "num_tokens": 5632785397.0, "step": 1844 }, { "epoch": 2.7019600659461442, "grad_norm": 0.47562345054329325, "learning_rate": 2.0550292650970465e-05, "loss": 0.716, "num_tokens": 5635752125.0, "step": 1845 }, { "epoch": 2.70342553581242, "grad_norm": 0.34135866770631185, "learning_rate": 2.0533274898188063e-05, "loss": 0.7054, "num_tokens": 5638668172.0, "step": 1846 }, { "epoch": 2.7048910056786957, "grad_norm": 0.4531980762127052, "learning_rate": 2.0516258465083078e-05, "loss": 0.6767, "num_tokens": 5641784699.0, "step": 1847 }, { "epoch": 2.706356475544972, "grad_norm": 0.3765670349169559, "learning_rate": 2.049924336696592e-05, "loss": 0.6985, "num_tokens": 5644911110.0, "step": 1848 }, { "epoch": 2.7078219454112475, "grad_norm": 0.4503306914335617, "learning_rate": 2.048222961914585e-05, "loss": 0.7102, "num_tokens": 5647943900.0, "step": 1849 }, { "epoch": 2.7092874152775233, "grad_norm": 0.4077906759209094, "learning_rate": 2.0465217236930872e-05, "loss": 0.6954, "num_tokens": 5650905017.0, "step": 1850 }, { "epoch": 2.7107528851437994, "grad_norm": 0.39058169235816986, "learning_rate": 2.0448206235627793e-05, "loss": 0.6723, "num_tokens": 5654030374.0, "step": 1851 }, { "epoch": 2.712218355010075, "grad_norm": 0.35262572872567305, "learning_rate": 2.0431196630542152e-05, "loss": 0.6993, "num_tokens": 5656926120.0, "step": 1852 }, { "epoch": 2.713683824876351, "grad_norm": 0.4320281864467972, "learning_rate": 2.041418843697826e-05, "loss": 0.7013, "num_tokens": 5660071546.0, "step": 1853 }, { "epoch": 2.715149294742627, "grad_norm": 0.34058345320231265, "learning_rate": 2.039718167023911e-05, "loss": 0.6831, "num_tokens": 5663124417.0, "step": 1854 }, { "epoch": 2.7166147646089027, "grad_norm": 0.42107414267544746, "learning_rate": 2.0380176345626464e-05, "loss": 0.7094, "num_tokens": 5666231747.0, "step": 1855 }, { "epoch": 2.7180802344751784, "grad_norm": 0.34553029213117037, "learning_rate": 2.0363172478440755e-05, "loss": 0.6903, "num_tokens": 5669198435.0, "step": 1856 }, { "epoch": 2.7195457043414546, "grad_norm": 0.47405852912933555, "learning_rate": 2.0346170083981128e-05, "loss": 0.7004, "num_tokens": 5672072496.0, "step": 1857 }, { "epoch": 2.7210111742077303, "grad_norm": 0.37886933445204674, "learning_rate": 2.032916917754537e-05, "loss": 0.6853, "num_tokens": 5675283184.0, "step": 1858 }, { "epoch": 2.722476644074006, "grad_norm": 0.4496114130762444, "learning_rate": 2.0312169774429962e-05, "loss": 0.6776, "num_tokens": 5678586334.0, "step": 1859 }, { "epoch": 2.723942113940282, "grad_norm": 0.4045274483148738, "learning_rate": 2.029517188993e-05, "loss": 0.7075, "num_tokens": 5681679719.0, "step": 1860 }, { "epoch": 2.725407583806558, "grad_norm": 0.46062080862925814, "learning_rate": 2.0278175539339268e-05, "loss": 0.6849, "num_tokens": 5684783448.0, "step": 1861 }, { "epoch": 2.7268730536728336, "grad_norm": 0.3912130015570455, "learning_rate": 2.0261180737950093e-05, "loss": 0.6839, "num_tokens": 5687697997.0, "step": 1862 }, { "epoch": 2.7283385235391098, "grad_norm": 0.4263455334127441, "learning_rate": 2.0244187501053473e-05, "loss": 0.6986, "num_tokens": 5690641954.0, "step": 1863 }, { "epoch": 2.7298039934053855, "grad_norm": 0.42211758936665417, "learning_rate": 2.0227195843938964e-05, "loss": 0.7131, "num_tokens": 5693682210.0, "step": 1864 }, { "epoch": 2.731269463271661, "grad_norm": 0.42200797281752933, "learning_rate": 2.021020578189472e-05, "loss": 0.7104, "num_tokens": 5696710995.0, "step": 1865 }, { "epoch": 2.7327349331379374, "grad_norm": 0.4083485467131711, "learning_rate": 2.0193217330207444e-05, "loss": 0.6877, "num_tokens": 5699915350.0, "step": 1866 }, { "epoch": 2.734200403004213, "grad_norm": 0.4368797719595614, "learning_rate": 2.0176230504162406e-05, "loss": 0.7094, "num_tokens": 5702878393.0, "step": 1867 }, { "epoch": 2.7356658728704892, "grad_norm": 0.35656118083370464, "learning_rate": 2.0159245319043384e-05, "loss": 0.697, "num_tokens": 5705795546.0, "step": 1868 }, { "epoch": 2.737131342736765, "grad_norm": 0.3790384057967855, "learning_rate": 2.014226179013272e-05, "loss": 0.679, "num_tokens": 5708817354.0, "step": 1869 }, { "epoch": 2.738596812603041, "grad_norm": 0.3697500515685886, "learning_rate": 2.0125279932711236e-05, "loss": 0.6821, "num_tokens": 5711896696.0, "step": 1870 }, { "epoch": 2.740062282469317, "grad_norm": 0.3864257412733465, "learning_rate": 2.0108299762058272e-05, "loss": 0.7067, "num_tokens": 5715207251.0, "step": 1871 }, { "epoch": 2.7415277523355925, "grad_norm": 0.3450370998987545, "learning_rate": 2.009132129345162e-05, "loss": 0.6932, "num_tokens": 5718113176.0, "step": 1872 }, { "epoch": 2.7429932222018687, "grad_norm": 0.4143403147958288, "learning_rate": 2.007434454216757e-05, "loss": 0.6894, "num_tokens": 5721111377.0, "step": 1873 }, { "epoch": 2.7444586920681444, "grad_norm": 0.36533215122779955, "learning_rate": 2.0057369523480846e-05, "loss": 0.6892, "num_tokens": 5724135704.0, "step": 1874 }, { "epoch": 2.74592416193442, "grad_norm": 0.43584583309456276, "learning_rate": 2.0040396252664644e-05, "loss": 0.6832, "num_tokens": 5727201969.0, "step": 1875 }, { "epoch": 2.7473896318006963, "grad_norm": 0.37863836415657054, "learning_rate": 2.002342474499054e-05, "loss": 0.6821, "num_tokens": 5730277975.0, "step": 1876 }, { "epoch": 2.748855101666972, "grad_norm": 0.40322304847383567, "learning_rate": 2.0006455015728564e-05, "loss": 0.7056, "num_tokens": 5733137079.0, "step": 1877 }, { "epoch": 2.7503205715332477, "grad_norm": 0.35187076225357466, "learning_rate": 1.998948708014712e-05, "loss": 0.689, "num_tokens": 5736379579.0, "step": 1878 }, { "epoch": 2.751786041399524, "grad_norm": 0.4319404264863892, "learning_rate": 1.997252095351303e-05, "loss": 0.6988, "num_tokens": 5739469536.0, "step": 1879 }, { "epoch": 2.7532515112657996, "grad_norm": 0.38328696671974516, "learning_rate": 1.9955556651091454e-05, "loss": 0.6815, "num_tokens": 5742433345.0, "step": 1880 }, { "epoch": 2.7547169811320753, "grad_norm": 0.45781096018396555, "learning_rate": 1.993859418814593e-05, "loss": 0.7084, "num_tokens": 5745325060.0, "step": 1881 }, { "epoch": 2.7561824509983515, "grad_norm": 0.3699073595019125, "learning_rate": 1.9921633579938322e-05, "loss": 0.6812, "num_tokens": 5748362798.0, "step": 1882 }, { "epoch": 2.757647920864627, "grad_norm": 0.41551368502428265, "learning_rate": 1.9904674841728856e-05, "loss": 0.7089, "num_tokens": 5751412711.0, "step": 1883 }, { "epoch": 2.759113390730903, "grad_norm": 0.37668531385344006, "learning_rate": 1.988771798877605e-05, "loss": 0.6855, "num_tokens": 5754600002.0, "step": 1884 }, { "epoch": 2.760578860597179, "grad_norm": 0.4048852438805406, "learning_rate": 1.9870763036336743e-05, "loss": 0.7192, "num_tokens": 5757610235.0, "step": 1885 }, { "epoch": 2.7620443304634548, "grad_norm": 0.39428828621174894, "learning_rate": 1.9853809999666046e-05, "loss": 0.6845, "num_tokens": 5760854231.0, "step": 1886 }, { "epoch": 2.7635098003297305, "grad_norm": 0.4023068092442269, "learning_rate": 1.983685889401735e-05, "loss": 0.6885, "num_tokens": 5763905463.0, "step": 1887 }, { "epoch": 2.7649752701960066, "grad_norm": 0.3736588372317174, "learning_rate": 1.9819909734642332e-05, "loss": 0.7003, "num_tokens": 5766912652.0, "step": 1888 }, { "epoch": 2.7664407400622824, "grad_norm": 0.4326364510277143, "learning_rate": 1.980296253679088e-05, "loss": 0.6911, "num_tokens": 5769856498.0, "step": 1889 }, { "epoch": 2.767906209928558, "grad_norm": 0.37565820521286686, "learning_rate": 1.9786017315711158e-05, "loss": 0.6775, "num_tokens": 5772896066.0, "step": 1890 }, { "epoch": 2.7693716797948342, "grad_norm": 0.43593184614905583, "learning_rate": 1.9769074086649508e-05, "loss": 0.705, "num_tokens": 5775979870.0, "step": 1891 }, { "epoch": 2.77083714966111, "grad_norm": 0.35682828413699147, "learning_rate": 1.975213286485052e-05, "loss": 0.6851, "num_tokens": 5778932355.0, "step": 1892 }, { "epoch": 2.772302619527386, "grad_norm": 0.38432362196373404, "learning_rate": 1.973519366555694e-05, "loss": 0.6881, "num_tokens": 5782202904.0, "step": 1893 }, { "epoch": 2.773768089393662, "grad_norm": 0.35040578237667497, "learning_rate": 1.9718256504009744e-05, "loss": 0.6661, "num_tokens": 5785224805.0, "step": 1894 }, { "epoch": 2.775233559259938, "grad_norm": 0.37705310585506896, "learning_rate": 1.9701321395448022e-05, "loss": 0.6659, "num_tokens": 5788305409.0, "step": 1895 }, { "epoch": 2.7766990291262137, "grad_norm": 0.36256851603745943, "learning_rate": 1.9684388355109044e-05, "loss": 0.6916, "num_tokens": 5791701268.0, "step": 1896 }, { "epoch": 2.7781644989924894, "grad_norm": 0.40394284613345866, "learning_rate": 1.9667457398228214e-05, "loss": 0.6936, "num_tokens": 5794655552.0, "step": 1897 }, { "epoch": 2.7796299688587656, "grad_norm": 0.3346330967971263, "learning_rate": 1.9650528540039077e-05, "loss": 0.6799, "num_tokens": 5797605044.0, "step": 1898 }, { "epoch": 2.7810954387250413, "grad_norm": 0.43005260249305177, "learning_rate": 1.9633601795773255e-05, "loss": 0.6869, "num_tokens": 5800634529.0, "step": 1899 }, { "epoch": 2.782560908591317, "grad_norm": 0.3601654604379806, "learning_rate": 1.9616677180660498e-05, "loss": 0.6967, "num_tokens": 5803531440.0, "step": 1900 }, { "epoch": 2.784026378457593, "grad_norm": 0.37389592472348404, "learning_rate": 1.9599754709928626e-05, "loss": 0.6903, "num_tokens": 5806750210.0, "step": 1901 }, { "epoch": 2.785491848323869, "grad_norm": 0.3291190453167584, "learning_rate": 1.9582834398803543e-05, "loss": 0.6988, "num_tokens": 5809871378.0, "step": 1902 }, { "epoch": 2.7869573181901446, "grad_norm": 0.4093555448024012, "learning_rate": 1.9565916262509187e-05, "loss": 0.6841, "num_tokens": 5812906445.0, "step": 1903 }, { "epoch": 2.7884227880564207, "grad_norm": 0.33756077789977046, "learning_rate": 1.954900031626757e-05, "loss": 0.6983, "num_tokens": 5816048697.0, "step": 1904 }, { "epoch": 2.7898882579226965, "grad_norm": 0.40976163975175744, "learning_rate": 1.9532086575298693e-05, "loss": 0.685, "num_tokens": 5819070232.0, "step": 1905 }, { "epoch": 2.791353727788972, "grad_norm": 0.35350587358661595, "learning_rate": 1.9515175054820627e-05, "loss": 0.6828, "num_tokens": 5822151339.0, "step": 1906 }, { "epoch": 2.7928191976552483, "grad_norm": 0.36815085340035886, "learning_rate": 1.949826577004938e-05, "loss": 0.6746, "num_tokens": 5824986913.0, "step": 1907 }, { "epoch": 2.794284667521524, "grad_norm": 0.35416890338570767, "learning_rate": 1.9481358736199013e-05, "loss": 0.6866, "num_tokens": 5828083147.0, "step": 1908 }, { "epoch": 2.7957501373877998, "grad_norm": 0.4006922627003764, "learning_rate": 1.946445396848151e-05, "loss": 0.6878, "num_tokens": 5830979362.0, "step": 1909 }, { "epoch": 2.797215607254076, "grad_norm": 0.32327354560521077, "learning_rate": 1.9447551482106853e-05, "loss": 0.6986, "num_tokens": 5834092904.0, "step": 1910 }, { "epoch": 2.7986810771203516, "grad_norm": 0.3705055243280932, "learning_rate": 1.943065129228295e-05, "loss": 0.6937, "num_tokens": 5837060766.0, "step": 1911 }, { "epoch": 2.8001465469866273, "grad_norm": 0.38551516967273763, "learning_rate": 1.9413753414215657e-05, "loss": 0.7102, "num_tokens": 5840018691.0, "step": 1912 }, { "epoch": 2.8016120168529035, "grad_norm": 0.35223400051176984, "learning_rate": 1.9396857863108724e-05, "loss": 0.7185, "num_tokens": 5842919140.0, "step": 1913 }, { "epoch": 2.8030774867191792, "grad_norm": 0.35570064466087653, "learning_rate": 1.937996465416384e-05, "loss": 0.6994, "num_tokens": 5845823397.0, "step": 1914 }, { "epoch": 2.804542956585455, "grad_norm": 0.3761013512147621, "learning_rate": 1.9363073802580557e-05, "loss": 0.7083, "num_tokens": 5848883525.0, "step": 1915 }, { "epoch": 2.806008426451731, "grad_norm": 0.363491056932675, "learning_rate": 1.9346185323556344e-05, "loss": 0.6864, "num_tokens": 5852063416.0, "step": 1916 }, { "epoch": 2.807473896318007, "grad_norm": 0.3532499632175771, "learning_rate": 1.932929923228649e-05, "loss": 0.6975, "num_tokens": 5855151078.0, "step": 1917 }, { "epoch": 2.808939366184283, "grad_norm": 0.346419028538593, "learning_rate": 1.931241554396417e-05, "loss": 0.7022, "num_tokens": 5858223609.0, "step": 1918 }, { "epoch": 2.8104048360505587, "grad_norm": 0.3598536867518936, "learning_rate": 1.9295534273780373e-05, "loss": 0.6932, "num_tokens": 5861434296.0, "step": 1919 }, { "epoch": 2.811870305916835, "grad_norm": 0.33538570146648855, "learning_rate": 1.9278655436923944e-05, "loss": 0.6935, "num_tokens": 5864616857.0, "step": 1920 }, { "epoch": 2.8133357757831106, "grad_norm": 0.3407397523661818, "learning_rate": 1.92617790485815e-05, "loss": 0.7038, "num_tokens": 5867683769.0, "step": 1921 }, { "epoch": 2.8148012456493863, "grad_norm": 0.3474651121837004, "learning_rate": 1.9244905123937484e-05, "loss": 0.6706, "num_tokens": 5870808683.0, "step": 1922 }, { "epoch": 2.8162667155156624, "grad_norm": 0.3535642898357413, "learning_rate": 1.9228033678174107e-05, "loss": 0.668, "num_tokens": 5874097110.0, "step": 1923 }, { "epoch": 2.817732185381938, "grad_norm": 0.40673062592993187, "learning_rate": 1.921116472647135e-05, "loss": 0.6682, "num_tokens": 5877249775.0, "step": 1924 }, { "epoch": 2.819197655248214, "grad_norm": 0.3230262762962952, "learning_rate": 1.9194298284006964e-05, "loss": 0.6922, "num_tokens": 5880316687.0, "step": 1925 }, { "epoch": 2.82066312511449, "grad_norm": 0.4042774708173968, "learning_rate": 1.917743436595643e-05, "loss": 0.6857, "num_tokens": 5883403526.0, "step": 1926 }, { "epoch": 2.8221285949807657, "grad_norm": 0.3053629983996727, "learning_rate": 1.9160572987492966e-05, "loss": 0.7041, "num_tokens": 5886376312.0, "step": 1927 }, { "epoch": 2.8235940648470415, "grad_norm": 0.37037867053268025, "learning_rate": 1.9143714163787477e-05, "loss": 0.6912, "num_tokens": 5889510427.0, "step": 1928 }, { "epoch": 2.8250595347133176, "grad_norm": 0.36391403257767957, "learning_rate": 1.912685791000862e-05, "loss": 0.695, "num_tokens": 5892878037.0, "step": 1929 }, { "epoch": 2.8265250045795933, "grad_norm": 0.35445728795796444, "learning_rate": 1.9110004241322695e-05, "loss": 0.6884, "num_tokens": 5895947896.0, "step": 1930 }, { "epoch": 2.827990474445869, "grad_norm": 0.3449886754584593, "learning_rate": 1.90931531728937e-05, "loss": 0.7061, "num_tokens": 5898861553.0, "step": 1931 }, { "epoch": 2.829455944312145, "grad_norm": 0.3570554916034151, "learning_rate": 1.9076304719883273e-05, "loss": 0.6845, "num_tokens": 5901914756.0, "step": 1932 }, { "epoch": 2.830921414178421, "grad_norm": 0.35699459082912244, "learning_rate": 1.9059458897450738e-05, "loss": 0.6829, "num_tokens": 5904657656.0, "step": 1933 }, { "epoch": 2.8323868840446966, "grad_norm": 0.3651182972677928, "learning_rate": 1.9042615720752997e-05, "loss": 0.6986, "num_tokens": 5907802966.0, "step": 1934 }, { "epoch": 2.833852353910973, "grad_norm": 0.3402541539487093, "learning_rate": 1.9025775204944617e-05, "loss": 0.6946, "num_tokens": 5910862409.0, "step": 1935 }, { "epoch": 2.8353178237772485, "grad_norm": 0.33018596619363494, "learning_rate": 1.9008937365177752e-05, "loss": 0.6739, "num_tokens": 5913906448.0, "step": 1936 }, { "epoch": 2.836783293643524, "grad_norm": 0.3910154366124698, "learning_rate": 1.8992102216602153e-05, "loss": 0.7064, "num_tokens": 5917064374.0, "step": 1937 }, { "epoch": 2.8382487635098004, "grad_norm": 0.37557458167084157, "learning_rate": 1.8975269774365133e-05, "loss": 0.7019, "num_tokens": 5920305901.0, "step": 1938 }, { "epoch": 2.839714233376076, "grad_norm": 0.32694739275000034, "learning_rate": 1.895844005361161e-05, "loss": 0.6813, "num_tokens": 5923514490.0, "step": 1939 }, { "epoch": 2.841179703242352, "grad_norm": 0.35576440389585745, "learning_rate": 1.8941613069484e-05, "loss": 0.6651, "num_tokens": 5926727802.0, "step": 1940 }, { "epoch": 2.842645173108628, "grad_norm": 0.3168482585406132, "learning_rate": 1.892478883712231e-05, "loss": 0.6801, "num_tokens": 5929957229.0, "step": 1941 }, { "epoch": 2.8441106429749037, "grad_norm": 0.3821920714964589, "learning_rate": 1.8907967371664023e-05, "loss": 0.6867, "num_tokens": 5932848511.0, "step": 1942 }, { "epoch": 2.84557611284118, "grad_norm": 0.392316780120927, "learning_rate": 1.889114868824417e-05, "loss": 0.689, "num_tokens": 5935814222.0, "step": 1943 }, { "epoch": 2.8470415827074556, "grad_norm": 0.3658397844712153, "learning_rate": 1.8874332801995258e-05, "loss": 0.701, "num_tokens": 5938491297.0, "step": 1944 }, { "epoch": 2.8485070525737317, "grad_norm": 0.38956888785417565, "learning_rate": 1.8857519728047284e-05, "loss": 0.6845, "num_tokens": 5941700614.0, "step": 1945 }, { "epoch": 2.8499725224400074, "grad_norm": 0.34260603257937944, "learning_rate": 1.884070948152771e-05, "loss": 0.7164, "num_tokens": 5944705336.0, "step": 1946 }, { "epoch": 2.851437992306283, "grad_norm": 0.32792243192687826, "learning_rate": 1.8823902077561472e-05, "loss": 0.6882, "num_tokens": 5947620870.0, "step": 1947 }, { "epoch": 2.8529034621725593, "grad_norm": 0.37292528363446426, "learning_rate": 1.880709753127091e-05, "loss": 0.6987, "num_tokens": 5950601305.0, "step": 1948 }, { "epoch": 2.854368932038835, "grad_norm": 0.2923390203809084, "learning_rate": 1.8790295857775844e-05, "loss": 0.7039, "num_tokens": 5953704643.0, "step": 1949 }, { "epoch": 2.8558344019051107, "grad_norm": 0.3581424940987141, "learning_rate": 1.877349707219346e-05, "loss": 0.6787, "num_tokens": 5956684637.0, "step": 1950 }, { "epoch": 2.857299871771387, "grad_norm": 0.3570182333752147, "learning_rate": 1.8756701189638384e-05, "loss": 0.6913, "num_tokens": 5959658750.0, "step": 1951 }, { "epoch": 2.8587653416376626, "grad_norm": 0.34086946700270954, "learning_rate": 1.87399082252226e-05, "loss": 0.6794, "num_tokens": 5962764476.0, "step": 1952 }, { "epoch": 2.8602308115039383, "grad_norm": 0.3629568152454884, "learning_rate": 1.8723118194055492e-05, "loss": 0.6734, "num_tokens": 5965903691.0, "step": 1953 }, { "epoch": 2.8616962813702145, "grad_norm": 0.35540443474987515, "learning_rate": 1.8706331111243784e-05, "loss": 0.6666, "num_tokens": 5969137440.0, "step": 1954 }, { "epoch": 2.86316175123649, "grad_norm": 0.34529871490478364, "learning_rate": 1.8689546991891563e-05, "loss": 0.6929, "num_tokens": 5972245303.0, "step": 1955 }, { "epoch": 2.864627221102766, "grad_norm": 0.3537956442922439, "learning_rate": 1.8672765851100246e-05, "loss": 0.6868, "num_tokens": 5975304556.0, "step": 1956 }, { "epoch": 2.866092690969042, "grad_norm": 0.3412992197419074, "learning_rate": 1.8655987703968564e-05, "loss": 0.6976, "num_tokens": 5978383642.0, "step": 1957 }, { "epoch": 2.867558160835318, "grad_norm": 0.3625105789656368, "learning_rate": 1.8639212565592548e-05, "loss": 0.688, "num_tokens": 5981742498.0, "step": 1958 }, { "epoch": 2.8690236307015935, "grad_norm": 0.35385510737549086, "learning_rate": 1.8622440451065545e-05, "loss": 0.7053, "num_tokens": 5984854190.0, "step": 1959 }, { "epoch": 2.8704891005678697, "grad_norm": 0.36185960735870565, "learning_rate": 1.860567137547816e-05, "loss": 0.6856, "num_tokens": 5987966447.0, "step": 1960 }, { "epoch": 2.8719545704341454, "grad_norm": 0.333693003472386, "learning_rate": 1.8588905353918275e-05, "loss": 0.6979, "num_tokens": 5991227331.0, "step": 1961 }, { "epoch": 2.873420040300421, "grad_norm": 0.37576610367029845, "learning_rate": 1.8572142401471018e-05, "loss": 0.6902, "num_tokens": 5994234249.0, "step": 1962 }, { "epoch": 2.8748855101666972, "grad_norm": 0.3426411176108004, "learning_rate": 1.8555382533218763e-05, "loss": 0.6835, "num_tokens": 5997329850.0, "step": 1963 }, { "epoch": 2.876350980032973, "grad_norm": 0.3663376674511217, "learning_rate": 1.8538625764241095e-05, "loss": 0.6777, "num_tokens": 6000224490.0, "step": 1964 }, { "epoch": 2.8778164498992487, "grad_norm": 0.33963546920435034, "learning_rate": 1.8521872109614835e-05, "loss": 0.6739, "num_tokens": 6003499083.0, "step": 1965 }, { "epoch": 2.879281919765525, "grad_norm": 0.3559222916732817, "learning_rate": 1.8505121584413964e-05, "loss": 0.6932, "num_tokens": 6006607123.0, "step": 1966 }, { "epoch": 2.8807473896318005, "grad_norm": 0.37887861916783216, "learning_rate": 1.8488374203709692e-05, "loss": 0.7119, "num_tokens": 6009692730.0, "step": 1967 }, { "epoch": 2.8822128594980767, "grad_norm": 0.3509608164424842, "learning_rate": 1.8471629982570376e-05, "loss": 0.702, "num_tokens": 6013038845.0, "step": 1968 }, { "epoch": 2.8836783293643524, "grad_norm": 0.3841466536592505, "learning_rate": 1.845488893606151e-05, "loss": 0.6878, "num_tokens": 6015935749.0, "step": 1969 }, { "epoch": 2.8851437992306286, "grad_norm": 0.36735969129977003, "learning_rate": 1.8438151079245775e-05, "loss": 0.6906, "num_tokens": 6018830876.0, "step": 1970 }, { "epoch": 2.8866092690969043, "grad_norm": 0.38382536465969336, "learning_rate": 1.8421416427182954e-05, "loss": 0.6766, "num_tokens": 6022137368.0, "step": 1971 }, { "epoch": 2.88807473896318, "grad_norm": 0.33074056182233375, "learning_rate": 1.8404684994929955e-05, "loss": 0.7241, "num_tokens": 6025242862.0, "step": 1972 }, { "epoch": 2.889540208829456, "grad_norm": 0.46333707425506526, "learning_rate": 1.838795679754078e-05, "loss": 0.7069, "num_tokens": 6028378641.0, "step": 1973 }, { "epoch": 2.891005678695732, "grad_norm": 0.4007889857057646, "learning_rate": 1.837123185006654e-05, "loss": 0.699, "num_tokens": 6031696806.0, "step": 1974 }, { "epoch": 2.8924711485620076, "grad_norm": 0.41913592586509835, "learning_rate": 1.8354510167555398e-05, "loss": 0.6846, "num_tokens": 6034870308.0, "step": 1975 }, { "epoch": 2.8939366184282838, "grad_norm": 0.4263626095747065, "learning_rate": 1.8337791765052597e-05, "loss": 0.6857, "num_tokens": 6037896141.0, "step": 1976 }, { "epoch": 2.8954020882945595, "grad_norm": 0.42329805523919944, "learning_rate": 1.8321076657600416e-05, "loss": 0.6789, "num_tokens": 6040892132.0, "step": 1977 }, { "epoch": 2.896867558160835, "grad_norm": 0.4092090936670306, "learning_rate": 1.8304364860238193e-05, "loss": 0.6651, "num_tokens": 6043928449.0, "step": 1978 }, { "epoch": 2.8983330280271113, "grad_norm": 0.3751279328037842, "learning_rate": 1.8287656388002246e-05, "loss": 0.6877, "num_tokens": 6046908766.0, "step": 1979 }, { "epoch": 2.899798497893387, "grad_norm": 0.38055453401862305, "learning_rate": 1.8270951255925948e-05, "loss": 0.7023, "num_tokens": 6050153868.0, "step": 1980 }, { "epoch": 2.9012639677596628, "grad_norm": 0.312525578205032, "learning_rate": 1.8254249479039625e-05, "loss": 0.6967, "num_tokens": 6053352635.0, "step": 1981 }, { "epoch": 2.902729437625939, "grad_norm": 0.38618325070426335, "learning_rate": 1.8237551072370626e-05, "loss": 0.6889, "num_tokens": 6056252968.0, "step": 1982 }, { "epoch": 2.9041949074922147, "grad_norm": 0.339837013539051, "learning_rate": 1.8220856050943224e-05, "loss": 0.6977, "num_tokens": 6059508239.0, "step": 1983 }, { "epoch": 2.9056603773584904, "grad_norm": 0.34698121123570486, "learning_rate": 1.8204164429778686e-05, "loss": 0.693, "num_tokens": 6062613842.0, "step": 1984 }, { "epoch": 2.9071258472247665, "grad_norm": 0.38807600758497707, "learning_rate": 1.8187476223895185e-05, "loss": 0.6826, "num_tokens": 6065697777.0, "step": 1985 }, { "epoch": 2.9085913170910422, "grad_norm": 0.3276864806259998, "learning_rate": 1.8170791448307848e-05, "loss": 0.7038, "num_tokens": 6068962297.0, "step": 1986 }, { "epoch": 2.910056786957318, "grad_norm": 0.40051535514351794, "learning_rate": 1.8154110118028705e-05, "loss": 0.7096, "num_tokens": 6072009212.0, "step": 1987 }, { "epoch": 2.911522256823594, "grad_norm": 0.3323727306371662, "learning_rate": 1.813743224806669e-05, "loss": 0.7101, "num_tokens": 6075007107.0, "step": 1988 }, { "epoch": 2.91298772668987, "grad_norm": 0.4334141586499341, "learning_rate": 1.8120757853427606e-05, "loss": 0.7052, "num_tokens": 6078205322.0, "step": 1989 }, { "epoch": 2.9144531965561455, "grad_norm": 0.3140357045600773, "learning_rate": 1.810408694911415e-05, "loss": 0.7002, "num_tokens": 6081104034.0, "step": 1990 }, { "epoch": 2.9159186664224217, "grad_norm": 0.38700248973075596, "learning_rate": 1.8087419550125874e-05, "loss": 0.6748, "num_tokens": 6083958702.0, "step": 1991 }, { "epoch": 2.9173841362886974, "grad_norm": 0.3490434040026488, "learning_rate": 1.8070755671459175e-05, "loss": 0.6891, "num_tokens": 6087077720.0, "step": 1992 }, { "epoch": 2.9188496061549736, "grad_norm": 0.3498098134972655, "learning_rate": 1.8054095328107272e-05, "loss": 0.6696, "num_tokens": 6090072360.0, "step": 1993 }, { "epoch": 2.9203150760212493, "grad_norm": 0.3949658650311826, "learning_rate": 1.8037438535060225e-05, "loss": 0.6876, "num_tokens": 6092950922.0, "step": 1994 }, { "epoch": 2.9217805458875254, "grad_norm": 0.3161783163196648, "learning_rate": 1.8020785307304872e-05, "loss": 0.7002, "num_tokens": 6095918763.0, "step": 1995 }, { "epoch": 2.923246015753801, "grad_norm": 0.38556120222194146, "learning_rate": 1.8004135659824875e-05, "loss": 0.6808, "num_tokens": 6098822898.0, "step": 1996 }, { "epoch": 2.924711485620077, "grad_norm": 0.3587199835861605, "learning_rate": 1.798748960760064e-05, "loss": 0.6744, "num_tokens": 6101945080.0, "step": 1997 }, { "epoch": 2.926176955486353, "grad_norm": 0.3271502966881546, "learning_rate": 1.7970847165609374e-05, "loss": 0.6721, "num_tokens": 6104777350.0, "step": 1998 }, { "epoch": 2.9276424253526288, "grad_norm": 0.4228446267314715, "learning_rate": 1.7954208348824997e-05, "loss": 0.6942, "num_tokens": 6107646001.0, "step": 1999 }, { "epoch": 2.9291078952189045, "grad_norm": 0.3133220781095222, "learning_rate": 1.7937573172218212e-05, "loss": 0.6926, "num_tokens": 6110615904.0, "step": 2000 }, { "epoch": 2.9305733650851806, "grad_norm": 0.3775104048144457, "learning_rate": 1.79209416507564e-05, "loss": 0.6886, "num_tokens": 6113656397.0, "step": 2001 }, { "epoch": 2.9320388349514563, "grad_norm": 0.35694742680102304, "learning_rate": 1.7904313799403698e-05, "loss": 0.6917, "num_tokens": 6116749959.0, "step": 2002 }, { "epoch": 2.933504304817732, "grad_norm": 0.40851525346760326, "learning_rate": 1.78876896331209e-05, "loss": 0.708, "num_tokens": 6119604315.0, "step": 2003 }, { "epoch": 2.934969774684008, "grad_norm": 0.3761625313848583, "learning_rate": 1.787106916686551e-05, "loss": 0.6972, "num_tokens": 6122878239.0, "step": 2004 }, { "epoch": 2.936435244550284, "grad_norm": 0.38867963173723463, "learning_rate": 1.78544524155917e-05, "loss": 0.7012, "num_tokens": 6125973777.0, "step": 2005 }, { "epoch": 2.9379007144165596, "grad_norm": 0.34962605390690465, "learning_rate": 1.783783939425029e-05, "loss": 0.7014, "num_tokens": 6129142680.0, "step": 2006 }, { "epoch": 2.939366184282836, "grad_norm": 0.3546333441676004, "learning_rate": 1.7821230117788754e-05, "loss": 0.6779, "num_tokens": 6132180228.0, "step": 2007 }, { "epoch": 2.9408316541491115, "grad_norm": 0.3276804886197234, "learning_rate": 1.7804624601151186e-05, "loss": 0.7016, "num_tokens": 6135365645.0, "step": 2008 }, { "epoch": 2.9422971240153872, "grad_norm": 0.38118667908784354, "learning_rate": 1.778802285927831e-05, "loss": 0.7014, "num_tokens": 6138301537.0, "step": 2009 }, { "epoch": 2.9437625938816634, "grad_norm": 0.3030655580025739, "learning_rate": 1.777142490710744e-05, "loss": 0.6984, "num_tokens": 6141476876.0, "step": 2010 }, { "epoch": 2.945228063747939, "grad_norm": 0.3238988627748957, "learning_rate": 1.77548307595725e-05, "loss": 0.6728, "num_tokens": 6144538754.0, "step": 2011 }, { "epoch": 2.946693533614215, "grad_norm": 0.35759059886743016, "learning_rate": 1.7738240431603963e-05, "loss": 0.6748, "num_tokens": 6147628620.0, "step": 2012 }, { "epoch": 2.948159003480491, "grad_norm": 0.35907742703413315, "learning_rate": 1.772165393812889e-05, "loss": 0.6958, "num_tokens": 6150645389.0, "step": 2013 }, { "epoch": 2.9496244733467667, "grad_norm": 0.30403079460782306, "learning_rate": 1.770507129407087e-05, "loss": 0.6859, "num_tokens": 6153921611.0, "step": 2014 }, { "epoch": 2.9510899432130424, "grad_norm": 0.37598170318169066, "learning_rate": 1.7688492514350068e-05, "loss": 0.6762, "num_tokens": 6156983485.0, "step": 2015 }, { "epoch": 2.9525554130793186, "grad_norm": 0.31697226414770346, "learning_rate": 1.7671917613883113e-05, "loss": 0.6914, "num_tokens": 6160061482.0, "step": 2016 }, { "epoch": 2.9540208829455943, "grad_norm": 0.37827740261772685, "learning_rate": 1.7655346607583194e-05, "loss": 0.6735, "num_tokens": 6163266924.0, "step": 2017 }, { "epoch": 2.9554863528118704, "grad_norm": 0.33509526288167346, "learning_rate": 1.763877951035997e-05, "loss": 0.701, "num_tokens": 6166246755.0, "step": 2018 }, { "epoch": 2.956951822678146, "grad_norm": 0.37339587042726974, "learning_rate": 1.7622216337119603e-05, "loss": 0.7046, "num_tokens": 6169065787.0, "step": 2019 }, { "epoch": 2.9584172925444223, "grad_norm": 0.34883979666738685, "learning_rate": 1.7605657102764696e-05, "loss": 0.6978, "num_tokens": 6172068175.0, "step": 2020 }, { "epoch": 2.959882762410698, "grad_norm": 0.36818699995555954, "learning_rate": 1.7589101822194333e-05, "loss": 0.6866, "num_tokens": 6175135136.0, "step": 2021 }, { "epoch": 2.9613482322769737, "grad_norm": 0.3541876725547167, "learning_rate": 1.7572550510304023e-05, "loss": 0.6726, "num_tokens": 6178472990.0, "step": 2022 }, { "epoch": 2.96281370214325, "grad_norm": 0.3502493600297295, "learning_rate": 1.7556003181985738e-05, "loss": 0.6948, "num_tokens": 6181375729.0, "step": 2023 }, { "epoch": 2.9642791720095256, "grad_norm": 0.3074632576421066, "learning_rate": 1.7539459852127808e-05, "loss": 0.6878, "num_tokens": 6184304808.0, "step": 2024 }, { "epoch": 2.9657446418758013, "grad_norm": 0.35466937291046186, "learning_rate": 1.7522920535615012e-05, "loss": 0.6838, "num_tokens": 6187334838.0, "step": 2025 }, { "epoch": 2.9672101117420775, "grad_norm": 0.3562676360141291, "learning_rate": 1.7506385247328506e-05, "loss": 0.6932, "num_tokens": 6190255504.0, "step": 2026 }, { "epoch": 2.968675581608353, "grad_norm": 0.303503744960109, "learning_rate": 1.7489854002145827e-05, "loss": 0.6974, "num_tokens": 6193288467.0, "step": 2027 }, { "epoch": 2.970141051474629, "grad_norm": 0.32562598526387065, "learning_rate": 1.747332681494085e-05, "loss": 0.6909, "num_tokens": 6196569499.0, "step": 2028 }, { "epoch": 2.971606521340905, "grad_norm": 0.33373255373100813, "learning_rate": 1.7456803700583834e-05, "loss": 0.6863, "num_tokens": 6199619526.0, "step": 2029 }, { "epoch": 2.973071991207181, "grad_norm": 0.3198568304440413, "learning_rate": 1.7440284673941333e-05, "loss": 0.6825, "num_tokens": 6202637993.0, "step": 2030 }, { "epoch": 2.9745374610734565, "grad_norm": 0.33763520976388306, "learning_rate": 1.742376974987627e-05, "loss": 0.672, "num_tokens": 6206081580.0, "step": 2031 }, { "epoch": 2.9760029309397327, "grad_norm": 0.3070274246440076, "learning_rate": 1.740725894324783e-05, "loss": 0.7014, "num_tokens": 6209300732.0, "step": 2032 }, { "epoch": 2.9774684008060084, "grad_norm": 0.34349860040159286, "learning_rate": 1.7390752268911536e-05, "loss": 0.6913, "num_tokens": 6212277976.0, "step": 2033 }, { "epoch": 2.978933870672284, "grad_norm": 0.4143081879824312, "learning_rate": 1.7374249741719157e-05, "loss": 0.6789, "num_tokens": 6215416573.0, "step": 2034 }, { "epoch": 2.9803993405385603, "grad_norm": 0.37676670897175035, "learning_rate": 1.7357751376518753e-05, "loss": 0.6819, "num_tokens": 6218531261.0, "step": 2035 }, { "epoch": 2.981864810404836, "grad_norm": 0.36030950039593773, "learning_rate": 1.7341257188154625e-05, "loss": 0.6899, "num_tokens": 6221342276.0, "step": 2036 }, { "epoch": 2.9833302802711117, "grad_norm": 0.3505728891330053, "learning_rate": 1.7324767191467346e-05, "loss": 0.6976, "num_tokens": 6224625446.0, "step": 2037 }, { "epoch": 2.984795750137388, "grad_norm": 0.3465075304757723, "learning_rate": 1.730828140129366e-05, "loss": 0.6875, "num_tokens": 6228127319.0, "step": 2038 }, { "epoch": 2.9862612200036636, "grad_norm": 0.3715512861827909, "learning_rate": 1.7291799832466593e-05, "loss": 0.6915, "num_tokens": 6231009456.0, "step": 2039 }, { "epoch": 2.9877266898699393, "grad_norm": 0.34542116605574746, "learning_rate": 1.727532249981531e-05, "loss": 0.6912, "num_tokens": 6234280381.0, "step": 2040 }, { "epoch": 2.9891921597362154, "grad_norm": 0.36454647320853734, "learning_rate": 1.7258849418165233e-05, "loss": 0.6982, "num_tokens": 6237307926.0, "step": 2041 }, { "epoch": 2.990657629602491, "grad_norm": 0.4067892269676075, "learning_rate": 1.7242380602337883e-05, "loss": 0.6788, "num_tokens": 6240224395.0, "step": 2042 }, { "epoch": 2.9921230994687673, "grad_norm": 0.3676539478700517, "learning_rate": 1.7225916067151018e-05, "loss": 0.6815, "num_tokens": 6243319959.0, "step": 2043 }, { "epoch": 2.993588569335043, "grad_norm": 0.3914655111034985, "learning_rate": 1.7209455827418482e-05, "loss": 0.6983, "num_tokens": 6246489593.0, "step": 2044 }, { "epoch": 2.995054039201319, "grad_norm": 0.40855158212096393, "learning_rate": 1.7192999897950287e-05, "loss": 0.6935, "num_tokens": 6249433834.0, "step": 2045 }, { "epoch": 2.996519509067595, "grad_norm": 0.35085973761937456, "learning_rate": 1.717654829355256e-05, "loss": 0.6889, "num_tokens": 6252616084.0, "step": 2046 }, { "epoch": 2.9979849789338706, "grad_norm": 0.356228228313392, "learning_rate": 1.7160101029027546e-05, "loss": 0.6708, "num_tokens": 6255602719.0, "step": 2047 }, { "epoch": 2.9994504488001468, "grad_norm": 0.33861350832732645, "learning_rate": 1.7143658119173566e-05, "loss": 0.7048, "num_tokens": 6258454926.0, "step": 2048 }, { "epoch": 3.0, "grad_norm": 0.5031557411193512, "learning_rate": 1.7127219578785036e-05, "loss": 0.7186, "num_tokens": 6259067011.0, "step": 2049 }, { "epoch": 3.0014654698662757, "grad_norm": 0.4224423264160683, "learning_rate": 1.7110785422652437e-05, "loss": 0.6814, "num_tokens": 6262206132.0, "step": 2050 }, { "epoch": 3.002930939732552, "grad_norm": 0.33422512397609005, "learning_rate": 1.7094355665562295e-05, "loss": 0.6667, "num_tokens": 6265118234.0, "step": 2051 }, { "epoch": 3.0043964095988276, "grad_norm": 0.3297955699437232, "learning_rate": 1.707793032229721e-05, "loss": 0.6822, "num_tokens": 6268176043.0, "step": 2052 }, { "epoch": 3.0058618794651033, "grad_norm": 0.323332935817789, "learning_rate": 1.706150940763577e-05, "loss": 0.6894, "num_tokens": 6271240226.0, "step": 2053 }, { "epoch": 3.0073273493313795, "grad_norm": 0.3350721172132651, "learning_rate": 1.7045092936352614e-05, "loss": 0.6912, "num_tokens": 6274072923.0, "step": 2054 }, { "epoch": 3.008792819197655, "grad_norm": 0.3351606184640879, "learning_rate": 1.7028680923218354e-05, "loss": 0.6783, "num_tokens": 6277083057.0, "step": 2055 }, { "epoch": 3.0102582890639313, "grad_norm": 0.3660419633573275, "learning_rate": 1.701227338299961e-05, "loss": 0.6935, "num_tokens": 6280156156.0, "step": 2056 }, { "epoch": 3.011723758930207, "grad_norm": 0.3312951438971161, "learning_rate": 1.6995870330458966e-05, "loss": 0.6802, "num_tokens": 6282982212.0, "step": 2057 }, { "epoch": 3.0131892287964828, "grad_norm": 0.33722951689594466, "learning_rate": 1.6979471780354977e-05, "loss": 0.6915, "num_tokens": 6285921397.0, "step": 2058 }, { "epoch": 3.014654698662759, "grad_norm": 0.3505891508562686, "learning_rate": 1.6963077747442148e-05, "loss": 0.6774, "num_tokens": 6288870625.0, "step": 2059 }, { "epoch": 3.0161201685290346, "grad_norm": 0.30274043418187113, "learning_rate": 1.694668824647091e-05, "loss": 0.6595, "num_tokens": 6291850259.0, "step": 2060 }, { "epoch": 3.0175856383953104, "grad_norm": 0.32988057400965043, "learning_rate": 1.6930303292187614e-05, "loss": 0.6675, "num_tokens": 6294686873.0, "step": 2061 }, { "epoch": 3.0190511082615865, "grad_norm": 0.34627987479833816, "learning_rate": 1.6913922899334546e-05, "loss": 0.6872, "num_tokens": 6297683748.0, "step": 2062 }, { "epoch": 3.0205165781278622, "grad_norm": 0.3434529888154081, "learning_rate": 1.6897547082649854e-05, "loss": 0.6844, "num_tokens": 6300936948.0, "step": 2063 }, { "epoch": 3.021982047994138, "grad_norm": 0.3539262401078821, "learning_rate": 1.6881175856867595e-05, "loss": 0.6832, "num_tokens": 6304278502.0, "step": 2064 }, { "epoch": 3.023447517860414, "grad_norm": 0.3311790003944426, "learning_rate": 1.6864809236717668e-05, "loss": 0.6843, "num_tokens": 6307095446.0, "step": 2065 }, { "epoch": 3.02491298772669, "grad_norm": 0.36789046890051574, "learning_rate": 1.684844723692586e-05, "loss": 0.6806, "num_tokens": 6310047278.0, "step": 2066 }, { "epoch": 3.0263784575929655, "grad_norm": 0.3470119623491541, "learning_rate": 1.6832089872213777e-05, "loss": 0.6914, "num_tokens": 6313076626.0, "step": 2067 }, { "epoch": 3.0278439274592417, "grad_norm": 0.33487147311455095, "learning_rate": 1.6815737157298877e-05, "loss": 0.6916, "num_tokens": 6316073842.0, "step": 2068 }, { "epoch": 3.0293093973255174, "grad_norm": 0.3230807977427358, "learning_rate": 1.6799389106894403e-05, "loss": 0.6868, "num_tokens": 6319301453.0, "step": 2069 }, { "epoch": 3.0307748671917936, "grad_norm": 0.3180722086456074, "learning_rate": 1.6783045735709428e-05, "loss": 0.6921, "num_tokens": 6322401931.0, "step": 2070 }, { "epoch": 3.0322403370580693, "grad_norm": 0.33670671117367174, "learning_rate": 1.6766707058448803e-05, "loss": 0.671, "num_tokens": 6325585536.0, "step": 2071 }, { "epoch": 3.033705806924345, "grad_norm": 0.31408224949763724, "learning_rate": 1.6750373089813154e-05, "loss": 0.6832, "num_tokens": 6328722959.0, "step": 2072 }, { "epoch": 3.035171276790621, "grad_norm": 0.3642184167764613, "learning_rate": 1.6734043844498876e-05, "loss": 0.6788, "num_tokens": 6331680020.0, "step": 2073 }, { "epoch": 3.036636746656897, "grad_norm": 0.32962595743150236, "learning_rate": 1.6717719337198127e-05, "loss": 0.6519, "num_tokens": 6334828135.0, "step": 2074 }, { "epoch": 3.0381022165231726, "grad_norm": 0.3232330604857763, "learning_rate": 1.670139958259877e-05, "loss": 0.6785, "num_tokens": 6337839080.0, "step": 2075 }, { "epoch": 3.0395676863894487, "grad_norm": 0.3250695566597705, "learning_rate": 1.668508459538442e-05, "loss": 0.6935, "num_tokens": 6340754944.0, "step": 2076 }, { "epoch": 3.0410331562557245, "grad_norm": 0.3221266200601106, "learning_rate": 1.666877439023438e-05, "loss": 0.6901, "num_tokens": 6343819200.0, "step": 2077 }, { "epoch": 3.042498626122, "grad_norm": 0.3296352686180719, "learning_rate": 1.6652468981823685e-05, "loss": 0.6816, "num_tokens": 6347125703.0, "step": 2078 }, { "epoch": 3.0439640959882763, "grad_norm": 0.31356938928695366, "learning_rate": 1.6636168384823003e-05, "loss": 0.6911, "num_tokens": 6350092301.0, "step": 2079 }, { "epoch": 3.045429565854552, "grad_norm": 0.40497840693334636, "learning_rate": 1.661987261389872e-05, "loss": 0.6837, "num_tokens": 6353382573.0, "step": 2080 }, { "epoch": 3.046895035720828, "grad_norm": 0.3694915380389864, "learning_rate": 1.6603581683712855e-05, "loss": 0.6703, "num_tokens": 6356579080.0, "step": 2081 }, { "epoch": 3.048360505587104, "grad_norm": 0.327444954656064, "learning_rate": 1.6587295608923088e-05, "loss": 0.6476, "num_tokens": 6359627012.0, "step": 2082 }, { "epoch": 3.0498259754533796, "grad_norm": 0.39278171655316585, "learning_rate": 1.6571014404182702e-05, "loss": 0.6722, "num_tokens": 6362720294.0, "step": 2083 }, { "epoch": 3.051291445319656, "grad_norm": 0.3442239662301714, "learning_rate": 1.655473808414064e-05, "loss": 0.7035, "num_tokens": 6365757347.0, "step": 2084 }, { "epoch": 3.0527569151859315, "grad_norm": 0.3972954608234565, "learning_rate": 1.6538466663441407e-05, "loss": 0.674, "num_tokens": 6368761836.0, "step": 2085 }, { "epoch": 3.0542223850522072, "grad_norm": 0.3375812877720009, "learning_rate": 1.6522200156725126e-05, "loss": 0.679, "num_tokens": 6371778786.0, "step": 2086 }, { "epoch": 3.0556878549184834, "grad_norm": 0.4437058608617698, "learning_rate": 1.6505938578627486e-05, "loss": 0.6882, "num_tokens": 6374853527.0, "step": 2087 }, { "epoch": 3.057153324784759, "grad_norm": 0.42743427608449025, "learning_rate": 1.648968194377977e-05, "loss": 0.6872, "num_tokens": 6378049482.0, "step": 2088 }, { "epoch": 3.058618794651035, "grad_norm": 0.3538406193531048, "learning_rate": 1.647343026680876e-05, "loss": 0.6726, "num_tokens": 6381104689.0, "step": 2089 }, { "epoch": 3.060084264517311, "grad_norm": 0.38825422674801074, "learning_rate": 1.645718356233683e-05, "loss": 0.6723, "num_tokens": 6384326919.0, "step": 2090 }, { "epoch": 3.0615497343835867, "grad_norm": 0.3440679508909914, "learning_rate": 1.644094184498185e-05, "loss": 0.6862, "num_tokens": 6387468403.0, "step": 2091 }, { "epoch": 3.0630152042498624, "grad_norm": 0.3543595185449668, "learning_rate": 1.6424705129357214e-05, "loss": 0.6829, "num_tokens": 6390672195.0, "step": 2092 }, { "epoch": 3.0644806741161386, "grad_norm": 0.3415495368643185, "learning_rate": 1.6408473430071814e-05, "loss": 0.6558, "num_tokens": 6393826183.0, "step": 2093 }, { "epoch": 3.0659461439824143, "grad_norm": 0.33432570703215225, "learning_rate": 1.639224676173002e-05, "loss": 0.6706, "num_tokens": 6396915085.0, "step": 2094 }, { "epoch": 3.0674116138486904, "grad_norm": 0.35526691787573805, "learning_rate": 1.6376025138931688e-05, "loss": 0.6661, "num_tokens": 6399857711.0, "step": 2095 }, { "epoch": 3.068877083714966, "grad_norm": 0.3647927096302436, "learning_rate": 1.6359808576272124e-05, "loss": 0.6579, "num_tokens": 6402908286.0, "step": 2096 }, { "epoch": 3.070342553581242, "grad_norm": 0.3258342159765199, "learning_rate": 1.6343597088342093e-05, "loss": 0.6921, "num_tokens": 6405979136.0, "step": 2097 }, { "epoch": 3.071808023447518, "grad_norm": 0.35594299253606976, "learning_rate": 1.6327390689727784e-05, "loss": 0.6772, "num_tokens": 6408767712.0, "step": 2098 }, { "epoch": 3.0732734933137937, "grad_norm": 0.34662255840313994, "learning_rate": 1.6311189395010814e-05, "loss": 0.669, "num_tokens": 6411914456.0, "step": 2099 }, { "epoch": 3.0747389631800695, "grad_norm": 0.3758124609997256, "learning_rate": 1.629499321876819e-05, "loss": 0.6908, "num_tokens": 6414927012.0, "step": 2100 }, { "epoch": 3.0762044330463456, "grad_norm": 0.3420976452784462, "learning_rate": 1.6278802175572352e-05, "loss": 0.678, "num_tokens": 6417999351.0, "step": 2101 }, { "epoch": 3.0776699029126213, "grad_norm": 0.3836269349315167, "learning_rate": 1.6262616279991074e-05, "loss": 0.6719, "num_tokens": 6420917759.0, "step": 2102 }, { "epoch": 3.079135372778897, "grad_norm": 0.3188623340869601, "learning_rate": 1.624643554658754e-05, "loss": 0.6833, "num_tokens": 6423957561.0, "step": 2103 }, { "epoch": 3.080600842645173, "grad_norm": 0.4172948146299732, "learning_rate": 1.6230259989920254e-05, "loss": 0.691, "num_tokens": 6426877110.0, "step": 2104 }, { "epoch": 3.082066312511449, "grad_norm": 0.34842503086110554, "learning_rate": 1.62140896245431e-05, "loss": 0.6812, "num_tokens": 6430061820.0, "step": 2105 }, { "epoch": 3.083531782377725, "grad_norm": 0.38123910517037546, "learning_rate": 1.6197924465005247e-05, "loss": 0.6688, "num_tokens": 6433416768.0, "step": 2106 }, { "epoch": 3.084997252244001, "grad_norm": 0.3791992936226009, "learning_rate": 1.6181764525851216e-05, "loss": 0.6792, "num_tokens": 6436431491.0, "step": 2107 }, { "epoch": 3.0864627221102765, "grad_norm": 0.40668472630323815, "learning_rate": 1.6165609821620815e-05, "loss": 0.6841, "num_tokens": 6439566711.0, "step": 2108 }, { "epoch": 3.0879281919765527, "grad_norm": 0.359017610469667, "learning_rate": 1.614946036684916e-05, "loss": 0.6677, "num_tokens": 6442767207.0, "step": 2109 }, { "epoch": 3.0893936618428284, "grad_norm": 0.38278484880385877, "learning_rate": 1.6133316176066603e-05, "loss": 0.6836, "num_tokens": 6445713428.0, "step": 2110 }, { "epoch": 3.090859131709104, "grad_norm": 0.38137948980533215, "learning_rate": 1.6117177263798805e-05, "loss": 0.6844, "num_tokens": 6448675661.0, "step": 2111 }, { "epoch": 3.0923246015753802, "grad_norm": 0.41628865869698056, "learning_rate": 1.6101043644566657e-05, "loss": 0.6807, "num_tokens": 6451842076.0, "step": 2112 }, { "epoch": 3.093790071441656, "grad_norm": 0.3883440551540658, "learning_rate": 1.6084915332886283e-05, "loss": 0.6741, "num_tokens": 6454854709.0, "step": 2113 }, { "epoch": 3.0952555413079317, "grad_norm": 0.40255646083210295, "learning_rate": 1.606879234326904e-05, "loss": 0.6677, "num_tokens": 6458010447.0, "step": 2114 }, { "epoch": 3.096721011174208, "grad_norm": 0.39704534371332556, "learning_rate": 1.6052674690221502e-05, "loss": 0.6613, "num_tokens": 6461066162.0, "step": 2115 }, { "epoch": 3.0981864810404836, "grad_norm": 0.3886346324544583, "learning_rate": 1.603656238824542e-05, "loss": 0.6941, "num_tokens": 6464197889.0, "step": 2116 }, { "epoch": 3.0996519509067593, "grad_norm": 0.38252522020183527, "learning_rate": 1.6020455451837756e-05, "loss": 0.6782, "num_tokens": 6467157698.0, "step": 2117 }, { "epoch": 3.1011174207730354, "grad_norm": 0.36211673855804943, "learning_rate": 1.6004353895490625e-05, "loss": 0.6642, "num_tokens": 6470210237.0, "step": 2118 }, { "epoch": 3.102582890639311, "grad_norm": 0.3856766803522645, "learning_rate": 1.5988257733691316e-05, "loss": 0.6758, "num_tokens": 6473268077.0, "step": 2119 }, { "epoch": 3.1040483605055873, "grad_norm": 0.3481586993218587, "learning_rate": 1.597216698092225e-05, "loss": 0.6709, "num_tokens": 6476528160.0, "step": 2120 }, { "epoch": 3.105513830371863, "grad_norm": 0.3590460316913547, "learning_rate": 1.5956081651660996e-05, "loss": 0.6675, "num_tokens": 6479578714.0, "step": 2121 }, { "epoch": 3.1069793002381387, "grad_norm": 0.33615313011749387, "learning_rate": 1.594000176038023e-05, "loss": 0.6663, "num_tokens": 6482705944.0, "step": 2122 }, { "epoch": 3.108444770104415, "grad_norm": 0.3236756249977203, "learning_rate": 1.5923927321547746e-05, "loss": 0.6808, "num_tokens": 6485871488.0, "step": 2123 }, { "epoch": 3.1099102399706906, "grad_norm": 0.3695252183278405, "learning_rate": 1.5907858349626417e-05, "loss": 0.6799, "num_tokens": 6489058166.0, "step": 2124 }, { "epoch": 3.1113757098369663, "grad_norm": 0.32963711587246763, "learning_rate": 1.5891794859074218e-05, "loss": 0.6847, "num_tokens": 6492103149.0, "step": 2125 }, { "epoch": 3.1128411797032425, "grad_norm": 0.41839204547089254, "learning_rate": 1.5875736864344173e-05, "loss": 0.6871, "num_tokens": 6495176950.0, "step": 2126 }, { "epoch": 3.114306649569518, "grad_norm": 0.3512569925923776, "learning_rate": 1.5859684379884386e-05, "loss": 0.6808, "num_tokens": 6498176489.0, "step": 2127 }, { "epoch": 3.115772119435794, "grad_norm": 0.36063105420447306, "learning_rate": 1.5843637420137964e-05, "loss": 0.6768, "num_tokens": 6501396030.0, "step": 2128 }, { "epoch": 3.11723758930207, "grad_norm": 0.32621973050160546, "learning_rate": 1.5827595999543083e-05, "loss": 0.6923, "num_tokens": 6504690118.0, "step": 2129 }, { "epoch": 3.118703059168346, "grad_norm": 0.35149157548091964, "learning_rate": 1.581156013253291e-05, "loss": 0.6793, "num_tokens": 6507700325.0, "step": 2130 }, { "epoch": 3.120168529034622, "grad_norm": 0.3335168983206523, "learning_rate": 1.5795529833535624e-05, "loss": 0.6809, "num_tokens": 6510775395.0, "step": 2131 }, { "epoch": 3.1216339989008977, "grad_norm": 0.37697926199316717, "learning_rate": 1.5779505116974397e-05, "loss": 0.6794, "num_tokens": 6513730601.0, "step": 2132 }, { "epoch": 3.1230994687671734, "grad_norm": 0.3394315115708865, "learning_rate": 1.5763485997267374e-05, "loss": 0.6643, "num_tokens": 6516914142.0, "step": 2133 }, { "epoch": 3.1245649386334495, "grad_norm": 0.3729353334078359, "learning_rate": 1.574747248882767e-05, "loss": 0.6672, "num_tokens": 6520170174.0, "step": 2134 }, { "epoch": 3.1260304084997252, "grad_norm": 0.3185867953721918, "learning_rate": 1.573146460606333e-05, "loss": 0.6844, "num_tokens": 6523208772.0, "step": 2135 }, { "epoch": 3.127495878366001, "grad_norm": 0.351362358762775, "learning_rate": 1.5715462363377373e-05, "loss": 0.6655, "num_tokens": 6526316228.0, "step": 2136 }, { "epoch": 3.128961348232277, "grad_norm": 0.3100315556746897, "learning_rate": 1.569946577516771e-05, "loss": 0.655, "num_tokens": 6529481065.0, "step": 2137 }, { "epoch": 3.130426818098553, "grad_norm": 0.3353223156940941, "learning_rate": 1.5683474855827184e-05, "loss": 0.6568, "num_tokens": 6532737114.0, "step": 2138 }, { "epoch": 3.1318922879648285, "grad_norm": 0.33532223811041706, "learning_rate": 1.5667489619743526e-05, "loss": 0.6734, "num_tokens": 6535935843.0, "step": 2139 }, { "epoch": 3.1333577578311047, "grad_norm": 0.30606383231230366, "learning_rate": 1.565151008129938e-05, "loss": 0.6701, "num_tokens": 6538881922.0, "step": 2140 }, { "epoch": 3.1348232276973804, "grad_norm": 0.3307091758533152, "learning_rate": 1.5635536254872215e-05, "loss": 0.6826, "num_tokens": 6541895270.0, "step": 2141 }, { "epoch": 3.136288697563656, "grad_norm": 0.37348146903267926, "learning_rate": 1.561956815483441e-05, "loss": 0.6797, "num_tokens": 6544947524.0, "step": 2142 }, { "epoch": 3.1377541674299323, "grad_norm": 0.33153329253197217, "learning_rate": 1.5603605795553152e-05, "loss": 0.6729, "num_tokens": 6548143802.0, "step": 2143 }, { "epoch": 3.139219637296208, "grad_norm": 0.32796179700895456, "learning_rate": 1.5587649191390503e-05, "loss": 0.6777, "num_tokens": 6551071640.0, "step": 2144 }, { "epoch": 3.140685107162484, "grad_norm": 0.3464505620525035, "learning_rate": 1.5571698356703304e-05, "loss": 0.692, "num_tokens": 6553747257.0, "step": 2145 }, { "epoch": 3.14215057702876, "grad_norm": 0.37393892056616373, "learning_rate": 1.5555753305843247e-05, "loss": 0.6665, "num_tokens": 6556941452.0, "step": 2146 }, { "epoch": 3.1436160468950356, "grad_norm": 0.3572027370281763, "learning_rate": 1.5539814053156774e-05, "loss": 0.686, "num_tokens": 6560015505.0, "step": 2147 }, { "epoch": 3.1450815167613118, "grad_norm": 0.358789914330423, "learning_rate": 1.5523880612985158e-05, "loss": 0.6911, "num_tokens": 6563094871.0, "step": 2148 }, { "epoch": 3.1465469866275875, "grad_norm": 0.35403223539985357, "learning_rate": 1.550795299966441e-05, "loss": 0.6885, "num_tokens": 6566200574.0, "step": 2149 }, { "epoch": 3.148012456493863, "grad_norm": 0.3435954417692675, "learning_rate": 1.549203122752531e-05, "loss": 0.6803, "num_tokens": 6569444759.0, "step": 2150 }, { "epoch": 3.1494779263601393, "grad_norm": 0.40243043510734866, "learning_rate": 1.5476115310893374e-05, "loss": 0.6894, "num_tokens": 6572517975.0, "step": 2151 }, { "epoch": 3.150943396226415, "grad_norm": 0.3551990424124929, "learning_rate": 1.546020526408886e-05, "loss": 0.6778, "num_tokens": 6575689139.0, "step": 2152 }, { "epoch": 3.1524088660926908, "grad_norm": 0.38906533579795216, "learning_rate": 1.544430110142674e-05, "loss": 0.6802, "num_tokens": 6578720664.0, "step": 2153 }, { "epoch": 3.153874335958967, "grad_norm": 0.3334290684495408, "learning_rate": 1.5428402837216698e-05, "loss": 0.6581, "num_tokens": 6581812094.0, "step": 2154 }, { "epoch": 3.1553398058252426, "grad_norm": 0.3580881405786349, "learning_rate": 1.541251048576309e-05, "loss": 0.6662, "num_tokens": 6584681433.0, "step": 2155 }, { "epoch": 3.156805275691519, "grad_norm": 0.33514221294765534, "learning_rate": 1.5396624061364986e-05, "loss": 0.6768, "num_tokens": 6587922659.0, "step": 2156 }, { "epoch": 3.1582707455577945, "grad_norm": 0.37803928258551445, "learning_rate": 1.538074357831609e-05, "loss": 0.6958, "num_tokens": 6591068884.0, "step": 2157 }, { "epoch": 3.1597362154240702, "grad_norm": 0.3396774408281775, "learning_rate": 1.5364869050904784e-05, "loss": 0.6746, "num_tokens": 6594124495.0, "step": 2158 }, { "epoch": 3.1612016852903464, "grad_norm": 0.3978873284791709, "learning_rate": 1.5349000493414064e-05, "loss": 0.6887, "num_tokens": 6597349994.0, "step": 2159 }, { "epoch": 3.162667155156622, "grad_norm": 0.3549172474043443, "learning_rate": 1.5333137920121603e-05, "loss": 0.6834, "num_tokens": 6600381722.0, "step": 2160 }, { "epoch": 3.164132625022898, "grad_norm": 0.4270583315333125, "learning_rate": 1.531728134529963e-05, "loss": 0.6929, "num_tokens": 6603341483.0, "step": 2161 }, { "epoch": 3.165598094889174, "grad_norm": 0.36780095211023944, "learning_rate": 1.530143078321503e-05, "loss": 0.6703, "num_tokens": 6606571024.0, "step": 2162 }, { "epoch": 3.1670635647554497, "grad_norm": 0.36676550355380316, "learning_rate": 1.528558624812923e-05, "loss": 0.6731, "num_tokens": 6609643142.0, "step": 2163 }, { "epoch": 3.1685290346217254, "grad_norm": 0.4319633664953416, "learning_rate": 1.5269747754298282e-05, "loss": 0.6894, "num_tokens": 6612827609.0, "step": 2164 }, { "epoch": 3.1699945044880016, "grad_norm": 0.32463116295474903, "learning_rate": 1.525391531597276e-05, "loss": 0.6818, "num_tokens": 6616032672.0, "step": 2165 }, { "epoch": 3.1714599743542773, "grad_norm": 0.37076375695490305, "learning_rate": 1.5238088947397821e-05, "loss": 0.6664, "num_tokens": 6619006262.0, "step": 2166 }, { "epoch": 3.172925444220553, "grad_norm": 0.3090841548748596, "learning_rate": 1.5222268662813147e-05, "loss": 0.6942, "num_tokens": 6622019797.0, "step": 2167 }, { "epoch": 3.174390914086829, "grad_norm": 0.39116074570869747, "learning_rate": 1.5206454476452943e-05, "loss": 0.6796, "num_tokens": 6625136846.0, "step": 2168 }, { "epoch": 3.175856383953105, "grad_norm": 0.33154282508782873, "learning_rate": 1.5190646402545932e-05, "loss": 0.6762, "num_tokens": 6628316094.0, "step": 2169 }, { "epoch": 3.177321853819381, "grad_norm": 0.3709285273692325, "learning_rate": 1.5174844455315337e-05, "loss": 0.6626, "num_tokens": 6631297128.0, "step": 2170 }, { "epoch": 3.1787873236856568, "grad_norm": 0.34144743802179606, "learning_rate": 1.5159048648978871e-05, "loss": 0.6745, "num_tokens": 6634506548.0, "step": 2171 }, { "epoch": 3.1802527935519325, "grad_norm": 0.3489259524261533, "learning_rate": 1.514325899774872e-05, "loss": 0.6913, "num_tokens": 6637528234.0, "step": 2172 }, { "epoch": 3.1817182634182086, "grad_norm": 0.37937038611210316, "learning_rate": 1.5127475515831518e-05, "loss": 0.6732, "num_tokens": 6640722083.0, "step": 2173 }, { "epoch": 3.1831837332844843, "grad_norm": 0.36287961485271486, "learning_rate": 1.5111698217428385e-05, "loss": 0.6968, "num_tokens": 6643702838.0, "step": 2174 }, { "epoch": 3.18464920315076, "grad_norm": 0.34096981213879085, "learning_rate": 1.5095927116734843e-05, "loss": 0.6967, "num_tokens": 6646717972.0, "step": 2175 }, { "epoch": 3.186114673017036, "grad_norm": 0.3358867655740543, "learning_rate": 1.5080162227940833e-05, "loss": 0.6856, "num_tokens": 6649613437.0, "step": 2176 }, { "epoch": 3.187580142883312, "grad_norm": 0.3395614003074484, "learning_rate": 1.5064403565230748e-05, "loss": 0.6881, "num_tokens": 6652592234.0, "step": 2177 }, { "epoch": 3.1890456127495876, "grad_norm": 0.375174015105079, "learning_rate": 1.5048651142783329e-05, "loss": 0.6751, "num_tokens": 6655710912.0, "step": 2178 }, { "epoch": 3.190511082615864, "grad_norm": 0.32186965779647103, "learning_rate": 1.5032904974771745e-05, "loss": 0.6874, "num_tokens": 6658867016.0, "step": 2179 }, { "epoch": 3.1919765524821395, "grad_norm": 0.3462342411753982, "learning_rate": 1.501716507536351e-05, "loss": 0.6901, "num_tokens": 6661876024.0, "step": 2180 }, { "epoch": 3.1934420223484157, "grad_norm": 0.31813234056432127, "learning_rate": 1.500143145872052e-05, "loss": 0.693, "num_tokens": 6665051133.0, "step": 2181 }, { "epoch": 3.1949074922146914, "grad_norm": 0.33435101040820614, "learning_rate": 1.4985704138998985e-05, "loss": 0.6772, "num_tokens": 6668097226.0, "step": 2182 }, { "epoch": 3.196372962080967, "grad_norm": 0.3261309598554135, "learning_rate": 1.4969983130349484e-05, "loss": 0.7026, "num_tokens": 6671169908.0, "step": 2183 }, { "epoch": 3.1978384319472433, "grad_norm": 0.3273073177324331, "learning_rate": 1.4954268446916898e-05, "loss": 0.702, "num_tokens": 6674136747.0, "step": 2184 }, { "epoch": 3.199303901813519, "grad_norm": 0.3377485054824053, "learning_rate": 1.4938560102840427e-05, "loss": 0.6635, "num_tokens": 6677220274.0, "step": 2185 }, { "epoch": 3.2007693716797947, "grad_norm": 0.34524027547480124, "learning_rate": 1.4922858112253557e-05, "loss": 0.6862, "num_tokens": 6680431798.0, "step": 2186 }, { "epoch": 3.202234841546071, "grad_norm": 0.3507686079267351, "learning_rate": 1.4907162489284069e-05, "loss": 0.6934, "num_tokens": 6683435069.0, "step": 2187 }, { "epoch": 3.2037003114123466, "grad_norm": 0.32999794628298756, "learning_rate": 1.4891473248053996e-05, "loss": 0.6973, "num_tokens": 6686566828.0, "step": 2188 }, { "epoch": 3.2051657812786223, "grad_norm": 0.3136046924382425, "learning_rate": 1.4875790402679661e-05, "loss": 0.6854, "num_tokens": 6689782224.0, "step": 2189 }, { "epoch": 3.2066312511448984, "grad_norm": 0.32515951032017804, "learning_rate": 1.4860113967271596e-05, "loss": 0.6856, "num_tokens": 6692846309.0, "step": 2190 }, { "epoch": 3.208096721011174, "grad_norm": 0.31650135833595877, "learning_rate": 1.4844443955934598e-05, "loss": 0.668, "num_tokens": 6695809367.0, "step": 2191 }, { "epoch": 3.20956219087745, "grad_norm": 0.40281201083599827, "learning_rate": 1.4828780382767653e-05, "loss": 0.6798, "num_tokens": 6698745000.0, "step": 2192 }, { "epoch": 3.211027660743726, "grad_norm": 0.3463172416490409, "learning_rate": 1.4813123261863985e-05, "loss": 0.6805, "num_tokens": 6701836956.0, "step": 2193 }, { "epoch": 3.2124931306100017, "grad_norm": 0.38024279603097016, "learning_rate": 1.4797472607310987e-05, "loss": 0.673, "num_tokens": 6704810294.0, "step": 2194 }, { "epoch": 3.213958600476278, "grad_norm": 0.33588207172315354, "learning_rate": 1.4781828433190254e-05, "loss": 0.6856, "num_tokens": 6707824469.0, "step": 2195 }, { "epoch": 3.2154240703425536, "grad_norm": 0.3703923094204619, "learning_rate": 1.4766190753577529e-05, "loss": 0.6889, "num_tokens": 6710885465.0, "step": 2196 }, { "epoch": 3.2168895402088293, "grad_norm": 0.3703740975686388, "learning_rate": 1.4750559582542736e-05, "loss": 0.6688, "num_tokens": 6713851628.0, "step": 2197 }, { "epoch": 3.2183550100751055, "grad_norm": 0.3144649156416061, "learning_rate": 1.4734934934149916e-05, "loss": 0.6801, "num_tokens": 6716764443.0, "step": 2198 }, { "epoch": 3.219820479941381, "grad_norm": 0.38900381526559935, "learning_rate": 1.4719316822457273e-05, "loss": 0.678, "num_tokens": 6719999859.0, "step": 2199 }, { "epoch": 3.221285949807657, "grad_norm": 0.33524956255487665, "learning_rate": 1.4703705261517099e-05, "loss": 0.6844, "num_tokens": 6722962041.0, "step": 2200 }, { "epoch": 3.222751419673933, "grad_norm": 0.3484951221426812, "learning_rate": 1.4688100265375811e-05, "loss": 0.6928, "num_tokens": 6725972779.0, "step": 2201 }, { "epoch": 3.224216889540209, "grad_norm": 0.329299307887011, "learning_rate": 1.4672501848073912e-05, "loss": 0.6747, "num_tokens": 6729014712.0, "step": 2202 }, { "epoch": 3.2256823594064845, "grad_norm": 0.3329634018053515, "learning_rate": 1.4656910023645995e-05, "loss": 0.6745, "num_tokens": 6732201336.0, "step": 2203 }, { "epoch": 3.2271478292727607, "grad_norm": 0.322502127550236, "learning_rate": 1.4641324806120696e-05, "loss": 0.6781, "num_tokens": 6735071700.0, "step": 2204 }, { "epoch": 3.2286132991390364, "grad_norm": 0.3185411588236215, "learning_rate": 1.4625746209520749e-05, "loss": 0.6826, "num_tokens": 6738066161.0, "step": 2205 }, { "epoch": 3.2300787690053125, "grad_norm": 0.31764081750882994, "learning_rate": 1.4610174247862876e-05, "loss": 0.6809, "num_tokens": 6740974558.0, "step": 2206 }, { "epoch": 3.2315442388715883, "grad_norm": 0.29053229419035065, "learning_rate": 1.4594608935157885e-05, "loss": 0.6694, "num_tokens": 6744063295.0, "step": 2207 }, { "epoch": 3.233009708737864, "grad_norm": 0.29376965575001546, "learning_rate": 1.4579050285410566e-05, "loss": 0.6819, "num_tokens": 6747030380.0, "step": 2208 }, { "epoch": 3.23447517860414, "grad_norm": 0.31032965858496087, "learning_rate": 1.456349831261973e-05, "loss": 0.7037, "num_tokens": 6749760356.0, "step": 2209 }, { "epoch": 3.235940648470416, "grad_norm": 0.293484953578673, "learning_rate": 1.4547953030778162e-05, "loss": 0.6918, "num_tokens": 6752687510.0, "step": 2210 }, { "epoch": 3.2374061183366916, "grad_norm": 0.3403444945496453, "learning_rate": 1.4532414453872644e-05, "loss": 0.6764, "num_tokens": 6755756932.0, "step": 2211 }, { "epoch": 3.2388715882029677, "grad_norm": 0.29676807109039155, "learning_rate": 1.4516882595883922e-05, "loss": 0.667, "num_tokens": 6758615049.0, "step": 2212 }, { "epoch": 3.2403370580692434, "grad_norm": 0.35634955344277586, "learning_rate": 1.4501357470786701e-05, "loss": 0.6778, "num_tokens": 6761802248.0, "step": 2213 }, { "epoch": 3.241802527935519, "grad_norm": 0.3307800128661071, "learning_rate": 1.448583909254961e-05, "loss": 0.6829, "num_tokens": 6765019175.0, "step": 2214 }, { "epoch": 3.2432679978017953, "grad_norm": 0.3741646783456708, "learning_rate": 1.4470327475135237e-05, "loss": 0.6812, "num_tokens": 6768091832.0, "step": 2215 }, { "epoch": 3.244733467668071, "grad_norm": 0.32907694948207883, "learning_rate": 1.4454822632500057e-05, "loss": 0.67, "num_tokens": 6771247479.0, "step": 2216 }, { "epoch": 3.2461989375343467, "grad_norm": 0.36233687617378973, "learning_rate": 1.4439324578594449e-05, "loss": 0.6689, "num_tokens": 6774254048.0, "step": 2217 }, { "epoch": 3.247664407400623, "grad_norm": 0.36024035839932406, "learning_rate": 1.4423833327362733e-05, "loss": 0.6776, "num_tokens": 6777318945.0, "step": 2218 }, { "epoch": 3.2491298772668986, "grad_norm": 0.33130001635500456, "learning_rate": 1.4408348892743038e-05, "loss": 0.6778, "num_tokens": 6780468893.0, "step": 2219 }, { "epoch": 3.2505953471331743, "grad_norm": 0.3529350825995393, "learning_rate": 1.4392871288667415e-05, "loss": 0.6866, "num_tokens": 6783364596.0, "step": 2220 }, { "epoch": 3.2520608169994505, "grad_norm": 0.31452154488881984, "learning_rate": 1.437740052906174e-05, "loss": 0.6714, "num_tokens": 6786094906.0, "step": 2221 }, { "epoch": 3.253526286865726, "grad_norm": 0.33598379488235836, "learning_rate": 1.4361936627845743e-05, "loss": 0.6772, "num_tokens": 6789169180.0, "step": 2222 }, { "epoch": 3.2549917567320024, "grad_norm": 0.351336948034945, "learning_rate": 1.434647959893297e-05, "loss": 0.6863, "num_tokens": 6792383107.0, "step": 2223 }, { "epoch": 3.256457226598278, "grad_norm": 0.3585054378367536, "learning_rate": 1.4331029456230815e-05, "loss": 0.6828, "num_tokens": 6795587375.0, "step": 2224 }, { "epoch": 3.257922696464554, "grad_norm": 0.3414079607236522, "learning_rate": 1.431558621364043e-05, "loss": 0.6763, "num_tokens": 6798888543.0, "step": 2225 }, { "epoch": 3.25938816633083, "grad_norm": 0.3327998028330822, "learning_rate": 1.4300149885056796e-05, "loss": 0.6807, "num_tokens": 6801804347.0, "step": 2226 }, { "epoch": 3.2608536361971057, "grad_norm": 0.3167086853747828, "learning_rate": 1.428472048436866e-05, "loss": 0.667, "num_tokens": 6804831921.0, "step": 2227 }, { "epoch": 3.2623191060633814, "grad_norm": 0.3213085026847831, "learning_rate": 1.426929802545853e-05, "loss": 0.6764, "num_tokens": 6807934073.0, "step": 2228 }, { "epoch": 3.2637845759296575, "grad_norm": 0.3155746741670374, "learning_rate": 1.4253882522202678e-05, "loss": 0.682, "num_tokens": 6810736900.0, "step": 2229 }, { "epoch": 3.2652500457959333, "grad_norm": 0.3429565517590045, "learning_rate": 1.4238473988471114e-05, "loss": 0.6845, "num_tokens": 6813960313.0, "step": 2230 }, { "epoch": 3.2667155156622094, "grad_norm": 0.31215201273172505, "learning_rate": 1.4223072438127568e-05, "loss": 0.687, "num_tokens": 6816901576.0, "step": 2231 }, { "epoch": 3.268180985528485, "grad_norm": 0.3655182025342804, "learning_rate": 1.4207677885029515e-05, "loss": 0.6779, "num_tokens": 6819882404.0, "step": 2232 }, { "epoch": 3.269646455394761, "grad_norm": 0.343901315186695, "learning_rate": 1.4192290343028092e-05, "loss": 0.6814, "num_tokens": 6823063949.0, "step": 2233 }, { "epoch": 3.271111925261037, "grad_norm": 0.3649952117226686, "learning_rate": 1.4176909825968169e-05, "loss": 0.6695, "num_tokens": 6826207270.0, "step": 2234 }, { "epoch": 3.2725773951273127, "grad_norm": 0.3150627365726366, "learning_rate": 1.416153634768827e-05, "loss": 0.6821, "num_tokens": 6829268598.0, "step": 2235 }, { "epoch": 3.2740428649935884, "grad_norm": 0.3448130724101046, "learning_rate": 1.4146169922020593e-05, "loss": 0.6811, "num_tokens": 6832319359.0, "step": 2236 }, { "epoch": 3.2755083348598646, "grad_norm": 0.3204323429495144, "learning_rate": 1.413081056279098e-05, "loss": 0.679, "num_tokens": 6835395884.0, "step": 2237 }, { "epoch": 3.2769738047261403, "grad_norm": 0.3449419329255864, "learning_rate": 1.4115458283818954e-05, "loss": 0.684, "num_tokens": 6838550890.0, "step": 2238 }, { "epoch": 3.278439274592416, "grad_norm": 0.29644439843354614, "learning_rate": 1.4100113098917607e-05, "loss": 0.684, "num_tokens": 6841579138.0, "step": 2239 }, { "epoch": 3.279904744458692, "grad_norm": 0.294865551902728, "learning_rate": 1.4084775021893694e-05, "loss": 0.6981, "num_tokens": 6844736173.0, "step": 2240 }, { "epoch": 3.281370214324968, "grad_norm": 0.34089358510716355, "learning_rate": 1.4069444066547569e-05, "loss": 0.6879, "num_tokens": 6847846050.0, "step": 2241 }, { "epoch": 3.2828356841912436, "grad_norm": 0.29380175599429265, "learning_rate": 1.4054120246673155e-05, "loss": 0.6705, "num_tokens": 6851128281.0, "step": 2242 }, { "epoch": 3.2843011540575198, "grad_norm": 0.3099267246707363, "learning_rate": 1.4038803576057985e-05, "loss": 0.6539, "num_tokens": 6854044981.0, "step": 2243 }, { "epoch": 3.2857666239237955, "grad_norm": 0.3317476191532662, "learning_rate": 1.4023494068483126e-05, "loss": 0.6929, "num_tokens": 6857093702.0, "step": 2244 }, { "epoch": 3.287232093790071, "grad_norm": 0.3342460252328149, "learning_rate": 1.4008191737723231e-05, "loss": 0.6808, "num_tokens": 6860160096.0, "step": 2245 }, { "epoch": 3.2886975636563474, "grad_norm": 0.31467113768075833, "learning_rate": 1.3992896597546488e-05, "loss": 0.6666, "num_tokens": 6863407765.0, "step": 2246 }, { "epoch": 3.290163033522623, "grad_norm": 0.3233439638387, "learning_rate": 1.3977608661714597e-05, "loss": 0.6777, "num_tokens": 6866334946.0, "step": 2247 }, { "epoch": 3.2916285033888992, "grad_norm": 0.3252819294460103, "learning_rate": 1.3962327943982801e-05, "loss": 0.7073, "num_tokens": 6869325918.0, "step": 2248 }, { "epoch": 3.293093973255175, "grad_norm": 0.33384092195596515, "learning_rate": 1.3947054458099834e-05, "loss": 0.6836, "num_tokens": 6872365514.0, "step": 2249 }, { "epoch": 3.2945594431214507, "grad_norm": 0.3030650556123793, "learning_rate": 1.3931788217807922e-05, "loss": 0.699, "num_tokens": 6875384995.0, "step": 2250 }, { "epoch": 3.296024912987727, "grad_norm": 0.30004336519608776, "learning_rate": 1.3916529236842778e-05, "loss": 0.6613, "num_tokens": 6878390591.0, "step": 2251 }, { "epoch": 3.2974903828540025, "grad_norm": 0.3235219113121467, "learning_rate": 1.390127752893359e-05, "loss": 0.6699, "num_tokens": 6881471440.0, "step": 2252 }, { "epoch": 3.2989558527202782, "grad_norm": 0.3392067338882081, "learning_rate": 1.3886033107802978e-05, "loss": 0.6939, "num_tokens": 6884511463.0, "step": 2253 }, { "epoch": 3.3004213225865544, "grad_norm": 0.3615886904528167, "learning_rate": 1.3870795987167038e-05, "loss": 0.6648, "num_tokens": 6887523540.0, "step": 2254 }, { "epoch": 3.30188679245283, "grad_norm": 0.34664615274847677, "learning_rate": 1.3855566180735274e-05, "loss": 0.6865, "num_tokens": 6890635083.0, "step": 2255 }, { "epoch": 3.3033522623191063, "grad_norm": 0.36629244917730075, "learning_rate": 1.3840343702210613e-05, "loss": 0.6852, "num_tokens": 6893632234.0, "step": 2256 }, { "epoch": 3.304817732185382, "grad_norm": 0.36467042089702134, "learning_rate": 1.3825128565289397e-05, "loss": 0.6884, "num_tokens": 6896381691.0, "step": 2257 }, { "epoch": 3.3062832020516577, "grad_norm": 0.3308979470901881, "learning_rate": 1.380992078366135e-05, "loss": 0.6677, "num_tokens": 6899525654.0, "step": 2258 }, { "epoch": 3.307748671917934, "grad_norm": 0.34347012678799804, "learning_rate": 1.379472037100959e-05, "loss": 0.6549, "num_tokens": 6902535258.0, "step": 2259 }, { "epoch": 3.3092141417842096, "grad_norm": 0.33596519839731187, "learning_rate": 1.3779527341010604e-05, "loss": 0.7001, "num_tokens": 6905336734.0, "step": 2260 }, { "epoch": 3.3106796116504853, "grad_norm": 0.31491775577342085, "learning_rate": 1.3764341707334228e-05, "loss": 0.6823, "num_tokens": 6908403416.0, "step": 2261 }, { "epoch": 3.3121450815167615, "grad_norm": 0.2886369860619765, "learning_rate": 1.374916348364364e-05, "loss": 0.6919, "num_tokens": 6911201668.0, "step": 2262 }, { "epoch": 3.313610551383037, "grad_norm": 0.3340409051853242, "learning_rate": 1.3733992683595383e-05, "loss": 0.6796, "num_tokens": 6914253734.0, "step": 2263 }, { "epoch": 3.315076021249313, "grad_norm": 0.29590315770498204, "learning_rate": 1.3718829320839264e-05, "loss": 0.6678, "num_tokens": 6917493336.0, "step": 2264 }, { "epoch": 3.316541491115589, "grad_norm": 0.3194743323388789, "learning_rate": 1.3703673409018454e-05, "loss": 0.6881, "num_tokens": 6920427891.0, "step": 2265 }, { "epoch": 3.3180069609818648, "grad_norm": 0.3250552601122571, "learning_rate": 1.3688524961769396e-05, "loss": 0.6652, "num_tokens": 6923427226.0, "step": 2266 }, { "epoch": 3.3194724308481405, "grad_norm": 0.3056619755452292, "learning_rate": 1.3673383992721805e-05, "loss": 0.6851, "num_tokens": 6926285418.0, "step": 2267 }, { "epoch": 3.3209379007144166, "grad_norm": 0.3689544438387145, "learning_rate": 1.365825051549868e-05, "loss": 0.6944, "num_tokens": 6929304850.0, "step": 2268 }, { "epoch": 3.3224033705806923, "grad_norm": 0.3307073899899554, "learning_rate": 1.3643124543716306e-05, "loss": 0.6833, "num_tokens": 6932252100.0, "step": 2269 }, { "epoch": 3.323868840446968, "grad_norm": 0.322901940669808, "learning_rate": 1.362800609098415e-05, "loss": 0.6703, "num_tokens": 6935438106.0, "step": 2270 }, { "epoch": 3.3253343103132442, "grad_norm": 0.3295309240327121, "learning_rate": 1.3612895170904978e-05, "loss": 0.6671, "num_tokens": 6938392196.0, "step": 2271 }, { "epoch": 3.32679978017952, "grad_norm": 0.3130084215237398, "learning_rate": 1.3597791797074747e-05, "loss": 0.6879, "num_tokens": 6941452086.0, "step": 2272 }, { "epoch": 3.328265250045796, "grad_norm": 0.31682734957264447, "learning_rate": 1.3582695983082628e-05, "loss": 0.6954, "num_tokens": 6944669372.0, "step": 2273 }, { "epoch": 3.329730719912072, "grad_norm": 0.3219980580465248, "learning_rate": 1.356760774251099e-05, "loss": 0.6681, "num_tokens": 6947734641.0, "step": 2274 }, { "epoch": 3.3311961897783475, "grad_norm": 0.3010604462563766, "learning_rate": 1.3552527088935398e-05, "loss": 0.6838, "num_tokens": 6950914445.0, "step": 2275 }, { "epoch": 3.3326616596446237, "grad_norm": 0.34640704965083374, "learning_rate": 1.353745403592457e-05, "loss": 0.6705, "num_tokens": 6954093926.0, "step": 2276 }, { "epoch": 3.3341271295108994, "grad_norm": 0.2888482138610229, "learning_rate": 1.3522388597040414e-05, "loss": 0.651, "num_tokens": 6957087740.0, "step": 2277 }, { "epoch": 3.335592599377175, "grad_norm": 0.31745608301724065, "learning_rate": 1.3507330785837951e-05, "loss": 0.6707, "num_tokens": 6960164577.0, "step": 2278 }, { "epoch": 3.3370580692434513, "grad_norm": 0.32897975693350096, "learning_rate": 1.3492280615865379e-05, "loss": 0.6619, "num_tokens": 6963272086.0, "step": 2279 }, { "epoch": 3.338523539109727, "grad_norm": 0.4181102507337895, "learning_rate": 1.347723810066399e-05, "loss": 0.6743, "num_tokens": 6966349528.0, "step": 2280 }, { "epoch": 3.339989008976003, "grad_norm": 0.36363650305439443, "learning_rate": 1.3462203253768208e-05, "loss": 0.7085, "num_tokens": 6969187044.0, "step": 2281 }, { "epoch": 3.341454478842279, "grad_norm": 0.4097173039169629, "learning_rate": 1.3447176088705534e-05, "loss": 0.6689, "num_tokens": 6972410726.0, "step": 2282 }, { "epoch": 3.3429199487085546, "grad_norm": 0.34231706961005426, "learning_rate": 1.3432156618996595e-05, "loss": 0.6737, "num_tokens": 6975349627.0, "step": 2283 }, { "epoch": 3.3443854185748307, "grad_norm": 0.3729967767997746, "learning_rate": 1.3417144858155047e-05, "loss": 0.6815, "num_tokens": 6978431669.0, "step": 2284 }, { "epoch": 3.3458508884411065, "grad_norm": 0.39461721416163326, "learning_rate": 1.3402140819687657e-05, "loss": 0.6717, "num_tokens": 6981741050.0, "step": 2285 }, { "epoch": 3.347316358307382, "grad_norm": 0.3784022594140698, "learning_rate": 1.338714451709421e-05, "loss": 0.6783, "num_tokens": 6984720001.0, "step": 2286 }, { "epoch": 3.3487818281736583, "grad_norm": 0.3981178606238951, "learning_rate": 1.3372155963867543e-05, "loss": 0.6753, "num_tokens": 6987675448.0, "step": 2287 }, { "epoch": 3.350247298039934, "grad_norm": 0.355069531791707, "learning_rate": 1.3357175173493521e-05, "loss": 0.6695, "num_tokens": 6990732502.0, "step": 2288 }, { "epoch": 3.3517127679062098, "grad_norm": 0.3193229119423912, "learning_rate": 1.3342202159451028e-05, "loss": 0.667, "num_tokens": 6993940342.0, "step": 2289 }, { "epoch": 3.353178237772486, "grad_norm": 0.3056923199213101, "learning_rate": 1.3327236935211932e-05, "loss": 0.677, "num_tokens": 6997146027.0, "step": 2290 }, { "epoch": 3.3546437076387616, "grad_norm": 0.29958428572163304, "learning_rate": 1.3312279514241131e-05, "loss": 0.6651, "num_tokens": 7000152649.0, "step": 2291 }, { "epoch": 3.3561091775050373, "grad_norm": 0.3467557574344487, "learning_rate": 1.3297329909996448e-05, "loss": 0.6646, "num_tokens": 7003532383.0, "step": 2292 }, { "epoch": 3.3575746473713135, "grad_norm": 0.3267572385991316, "learning_rate": 1.3282388135928728e-05, "loss": 0.6885, "num_tokens": 7006607278.0, "step": 2293 }, { "epoch": 3.359040117237589, "grad_norm": 0.3206475661589553, "learning_rate": 1.3267454205481733e-05, "loss": 0.687, "num_tokens": 7009382798.0, "step": 2294 }, { "epoch": 3.360505587103865, "grad_norm": 0.32368929997365975, "learning_rate": 1.3252528132092187e-05, "loss": 0.6833, "num_tokens": 7012298136.0, "step": 2295 }, { "epoch": 3.361971056970141, "grad_norm": 0.3094890614542978, "learning_rate": 1.323760992918972e-05, "loss": 0.699, "num_tokens": 7015232330.0, "step": 2296 }, { "epoch": 3.363436526836417, "grad_norm": 0.3487273707901496, "learning_rate": 1.3222699610196937e-05, "loss": 0.6736, "num_tokens": 7018172998.0, "step": 2297 }, { "epoch": 3.364901996702693, "grad_norm": 0.3515686272653105, "learning_rate": 1.3207797188529266e-05, "loss": 0.6887, "num_tokens": 7021184101.0, "step": 2298 }, { "epoch": 3.3663674665689687, "grad_norm": 0.29650476465713105, "learning_rate": 1.3192902677595103e-05, "loss": 0.6703, "num_tokens": 7024244543.0, "step": 2299 }, { "epoch": 3.3678329364352444, "grad_norm": 0.33592640509617094, "learning_rate": 1.3178016090795698e-05, "loss": 0.6844, "num_tokens": 7027028689.0, "step": 2300 }, { "epoch": 3.3692984063015206, "grad_norm": 0.3194273158667683, "learning_rate": 1.3163137441525149e-05, "loss": 0.6698, "num_tokens": 7030264147.0, "step": 2301 }, { "epoch": 3.3707638761677963, "grad_norm": 0.3753814485627654, "learning_rate": 1.3148266743170455e-05, "loss": 0.6836, "num_tokens": 7033089490.0, "step": 2302 }, { "epoch": 3.372229346034072, "grad_norm": 0.34706312204489564, "learning_rate": 1.3133404009111435e-05, "loss": 0.6832, "num_tokens": 7036143714.0, "step": 2303 }, { "epoch": 3.373694815900348, "grad_norm": 0.331858292708562, "learning_rate": 1.3118549252720749e-05, "loss": 0.6656, "num_tokens": 7039128864.0, "step": 2304 }, { "epoch": 3.375160285766624, "grad_norm": 0.3721720300610116, "learning_rate": 1.3103702487363874e-05, "loss": 0.6817, "num_tokens": 7042456485.0, "step": 2305 }, { "epoch": 3.3766257556329, "grad_norm": 0.3209218557529914, "learning_rate": 1.3088863726399104e-05, "loss": 0.6604, "num_tokens": 7045541868.0, "step": 2306 }, { "epoch": 3.3780912254991757, "grad_norm": 0.34855914856981385, "learning_rate": 1.307403298317752e-05, "loss": 0.6556, "num_tokens": 7048531803.0, "step": 2307 }, { "epoch": 3.3795566953654514, "grad_norm": 0.3174212365677777, "learning_rate": 1.305921027104301e-05, "loss": 0.6729, "num_tokens": 7051528726.0, "step": 2308 }, { "epoch": 3.3810221652317276, "grad_norm": 0.39004927569528114, "learning_rate": 1.3044395603332218e-05, "loss": 0.6715, "num_tokens": 7054592015.0, "step": 2309 }, { "epoch": 3.3824876350980033, "grad_norm": 0.3151158737092963, "learning_rate": 1.3029588993374555e-05, "loss": 0.6728, "num_tokens": 7057649264.0, "step": 2310 }, { "epoch": 3.383953104964279, "grad_norm": 0.3654467531907248, "learning_rate": 1.3014790454492188e-05, "loss": 0.6908, "num_tokens": 7060582662.0, "step": 2311 }, { "epoch": 3.385418574830555, "grad_norm": 0.3082319094156999, "learning_rate": 1.3000000000000006e-05, "loss": 0.666, "num_tokens": 7063631182.0, "step": 2312 }, { "epoch": 3.386884044696831, "grad_norm": 0.39086602685852867, "learning_rate": 1.2985217643205635e-05, "loss": 0.6874, "num_tokens": 7066681921.0, "step": 2313 }, { "epoch": 3.3883495145631066, "grad_norm": 0.35380598996053286, "learning_rate": 1.2970443397409429e-05, "loss": 0.6687, "num_tokens": 7069724670.0, "step": 2314 }, { "epoch": 3.389814984429383, "grad_norm": 0.3958640733466791, "learning_rate": 1.295567727590441e-05, "loss": 0.6802, "num_tokens": 7072597507.0, "step": 2315 }, { "epoch": 3.3912804542956585, "grad_norm": 0.3614978076753916, "learning_rate": 1.2940919291976328e-05, "loss": 0.6904, "num_tokens": 7075664618.0, "step": 2316 }, { "epoch": 3.392745924161934, "grad_norm": 0.3723945975632481, "learning_rate": 1.2926169458903586e-05, "loss": 0.6831, "num_tokens": 7078742799.0, "step": 2317 }, { "epoch": 3.3942113940282104, "grad_norm": 0.3735268200488837, "learning_rate": 1.291142778995726e-05, "loss": 0.6679, "num_tokens": 7081768854.0, "step": 2318 }, { "epoch": 3.395676863894486, "grad_norm": 0.2986545121820524, "learning_rate": 1.2896694298401084e-05, "loss": 0.6657, "num_tokens": 7085080146.0, "step": 2319 }, { "epoch": 3.397142333760762, "grad_norm": 0.3623872259314723, "learning_rate": 1.288196899749143e-05, "loss": 0.6667, "num_tokens": 7087831548.0, "step": 2320 }, { "epoch": 3.398607803627038, "grad_norm": 0.32018000226353216, "learning_rate": 1.2867251900477293e-05, "loss": 0.6823, "num_tokens": 7090654567.0, "step": 2321 }, { "epoch": 3.4000732734933137, "grad_norm": 0.31583539861420573, "learning_rate": 1.2852543020600321e-05, "loss": 0.6719, "num_tokens": 7093790563.0, "step": 2322 }, { "epoch": 3.40153874335959, "grad_norm": 0.2811197944488693, "learning_rate": 1.2837842371094713e-05, "loss": 0.6674, "num_tokens": 7096858180.0, "step": 2323 }, { "epoch": 3.4030042132258655, "grad_norm": 0.32856016218185036, "learning_rate": 1.2823149965187311e-05, "loss": 0.6946, "num_tokens": 7100064381.0, "step": 2324 }, { "epoch": 3.4044696830921413, "grad_norm": 0.2769950924241463, "learning_rate": 1.2808465816097522e-05, "loss": 0.6804, "num_tokens": 7102999829.0, "step": 2325 }, { "epoch": 3.4059351529584174, "grad_norm": 0.34330362374469703, "learning_rate": 1.2793789937037317e-05, "loss": 0.6695, "num_tokens": 7106216312.0, "step": 2326 }, { "epoch": 3.407400622824693, "grad_norm": 0.2856780301131587, "learning_rate": 1.2779122341211232e-05, "loss": 0.6643, "num_tokens": 7109321156.0, "step": 2327 }, { "epoch": 3.408866092690969, "grad_norm": 0.30505365461867, "learning_rate": 1.2764463041816368e-05, "loss": 0.6877, "num_tokens": 7112020274.0, "step": 2328 }, { "epoch": 3.410331562557245, "grad_norm": 0.29084219421018226, "learning_rate": 1.2749812052042318e-05, "loss": 0.6564, "num_tokens": 7115113898.0, "step": 2329 }, { "epoch": 3.4117970324235207, "grad_norm": 0.33118526188579545, "learning_rate": 1.2735169385071249e-05, "loss": 0.6818, "num_tokens": 7118094291.0, "step": 2330 }, { "epoch": 3.413262502289797, "grad_norm": 0.27904179651743855, "learning_rate": 1.2720535054077802e-05, "loss": 0.6641, "num_tokens": 7121203622.0, "step": 2331 }, { "epoch": 3.4147279721560726, "grad_norm": 0.3406836944996538, "learning_rate": 1.2705909072229136e-05, "loss": 0.6671, "num_tokens": 7124401148.0, "step": 2332 }, { "epoch": 3.4161934420223483, "grad_norm": 0.30258100250347625, "learning_rate": 1.2691291452684897e-05, "loss": 0.6485, "num_tokens": 7127684983.0, "step": 2333 }, { "epoch": 3.4176589118886245, "grad_norm": 0.34755552490240693, "learning_rate": 1.2676682208597195e-05, "loss": 0.6836, "num_tokens": 7130540166.0, "step": 2334 }, { "epoch": 3.4191243817549, "grad_norm": 0.28949444078834957, "learning_rate": 1.266208135311061e-05, "loss": 0.6498, "num_tokens": 7133701750.0, "step": 2335 }, { "epoch": 3.420589851621176, "grad_norm": 0.3960667964491369, "learning_rate": 1.2647488899362202e-05, "loss": 0.6699, "num_tokens": 7136741270.0, "step": 2336 }, { "epoch": 3.422055321487452, "grad_norm": 0.3280653859184688, "learning_rate": 1.2632904860481411e-05, "loss": 0.6993, "num_tokens": 7139718746.0, "step": 2337 }, { "epoch": 3.4235207913537278, "grad_norm": 0.3576538292986628, "learning_rate": 1.2618329249590169e-05, "loss": 0.6735, "num_tokens": 7142678424.0, "step": 2338 }, { "epoch": 3.4249862612200035, "grad_norm": 0.32212034405204354, "learning_rate": 1.2603762079802786e-05, "loss": 0.6802, "num_tokens": 7145654006.0, "step": 2339 }, { "epoch": 3.4264517310862797, "grad_norm": 0.30672463488346974, "learning_rate": 1.258920336422599e-05, "loss": 0.6578, "num_tokens": 7149145769.0, "step": 2340 }, { "epoch": 3.4279172009525554, "grad_norm": 0.31720751614403514, "learning_rate": 1.2574653115958898e-05, "loss": 0.6972, "num_tokens": 7152094042.0, "step": 2341 }, { "epoch": 3.429382670818831, "grad_norm": 0.3450719675940326, "learning_rate": 1.2560111348093018e-05, "loss": 0.6594, "num_tokens": 7155080596.0, "step": 2342 }, { "epoch": 3.4308481406851072, "grad_norm": 0.328726912495686, "learning_rate": 1.254557807371222e-05, "loss": 0.6911, "num_tokens": 7158240618.0, "step": 2343 }, { "epoch": 3.432313610551383, "grad_norm": 0.3068959105484197, "learning_rate": 1.2531053305892722e-05, "loss": 0.6576, "num_tokens": 7161595894.0, "step": 2344 }, { "epoch": 3.4337790804176587, "grad_norm": 0.3230000235002226, "learning_rate": 1.2516537057703119e-05, "loss": 0.6728, "num_tokens": 7164862625.0, "step": 2345 }, { "epoch": 3.435244550283935, "grad_norm": 0.3214012284629574, "learning_rate": 1.25020293422043e-05, "loss": 0.6818, "num_tokens": 7167991999.0, "step": 2346 }, { "epoch": 3.4367100201502105, "grad_norm": 0.3171803346470209, "learning_rate": 1.2487530172449515e-05, "loss": 0.6823, "num_tokens": 7171136798.0, "step": 2347 }, { "epoch": 3.4381754900164867, "grad_norm": 0.3027568047694281, "learning_rate": 1.2473039561484304e-05, "loss": 0.6846, "num_tokens": 7174491603.0, "step": 2348 }, { "epoch": 3.4396409598827624, "grad_norm": 0.31532837114211915, "learning_rate": 1.2458557522346505e-05, "loss": 0.6769, "num_tokens": 7177585508.0, "step": 2349 }, { "epoch": 3.441106429749038, "grad_norm": 0.3139428931768803, "learning_rate": 1.2444084068066253e-05, "loss": 0.673, "num_tokens": 7180847162.0, "step": 2350 }, { "epoch": 3.4425718996153143, "grad_norm": 0.29392171220174806, "learning_rate": 1.2429619211665953e-05, "loss": 0.6568, "num_tokens": 7184024099.0, "step": 2351 }, { "epoch": 3.44403736948159, "grad_norm": 0.32066426359223765, "learning_rate": 1.241516296616027e-05, "loss": 0.6988, "num_tokens": 7187054010.0, "step": 2352 }, { "epoch": 3.4455028393478657, "grad_norm": 0.30459298795233797, "learning_rate": 1.2400715344556141e-05, "loss": 0.6726, "num_tokens": 7190059261.0, "step": 2353 }, { "epoch": 3.446968309214142, "grad_norm": 0.33554619800277674, "learning_rate": 1.2386276359852718e-05, "loss": 0.644, "num_tokens": 7193071246.0, "step": 2354 }, { "epoch": 3.4484337790804176, "grad_norm": 0.301480308805606, "learning_rate": 1.23718460250414e-05, "loss": 0.6552, "num_tokens": 7196267637.0, "step": 2355 }, { "epoch": 3.4498992489466938, "grad_norm": 0.3305896278720203, "learning_rate": 1.2357424353105793e-05, "loss": 0.6692, "num_tokens": 7199245819.0, "step": 2356 }, { "epoch": 3.4513647188129695, "grad_norm": 0.3000067324698446, "learning_rate": 1.2343011357021718e-05, "loss": 0.6819, "num_tokens": 7202174867.0, "step": 2357 }, { "epoch": 3.452830188679245, "grad_norm": 0.331108974525821, "learning_rate": 1.232860704975717e-05, "loss": 0.6832, "num_tokens": 7204906384.0, "step": 2358 }, { "epoch": 3.4542956585455213, "grad_norm": 0.30822440479392427, "learning_rate": 1.2314211444272365e-05, "loss": 0.6863, "num_tokens": 7208019892.0, "step": 2359 }, { "epoch": 3.455761128411797, "grad_norm": 0.31457760557042735, "learning_rate": 1.2299824553519644e-05, "loss": 0.6889, "num_tokens": 7210873575.0, "step": 2360 }, { "epoch": 3.4572265982780728, "grad_norm": 0.278716029425566, "learning_rate": 1.2285446390443539e-05, "loss": 0.6714, "num_tokens": 7214120061.0, "step": 2361 }, { "epoch": 3.458692068144349, "grad_norm": 0.285286088938773, "learning_rate": 1.227107696798072e-05, "loss": 0.6666, "num_tokens": 7217238963.0, "step": 2362 }, { "epoch": 3.4601575380106246, "grad_norm": 0.28648517598618223, "learning_rate": 1.2256716299059986e-05, "loss": 0.6695, "num_tokens": 7220460767.0, "step": 2363 }, { "epoch": 3.4616230078769004, "grad_norm": 0.3052140797961659, "learning_rate": 1.2242364396602269e-05, "loss": 0.6835, "num_tokens": 7223662508.0, "step": 2364 }, { "epoch": 3.4630884777431765, "grad_norm": 0.2901735753817344, "learning_rate": 1.222802127352061e-05, "loss": 0.6598, "num_tokens": 7226556205.0, "step": 2365 }, { "epoch": 3.4645539476094522, "grad_norm": 0.31674110575252773, "learning_rate": 1.2213686942720141e-05, "loss": 0.6534, "num_tokens": 7229653626.0, "step": 2366 }, { "epoch": 3.466019417475728, "grad_norm": 0.3014376617658758, "learning_rate": 1.2199361417098107e-05, "loss": 0.6705, "num_tokens": 7232741107.0, "step": 2367 }, { "epoch": 3.467484887342004, "grad_norm": 0.31605317001348443, "learning_rate": 1.2185044709543816e-05, "loss": 0.6674, "num_tokens": 7235890740.0, "step": 2368 }, { "epoch": 3.46895035720828, "grad_norm": 0.32170857203914577, "learning_rate": 1.2170736832938634e-05, "loss": 0.6867, "num_tokens": 7239034835.0, "step": 2369 }, { "epoch": 3.4704158270745555, "grad_norm": 0.34563186676956076, "learning_rate": 1.2156437800155998e-05, "loss": 0.6765, "num_tokens": 7241847331.0, "step": 2370 }, { "epoch": 3.4718812969408317, "grad_norm": 0.3511241479423847, "learning_rate": 1.2142147624061375e-05, "loss": 0.6682, "num_tokens": 7244759779.0, "step": 2371 }, { "epoch": 3.4733467668071074, "grad_norm": 0.3399586216182509, "learning_rate": 1.212786631751226e-05, "loss": 0.6627, "num_tokens": 7247840593.0, "step": 2372 }, { "epoch": 3.4748122366733836, "grad_norm": 0.29951225287561806, "learning_rate": 1.2113593893358202e-05, "loss": 0.6753, "num_tokens": 7251033643.0, "step": 2373 }, { "epoch": 3.4762777065396593, "grad_norm": 0.34699683368497464, "learning_rate": 1.2099330364440698e-05, "loss": 0.6834, "num_tokens": 7254183466.0, "step": 2374 }, { "epoch": 3.477743176405935, "grad_norm": 0.30406767950367236, "learning_rate": 1.20850757435933e-05, "loss": 0.6811, "num_tokens": 7257176008.0, "step": 2375 }, { "epoch": 3.479208646272211, "grad_norm": 0.3147466196484179, "learning_rate": 1.2070830043641513e-05, "loss": 0.6594, "num_tokens": 7260275012.0, "step": 2376 }, { "epoch": 3.480674116138487, "grad_norm": 0.34258867428165707, "learning_rate": 1.2056593277402821e-05, "loss": 0.6711, "num_tokens": 7263432607.0, "step": 2377 }, { "epoch": 3.4821395860047626, "grad_norm": 0.31976791048027403, "learning_rate": 1.2042365457686673e-05, "loss": 0.6647, "num_tokens": 7266396860.0, "step": 2378 }, { "epoch": 3.4836050558710387, "grad_norm": 0.2988654469084843, "learning_rate": 1.202814659729447e-05, "loss": 0.6572, "num_tokens": 7269705808.0, "step": 2379 }, { "epoch": 3.4850705257373145, "grad_norm": 0.2983857252247023, "learning_rate": 1.2013936709019537e-05, "loss": 0.6885, "num_tokens": 7273062104.0, "step": 2380 }, { "epoch": 3.4865359956035906, "grad_norm": 0.3258846155478573, "learning_rate": 1.1999735805647165e-05, "loss": 0.6684, "num_tokens": 7276142321.0, "step": 2381 }, { "epoch": 3.4880014654698663, "grad_norm": 0.30700234880976096, "learning_rate": 1.1985543899954512e-05, "loss": 0.6663, "num_tokens": 7279211511.0, "step": 2382 }, { "epoch": 3.489466935336142, "grad_norm": 0.29632272611490373, "learning_rate": 1.197136100471066e-05, "loss": 0.6895, "num_tokens": 7282053955.0, "step": 2383 }, { "epoch": 3.490932405202418, "grad_norm": 0.3172831140223887, "learning_rate": 1.1957187132676602e-05, "loss": 0.68, "num_tokens": 7285150067.0, "step": 2384 }, { "epoch": 3.492397875068694, "grad_norm": 0.3228199075746972, "learning_rate": 1.1943022296605191e-05, "loss": 0.6647, "num_tokens": 7288194947.0, "step": 2385 }, { "epoch": 3.4938633449349696, "grad_norm": 0.3149587595933546, "learning_rate": 1.1928866509241157e-05, "loss": 0.6756, "num_tokens": 7291377348.0, "step": 2386 }, { "epoch": 3.495328814801246, "grad_norm": 0.3406084635985525, "learning_rate": 1.1914719783321082e-05, "loss": 0.668, "num_tokens": 7294474689.0, "step": 2387 }, { "epoch": 3.4967942846675215, "grad_norm": 0.31293647254906365, "learning_rate": 1.190058213157341e-05, "loss": 0.6849, "num_tokens": 7297689793.0, "step": 2388 }, { "epoch": 3.4982597545337972, "grad_norm": 0.3087122610479668, "learning_rate": 1.1886453566718398e-05, "loss": 0.6907, "num_tokens": 7300621503.0, "step": 2389 }, { "epoch": 3.4997252244000734, "grad_norm": 0.31231545309716685, "learning_rate": 1.1872334101468162e-05, "loss": 0.6629, "num_tokens": 7303762136.0, "step": 2390 }, { "epoch": 3.501190694266349, "grad_norm": 0.34081748392374056, "learning_rate": 1.1858223748526582e-05, "loss": 0.6739, "num_tokens": 7306830187.0, "step": 2391 }, { "epoch": 3.502656164132625, "grad_norm": 0.2956876890680326, "learning_rate": 1.1844122520589388e-05, "loss": 0.6908, "num_tokens": 7309979745.0, "step": 2392 }, { "epoch": 3.504121633998901, "grad_norm": 0.33504728375560827, "learning_rate": 1.1830030430344066e-05, "loss": 0.6715, "num_tokens": 7313217919.0, "step": 2393 }, { "epoch": 3.5055871038651767, "grad_norm": 0.3228847026551916, "learning_rate": 1.1815947490469898e-05, "loss": 0.6847, "num_tokens": 7316187662.0, "step": 2394 }, { "epoch": 3.5070525737314524, "grad_norm": 0.3190855505102043, "learning_rate": 1.1801873713637929e-05, "loss": 0.6568, "num_tokens": 7318928023.0, "step": 2395 }, { "epoch": 3.5085180435977286, "grad_norm": 0.33169212129944275, "learning_rate": 1.1787809112510953e-05, "loss": 0.697, "num_tokens": 7321736489.0, "step": 2396 }, { "epoch": 3.5099835134640043, "grad_norm": 0.3186227794810296, "learning_rate": 1.1773753699743508e-05, "loss": 0.6561, "num_tokens": 7324627649.0, "step": 2397 }, { "epoch": 3.51144898333028, "grad_norm": 0.32800268371295815, "learning_rate": 1.1759707487981884e-05, "loss": 0.6846, "num_tokens": 7327823174.0, "step": 2398 }, { "epoch": 3.512914453196556, "grad_norm": 0.3117434465143724, "learning_rate": 1.1745670489864071e-05, "loss": 0.6916, "num_tokens": 7330682225.0, "step": 2399 }, { "epoch": 3.514379923062832, "grad_norm": 0.30612016979345497, "learning_rate": 1.1731642718019778e-05, "loss": 0.6662, "num_tokens": 7333644351.0, "step": 2400 }, { "epoch": 3.515845392929108, "grad_norm": 0.29859776607888855, "learning_rate": 1.1717624185070412e-05, "loss": 0.6857, "num_tokens": 7336580978.0, "step": 2401 }, { "epoch": 3.5173108627953837, "grad_norm": 0.2996909880032615, "learning_rate": 1.1703614903629066e-05, "loss": 0.6651, "num_tokens": 7339585122.0, "step": 2402 }, { "epoch": 3.51877633266166, "grad_norm": 0.3009302885592069, "learning_rate": 1.1689614886300503e-05, "loss": 0.6741, "num_tokens": 7342597151.0, "step": 2403 }, { "epoch": 3.5202418025279356, "grad_norm": 0.3100609929259285, "learning_rate": 1.1675624145681177e-05, "loss": 0.6752, "num_tokens": 7345771108.0, "step": 2404 }, { "epoch": 3.5217072723942113, "grad_norm": 0.308843106369383, "learning_rate": 1.1661642694359148e-05, "loss": 0.6908, "num_tokens": 7349000205.0, "step": 2405 }, { "epoch": 3.5231727422604875, "grad_norm": 0.3209577737539571, "learning_rate": 1.1647670544914169e-05, "loss": 0.6782, "num_tokens": 7352112966.0, "step": 2406 }, { "epoch": 3.524638212126763, "grad_norm": 0.3110953464933069, "learning_rate": 1.1633707709917592e-05, "loss": 0.6953, "num_tokens": 7354978452.0, "step": 2407 }, { "epoch": 3.526103681993039, "grad_norm": 0.3279755786806314, "learning_rate": 1.1619754201932395e-05, "loss": 0.6656, "num_tokens": 7358149721.0, "step": 2408 }, { "epoch": 3.527569151859315, "grad_norm": 0.2974767408723198, "learning_rate": 1.1605810033513167e-05, "loss": 0.6735, "num_tokens": 7361086351.0, "step": 2409 }, { "epoch": 3.529034621725591, "grad_norm": 0.31000145223830883, "learning_rate": 1.1591875217206097e-05, "loss": 0.6777, "num_tokens": 7364153661.0, "step": 2410 }, { "epoch": 3.5305000915918665, "grad_norm": 0.3091034998688398, "learning_rate": 1.1577949765548942e-05, "loss": 0.6523, "num_tokens": 7367416224.0, "step": 2411 }, { "epoch": 3.5319655614581427, "grad_norm": 0.34048417431596073, "learning_rate": 1.1564033691071064e-05, "loss": 0.6784, "num_tokens": 7370570256.0, "step": 2412 }, { "epoch": 3.5334310313244184, "grad_norm": 0.3040636017422998, "learning_rate": 1.1550127006293359e-05, "loss": 0.6653, "num_tokens": 7373561120.0, "step": 2413 }, { "epoch": 3.534896501190694, "grad_norm": 0.3352297732946215, "learning_rate": 1.1536229723728293e-05, "loss": 0.6723, "num_tokens": 7376555616.0, "step": 2414 }, { "epoch": 3.5363619710569703, "grad_norm": 0.32107690588335847, "learning_rate": 1.152234185587986e-05, "loss": 0.6742, "num_tokens": 7379513107.0, "step": 2415 }, { "epoch": 3.537827440923246, "grad_norm": 0.3583952252603583, "learning_rate": 1.1508463415243591e-05, "loss": 0.6693, "num_tokens": 7382653405.0, "step": 2416 }, { "epoch": 3.5392929107895217, "grad_norm": 0.31521458709650757, "learning_rate": 1.1494594414306524e-05, "loss": 0.6672, "num_tokens": 7385621914.0, "step": 2417 }, { "epoch": 3.540758380655798, "grad_norm": 0.3245981497297923, "learning_rate": 1.1480734865547232e-05, "loss": 0.6776, "num_tokens": 7388830318.0, "step": 2418 }, { "epoch": 3.5422238505220736, "grad_norm": 0.31857414184122734, "learning_rate": 1.146688478143574e-05, "loss": 0.6751, "num_tokens": 7392015572.0, "step": 2419 }, { "epoch": 3.5436893203883493, "grad_norm": 0.2989734199140742, "learning_rate": 1.1453044174433599e-05, "loss": 0.6681, "num_tokens": 7394975426.0, "step": 2420 }, { "epoch": 3.5451547902546254, "grad_norm": 0.3106820756724418, "learning_rate": 1.1439213056993807e-05, "loss": 0.6805, "num_tokens": 7397879200.0, "step": 2421 }, { "epoch": 3.546620260120901, "grad_norm": 0.29264808049099134, "learning_rate": 1.1425391441560833e-05, "loss": 0.6845, "num_tokens": 7401004071.0, "step": 2422 }, { "epoch": 3.548085729987177, "grad_norm": 0.3355958791831573, "learning_rate": 1.1411579340570596e-05, "loss": 0.6744, "num_tokens": 7404079357.0, "step": 2423 }, { "epoch": 3.549551199853453, "grad_norm": 0.2741205355097215, "learning_rate": 1.1397776766450455e-05, "loss": 0.6916, "num_tokens": 7407225537.0, "step": 2424 }, { "epoch": 3.5510166697197287, "grad_norm": 0.3292161958600772, "learning_rate": 1.1383983731619189e-05, "loss": 0.659, "num_tokens": 7410238758.0, "step": 2425 }, { "epoch": 3.552482139586005, "grad_norm": 0.30334474024653624, "learning_rate": 1.1370200248487005e-05, "loss": 0.6636, "num_tokens": 7413382124.0, "step": 2426 }, { "epoch": 3.5539476094522806, "grad_norm": 0.3122897834918295, "learning_rate": 1.1356426329455523e-05, "loss": 0.6712, "num_tokens": 7416631877.0, "step": 2427 }, { "epoch": 3.5554130793185568, "grad_norm": 0.28251662334328553, "learning_rate": 1.1342661986917727e-05, "loss": 0.6758, "num_tokens": 7419706477.0, "step": 2428 }, { "epoch": 3.5568785491848325, "grad_norm": 0.3302209485935383, "learning_rate": 1.1328907233258017e-05, "loss": 0.6882, "num_tokens": 7422693437.0, "step": 2429 }, { "epoch": 3.558344019051108, "grad_norm": 0.27725619159336623, "learning_rate": 1.1315162080852153e-05, "loss": 0.6865, "num_tokens": 7425882887.0, "step": 2430 }, { "epoch": 3.5598094889173844, "grad_norm": 0.3052826145787302, "learning_rate": 1.1301426542067253e-05, "loss": 0.6623, "num_tokens": 7428948520.0, "step": 2431 }, { "epoch": 3.56127495878366, "grad_norm": 0.31759531740749314, "learning_rate": 1.1287700629261788e-05, "loss": 0.6733, "num_tokens": 7432021257.0, "step": 2432 }, { "epoch": 3.562740428649936, "grad_norm": 0.29637352126862826, "learning_rate": 1.1273984354785573e-05, "loss": 0.6781, "num_tokens": 7435379613.0, "step": 2433 }, { "epoch": 3.564205898516212, "grad_norm": 0.29905448620210534, "learning_rate": 1.1260277730979732e-05, "loss": 0.6955, "num_tokens": 7438327137.0, "step": 2434 }, { "epoch": 3.5656713683824877, "grad_norm": 0.32013836190467143, "learning_rate": 1.1246580770176747e-05, "loss": 0.6859, "num_tokens": 7441549909.0, "step": 2435 }, { "epoch": 3.5671368382487634, "grad_norm": 0.2640364992423809, "learning_rate": 1.123289348470035e-05, "loss": 0.6578, "num_tokens": 7444871233.0, "step": 2436 }, { "epoch": 3.5686023081150395, "grad_norm": 0.2914409791179919, "learning_rate": 1.121921588686561e-05, "loss": 0.6975, "num_tokens": 7448054549.0, "step": 2437 }, { "epoch": 3.5700677779813152, "grad_norm": 0.263412748692145, "learning_rate": 1.1205547988978872e-05, "loss": 0.6695, "num_tokens": 7451107463.0, "step": 2438 }, { "epoch": 3.571533247847591, "grad_norm": 0.2846600980766393, "learning_rate": 1.1191889803337742e-05, "loss": 0.6721, "num_tokens": 7454246500.0, "step": 2439 }, { "epoch": 3.572998717713867, "grad_norm": 0.3014154428012928, "learning_rate": 1.1178241342231084e-05, "loss": 0.6636, "num_tokens": 7457131636.0, "step": 2440 }, { "epoch": 3.574464187580143, "grad_norm": 0.2978661191119342, "learning_rate": 1.1164602617939048e-05, "loss": 0.6708, "num_tokens": 7460246918.0, "step": 2441 }, { "epoch": 3.5759296574464186, "grad_norm": 0.30076357611194116, "learning_rate": 1.1150973642732966e-05, "loss": 0.6549, "num_tokens": 7463576084.0, "step": 2442 }, { "epoch": 3.5773951273126947, "grad_norm": 0.2863629321850154, "learning_rate": 1.1137354428875451e-05, "loss": 0.6756, "num_tokens": 7466606469.0, "step": 2443 }, { "epoch": 3.5788605971789704, "grad_norm": 0.3082620695239256, "learning_rate": 1.11237449886203e-05, "loss": 0.6931, "num_tokens": 7469575060.0, "step": 2444 }, { "epoch": 3.580326067045246, "grad_norm": 0.32662559781020256, "learning_rate": 1.1110145334212537e-05, "loss": 0.6819, "num_tokens": 7472543091.0, "step": 2445 }, { "epoch": 3.5817915369115223, "grad_norm": 0.3066812971474287, "learning_rate": 1.1096555477888364e-05, "loss": 0.6869, "num_tokens": 7475404389.0, "step": 2446 }, { "epoch": 3.583257006777798, "grad_norm": 0.3185435722184899, "learning_rate": 1.108297543187518e-05, "loss": 0.6643, "num_tokens": 7478495716.0, "step": 2447 }, { "epoch": 3.5847224766440737, "grad_norm": 0.29316325614853733, "learning_rate": 1.1069405208391548e-05, "loss": 0.684, "num_tokens": 7481638522.0, "step": 2448 }, { "epoch": 3.58618794651035, "grad_norm": 0.33682574734278803, "learning_rate": 1.1055844819647212e-05, "loss": 0.6668, "num_tokens": 7484687624.0, "step": 2449 }, { "epoch": 3.5876534163766256, "grad_norm": 0.3218032931995025, "learning_rate": 1.1042294277843029e-05, "loss": 0.6593, "num_tokens": 7487937718.0, "step": 2450 }, { "epoch": 3.5891188862429018, "grad_norm": 0.2893656187391702, "learning_rate": 1.1028753595171043e-05, "loss": 0.6679, "num_tokens": 7491168376.0, "step": 2451 }, { "epoch": 3.5905843561091775, "grad_norm": 0.29788363342234875, "learning_rate": 1.1015222783814394e-05, "loss": 0.6789, "num_tokens": 7494038205.0, "step": 2452 }, { "epoch": 3.5920498259754536, "grad_norm": 0.30472083432686636, "learning_rate": 1.100170185594735e-05, "loss": 0.667, "num_tokens": 7497146229.0, "step": 2453 }, { "epoch": 3.5935152958417294, "grad_norm": 0.2826467265773505, "learning_rate": 1.0988190823735296e-05, "loss": 0.6531, "num_tokens": 7500386224.0, "step": 2454 }, { "epoch": 3.594980765708005, "grad_norm": 0.3026899390097473, "learning_rate": 1.0974689699334698e-05, "loss": 0.6546, "num_tokens": 7503485044.0, "step": 2455 }, { "epoch": 3.5964462355742812, "grad_norm": 0.31469939831440097, "learning_rate": 1.0961198494893103e-05, "loss": 0.6753, "num_tokens": 7506564051.0, "step": 2456 }, { "epoch": 3.597911705440557, "grad_norm": 0.2710537488846357, "learning_rate": 1.094771722254917e-05, "loss": 0.6941, "num_tokens": 7509809911.0, "step": 2457 }, { "epoch": 3.5993771753068327, "grad_norm": 0.30738783109323087, "learning_rate": 1.093424589443258e-05, "loss": 0.6907, "num_tokens": 7512748299.0, "step": 2458 }, { "epoch": 3.600842645173109, "grad_norm": 0.30891779221321897, "learning_rate": 1.0920784522664082e-05, "loss": 0.6777, "num_tokens": 7515576216.0, "step": 2459 }, { "epoch": 3.6023081150393845, "grad_norm": 0.30926510200579055, "learning_rate": 1.0907333119355474e-05, "loss": 0.6856, "num_tokens": 7518771846.0, "step": 2460 }, { "epoch": 3.6037735849056602, "grad_norm": 0.29555837648463, "learning_rate": 1.0893891696609574e-05, "loss": 0.6648, "num_tokens": 7521650681.0, "step": 2461 }, { "epoch": 3.6052390547719364, "grad_norm": 0.29179950038306074, "learning_rate": 1.0880460266520217e-05, "loss": 0.6712, "num_tokens": 7524771409.0, "step": 2462 }, { "epoch": 3.606704524638212, "grad_norm": 0.2949680669282753, "learning_rate": 1.086703884117227e-05, "loss": 0.6651, "num_tokens": 7527884436.0, "step": 2463 }, { "epoch": 3.608169994504488, "grad_norm": 0.3113228417189421, "learning_rate": 1.0853627432641574e-05, "loss": 0.6759, "num_tokens": 7531128415.0, "step": 2464 }, { "epoch": 3.609635464370764, "grad_norm": 0.2990122120864179, "learning_rate": 1.0840226052994956e-05, "loss": 0.6881, "num_tokens": 7533996387.0, "step": 2465 }, { "epoch": 3.6111009342370397, "grad_norm": 0.31610036410732795, "learning_rate": 1.082683471429025e-05, "loss": 0.6611, "num_tokens": 7537218327.0, "step": 2466 }, { "epoch": 3.6125664041033154, "grad_norm": 0.34876853132124036, "learning_rate": 1.0813453428576217e-05, "loss": 0.6779, "num_tokens": 7540367016.0, "step": 2467 }, { "epoch": 3.6140318739695916, "grad_norm": 0.2868164608089626, "learning_rate": 1.0800082207892606e-05, "loss": 0.6631, "num_tokens": 7543395617.0, "step": 2468 }, { "epoch": 3.6154973438358673, "grad_norm": 0.35731004208776446, "learning_rate": 1.0786721064270082e-05, "loss": 0.6811, "num_tokens": 7546296839.0, "step": 2469 }, { "epoch": 3.616962813702143, "grad_norm": 0.3502379833139834, "learning_rate": 1.0773370009730271e-05, "loss": 0.6819, "num_tokens": 7549352517.0, "step": 2470 }, { "epoch": 3.618428283568419, "grad_norm": 0.33556830058437254, "learning_rate": 1.0760029056285688e-05, "loss": 0.6779, "num_tokens": 7552249430.0, "step": 2471 }, { "epoch": 3.619893753434695, "grad_norm": 0.3368153165093765, "learning_rate": 1.0746698215939804e-05, "loss": 0.6802, "num_tokens": 7555371589.0, "step": 2472 }, { "epoch": 3.6213592233009706, "grad_norm": 0.28041658644882733, "learning_rate": 1.0733377500686938e-05, "loss": 0.6748, "num_tokens": 7558335168.0, "step": 2473 }, { "epoch": 3.6228246931672468, "grad_norm": 0.33339076221456904, "learning_rate": 1.0720066922512344e-05, "loss": 0.6723, "num_tokens": 7561172497.0, "step": 2474 }, { "epoch": 3.6242901630335225, "grad_norm": 0.2751221705904446, "learning_rate": 1.0706766493392134e-05, "loss": 0.6884, "num_tokens": 7564065905.0, "step": 2475 }, { "epoch": 3.6257556328997986, "grad_norm": 0.31540842089342086, "learning_rate": 1.0693476225293291e-05, "loss": 0.6844, "num_tokens": 7567103046.0, "step": 2476 }, { "epoch": 3.6272211027660743, "grad_norm": 0.30369311040829944, "learning_rate": 1.0680196130173657e-05, "loss": 0.6697, "num_tokens": 7570251950.0, "step": 2477 }, { "epoch": 3.6286865726323505, "grad_norm": 0.29919512986829705, "learning_rate": 1.066692621998192e-05, "loss": 0.6719, "num_tokens": 7573398511.0, "step": 2478 }, { "epoch": 3.630152042498626, "grad_norm": 0.29264263824919257, "learning_rate": 1.0653666506657594e-05, "loss": 0.6817, "num_tokens": 7576314315.0, "step": 2479 }, { "epoch": 3.631617512364902, "grad_norm": 0.31460981691262546, "learning_rate": 1.0640417002131056e-05, "loss": 0.6869, "num_tokens": 7579252136.0, "step": 2480 }, { "epoch": 3.633082982231178, "grad_norm": 0.291306390631656, "learning_rate": 1.0627177718323442e-05, "loss": 0.6741, "num_tokens": 7582322163.0, "step": 2481 }, { "epoch": 3.634548452097454, "grad_norm": 0.2832672676801926, "learning_rate": 1.0613948667146739e-05, "loss": 0.6797, "num_tokens": 7585103430.0, "step": 2482 }, { "epoch": 3.6360139219637295, "grad_norm": 0.34654531627354435, "learning_rate": 1.0600729860503705e-05, "loss": 0.676, "num_tokens": 7588264433.0, "step": 2483 }, { "epoch": 3.6374793918300057, "grad_norm": 0.30732383194519486, "learning_rate": 1.0587521310287883e-05, "loss": 0.6926, "num_tokens": 7591342894.0, "step": 2484 }, { "epoch": 3.6389448616962814, "grad_norm": 0.31001511455355674, "learning_rate": 1.0574323028383582e-05, "loss": 0.6609, "num_tokens": 7594464048.0, "step": 2485 }, { "epoch": 3.640410331562557, "grad_norm": 0.30941668234959, "learning_rate": 1.05611350266659e-05, "loss": 0.6832, "num_tokens": 7597610959.0, "step": 2486 }, { "epoch": 3.6418758014288333, "grad_norm": 0.31385530730910494, "learning_rate": 1.0547957317000641e-05, "loss": 0.6648, "num_tokens": 7600612628.0, "step": 2487 }, { "epoch": 3.643341271295109, "grad_norm": 0.306982708913995, "learning_rate": 1.0534789911244392e-05, "loss": 0.6981, "num_tokens": 7603397486.0, "step": 2488 }, { "epoch": 3.6448067411613847, "grad_norm": 0.3005703017667938, "learning_rate": 1.0521632821244442e-05, "loss": 0.6543, "num_tokens": 7606346131.0, "step": 2489 }, { "epoch": 3.646272211027661, "grad_norm": 0.28183605119915317, "learning_rate": 1.0508486058838803e-05, "loss": 0.6716, "num_tokens": 7609593654.0, "step": 2490 }, { "epoch": 3.6477376808939366, "grad_norm": 0.3359298387773411, "learning_rate": 1.0495349635856205e-05, "loss": 0.6876, "num_tokens": 7612537919.0, "step": 2491 }, { "epoch": 3.6492031507602123, "grad_norm": 0.2796982497662434, "learning_rate": 1.0482223564116064e-05, "loss": 0.6667, "num_tokens": 7615614940.0, "step": 2492 }, { "epoch": 3.6506686206264884, "grad_norm": 0.2967855074456007, "learning_rate": 1.0469107855428482e-05, "loss": 0.6852, "num_tokens": 7618601798.0, "step": 2493 }, { "epoch": 3.652134090492764, "grad_norm": 0.30041487183434823, "learning_rate": 1.0456002521594261e-05, "loss": 0.6717, "num_tokens": 7621712576.0, "step": 2494 }, { "epoch": 3.65359956035904, "grad_norm": 0.29115252309196177, "learning_rate": 1.0442907574404826e-05, "loss": 0.6879, "num_tokens": 7624793878.0, "step": 2495 }, { "epoch": 3.655065030225316, "grad_norm": 0.28438649447782227, "learning_rate": 1.0429823025642292e-05, "loss": 0.6843, "num_tokens": 7627788658.0, "step": 2496 }, { "epoch": 3.6565305000915918, "grad_norm": 0.31406830667174906, "learning_rate": 1.0416748887079409e-05, "loss": 0.6833, "num_tokens": 7630923206.0, "step": 2497 }, { "epoch": 3.6579959699578675, "grad_norm": 0.27413164244969046, "learning_rate": 1.0403685170479549e-05, "loss": 0.6821, "num_tokens": 7634064556.0, "step": 2498 }, { "epoch": 3.6594614398241436, "grad_norm": 0.29466619893687507, "learning_rate": 1.0390631887596711e-05, "loss": 0.6666, "num_tokens": 7636916978.0, "step": 2499 }, { "epoch": 3.6609269096904193, "grad_norm": 0.2872211689665231, "learning_rate": 1.0377589050175537e-05, "loss": 0.6653, "num_tokens": 7639909487.0, "step": 2500 }, { "epoch": 3.6623923795566955, "grad_norm": 0.2927302764129567, "learning_rate": 1.0364556669951206e-05, "loss": 0.69, "num_tokens": 7643011121.0, "step": 2501 }, { "epoch": 3.663857849422971, "grad_norm": 0.2831122315463431, "learning_rate": 1.0351534758649556e-05, "loss": 0.6845, "num_tokens": 7646324069.0, "step": 2502 }, { "epoch": 3.6653233192892474, "grad_norm": 0.29366516489507044, "learning_rate": 1.0338523327986967e-05, "loss": 0.6732, "num_tokens": 7649483651.0, "step": 2503 }, { "epoch": 3.666788789155523, "grad_norm": 0.2555832022964601, "learning_rate": 1.0325522389670396e-05, "loss": 0.6706, "num_tokens": 7652300475.0, "step": 2504 }, { "epoch": 3.668254259021799, "grad_norm": 0.29264155225706545, "learning_rate": 1.0312531955397365e-05, "loss": 0.6793, "num_tokens": 7655265789.0, "step": 2505 }, { "epoch": 3.669719728888075, "grad_norm": 0.2891594987700585, "learning_rate": 1.0299552036855936e-05, "loss": 0.6791, "num_tokens": 7658195221.0, "step": 2506 }, { "epoch": 3.6711851987543507, "grad_norm": 0.2918687059484172, "learning_rate": 1.028658264572472e-05, "loss": 0.68, "num_tokens": 7661239568.0, "step": 2507 }, { "epoch": 3.6726506686206264, "grad_norm": 0.28918619491959674, "learning_rate": 1.0273623793672855e-05, "loss": 0.6808, "num_tokens": 7664256989.0, "step": 2508 }, { "epoch": 3.6741161384869025, "grad_norm": 0.2843455615943383, "learning_rate": 1.0260675492359988e-05, "loss": 0.6761, "num_tokens": 7667611927.0, "step": 2509 }, { "epoch": 3.6755816083531783, "grad_norm": 0.26475771415499977, "learning_rate": 1.0247737753436276e-05, "loss": 0.6853, "num_tokens": 7670963744.0, "step": 2510 }, { "epoch": 3.677047078219454, "grad_norm": 0.26538236612581934, "learning_rate": 1.0234810588542388e-05, "loss": 0.6824, "num_tokens": 7673908057.0, "step": 2511 }, { "epoch": 3.67851254808573, "grad_norm": 0.2567292074101507, "learning_rate": 1.022189400930944e-05, "loss": 0.6852, "num_tokens": 7676966949.0, "step": 2512 }, { "epoch": 3.679978017952006, "grad_norm": 0.2891340531331051, "learning_rate": 1.0208988027359075e-05, "loss": 0.6922, "num_tokens": 7679820437.0, "step": 2513 }, { "epoch": 3.6814434878182816, "grad_norm": 0.2880442655690957, "learning_rate": 1.0196092654303365e-05, "loss": 0.6794, "num_tokens": 7682844348.0, "step": 2514 }, { "epoch": 3.6829089576845577, "grad_norm": 0.29895084942396183, "learning_rate": 1.0183207901744847e-05, "loss": 0.6711, "num_tokens": 7685760633.0, "step": 2515 }, { "epoch": 3.6843744275508334, "grad_norm": 0.26905366572962364, "learning_rate": 1.0170333781276494e-05, "loss": 0.6623, "num_tokens": 7688843262.0, "step": 2516 }, { "epoch": 3.685839897417109, "grad_norm": 0.28872555424826135, "learning_rate": 1.0157470304481744e-05, "loss": 0.6651, "num_tokens": 7692110176.0, "step": 2517 }, { "epoch": 3.6873053672833853, "grad_norm": 0.2914233824247382, "learning_rate": 1.014461748293441e-05, "loss": 0.6824, "num_tokens": 7695282037.0, "step": 2518 }, { "epoch": 3.688770837149661, "grad_norm": 0.2730801279675292, "learning_rate": 1.013177532819876e-05, "loss": 0.6576, "num_tokens": 7698127309.0, "step": 2519 }, { "epoch": 3.6902363070159367, "grad_norm": 0.2674863229120733, "learning_rate": 1.0118943851829446e-05, "loss": 0.6676, "num_tokens": 7701199759.0, "step": 2520 }, { "epoch": 3.691701776882213, "grad_norm": 0.29227892466431765, "learning_rate": 1.0106123065371514e-05, "loss": 0.6686, "num_tokens": 7704332832.0, "step": 2521 }, { "epoch": 3.6931672467484886, "grad_norm": 0.2773566555537987, "learning_rate": 1.0093312980360385e-05, "loss": 0.6634, "num_tokens": 7707543670.0, "step": 2522 }, { "epoch": 3.6946327166147643, "grad_norm": 0.32721425206433763, "learning_rate": 1.0080513608321865e-05, "loss": 0.689, "num_tokens": 7710868371.0, "step": 2523 }, { "epoch": 3.6960981864810405, "grad_norm": 0.3006430914753327, "learning_rate": 1.0067724960772109e-05, "loss": 0.6819, "num_tokens": 7713747537.0, "step": 2524 }, { "epoch": 3.697563656347316, "grad_norm": 0.3150755745836954, "learning_rate": 1.0054947049217647e-05, "loss": 0.6927, "num_tokens": 7716859228.0, "step": 2525 }, { "epoch": 3.6990291262135924, "grad_norm": 0.2972694992312387, "learning_rate": 1.0042179885155303e-05, "loss": 0.6625, "num_tokens": 7719677400.0, "step": 2526 }, { "epoch": 3.700494596079868, "grad_norm": 0.31489504056310763, "learning_rate": 1.002942348007228e-05, "loss": 0.6872, "num_tokens": 7722783491.0, "step": 2527 }, { "epoch": 3.7019600659461442, "grad_norm": 0.3223807469437241, "learning_rate": 1.0016677845446072e-05, "loss": 0.6906, "num_tokens": 7725820317.0, "step": 2528 }, { "epoch": 3.70342553581242, "grad_norm": 0.3321982269495967, "learning_rate": 1.0003942992744489e-05, "loss": 0.6602, "num_tokens": 7729025141.0, "step": 2529 }, { "epoch": 3.7048910056786957, "grad_norm": 0.3077796869089496, "learning_rate": 9.991218933425637e-06, "loss": 0.6722, "num_tokens": 7731945509.0, "step": 2530 }, { "epoch": 3.706356475544972, "grad_norm": 0.28365465149349534, "learning_rate": 9.978505678937934e-06, "loss": 0.6643, "num_tokens": 7735129106.0, "step": 2531 }, { "epoch": 3.7078219454112475, "grad_norm": 0.31121861394148304, "learning_rate": 9.965803240720028e-06, "loss": 0.6653, "num_tokens": 7738178206.0, "step": 2532 }, { "epoch": 3.7092874152775233, "grad_norm": 0.2974056411017123, "learning_rate": 9.953111630200886e-06, "loss": 0.6745, "num_tokens": 7741122687.0, "step": 2533 }, { "epoch": 3.7107528851437994, "grad_norm": 0.27072787689200517, "learning_rate": 9.940430858799705e-06, "loss": 0.6716, "num_tokens": 7744424145.0, "step": 2534 }, { "epoch": 3.712218355010075, "grad_norm": 0.30139556517477184, "learning_rate": 9.927760937925932e-06, "loss": 0.6839, "num_tokens": 7747397341.0, "step": 2535 }, { "epoch": 3.713683824876351, "grad_norm": 0.30327818503478193, "learning_rate": 9.915101878979261e-06, "loss": 0.6774, "num_tokens": 7750702233.0, "step": 2536 }, { "epoch": 3.715149294742627, "grad_norm": 0.25574910927636846, "learning_rate": 9.9024536933496e-06, "loss": 0.6892, "num_tokens": 7753571529.0, "step": 2537 }, { "epoch": 3.7166147646089027, "grad_norm": 0.29541167592518347, "learning_rate": 9.889816392417078e-06, "loss": 0.6732, "num_tokens": 7756710866.0, "step": 2538 }, { "epoch": 3.7180802344751784, "grad_norm": 0.297537095831004, "learning_rate": 9.877189987552053e-06, "loss": 0.6689, "num_tokens": 7759680899.0, "step": 2539 }, { "epoch": 3.7195457043414546, "grad_norm": 0.2915933035084471, "learning_rate": 9.864574490115035e-06, "loss": 0.6853, "num_tokens": 7763040808.0, "step": 2540 }, { "epoch": 3.7210111742077303, "grad_norm": 0.29951436246811586, "learning_rate": 9.851969911456759e-06, "loss": 0.67, "num_tokens": 7766170276.0, "step": 2541 }, { "epoch": 3.722476644074006, "grad_norm": 0.2810187376628007, "learning_rate": 9.839376262918117e-06, "loss": 0.6673, "num_tokens": 7769305589.0, "step": 2542 }, { "epoch": 3.723942113940282, "grad_norm": 0.2948032027989562, "learning_rate": 9.826793555830174e-06, "loss": 0.6759, "num_tokens": 7772396648.0, "step": 2543 }, { "epoch": 3.725407583806558, "grad_norm": 0.30046004929788206, "learning_rate": 9.81422180151414e-06, "loss": 0.6976, "num_tokens": 7775346899.0, "step": 2544 }, { "epoch": 3.7268730536728336, "grad_norm": 0.3019977849771077, "learning_rate": 9.801661011281394e-06, "loss": 0.6644, "num_tokens": 7778381715.0, "step": 2545 }, { "epoch": 3.7283385235391098, "grad_norm": 0.27941845967068596, "learning_rate": 9.789111196433414e-06, "loss": 0.683, "num_tokens": 7781592312.0, "step": 2546 }, { "epoch": 3.7298039934053855, "grad_norm": 0.292500854200043, "learning_rate": 9.77657236826184e-06, "loss": 0.6943, "num_tokens": 7784621078.0, "step": 2547 }, { "epoch": 3.731269463271661, "grad_norm": 0.29110615446361826, "learning_rate": 9.764044538048407e-06, "loss": 0.6792, "num_tokens": 7787659503.0, "step": 2548 }, { "epoch": 3.7327349331379374, "grad_norm": 0.2659996585450849, "learning_rate": 9.751527717064942e-06, "loss": 0.6758, "num_tokens": 7790621153.0, "step": 2549 }, { "epoch": 3.734200403004213, "grad_norm": 0.29733080179630234, "learning_rate": 9.739021916573402e-06, "loss": 0.6848, "num_tokens": 7793720104.0, "step": 2550 }, { "epoch": 3.7356658728704892, "grad_norm": 0.28355092022416384, "learning_rate": 9.726527147825799e-06, "loss": 0.6688, "num_tokens": 7796633974.0, "step": 2551 }, { "epoch": 3.737131342736765, "grad_norm": 0.2975335758827942, "learning_rate": 9.71404342206423e-06, "loss": 0.6579, "num_tokens": 7799587608.0, "step": 2552 }, { "epoch": 3.738596812603041, "grad_norm": 0.3070679896608045, "learning_rate": 9.701570750520855e-06, "loss": 0.6789, "num_tokens": 7802427390.0, "step": 2553 }, { "epoch": 3.740062282469317, "grad_norm": 0.29657927091874214, "learning_rate": 9.689109144417889e-06, "loss": 0.6724, "num_tokens": 7805660850.0, "step": 2554 }, { "epoch": 3.7415277523355925, "grad_norm": 0.30825631635089884, "learning_rate": 9.676658614967584e-06, "loss": 0.6689, "num_tokens": 7808748317.0, "step": 2555 }, { "epoch": 3.7429932222018687, "grad_norm": 0.28548605008654426, "learning_rate": 9.664219173372245e-06, "loss": 0.6787, "num_tokens": 7811664244.0, "step": 2556 }, { "epoch": 3.7444586920681444, "grad_norm": 0.28015007793959407, "learning_rate": 9.651790830824168e-06, "loss": 0.6911, "num_tokens": 7814645384.0, "step": 2557 }, { "epoch": 3.74592416193442, "grad_norm": 0.2722052814509414, "learning_rate": 9.639373598505702e-06, "loss": 0.6695, "num_tokens": 7817794683.0, "step": 2558 }, { "epoch": 3.7473896318006963, "grad_norm": 0.2763262908984455, "learning_rate": 9.626967487589167e-06, "loss": 0.6853, "num_tokens": 7820752026.0, "step": 2559 }, { "epoch": 3.748855101666972, "grad_norm": 0.2889935609300939, "learning_rate": 9.614572509236895e-06, "loss": 0.7023, "num_tokens": 7823674673.0, "step": 2560 }, { "epoch": 3.7503205715332477, "grad_norm": 0.2935161488609743, "learning_rate": 9.602188674601188e-06, "loss": 0.6813, "num_tokens": 7826649529.0, "step": 2561 }, { "epoch": 3.751786041399524, "grad_norm": 0.27044019496273075, "learning_rate": 9.589815994824345e-06, "loss": 0.6623, "num_tokens": 7829642816.0, "step": 2562 }, { "epoch": 3.7532515112657996, "grad_norm": 0.28101090033973114, "learning_rate": 9.577454481038596e-06, "loss": 0.664, "num_tokens": 7832603851.0, "step": 2563 }, { "epoch": 3.7547169811320753, "grad_norm": 0.2960425904130176, "learning_rate": 9.565104144366151e-06, "loss": 0.6756, "num_tokens": 7835760789.0, "step": 2564 }, { "epoch": 3.7561824509983515, "grad_norm": 0.3035027146822713, "learning_rate": 9.552764995919153e-06, "loss": 0.6712, "num_tokens": 7838844373.0, "step": 2565 }, { "epoch": 3.757647920864627, "grad_norm": 0.26592183053554036, "learning_rate": 9.540437046799679e-06, "loss": 0.6613, "num_tokens": 7841749140.0, "step": 2566 }, { "epoch": 3.759113390730903, "grad_norm": 0.26646455789193796, "learning_rate": 9.528120308099726e-06, "loss": 0.6749, "num_tokens": 7844907280.0, "step": 2567 }, { "epoch": 3.760578860597179, "grad_norm": 0.25245195035740703, "learning_rate": 9.515814790901214e-06, "loss": 0.6744, "num_tokens": 7847925479.0, "step": 2568 }, { "epoch": 3.7620443304634548, "grad_norm": 0.328246444084621, "learning_rate": 9.503520506275949e-06, "loss": 0.6635, "num_tokens": 7851023636.0, "step": 2569 }, { "epoch": 3.7635098003297305, "grad_norm": 0.2806172477929138, "learning_rate": 9.491237465285662e-06, "loss": 0.6685, "num_tokens": 7854072023.0, "step": 2570 }, { "epoch": 3.7649752701960066, "grad_norm": 0.2850073594375248, "learning_rate": 9.478965678981927e-06, "loss": 0.6604, "num_tokens": 7856935884.0, "step": 2571 }, { "epoch": 3.7664407400622824, "grad_norm": 0.25297965713546156, "learning_rate": 9.466705158406227e-06, "loss": 0.6623, "num_tokens": 7860276372.0, "step": 2572 }, { "epoch": 3.767906209928558, "grad_norm": 0.3036682880571809, "learning_rate": 9.45445591458989e-06, "loss": 0.6746, "num_tokens": 7863415985.0, "step": 2573 }, { "epoch": 3.7693716797948342, "grad_norm": 0.28536263786229527, "learning_rate": 9.442217958554104e-06, "loss": 0.6833, "num_tokens": 7866513850.0, "step": 2574 }, { "epoch": 3.77083714966111, "grad_norm": 0.28249421324657004, "learning_rate": 9.429991301309889e-06, "loss": 0.6783, "num_tokens": 7869742061.0, "step": 2575 }, { "epoch": 3.772302619527386, "grad_norm": 0.3116289323411193, "learning_rate": 9.417775953858127e-06, "loss": 0.6598, "num_tokens": 7872835156.0, "step": 2576 }, { "epoch": 3.773768089393662, "grad_norm": 0.24902493548603707, "learning_rate": 9.405571927189487e-06, "loss": 0.6721, "num_tokens": 7875927673.0, "step": 2577 }, { "epoch": 3.775233559259938, "grad_norm": 0.296037283137463, "learning_rate": 9.393379232284483e-06, "loss": 0.6797, "num_tokens": 7879081655.0, "step": 2578 }, { "epoch": 3.7766990291262137, "grad_norm": 0.2744546560702928, "learning_rate": 9.381197880113419e-06, "loss": 0.6789, "num_tokens": 7882219855.0, "step": 2579 }, { "epoch": 3.7781644989924894, "grad_norm": 0.2955318821509196, "learning_rate": 9.369027881636396e-06, "loss": 0.6674, "num_tokens": 7885255085.0, "step": 2580 }, { "epoch": 3.7796299688587656, "grad_norm": 0.2887703794558178, "learning_rate": 9.3568692478033e-06, "loss": 0.6983, "num_tokens": 7887839264.0, "step": 2581 }, { "epoch": 3.7810954387250413, "grad_norm": 0.2872755939280624, "learning_rate": 9.344721989553789e-06, "loss": 0.6654, "num_tokens": 7890916670.0, "step": 2582 }, { "epoch": 3.782560908591317, "grad_norm": 0.3037807160168398, "learning_rate": 9.332586117817285e-06, "loss": 0.6529, "num_tokens": 7894014031.0, "step": 2583 }, { "epoch": 3.784026378457593, "grad_norm": 0.32136928938617815, "learning_rate": 9.320461643512982e-06, "loss": 0.6852, "num_tokens": 7896855265.0, "step": 2584 }, { "epoch": 3.785491848323869, "grad_norm": 0.29581564476892874, "learning_rate": 9.308348577549788e-06, "loss": 0.6755, "num_tokens": 7899835481.0, "step": 2585 }, { "epoch": 3.7869573181901446, "grad_norm": 0.29806016747146313, "learning_rate": 9.296246930826374e-06, "loss": 0.6835, "num_tokens": 7902829335.0, "step": 2586 }, { "epoch": 3.7884227880564207, "grad_norm": 0.28195275753103555, "learning_rate": 9.284156714231127e-06, "loss": 0.7039, "num_tokens": 7905715629.0, "step": 2587 }, { "epoch": 3.7898882579226965, "grad_norm": 0.3118629792590862, "learning_rate": 9.272077938642147e-06, "loss": 0.6948, "num_tokens": 7908678897.0, "step": 2588 }, { "epoch": 3.791353727788972, "grad_norm": 0.28072850622434986, "learning_rate": 9.260010614927241e-06, "loss": 0.6752, "num_tokens": 7911765069.0, "step": 2589 }, { "epoch": 3.7928191976552483, "grad_norm": 0.29140090401129704, "learning_rate": 9.247954753943916e-06, "loss": 0.6711, "num_tokens": 7914928632.0, "step": 2590 }, { "epoch": 3.794284667521524, "grad_norm": 0.31322006725359225, "learning_rate": 9.23591036653936e-06, "loss": 0.683, "num_tokens": 7917996260.0, "step": 2591 }, { "epoch": 3.7957501373877998, "grad_norm": 0.2892032627857932, "learning_rate": 9.22387746355044e-06, "loss": 0.6614, "num_tokens": 7920989003.0, "step": 2592 }, { "epoch": 3.797215607254076, "grad_norm": 0.27280792997736203, "learning_rate": 9.211856055803701e-06, "loss": 0.6776, "num_tokens": 7923981971.0, "step": 2593 }, { "epoch": 3.7986810771203516, "grad_norm": 0.27115764240706564, "learning_rate": 9.199846154115316e-06, "loss": 0.6526, "num_tokens": 7927088409.0, "step": 2594 }, { "epoch": 3.8001465469866273, "grad_norm": 0.2761122233967579, "learning_rate": 9.187847769291142e-06, "loss": 0.6627, "num_tokens": 7930403463.0, "step": 2595 }, { "epoch": 3.8016120168529035, "grad_norm": 0.2857152570343955, "learning_rate": 9.175860912126649e-06, "loss": 0.6693, "num_tokens": 7933485282.0, "step": 2596 }, { "epoch": 3.8030774867191792, "grad_norm": 0.3010174165815394, "learning_rate": 9.163885593406944e-06, "loss": 0.6823, "num_tokens": 7936530184.0, "step": 2597 }, { "epoch": 3.804542956585455, "grad_norm": 0.2939410946823734, "learning_rate": 9.15192182390675e-06, "loss": 0.6777, "num_tokens": 7939588216.0, "step": 2598 }, { "epoch": 3.806008426451731, "grad_norm": 0.295328243524195, "learning_rate": 9.139969614390396e-06, "loss": 0.6677, "num_tokens": 7942700811.0, "step": 2599 }, { "epoch": 3.807473896318007, "grad_norm": 0.3287908728101239, "learning_rate": 9.12802897561181e-06, "loss": 0.6736, "num_tokens": 7945771185.0, "step": 2600 }, { "epoch": 3.808939366184283, "grad_norm": 0.2752500052547036, "learning_rate": 9.116099918314532e-06, "loss": 0.6718, "num_tokens": 7948830238.0, "step": 2601 }, { "epoch": 3.8104048360505587, "grad_norm": 0.30811031253723264, "learning_rate": 9.104182453231635e-06, "loss": 0.6967, "num_tokens": 7951881977.0, "step": 2602 }, { "epoch": 3.811870305916835, "grad_norm": 0.30485405634493695, "learning_rate": 9.092276591085812e-06, "loss": 0.6686, "num_tokens": 7954773548.0, "step": 2603 }, { "epoch": 3.8133357757831106, "grad_norm": 0.2848357175627662, "learning_rate": 9.080382342589282e-06, "loss": 0.6815, "num_tokens": 7957798664.0, "step": 2604 }, { "epoch": 3.8148012456493863, "grad_norm": 0.30458103130061936, "learning_rate": 9.068499718443829e-06, "loss": 0.6881, "num_tokens": 7960839633.0, "step": 2605 }, { "epoch": 3.8162667155156624, "grad_norm": 0.29631685637194255, "learning_rate": 9.056628729340767e-06, "loss": 0.6658, "num_tokens": 7963647591.0, "step": 2606 }, { "epoch": 3.817732185381938, "grad_norm": 0.2761036544596787, "learning_rate": 9.044769385960968e-06, "loss": 0.6928, "num_tokens": 7966823472.0, "step": 2607 }, { "epoch": 3.819197655248214, "grad_norm": 0.2957423191406887, "learning_rate": 9.032921698974785e-06, "loss": 0.6577, "num_tokens": 7969964587.0, "step": 2608 }, { "epoch": 3.82066312511449, "grad_norm": 0.2751329376583761, "learning_rate": 9.021085679042124e-06, "loss": 0.6849, "num_tokens": 7972915043.0, "step": 2609 }, { "epoch": 3.8221285949807657, "grad_norm": 0.2961086114512179, "learning_rate": 9.009261336812369e-06, "loss": 0.6778, "num_tokens": 7975774983.0, "step": 2610 }, { "epoch": 3.8235940648470415, "grad_norm": 0.24370241441125515, "learning_rate": 8.9974486829244e-06, "loss": 0.6878, "num_tokens": 7978807654.0, "step": 2611 }, { "epoch": 3.8250595347133176, "grad_norm": 0.29066016590327115, "learning_rate": 8.98564772800659e-06, "loss": 0.6765, "num_tokens": 7981864652.0, "step": 2612 }, { "epoch": 3.8265250045795933, "grad_norm": 0.29122387613682843, "learning_rate": 8.973858482676776e-06, "loss": 0.6656, "num_tokens": 7984885876.0, "step": 2613 }, { "epoch": 3.827990474445869, "grad_norm": 0.2958314936624019, "learning_rate": 8.962080957542256e-06, "loss": 0.685, "num_tokens": 7987866785.0, "step": 2614 }, { "epoch": 3.829455944312145, "grad_norm": 0.2902433712908369, "learning_rate": 8.95031516319981e-06, "loss": 0.6681, "num_tokens": 7990952752.0, "step": 2615 }, { "epoch": 3.830921414178421, "grad_norm": 0.2823962411543531, "learning_rate": 8.938561110235617e-06, "loss": 0.6765, "num_tokens": 7994212607.0, "step": 2616 }, { "epoch": 3.8323868840446966, "grad_norm": 0.2572386320017982, "learning_rate": 8.92681880922534e-06, "loss": 0.6701, "num_tokens": 7997329609.0, "step": 2617 }, { "epoch": 3.833852353910973, "grad_norm": 0.2894794057598482, "learning_rate": 8.915088270734038e-06, "loss": 0.644, "num_tokens": 8000307583.0, "step": 2618 }, { "epoch": 3.8353178237772485, "grad_norm": 0.2787709802110485, "learning_rate": 8.903369505316194e-06, "loss": 0.6661, "num_tokens": 8003354856.0, "step": 2619 }, { "epoch": 3.836783293643524, "grad_norm": 0.3280560367382823, "learning_rate": 8.891662523515694e-06, "loss": 0.6755, "num_tokens": 8006625836.0, "step": 2620 }, { "epoch": 3.8382487635098004, "grad_norm": 0.31177549594042875, "learning_rate": 8.879967335865844e-06, "loss": 0.6797, "num_tokens": 8009820820.0, "step": 2621 }, { "epoch": 3.839714233376076, "grad_norm": 0.2886129992099771, "learning_rate": 8.868283952889302e-06, "loss": 0.6638, "num_tokens": 8012838990.0, "step": 2622 }, { "epoch": 3.841179703242352, "grad_norm": 0.2674566208355911, "learning_rate": 8.856612385098133e-06, "loss": 0.6743, "num_tokens": 8015640257.0, "step": 2623 }, { "epoch": 3.842645173108628, "grad_norm": 0.2766055524635886, "learning_rate": 8.844952642993767e-06, "loss": 0.6741, "num_tokens": 8018877173.0, "step": 2624 }, { "epoch": 3.8441106429749037, "grad_norm": 0.2785310173598924, "learning_rate": 8.833304737066983e-06, "loss": 0.6822, "num_tokens": 8021783938.0, "step": 2625 }, { "epoch": 3.84557611284118, "grad_norm": 0.28413870221980897, "learning_rate": 8.821668677797917e-06, "loss": 0.6631, "num_tokens": 8025014715.0, "step": 2626 }, { "epoch": 3.8470415827074556, "grad_norm": 0.26530037810688034, "learning_rate": 8.810044475656048e-06, "loss": 0.6821, "num_tokens": 8028230011.0, "step": 2627 }, { "epoch": 3.8485070525737317, "grad_norm": 0.2675244971737039, "learning_rate": 8.798432141100178e-06, "loss": 0.6878, "num_tokens": 8031281072.0, "step": 2628 }, { "epoch": 3.8499725224400074, "grad_norm": 0.25790948540107633, "learning_rate": 8.786831684578452e-06, "loss": 0.6791, "num_tokens": 8034183257.0, "step": 2629 }, { "epoch": 3.851437992306283, "grad_norm": 0.27899670873736865, "learning_rate": 8.7752431165283e-06, "loss": 0.6709, "num_tokens": 8037175102.0, "step": 2630 }, { "epoch": 3.8529034621725593, "grad_norm": 0.28037591661662437, "learning_rate": 8.763666447376471e-06, "loss": 0.6643, "num_tokens": 8040361089.0, "step": 2631 }, { "epoch": 3.854368932038835, "grad_norm": 0.262876543930627, "learning_rate": 8.752101687539017e-06, "loss": 0.678, "num_tokens": 8043264778.0, "step": 2632 }, { "epoch": 3.8558344019051107, "grad_norm": 0.2775689870643678, "learning_rate": 8.740548847421244e-06, "loss": 0.6828, "num_tokens": 8046122589.0, "step": 2633 }, { "epoch": 3.857299871771387, "grad_norm": 0.2572539992890923, "learning_rate": 8.72900793741777e-06, "loss": 0.675, "num_tokens": 8049338598.0, "step": 2634 }, { "epoch": 3.8587653416376626, "grad_norm": 0.27585388716884945, "learning_rate": 8.717478967912456e-06, "loss": 0.6782, "num_tokens": 8052485229.0, "step": 2635 }, { "epoch": 3.8602308115039383, "grad_norm": 0.2755765636373551, "learning_rate": 8.705961949278424e-06, "loss": 0.6761, "num_tokens": 8055196581.0, "step": 2636 }, { "epoch": 3.8616962813702145, "grad_norm": 0.2966791818815388, "learning_rate": 8.694456891878042e-06, "loss": 0.6621, "num_tokens": 8058332186.0, "step": 2637 }, { "epoch": 3.86316175123649, "grad_norm": 0.27662932774519583, "learning_rate": 8.682963806062934e-06, "loss": 0.6679, "num_tokens": 8061156469.0, "step": 2638 }, { "epoch": 3.864627221102766, "grad_norm": 0.2852690323028242, "learning_rate": 8.671482702173914e-06, "loss": 0.6835, "num_tokens": 8064292607.0, "step": 2639 }, { "epoch": 3.866092690969042, "grad_norm": 0.2996124478406475, "learning_rate": 8.660013590541058e-06, "loss": 0.663, "num_tokens": 8067650995.0, "step": 2640 }, { "epoch": 3.867558160835318, "grad_norm": 0.2865163220228784, "learning_rate": 8.648556481483626e-06, "loss": 0.6796, "num_tokens": 8070566911.0, "step": 2641 }, { "epoch": 3.8690236307015935, "grad_norm": 0.29368594627340305, "learning_rate": 8.637111385310081e-06, "loss": 0.6766, "num_tokens": 8073547075.0, "step": 2642 }, { "epoch": 3.8704891005678697, "grad_norm": 0.2564187908110803, "learning_rate": 8.62567831231809e-06, "loss": 0.6833, "num_tokens": 8076700238.0, "step": 2643 }, { "epoch": 3.8719545704341454, "grad_norm": 0.29680717760487557, "learning_rate": 8.614257272794485e-06, "loss": 0.6826, "num_tokens": 8079645729.0, "step": 2644 }, { "epoch": 3.873420040300421, "grad_norm": 0.28371769892819854, "learning_rate": 8.602848277015283e-06, "loss": 0.6769, "num_tokens": 8082524907.0, "step": 2645 }, { "epoch": 3.8748855101666972, "grad_norm": 0.271141989777032, "learning_rate": 8.591451335245676e-06, "loss": 0.6709, "num_tokens": 8085568230.0, "step": 2646 }, { "epoch": 3.876350980032973, "grad_norm": 0.27165224538294513, "learning_rate": 8.580066457739969e-06, "loss": 0.6628, "num_tokens": 8088553239.0, "step": 2647 }, { "epoch": 3.8778164498992487, "grad_norm": 0.27534220747304594, "learning_rate": 8.568693654741663e-06, "loss": 0.7004, "num_tokens": 8091671276.0, "step": 2648 }, { "epoch": 3.879281919765525, "grad_norm": 0.29122425588477363, "learning_rate": 8.557332936483363e-06, "loss": 0.6572, "num_tokens": 8094769582.0, "step": 2649 }, { "epoch": 3.8807473896318005, "grad_norm": 0.27091399049704856, "learning_rate": 8.545984313186807e-06, "loss": 0.6852, "num_tokens": 8097715309.0, "step": 2650 }, { "epoch": 3.8822128594980767, "grad_norm": 0.2635147918111219, "learning_rate": 8.534647795062854e-06, "loss": 0.6714, "num_tokens": 8100806666.0, "step": 2651 }, { "epoch": 3.8836783293643524, "grad_norm": 0.26563613792863067, "learning_rate": 8.523323392311481e-06, "loss": 0.6852, "num_tokens": 8103660070.0, "step": 2652 }, { "epoch": 3.8851437992306286, "grad_norm": 0.28845707342552124, "learning_rate": 8.512011115121734e-06, "loss": 0.6589, "num_tokens": 8106896808.0, "step": 2653 }, { "epoch": 3.8866092690969043, "grad_norm": 0.28105316591364743, "learning_rate": 8.500710973671785e-06, "loss": 0.6917, "num_tokens": 8109830260.0, "step": 2654 }, { "epoch": 3.88807473896318, "grad_norm": 0.25233624958217415, "learning_rate": 8.489422978128866e-06, "loss": 0.672, "num_tokens": 8113078918.0, "step": 2655 }, { "epoch": 3.889540208829456, "grad_norm": 0.28944307034709277, "learning_rate": 8.478147138649285e-06, "loss": 0.6737, "num_tokens": 8116115483.0, "step": 2656 }, { "epoch": 3.891005678695732, "grad_norm": 0.26756053575908045, "learning_rate": 8.466883465378411e-06, "loss": 0.6781, "num_tokens": 8118972483.0, "step": 2657 }, { "epoch": 3.8924711485620076, "grad_norm": 0.29212649347832187, "learning_rate": 8.45563196845067e-06, "loss": 0.6887, "num_tokens": 8121794379.0, "step": 2658 }, { "epoch": 3.8939366184282838, "grad_norm": 0.26520099862955143, "learning_rate": 8.444392657989528e-06, "loss": 0.6698, "num_tokens": 8124854157.0, "step": 2659 }, { "epoch": 3.8954020882945595, "grad_norm": 0.27292468852138146, "learning_rate": 8.433165544107507e-06, "loss": 0.6614, "num_tokens": 8128044483.0, "step": 2660 }, { "epoch": 3.896867558160835, "grad_norm": 0.29125596078158467, "learning_rate": 8.42195063690611e-06, "loss": 0.6709, "num_tokens": 8130977878.0, "step": 2661 }, { "epoch": 3.8983330280271113, "grad_norm": 0.2713717567242872, "learning_rate": 8.410747946475904e-06, "loss": 0.6718, "num_tokens": 8134115439.0, "step": 2662 }, { "epoch": 3.899798497893387, "grad_norm": 0.252182453619379, "learning_rate": 8.399557482896438e-06, "loss": 0.6925, "num_tokens": 8137020312.0, "step": 2663 }, { "epoch": 3.9012639677596628, "grad_norm": 0.2631827515113229, "learning_rate": 8.388379256236269e-06, "loss": 0.6796, "num_tokens": 8139916933.0, "step": 2664 }, { "epoch": 3.902729437625939, "grad_norm": 0.27019753372277, "learning_rate": 8.377213276552938e-06, "loss": 0.6622, "num_tokens": 8143257060.0, "step": 2665 }, { "epoch": 3.9041949074922147, "grad_norm": 0.267678346606212, "learning_rate": 8.366059553892981e-06, "loss": 0.6825, "num_tokens": 8146165938.0, "step": 2666 }, { "epoch": 3.9056603773584904, "grad_norm": 0.26550430172064926, "learning_rate": 8.35491809829188e-06, "loss": 0.6765, "num_tokens": 8149232621.0, "step": 2667 }, { "epoch": 3.9071258472247665, "grad_norm": 0.27956038242878367, "learning_rate": 8.343788919774104e-06, "loss": 0.6714, "num_tokens": 8152266089.0, "step": 2668 }, { "epoch": 3.9085913170910422, "grad_norm": 0.2886528899403416, "learning_rate": 8.332672028353068e-06, "loss": 0.6697, "num_tokens": 8155351182.0, "step": 2669 }, { "epoch": 3.910056786957318, "grad_norm": 0.2848214444533986, "learning_rate": 8.321567434031129e-06, "loss": 0.6638, "num_tokens": 8158759182.0, "step": 2670 }, { "epoch": 3.911522256823594, "grad_norm": 0.2826495909714783, "learning_rate": 8.310475146799578e-06, "loss": 0.6829, "num_tokens": 8161982305.0, "step": 2671 }, { "epoch": 3.91298772668987, "grad_norm": 0.2734369572083052, "learning_rate": 8.299395176638638e-06, "loss": 0.672, "num_tokens": 8164927054.0, "step": 2672 }, { "epoch": 3.9144531965561455, "grad_norm": 0.28618864419998835, "learning_rate": 8.288327533517448e-06, "loss": 0.6945, "num_tokens": 8168098906.0, "step": 2673 }, { "epoch": 3.9159186664224217, "grad_norm": 0.28473972828008887, "learning_rate": 8.277272227394058e-06, "loss": 0.6724, "num_tokens": 8171009114.0, "step": 2674 }, { "epoch": 3.9173841362886974, "grad_norm": 0.2955752577152905, "learning_rate": 8.266229268215414e-06, "loss": 0.6751, "num_tokens": 8173971784.0, "step": 2675 }, { "epoch": 3.9188496061549736, "grad_norm": 0.29317270873316065, "learning_rate": 8.255198665917348e-06, "loss": 0.6627, "num_tokens": 8177242688.0, "step": 2676 }, { "epoch": 3.9203150760212493, "grad_norm": 0.27711415016209684, "learning_rate": 8.244180430424598e-06, "loss": 0.6801, "num_tokens": 8180112205.0, "step": 2677 }, { "epoch": 3.9217805458875254, "grad_norm": 0.30285404368057467, "learning_rate": 8.233174571650733e-06, "loss": 0.6575, "num_tokens": 8183186080.0, "step": 2678 }, { "epoch": 3.923246015753801, "grad_norm": 0.30574464741790747, "learning_rate": 8.222181099498232e-06, "loss": 0.6727, "num_tokens": 8186184573.0, "step": 2679 }, { "epoch": 3.924711485620077, "grad_norm": 0.24377703140438342, "learning_rate": 8.2112000238584e-06, "loss": 0.685, "num_tokens": 8189185770.0, "step": 2680 }, { "epoch": 3.926176955486353, "grad_norm": 0.2908946105420151, "learning_rate": 8.200231354611392e-06, "loss": 0.6569, "num_tokens": 8192151854.0, "step": 2681 }, { "epoch": 3.9276424253526288, "grad_norm": 0.27033060870625786, "learning_rate": 8.189275101626208e-06, "loss": 0.6643, "num_tokens": 8195097159.0, "step": 2682 }, { "epoch": 3.9291078952189045, "grad_norm": 0.28739385192497846, "learning_rate": 8.178331274760679e-06, "loss": 0.6855, "num_tokens": 8197984058.0, "step": 2683 }, { "epoch": 3.9305733650851806, "grad_norm": 0.2814848220229086, "learning_rate": 8.167399883861435e-06, "loss": 0.6812, "num_tokens": 8200676091.0, "step": 2684 }, { "epoch": 3.9320388349514563, "grad_norm": 0.28565660559527944, "learning_rate": 8.156480938763946e-06, "loss": 0.6875, "num_tokens": 8203488571.0, "step": 2685 }, { "epoch": 3.933504304817732, "grad_norm": 0.28735350334703336, "learning_rate": 8.145574449292468e-06, "loss": 0.6608, "num_tokens": 8206355904.0, "step": 2686 }, { "epoch": 3.934969774684008, "grad_norm": 0.26294996728443243, "learning_rate": 8.134680425260042e-06, "loss": 0.6733, "num_tokens": 8209330366.0, "step": 2687 }, { "epoch": 3.936435244550284, "grad_norm": 0.28986874912566435, "learning_rate": 8.123798876468514e-06, "loss": 0.6805, "num_tokens": 8212358483.0, "step": 2688 }, { "epoch": 3.9379007144165596, "grad_norm": 0.2527607986748959, "learning_rate": 8.112929812708488e-06, "loss": 0.658, "num_tokens": 8215469086.0, "step": 2689 }, { "epoch": 3.939366184282836, "grad_norm": 0.2831970824250484, "learning_rate": 8.102073243759336e-06, "loss": 0.6821, "num_tokens": 8218712626.0, "step": 2690 }, { "epoch": 3.9408316541491115, "grad_norm": 0.30981527380054547, "learning_rate": 8.091229179389209e-06, "loss": 0.6875, "num_tokens": 8221504098.0, "step": 2691 }, { "epoch": 3.9422971240153872, "grad_norm": 0.2743504406539703, "learning_rate": 8.080397629354973e-06, "loss": 0.6848, "num_tokens": 8224313899.0, "step": 2692 }, { "epoch": 3.9437625938816634, "grad_norm": 0.30681434989527984, "learning_rate": 8.069578603402266e-06, "loss": 0.6704, "num_tokens": 8227316252.0, "step": 2693 }, { "epoch": 3.945228063747939, "grad_norm": 0.2643501452346204, "learning_rate": 8.058772111265436e-06, "loss": 0.679, "num_tokens": 8230398772.0, "step": 2694 }, { "epoch": 3.946693533614215, "grad_norm": 0.28777457740519835, "learning_rate": 8.047978162667566e-06, "loss": 0.6674, "num_tokens": 8233555152.0, "step": 2695 }, { "epoch": 3.948159003480491, "grad_norm": 0.296933485932885, "learning_rate": 8.037196767320442e-06, "loss": 0.6841, "num_tokens": 8236482699.0, "step": 2696 }, { "epoch": 3.9496244733467667, "grad_norm": 0.2551988686917353, "learning_rate": 8.026427934924576e-06, "loss": 0.6641, "num_tokens": 8239753512.0, "step": 2697 }, { "epoch": 3.9510899432130424, "grad_norm": 0.3147732459808143, "learning_rate": 8.015671675169145e-06, "loss": 0.6666, "num_tokens": 8242884192.0, "step": 2698 }, { "epoch": 3.9525554130793186, "grad_norm": 0.27216797620622435, "learning_rate": 8.004927997732044e-06, "loss": 0.6651, "num_tokens": 8246073370.0, "step": 2699 }, { "epoch": 3.9540208829455943, "grad_norm": 0.2983248033143554, "learning_rate": 7.994196912279832e-06, "loss": 0.6954, "num_tokens": 8248875273.0, "step": 2700 }, { "epoch": 3.9554863528118704, "grad_norm": 0.3097831974006926, "learning_rate": 7.983478428467738e-06, "loss": 0.6588, "num_tokens": 8251993853.0, "step": 2701 }, { "epoch": 3.956951822678146, "grad_norm": 0.28790171837437484, "learning_rate": 7.972772555939654e-06, "loss": 0.6894, "num_tokens": 8255279431.0, "step": 2702 }, { "epoch": 3.9584172925444223, "grad_norm": 0.32784542248287957, "learning_rate": 7.962079304328132e-06, "loss": 0.6848, "num_tokens": 8258356073.0, "step": 2703 }, { "epoch": 3.959882762410698, "grad_norm": 0.3085463965001779, "learning_rate": 7.951398683254354e-06, "loss": 0.6731, "num_tokens": 8261600675.0, "step": 2704 }, { "epoch": 3.9613482322769737, "grad_norm": 0.26266428658570223, "learning_rate": 7.940730702328165e-06, "loss": 0.6784, "num_tokens": 8264520375.0, "step": 2705 }, { "epoch": 3.96281370214325, "grad_norm": 0.292800762644355, "learning_rate": 7.930075371147996e-06, "loss": 0.6828, "num_tokens": 8267455973.0, "step": 2706 }, { "epoch": 3.9642791720095256, "grad_norm": 0.2855124226713279, "learning_rate": 7.91943269930094e-06, "loss": 0.6802, "num_tokens": 8270174899.0, "step": 2707 }, { "epoch": 3.9657446418758013, "grad_norm": 0.2591585490487283, "learning_rate": 7.908802696362666e-06, "loss": 0.6765, "num_tokens": 8273329226.0, "step": 2708 }, { "epoch": 3.9672101117420775, "grad_norm": 0.2522413383355082, "learning_rate": 7.898185371897465e-06, "loss": 0.6671, "num_tokens": 8276515983.0, "step": 2709 }, { "epoch": 3.968675581608353, "grad_norm": 0.2818748314538766, "learning_rate": 7.887580735458202e-06, "loss": 0.6861, "num_tokens": 8279707138.0, "step": 2710 }, { "epoch": 3.970141051474629, "grad_norm": 0.26344856502568376, "learning_rate": 7.876988796586355e-06, "loss": 0.6698, "num_tokens": 8282539039.0, "step": 2711 }, { "epoch": 3.971606521340905, "grad_norm": 0.2663220145376771, "learning_rate": 7.866409564811941e-06, "loss": 0.6541, "num_tokens": 8285552351.0, "step": 2712 }, { "epoch": 3.973071991207181, "grad_norm": 0.24150203982237475, "learning_rate": 7.855843049653577e-06, "loss": 0.6617, "num_tokens": 8288581585.0, "step": 2713 }, { "epoch": 3.9745374610734565, "grad_norm": 0.33394931652875476, "learning_rate": 7.84528926061842e-06, "loss": 0.6604, "num_tokens": 8291979499.0, "step": 2714 }, { "epoch": 3.9760029309397327, "grad_norm": 0.2589091075068548, "learning_rate": 7.834748207202171e-06, "loss": 0.6588, "num_tokens": 8295457376.0, "step": 2715 }, { "epoch": 3.9774684008060084, "grad_norm": 0.2774736726001636, "learning_rate": 7.824219898889093e-06, "loss": 0.6744, "num_tokens": 8298613024.0, "step": 2716 }, { "epoch": 3.978933870672284, "grad_norm": 0.259413832309309, "learning_rate": 7.813704345151966e-06, "loss": 0.6677, "num_tokens": 8301688640.0, "step": 2717 }, { "epoch": 3.9803993405385603, "grad_norm": 0.2595762142389086, "learning_rate": 7.8032015554521e-06, "loss": 0.6682, "num_tokens": 8304730302.0, "step": 2718 }, { "epoch": 3.981864810404836, "grad_norm": 0.28604162698048496, "learning_rate": 7.792711539239317e-06, "loss": 0.6835, "num_tokens": 8307781206.0, "step": 2719 }, { "epoch": 3.9833302802711117, "grad_norm": 0.27218237848258076, "learning_rate": 7.78223430595195e-06, "loss": 0.6598, "num_tokens": 8310588815.0, "step": 2720 }, { "epoch": 3.984795750137388, "grad_norm": 0.2666228702670444, "learning_rate": 7.771769865016821e-06, "loss": 0.6684, "num_tokens": 8313842554.0, "step": 2721 }, { "epoch": 3.9862612200036636, "grad_norm": 0.29323205318879547, "learning_rate": 7.761318225849266e-06, "loss": 0.651, "num_tokens": 8317098795.0, "step": 2722 }, { "epoch": 3.9877266898699393, "grad_norm": 0.27684218849403797, "learning_rate": 7.750879397853074e-06, "loss": 0.6646, "num_tokens": 8320447160.0, "step": 2723 }, { "epoch": 3.9891921597362154, "grad_norm": 0.2898667035902747, "learning_rate": 7.740453390420526e-06, "loss": 0.6761, "num_tokens": 8323494051.0, "step": 2724 }, { "epoch": 3.990657629602491, "grad_norm": 0.2894839920674067, "learning_rate": 7.73004021293236e-06, "loss": 0.6761, "num_tokens": 8326654926.0, "step": 2725 }, { "epoch": 3.9921230994687673, "grad_norm": 0.25343461168804654, "learning_rate": 7.71963987475777e-06, "loss": 0.6919, "num_tokens": 8329677946.0, "step": 2726 }, { "epoch": 3.993588569335043, "grad_norm": 0.29682184585993415, "learning_rate": 7.709252385254397e-06, "loss": 0.6849, "num_tokens": 8332626454.0, "step": 2727 }, { "epoch": 3.995054039201319, "grad_norm": 0.2911049582580313, "learning_rate": 7.698877753768342e-06, "loss": 0.6723, "num_tokens": 8335773538.0, "step": 2728 }, { "epoch": 3.996519509067595, "grad_norm": 0.26461608495369726, "learning_rate": 7.688515989634094e-06, "loss": 0.6755, "num_tokens": 8338733043.0, "step": 2729 }, { "epoch": 3.9979849789338706, "grad_norm": 0.3181464556943696, "learning_rate": 7.67816710217461e-06, "loss": 0.6518, "num_tokens": 8341589989.0, "step": 2730 }, { "epoch": 3.9994504488001468, "grad_norm": 0.30348912511808096, "learning_rate": 7.667831100701231e-06, "loss": 0.6718, "num_tokens": 8344725181.0, "step": 2731 }, { "epoch": 4.0, "grad_norm": 0.30348912511808096, "learning_rate": 7.65750799451372e-06, "loss": 0.7002, "num_tokens": 8345433648.0, "step": 2732 }, { "epoch": 4.001465469866276, "grad_norm": 0.508376068031645, "learning_rate": 7.647197792900229e-06, "loss": 0.6838, "num_tokens": 8348399184.0, "step": 2733 }, { "epoch": 4.002930939732551, "grad_norm": 0.3144344002958086, "learning_rate": 7.636900505137303e-06, "loss": 0.6579, "num_tokens": 8351412987.0, "step": 2734 }, { "epoch": 4.004396409598828, "grad_norm": 0.3084916306748364, "learning_rate": 7.626616140489862e-06, "loss": 0.66, "num_tokens": 8354150387.0, "step": 2735 }, { "epoch": 4.005861879465104, "grad_norm": 0.27382554103019996, "learning_rate": 7.616344708211219e-06, "loss": 0.6728, "num_tokens": 8357185099.0, "step": 2736 }, { "epoch": 4.007327349331379, "grad_norm": 0.3215479218309215, "learning_rate": 7.606086217543012e-06, "loss": 0.6528, "num_tokens": 8360297951.0, "step": 2737 }, { "epoch": 4.008792819197655, "grad_norm": 0.29619854562154374, "learning_rate": 7.595840677715278e-06, "loss": 0.6689, "num_tokens": 8363408275.0, "step": 2738 }, { "epoch": 4.010258289063931, "grad_norm": 0.3249265795661397, "learning_rate": 7.585608097946378e-06, "loss": 0.6641, "num_tokens": 8366476373.0, "step": 2739 }, { "epoch": 4.011723758930207, "grad_norm": 0.330551640084049, "learning_rate": 7.5753884874430116e-06, "loss": 0.676, "num_tokens": 8369410282.0, "step": 2740 }, { "epoch": 4.013189228796483, "grad_norm": 0.2716452197475487, "learning_rate": 7.56518185540021e-06, "loss": 0.6631, "num_tokens": 8372631249.0, "step": 2741 }, { "epoch": 4.014654698662759, "grad_norm": 0.31013797919860586, "learning_rate": 7.554988211001349e-06, "loss": 0.6616, "num_tokens": 8375791215.0, "step": 2742 }, { "epoch": 4.016120168529034, "grad_norm": 0.25794433983908893, "learning_rate": 7.5448075634180774e-06, "loss": 0.6649, "num_tokens": 8378640904.0, "step": 2743 }, { "epoch": 4.01758563839531, "grad_norm": 0.2977701886352675, "learning_rate": 7.534639921810393e-06, "loss": 0.6713, "num_tokens": 8381717076.0, "step": 2744 }, { "epoch": 4.0190511082615865, "grad_norm": 0.2847131601267182, "learning_rate": 7.5244852953265645e-06, "loss": 0.6835, "num_tokens": 8384624240.0, "step": 2745 }, { "epoch": 4.020516578127863, "grad_norm": 0.25013769792327667, "learning_rate": 7.51434369310316e-06, "loss": 0.6633, "num_tokens": 8387770443.0, "step": 2746 }, { "epoch": 4.021982047994138, "grad_norm": 0.278537532010583, "learning_rate": 7.504215124265024e-06, "loss": 0.652, "num_tokens": 8390844504.0, "step": 2747 }, { "epoch": 4.023447517860414, "grad_norm": 0.24889276249252337, "learning_rate": 7.494099597925282e-06, "loss": 0.682, "num_tokens": 8393801605.0, "step": 2748 }, { "epoch": 4.02491298772669, "grad_norm": 0.266188825280029, "learning_rate": 7.4839971231853135e-06, "loss": 0.6382, "num_tokens": 8397032028.0, "step": 2749 }, { "epoch": 4.0263784575929655, "grad_norm": 0.25704780519505965, "learning_rate": 7.4739077091347736e-06, "loss": 0.6723, "num_tokens": 8400041175.0, "step": 2750 }, { "epoch": 4.027843927459242, "grad_norm": 0.25329141996211546, "learning_rate": 7.4638313648515394e-06, "loss": 0.6713, "num_tokens": 8402967808.0, "step": 2751 }, { "epoch": 4.029309397325518, "grad_norm": 0.2709114702585647, "learning_rate": 7.453768099401753e-06, "loss": 0.6779, "num_tokens": 8406118987.0, "step": 2752 }, { "epoch": 4.030774867191793, "grad_norm": 0.2783717176225457, "learning_rate": 7.443717921839777e-06, "loss": 0.6781, "num_tokens": 8409062428.0, "step": 2753 }, { "epoch": 4.032240337058069, "grad_norm": 0.2454565881005608, "learning_rate": 7.433680841208201e-06, "loss": 0.6689, "num_tokens": 8412253350.0, "step": 2754 }, { "epoch": 4.033705806924345, "grad_norm": 0.26045533324812575, "learning_rate": 7.4236568665378284e-06, "loss": 0.6532, "num_tokens": 8415584326.0, "step": 2755 }, { "epoch": 4.035171276790621, "grad_norm": 0.28096285529281234, "learning_rate": 7.413646006847673e-06, "loss": 0.6627, "num_tokens": 8418621493.0, "step": 2756 }, { "epoch": 4.036636746656897, "grad_norm": 0.23311111814616522, "learning_rate": 7.403648271144948e-06, "loss": 0.6571, "num_tokens": 8421767486.0, "step": 2757 }, { "epoch": 4.038102216523173, "grad_norm": 0.2762367998712191, "learning_rate": 7.393663668425053e-06, "loss": 0.691, "num_tokens": 8424680695.0, "step": 2758 }, { "epoch": 4.039567686389448, "grad_norm": 0.26476568984680576, "learning_rate": 7.383692207671591e-06, "loss": 0.6568, "num_tokens": 8427832208.0, "step": 2759 }, { "epoch": 4.0410331562557245, "grad_norm": 0.2675081083730221, "learning_rate": 7.373733897856305e-06, "loss": 0.6718, "num_tokens": 8430643530.0, "step": 2760 }, { "epoch": 4.042498626122001, "grad_norm": 0.28654333786659736, "learning_rate": 7.363788747939145e-06, "loss": 0.6726, "num_tokens": 8433629583.0, "step": 2761 }, { "epoch": 4.043964095988276, "grad_norm": 0.27260832203740254, "learning_rate": 7.3538567668681945e-06, "loss": 0.6528, "num_tokens": 8436798377.0, "step": 2762 }, { "epoch": 4.045429565854552, "grad_norm": 0.2781331400079584, "learning_rate": 7.343937963579695e-06, "loss": 0.6687, "num_tokens": 8439942152.0, "step": 2763 }, { "epoch": 4.046895035720828, "grad_norm": 0.29798966960144907, "learning_rate": 7.334032346998034e-06, "loss": 0.6613, "num_tokens": 8443170238.0, "step": 2764 }, { "epoch": 4.0483605055871035, "grad_norm": 0.2603043296538314, "learning_rate": 7.324139926035731e-06, "loss": 0.6812, "num_tokens": 8446493634.0, "step": 2765 }, { "epoch": 4.04982597545338, "grad_norm": 0.27220310310696, "learning_rate": 7.3142607095934305e-06, "loss": 0.6703, "num_tokens": 8449560608.0, "step": 2766 }, { "epoch": 4.051291445319656, "grad_norm": 0.26175798864093436, "learning_rate": 7.304394706559912e-06, "loss": 0.6653, "num_tokens": 8452731619.0, "step": 2767 }, { "epoch": 4.052756915185931, "grad_norm": 0.26363203450344047, "learning_rate": 7.294541925812045e-06, "loss": 0.6471, "num_tokens": 8456066377.0, "step": 2768 }, { "epoch": 4.054222385052207, "grad_norm": 0.2686557795862108, "learning_rate": 7.284702376214816e-06, "loss": 0.6627, "num_tokens": 8459107661.0, "step": 2769 }, { "epoch": 4.055687854918483, "grad_norm": 0.28977322650579684, "learning_rate": 7.274876066621299e-06, "loss": 0.6892, "num_tokens": 8462107866.0, "step": 2770 }, { "epoch": 4.0571533247847595, "grad_norm": 0.2869120462509204, "learning_rate": 7.2650630058726655e-06, "loss": 0.6562, "num_tokens": 8465169898.0, "step": 2771 }, { "epoch": 4.058618794651035, "grad_norm": 0.2384630582317283, "learning_rate": 7.255263202798146e-06, "loss": 0.6889, "num_tokens": 8468094534.0, "step": 2772 }, { "epoch": 4.060084264517311, "grad_norm": 0.2800094729451123, "learning_rate": 7.245476666215079e-06, "loss": 0.6715, "num_tokens": 8471037895.0, "step": 2773 }, { "epoch": 4.061549734383587, "grad_norm": 0.2678315392760878, "learning_rate": 7.23570340492882e-06, "loss": 0.6607, "num_tokens": 8474024951.0, "step": 2774 }, { "epoch": 4.063015204249862, "grad_norm": 0.27369581622668937, "learning_rate": 7.225943427732821e-06, "loss": 0.66, "num_tokens": 8477048924.0, "step": 2775 }, { "epoch": 4.064480674116139, "grad_norm": 0.2720273948972367, "learning_rate": 7.2161967434085615e-06, "loss": 0.6561, "num_tokens": 8480109703.0, "step": 2776 }, { "epoch": 4.065946143982415, "grad_norm": 0.2752478001555263, "learning_rate": 7.206463360725557e-06, "loss": 0.6685, "num_tokens": 8483388100.0, "step": 2777 }, { "epoch": 4.06741161384869, "grad_norm": 0.28040905587354104, "learning_rate": 7.1967432884413715e-06, "loss": 0.677, "num_tokens": 8486352183.0, "step": 2778 }, { "epoch": 4.068877083714966, "grad_norm": 0.2504779867523387, "learning_rate": 7.187036535301579e-06, "loss": 0.6693, "num_tokens": 8489535049.0, "step": 2779 }, { "epoch": 4.070342553581242, "grad_norm": 0.28816548136254017, "learning_rate": 7.17734311003977e-06, "loss": 0.6458, "num_tokens": 8492683871.0, "step": 2780 }, { "epoch": 4.071808023447518, "grad_norm": 0.25485690748991907, "learning_rate": 7.1676630213775535e-06, "loss": 0.6677, "num_tokens": 8495722829.0, "step": 2781 }, { "epoch": 4.073273493313794, "grad_norm": 0.25307589836860095, "learning_rate": 7.1579962780245345e-06, "loss": 0.6499, "num_tokens": 8498962950.0, "step": 2782 }, { "epoch": 4.07473896318007, "grad_norm": 0.26571228843267947, "learning_rate": 7.148342888678308e-06, "loss": 0.6564, "num_tokens": 8502227546.0, "step": 2783 }, { "epoch": 4.076204433046345, "grad_norm": 0.2691535135822236, "learning_rate": 7.138702862024452e-06, "loss": 0.6465, "num_tokens": 8505296443.0, "step": 2784 }, { "epoch": 4.077669902912621, "grad_norm": 0.27704830843265577, "learning_rate": 7.129076206736527e-06, "loss": 0.6756, "num_tokens": 8508376837.0, "step": 2785 }, { "epoch": 4.0791353727788975, "grad_norm": 0.2727565064513333, "learning_rate": 7.119462931476056e-06, "loss": 0.6802, "num_tokens": 8511385097.0, "step": 2786 }, { "epoch": 4.080600842645173, "grad_norm": 0.25546163663204835, "learning_rate": 7.109863044892539e-06, "loss": 0.6651, "num_tokens": 8514332038.0, "step": 2787 }, { "epoch": 4.082066312511449, "grad_norm": 0.2925241338954218, "learning_rate": 7.100276555623402e-06, "loss": 0.6779, "num_tokens": 8517413361.0, "step": 2788 }, { "epoch": 4.083531782377725, "grad_norm": 0.2691651755761821, "learning_rate": 7.090703472294047e-06, "loss": 0.6654, "num_tokens": 8520505156.0, "step": 2789 }, { "epoch": 4.084997252244, "grad_norm": 0.28101539473689946, "learning_rate": 7.081143803517795e-06, "loss": 0.6715, "num_tokens": 8523606749.0, "step": 2790 }, { "epoch": 4.0864627221102765, "grad_norm": 0.2551357394655886, "learning_rate": 7.071597557895903e-06, "loss": 0.6778, "num_tokens": 8526591258.0, "step": 2791 }, { "epoch": 4.087928191976553, "grad_norm": 0.2877777014996474, "learning_rate": 7.062064744017549e-06, "loss": 0.672, "num_tokens": 8529705104.0, "step": 2792 }, { "epoch": 4.089393661842828, "grad_norm": 0.28732767977322776, "learning_rate": 7.052545370459829e-06, "loss": 0.6709, "num_tokens": 8532854574.0, "step": 2793 }, { "epoch": 4.090859131709104, "grad_norm": 0.2562060466659093, "learning_rate": 7.043039445787738e-06, "loss": 0.6689, "num_tokens": 8535883563.0, "step": 2794 }, { "epoch": 4.09232460157538, "grad_norm": 0.27597843091009455, "learning_rate": 7.0335469785541845e-06, "loss": 0.6681, "num_tokens": 8538956887.0, "step": 2795 }, { "epoch": 4.093790071441656, "grad_norm": 0.2520921805910092, "learning_rate": 7.0240679772999644e-06, "loss": 0.6668, "num_tokens": 8542050016.0, "step": 2796 }, { "epoch": 4.095255541307932, "grad_norm": 0.2688655414760752, "learning_rate": 7.014602450553738e-06, "loss": 0.6677, "num_tokens": 8545002503.0, "step": 2797 }, { "epoch": 4.096721011174208, "grad_norm": 0.2525052421507422, "learning_rate": 7.005150406832072e-06, "loss": 0.6625, "num_tokens": 8548222591.0, "step": 2798 }, { "epoch": 4.098186481040484, "grad_norm": 0.27306484967325173, "learning_rate": 6.995711854639384e-06, "loss": 0.674, "num_tokens": 8551180054.0, "step": 2799 }, { "epoch": 4.099651950906759, "grad_norm": 0.2573022679759736, "learning_rate": 6.986286802467955e-06, "loss": 0.6532, "num_tokens": 8554225194.0, "step": 2800 }, { "epoch": 4.101117420773035, "grad_norm": 0.24682498994688504, "learning_rate": 6.9768752587979236e-06, "loss": 0.6608, "num_tokens": 8557174742.0, "step": 2801 }, { "epoch": 4.102582890639312, "grad_norm": 0.25770724829122393, "learning_rate": 6.967477232097271e-06, "loss": 0.6638, "num_tokens": 8560076199.0, "step": 2802 }, { "epoch": 4.104048360505587, "grad_norm": 0.2766893510700863, "learning_rate": 6.9580927308218126e-06, "loss": 0.6577, "num_tokens": 8563106130.0, "step": 2803 }, { "epoch": 4.105513830371863, "grad_norm": 0.2668644049813433, "learning_rate": 6.948721763415209e-06, "loss": 0.6637, "num_tokens": 8566291199.0, "step": 2804 }, { "epoch": 4.106979300238139, "grad_norm": 0.24544302409094326, "learning_rate": 6.9393643383089206e-06, "loss": 0.6773, "num_tokens": 8569385724.0, "step": 2805 }, { "epoch": 4.1084447701044144, "grad_norm": 0.25711325814094943, "learning_rate": 6.930020463922248e-06, "loss": 0.6445, "num_tokens": 8572235763.0, "step": 2806 }, { "epoch": 4.109910239970691, "grad_norm": 0.2745625023578714, "learning_rate": 6.920690148662285e-06, "loss": 0.6659, "num_tokens": 8575382442.0, "step": 2807 }, { "epoch": 4.111375709836967, "grad_norm": 0.28046579867527816, "learning_rate": 6.9113734009239246e-06, "loss": 0.6521, "num_tokens": 8578473083.0, "step": 2808 }, { "epoch": 4.112841179703242, "grad_norm": 0.27516697154458775, "learning_rate": 6.902070229089861e-06, "loss": 0.6823, "num_tokens": 8581550469.0, "step": 2809 }, { "epoch": 4.114306649569518, "grad_norm": 0.28370926594353296, "learning_rate": 6.892780641530565e-06, "loss": 0.6581, "num_tokens": 8584598882.0, "step": 2810 }, { "epoch": 4.115772119435794, "grad_norm": 0.2935151083423503, "learning_rate": 6.88350464660429e-06, "loss": 0.6895, "num_tokens": 8587514006.0, "step": 2811 }, { "epoch": 4.11723758930207, "grad_norm": 0.27530642435058994, "learning_rate": 6.874242252657062e-06, "loss": 0.6741, "num_tokens": 8590242141.0, "step": 2812 }, { "epoch": 4.118703059168346, "grad_norm": 0.2804765048147015, "learning_rate": 6.864993468022666e-06, "loss": 0.6473, "num_tokens": 8593551261.0, "step": 2813 }, { "epoch": 4.120168529034622, "grad_norm": 0.2819955620599147, "learning_rate": 6.855758301022641e-06, "loss": 0.6588, "num_tokens": 8596582826.0, "step": 2814 }, { "epoch": 4.121633998900897, "grad_norm": 0.25105784008232, "learning_rate": 6.8465367599662745e-06, "loss": 0.6601, "num_tokens": 8599685975.0, "step": 2815 }, { "epoch": 4.123099468767173, "grad_norm": 0.26715117170841746, "learning_rate": 6.837328853150594e-06, "loss": 0.6747, "num_tokens": 8602655170.0, "step": 2816 }, { "epoch": 4.1245649386334495, "grad_norm": 0.2908835889941841, "learning_rate": 6.828134588860358e-06, "loss": 0.6684, "num_tokens": 8605736562.0, "step": 2817 }, { "epoch": 4.126030408499725, "grad_norm": 0.26599414503431307, "learning_rate": 6.818953975368061e-06, "loss": 0.6757, "num_tokens": 8608775792.0, "step": 2818 }, { "epoch": 4.127495878366001, "grad_norm": 0.27629417984564264, "learning_rate": 6.809787020933896e-06, "loss": 0.6834, "num_tokens": 8611967538.0, "step": 2819 }, { "epoch": 4.128961348232277, "grad_norm": 0.2509651557802986, "learning_rate": 6.800633733805786e-06, "loss": 0.6945, "num_tokens": 8614899672.0, "step": 2820 }, { "epoch": 4.130426818098553, "grad_norm": 0.29996320575949453, "learning_rate": 6.791494122219343e-06, "loss": 0.6672, "num_tokens": 8618060382.0, "step": 2821 }, { "epoch": 4.1318922879648285, "grad_norm": 0.256748611839633, "learning_rate": 6.782368194397881e-06, "loss": 0.6623, "num_tokens": 8621093792.0, "step": 2822 }, { "epoch": 4.133357757831105, "grad_norm": 0.2624601956029337, "learning_rate": 6.773255958552401e-06, "loss": 0.6722, "num_tokens": 8624242371.0, "step": 2823 }, { "epoch": 4.134823227697381, "grad_norm": 0.2581501662833647, "learning_rate": 6.7641574228815854e-06, "loss": 0.6801, "num_tokens": 8627004038.0, "step": 2824 }, { "epoch": 4.136288697563656, "grad_norm": 0.2549748239321414, "learning_rate": 6.755072595571781e-06, "loss": 0.6774, "num_tokens": 8630118940.0, "step": 2825 }, { "epoch": 4.137754167429932, "grad_norm": 0.2601687660352944, "learning_rate": 6.74600148479702e-06, "loss": 0.6697, "num_tokens": 8633338909.0, "step": 2826 }, { "epoch": 4.1392196372962085, "grad_norm": 0.25277188567077785, "learning_rate": 6.736944098718978e-06, "loss": 0.6749, "num_tokens": 8636207710.0, "step": 2827 }, { "epoch": 4.140685107162484, "grad_norm": 0.2607393759310363, "learning_rate": 6.727900445486983e-06, "loss": 0.6769, "num_tokens": 8639401699.0, "step": 2828 }, { "epoch": 4.14215057702876, "grad_norm": 0.24285268823983108, "learning_rate": 6.718870533238011e-06, "loss": 0.6766, "num_tokens": 8642491244.0, "step": 2829 }, { "epoch": 4.143616046895036, "grad_norm": 0.262592554246135, "learning_rate": 6.709854370096672e-06, "loss": 0.6607, "num_tokens": 8645718000.0, "step": 2830 }, { "epoch": 4.145081516761311, "grad_norm": 0.25758937452229047, "learning_rate": 6.700851964175206e-06, "loss": 0.67, "num_tokens": 8648763538.0, "step": 2831 }, { "epoch": 4.1465469866275875, "grad_norm": 0.24844567767016623, "learning_rate": 6.691863323573482e-06, "loss": 0.6618, "num_tokens": 8651611884.0, "step": 2832 }, { "epoch": 4.148012456493864, "grad_norm": 0.26467954844147473, "learning_rate": 6.682888456378966e-06, "loss": 0.6547, "num_tokens": 8654621547.0, "step": 2833 }, { "epoch": 4.149477926360139, "grad_norm": 0.2741363742067731, "learning_rate": 6.673927370666753e-06, "loss": 0.6553, "num_tokens": 8657819976.0, "step": 2834 }, { "epoch": 4.150943396226415, "grad_norm": 0.24974125668223987, "learning_rate": 6.664980074499524e-06, "loss": 0.6659, "num_tokens": 8660939684.0, "step": 2835 }, { "epoch": 4.152408866092691, "grad_norm": 0.2615721039527689, "learning_rate": 6.656046575927557e-06, "loss": 0.6565, "num_tokens": 8663893867.0, "step": 2836 }, { "epoch": 4.1538743359589665, "grad_norm": 0.27134340688799297, "learning_rate": 6.647126882988715e-06, "loss": 0.6632, "num_tokens": 8667133857.0, "step": 2837 }, { "epoch": 4.155339805825243, "grad_norm": 0.2591575113788516, "learning_rate": 6.638221003708441e-06, "loss": 0.6774, "num_tokens": 8670073016.0, "step": 2838 }, { "epoch": 4.156805275691519, "grad_norm": 0.2598364447796385, "learning_rate": 6.6293289460997466e-06, "loss": 0.6545, "num_tokens": 8673070846.0, "step": 2839 }, { "epoch": 4.158270745557794, "grad_norm": 0.26041529014438575, "learning_rate": 6.620450718163207e-06, "loss": 0.6548, "num_tokens": 8676158455.0, "step": 2840 }, { "epoch": 4.15973621542407, "grad_norm": 0.23040494689385213, "learning_rate": 6.611586327886965e-06, "loss": 0.6503, "num_tokens": 8679130398.0, "step": 2841 }, { "epoch": 4.161201685290346, "grad_norm": 0.2619688483576692, "learning_rate": 6.6027357832466935e-06, "loss": 0.6638, "num_tokens": 8682027738.0, "step": 2842 }, { "epoch": 4.162667155156622, "grad_norm": 0.28026662814851494, "learning_rate": 6.593899092205627e-06, "loss": 0.6805, "num_tokens": 8685076316.0, "step": 2843 }, { "epoch": 4.164132625022898, "grad_norm": 0.2330718353126558, "learning_rate": 6.585076262714523e-06, "loss": 0.6484, "num_tokens": 8688143963.0, "step": 2844 }, { "epoch": 4.165598094889174, "grad_norm": 0.27017623742026037, "learning_rate": 6.576267302711674e-06, "loss": 0.6536, "num_tokens": 8691420125.0, "step": 2845 }, { "epoch": 4.16706356475545, "grad_norm": 0.23910696414953064, "learning_rate": 6.567472220122888e-06, "loss": 0.6639, "num_tokens": 8694567921.0, "step": 2846 }, { "epoch": 4.168529034621725, "grad_norm": 0.2404539323189428, "learning_rate": 6.5586910228614895e-06, "loss": 0.6605, "num_tokens": 8697466357.0, "step": 2847 }, { "epoch": 4.169994504488002, "grad_norm": 0.26139040310734346, "learning_rate": 6.549923718828306e-06, "loss": 0.6366, "num_tokens": 8700709023.0, "step": 2848 }, { "epoch": 4.171459974354278, "grad_norm": 0.2483274716151525, "learning_rate": 6.541170315911685e-06, "loss": 0.6591, "num_tokens": 8703447761.0, "step": 2849 }, { "epoch": 4.172925444220553, "grad_norm": 0.25282619636532216, "learning_rate": 6.532430821987426e-06, "loss": 0.6573, "num_tokens": 8706505608.0, "step": 2850 }, { "epoch": 4.174390914086829, "grad_norm": 0.2580020252215847, "learning_rate": 6.523705244918856e-06, "loss": 0.658, "num_tokens": 8709830150.0, "step": 2851 }, { "epoch": 4.175856383953105, "grad_norm": 0.2828820598737101, "learning_rate": 6.514993592556757e-06, "loss": 0.6582, "num_tokens": 8712884175.0, "step": 2852 }, { "epoch": 4.177321853819381, "grad_norm": 0.2502952083921611, "learning_rate": 6.506295872739388e-06, "loss": 0.6598, "num_tokens": 8715834191.0, "step": 2853 }, { "epoch": 4.178787323685657, "grad_norm": 0.24524123441380385, "learning_rate": 6.497612093292466e-06, "loss": 0.6504, "num_tokens": 8718560140.0, "step": 2854 }, { "epoch": 4.180252793551933, "grad_norm": 0.2611638560558994, "learning_rate": 6.488942262029185e-06, "loss": 0.6584, "num_tokens": 8721477799.0, "step": 2855 }, { "epoch": 4.181718263418208, "grad_norm": 0.24563653514466768, "learning_rate": 6.480286386750158e-06, "loss": 0.6677, "num_tokens": 8724416503.0, "step": 2856 }, { "epoch": 4.183183733284484, "grad_norm": 0.2580087824639078, "learning_rate": 6.471644475243472e-06, "loss": 0.6572, "num_tokens": 8727425012.0, "step": 2857 }, { "epoch": 4.1846492031507605, "grad_norm": 0.2637983930041308, "learning_rate": 6.463016535284633e-06, "loss": 0.6717, "num_tokens": 8730651387.0, "step": 2858 }, { "epoch": 4.186114673017036, "grad_norm": 0.27755898259284695, "learning_rate": 6.454402574636577e-06, "loss": 0.6617, "num_tokens": 8733772734.0, "step": 2859 }, { "epoch": 4.187580142883312, "grad_norm": 0.25581723319365895, "learning_rate": 6.44580260104967e-06, "loss": 0.6547, "num_tokens": 8736713599.0, "step": 2860 }, { "epoch": 4.189045612749588, "grad_norm": 0.27881942024675727, "learning_rate": 6.4372166222616835e-06, "loss": 0.6658, "num_tokens": 8739663552.0, "step": 2861 }, { "epoch": 4.190511082615863, "grad_norm": 0.2496936762919609, "learning_rate": 6.4286446459978e-06, "loss": 0.6581, "num_tokens": 8742915201.0, "step": 2862 }, { "epoch": 4.1919765524821395, "grad_norm": 0.23853767658519257, "learning_rate": 6.42008667997062e-06, "loss": 0.659, "num_tokens": 8745808807.0, "step": 2863 }, { "epoch": 4.193442022348416, "grad_norm": 0.2713529171051726, "learning_rate": 6.411542731880104e-06, "loss": 0.6427, "num_tokens": 8748824116.0, "step": 2864 }, { "epoch": 4.194907492214691, "grad_norm": 0.23794906023817944, "learning_rate": 6.403012809413634e-06, "loss": 0.6707, "num_tokens": 8751978825.0, "step": 2865 }, { "epoch": 4.196372962080967, "grad_norm": 0.23760338127296238, "learning_rate": 6.394496920245954e-06, "loss": 0.67, "num_tokens": 8754980546.0, "step": 2866 }, { "epoch": 4.197838431947243, "grad_norm": 0.264154055797665, "learning_rate": 6.385995072039185e-06, "loss": 0.6527, "num_tokens": 8758237295.0, "step": 2867 }, { "epoch": 4.1993039018135185, "grad_norm": 0.25264813842148964, "learning_rate": 6.377507272442817e-06, "loss": 0.6666, "num_tokens": 8761494596.0, "step": 2868 }, { "epoch": 4.200769371679795, "grad_norm": 0.24218431788640982, "learning_rate": 6.3690335290936966e-06, "loss": 0.6654, "num_tokens": 8764659144.0, "step": 2869 }, { "epoch": 4.202234841546071, "grad_norm": 0.2531054971525206, "learning_rate": 6.360573849616022e-06, "loss": 0.6709, "num_tokens": 8767829885.0, "step": 2870 }, { "epoch": 4.203700311412347, "grad_norm": 0.24521970143812893, "learning_rate": 6.3521282416213495e-06, "loss": 0.6725, "num_tokens": 8770847273.0, "step": 2871 }, { "epoch": 4.205165781278622, "grad_norm": 0.2802457881309137, "learning_rate": 6.34369671270856e-06, "loss": 0.6565, "num_tokens": 8773900011.0, "step": 2872 }, { "epoch": 4.206631251144898, "grad_norm": 0.24324892766984155, "learning_rate": 6.3352792704638764e-06, "loss": 0.6477, "num_tokens": 8776905823.0, "step": 2873 }, { "epoch": 4.208096721011175, "grad_norm": 0.2496964377387776, "learning_rate": 6.32687592246084e-06, "loss": 0.6791, "num_tokens": 8780039376.0, "step": 2874 }, { "epoch": 4.20956219087745, "grad_norm": 0.2487931872826802, "learning_rate": 6.318486676260316e-06, "loss": 0.6551, "num_tokens": 8783231643.0, "step": 2875 }, { "epoch": 4.211027660743726, "grad_norm": 0.2721115568072086, "learning_rate": 6.310111539410476e-06, "loss": 0.664, "num_tokens": 8786122538.0, "step": 2876 }, { "epoch": 4.212493130610002, "grad_norm": 0.2417328134338784, "learning_rate": 6.301750519446814e-06, "loss": 0.6642, "num_tokens": 8789043097.0, "step": 2877 }, { "epoch": 4.2139586004762775, "grad_norm": 0.26092837863741114, "learning_rate": 6.293403623892093e-06, "loss": 0.6494, "num_tokens": 8791887076.0, "step": 2878 }, { "epoch": 4.215424070342554, "grad_norm": 0.2506319313665538, "learning_rate": 6.285070860256389e-06, "loss": 0.6612, "num_tokens": 8794950138.0, "step": 2879 }, { "epoch": 4.21688954020883, "grad_norm": 0.2458467697589475, "learning_rate": 6.2767522360370684e-06, "loss": 0.687, "num_tokens": 8797936881.0, "step": 2880 }, { "epoch": 4.218355010075105, "grad_norm": 0.25915370976686813, "learning_rate": 6.268447758718745e-06, "loss": 0.6591, "num_tokens": 8800988506.0, "step": 2881 }, { "epoch": 4.219820479941381, "grad_norm": 0.24791290494883722, "learning_rate": 6.260157435773341e-06, "loss": 0.6694, "num_tokens": 8804042327.0, "step": 2882 }, { "epoch": 4.221285949807657, "grad_norm": 0.25292771854886675, "learning_rate": 6.251881274660021e-06, "loss": 0.6827, "num_tokens": 8806999940.0, "step": 2883 }, { "epoch": 4.222751419673933, "grad_norm": 0.27677073018613024, "learning_rate": 6.243619282825213e-06, "loss": 0.6655, "num_tokens": 8809887968.0, "step": 2884 }, { "epoch": 4.224216889540209, "grad_norm": 0.25024258490495654, "learning_rate": 6.23537146770259e-06, "loss": 0.6723, "num_tokens": 8812734312.0, "step": 2885 }, { "epoch": 4.225682359406485, "grad_norm": 0.2808465310623127, "learning_rate": 6.2271378367130904e-06, "loss": 0.6591, "num_tokens": 8815611205.0, "step": 2886 }, { "epoch": 4.22714782927276, "grad_norm": 0.27012340916364197, "learning_rate": 6.2189183972648625e-06, "loss": 0.6689, "num_tokens": 8818558300.0, "step": 2887 }, { "epoch": 4.228613299139036, "grad_norm": 0.2857031130655367, "learning_rate": 6.210713156753304e-06, "loss": 0.677, "num_tokens": 8821721587.0, "step": 2888 }, { "epoch": 4.2300787690053125, "grad_norm": 0.2617659988500074, "learning_rate": 6.202522122561035e-06, "loss": 0.6761, "num_tokens": 8824729628.0, "step": 2889 }, { "epoch": 4.231544238871588, "grad_norm": 0.25616308577420405, "learning_rate": 6.194345302057889e-06, "loss": 0.6842, "num_tokens": 8827667180.0, "step": 2890 }, { "epoch": 4.233009708737864, "grad_norm": 0.2762435884193145, "learning_rate": 6.186182702600909e-06, "loss": 0.6622, "num_tokens": 8830766132.0, "step": 2891 }, { "epoch": 4.23447517860414, "grad_norm": 0.2904026436138465, "learning_rate": 6.178034331534354e-06, "loss": 0.653, "num_tokens": 8833841554.0, "step": 2892 }, { "epoch": 4.235940648470415, "grad_norm": 0.2582396038046331, "learning_rate": 6.1699001961896645e-06, "loss": 0.6848, "num_tokens": 8836907717.0, "step": 2893 }, { "epoch": 4.237406118336692, "grad_norm": 0.2587652405353099, "learning_rate": 6.161780303885493e-06, "loss": 0.6766, "num_tokens": 8840181205.0, "step": 2894 }, { "epoch": 4.238871588202968, "grad_norm": 0.27095115657505847, "learning_rate": 6.153674661927653e-06, "loss": 0.6636, "num_tokens": 8843717963.0, "step": 2895 }, { "epoch": 4.240337058069244, "grad_norm": 0.2555779841673245, "learning_rate": 6.1455832776091584e-06, "loss": 0.6735, "num_tokens": 8846821275.0, "step": 2896 }, { "epoch": 4.241802527935519, "grad_norm": 0.26732524935651025, "learning_rate": 6.137506158210182e-06, "loss": 0.6672, "num_tokens": 8849770596.0, "step": 2897 }, { "epoch": 4.243267997801795, "grad_norm": 0.26390330405484164, "learning_rate": 6.1294433109980664e-06, "loss": 0.6583, "num_tokens": 8852800120.0, "step": 2898 }, { "epoch": 4.2447334676680715, "grad_norm": 0.24757050148622006, "learning_rate": 6.121394743227304e-06, "loss": 0.6731, "num_tokens": 8855748216.0, "step": 2899 }, { "epoch": 4.246198937534347, "grad_norm": 0.2636225204941994, "learning_rate": 6.113360462139561e-06, "loss": 0.6425, "num_tokens": 8858782991.0, "step": 2900 }, { "epoch": 4.247664407400623, "grad_norm": 0.2586106043606745, "learning_rate": 6.105340474963623e-06, "loss": 0.6713, "num_tokens": 8861839646.0, "step": 2901 }, { "epoch": 4.249129877266899, "grad_norm": 0.2513157899039302, "learning_rate": 6.097334788915435e-06, "loss": 0.6469, "num_tokens": 8865049976.0, "step": 2902 }, { "epoch": 4.250595347133174, "grad_norm": 0.2779915978737713, "learning_rate": 6.089343411198061e-06, "loss": 0.6644, "num_tokens": 8868380907.0, "step": 2903 }, { "epoch": 4.2520608169994505, "grad_norm": 0.275583662234401, "learning_rate": 6.0813663490017e-06, "loss": 0.6834, "num_tokens": 8871348100.0, "step": 2904 }, { "epoch": 4.253526286865727, "grad_norm": 0.24855474410357584, "learning_rate": 6.073403609503665e-06, "loss": 0.6528, "num_tokens": 8874711875.0, "step": 2905 }, { "epoch": 4.254991756732002, "grad_norm": 0.2704408871162648, "learning_rate": 6.065455199868389e-06, "loss": 0.6655, "num_tokens": 8877844541.0, "step": 2906 }, { "epoch": 4.256457226598278, "grad_norm": 0.24880828697263843, "learning_rate": 6.057521127247399e-06, "loss": 0.659, "num_tokens": 8880770597.0, "step": 2907 }, { "epoch": 4.257922696464554, "grad_norm": 0.25273243633482995, "learning_rate": 6.049601398779343e-06, "loss": 0.66, "num_tokens": 8883876628.0, "step": 2908 }, { "epoch": 4.2593881663308295, "grad_norm": 0.2547349628226953, "learning_rate": 6.041696021589938e-06, "loss": 0.6593, "num_tokens": 8886981639.0, "step": 2909 }, { "epoch": 4.260853636197106, "grad_norm": 0.24118291803712041, "learning_rate": 6.03380500279201e-06, "loss": 0.647, "num_tokens": 8890165868.0, "step": 2910 }, { "epoch": 4.262319106063382, "grad_norm": 0.26183331853578995, "learning_rate": 6.025928349485457e-06, "loss": 0.6803, "num_tokens": 8893127415.0, "step": 2911 }, { "epoch": 4.263784575929657, "grad_norm": 0.2507686391805737, "learning_rate": 6.018066068757247e-06, "loss": 0.6772, "num_tokens": 8896204145.0, "step": 2912 }, { "epoch": 4.265250045795933, "grad_norm": 0.25131707587143337, "learning_rate": 6.0102181676814245e-06, "loss": 0.6629, "num_tokens": 8899336560.0, "step": 2913 }, { "epoch": 4.266715515662209, "grad_norm": 0.271370526408866, "learning_rate": 6.002384653319098e-06, "loss": 0.6804, "num_tokens": 8902180196.0, "step": 2914 }, { "epoch": 4.268180985528485, "grad_norm": 0.2529260157711417, "learning_rate": 5.994565532718414e-06, "loss": 0.6565, "num_tokens": 8905191814.0, "step": 2915 }, { "epoch": 4.269646455394761, "grad_norm": 0.27827917896034154, "learning_rate": 5.986760812914593e-06, "loss": 0.677, "num_tokens": 8908129168.0, "step": 2916 }, { "epoch": 4.271111925261037, "grad_norm": 0.256103388878135, "learning_rate": 5.978970500929881e-06, "loss": 0.6797, "num_tokens": 8911150269.0, "step": 2917 }, { "epoch": 4.272577395127312, "grad_norm": 0.2571496760861382, "learning_rate": 5.971194603773567e-06, "loss": 0.6589, "num_tokens": 8914092763.0, "step": 2918 }, { "epoch": 4.274042864993588, "grad_norm": 0.25676599110027704, "learning_rate": 5.963433128441971e-06, "loss": 0.6895, "num_tokens": 8916937345.0, "step": 2919 }, { "epoch": 4.275508334859865, "grad_norm": 0.2874288739296436, "learning_rate": 5.955686081918433e-06, "loss": 0.674, "num_tokens": 8919928325.0, "step": 2920 }, { "epoch": 4.27697380472614, "grad_norm": 0.2557608302007733, "learning_rate": 5.947953471173313e-06, "loss": 0.6718, "num_tokens": 8923009217.0, "step": 2921 }, { "epoch": 4.278439274592416, "grad_norm": 0.25476046662613056, "learning_rate": 5.940235303163986e-06, "loss": 0.6567, "num_tokens": 8925982174.0, "step": 2922 }, { "epoch": 4.279904744458692, "grad_norm": 0.2418554541072236, "learning_rate": 5.932531584834824e-06, "loss": 0.6674, "num_tokens": 8928994112.0, "step": 2923 }, { "epoch": 4.281370214324968, "grad_norm": 0.2636409820372933, "learning_rate": 5.924842323117204e-06, "loss": 0.6656, "num_tokens": 8932099767.0, "step": 2924 }, { "epoch": 4.282835684191244, "grad_norm": 0.23415339047416747, "learning_rate": 5.917167524929503e-06, "loss": 0.647, "num_tokens": 8935453991.0, "step": 2925 }, { "epoch": 4.28430115405752, "grad_norm": 0.24510598513910214, "learning_rate": 5.909507197177059e-06, "loss": 0.6724, "num_tokens": 8938770001.0, "step": 2926 }, { "epoch": 4.285766623923796, "grad_norm": 0.261555535345284, "learning_rate": 5.901861346752225e-06, "loss": 0.6615, "num_tokens": 8941796843.0, "step": 2927 }, { "epoch": 4.287232093790071, "grad_norm": 0.26597895667242, "learning_rate": 5.894229980534301e-06, "loss": 0.6534, "num_tokens": 8944849932.0, "step": 2928 }, { "epoch": 4.288697563656347, "grad_norm": 0.2628441241582817, "learning_rate": 5.886613105389568e-06, "loss": 0.6676, "num_tokens": 8947656855.0, "step": 2929 }, { "epoch": 4.2901630335226235, "grad_norm": 0.23753247674510014, "learning_rate": 5.87901072817126e-06, "loss": 0.6581, "num_tokens": 8950790029.0, "step": 2930 }, { "epoch": 4.291628503388899, "grad_norm": 0.2563824854021634, "learning_rate": 5.871422855719584e-06, "loss": 0.6488, "num_tokens": 8953868789.0, "step": 2931 }, { "epoch": 4.293093973255175, "grad_norm": 0.2718253643156808, "learning_rate": 5.863849494861668e-06, "loss": 0.6627, "num_tokens": 8957115650.0, "step": 2932 }, { "epoch": 4.294559443121451, "grad_norm": 0.26680257667825785, "learning_rate": 5.856290652411613e-06, "loss": 0.6684, "num_tokens": 8960117245.0, "step": 2933 }, { "epoch": 4.296024912987726, "grad_norm": 0.24551325964835713, "learning_rate": 5.848746335170436e-06, "loss": 0.659, "num_tokens": 8963456827.0, "step": 2934 }, { "epoch": 4.2974903828540025, "grad_norm": 0.26038924072771213, "learning_rate": 5.8412165499260966e-06, "loss": 0.6457, "num_tokens": 8966604911.0, "step": 2935 }, { "epoch": 4.298955852720279, "grad_norm": 0.2573026694235447, "learning_rate": 5.8337013034534725e-06, "loss": 0.6648, "num_tokens": 8969538753.0, "step": 2936 }, { "epoch": 4.300421322586554, "grad_norm": 0.24785788184841212, "learning_rate": 5.826200602514365e-06, "loss": 0.6645, "num_tokens": 8972722707.0, "step": 2937 }, { "epoch": 4.30188679245283, "grad_norm": 0.2298122002317854, "learning_rate": 5.818714453857482e-06, "loss": 0.6783, "num_tokens": 8975702522.0, "step": 2938 }, { "epoch": 4.303352262319106, "grad_norm": 0.24378118601884363, "learning_rate": 5.811242864218455e-06, "loss": 0.6688, "num_tokens": 8978819858.0, "step": 2939 }, { "epoch": 4.3048177321853816, "grad_norm": 0.25781975641764454, "learning_rate": 5.803785840319784e-06, "loss": 0.6656, "num_tokens": 8981930141.0, "step": 2940 }, { "epoch": 4.306283202051658, "grad_norm": 0.23521527800797498, "learning_rate": 5.7963433888709e-06, "loss": 0.6677, "num_tokens": 8984918108.0, "step": 2941 }, { "epoch": 4.307748671917934, "grad_norm": 0.24151862352338208, "learning_rate": 5.7889155165681e-06, "loss": 0.6952, "num_tokens": 8987883045.0, "step": 2942 }, { "epoch": 4.30921414178421, "grad_norm": 0.26479391523318335, "learning_rate": 5.7815022300945664e-06, "loss": 0.6368, "num_tokens": 8990861932.0, "step": 2943 }, { "epoch": 4.310679611650485, "grad_norm": 0.24856290082720506, "learning_rate": 5.774103536120359e-06, "loss": 0.6624, "num_tokens": 8993873835.0, "step": 2944 }, { "epoch": 4.3121450815167615, "grad_norm": 0.2650284973756836, "learning_rate": 5.7667194413024195e-06, "loss": 0.6623, "num_tokens": 8996861868.0, "step": 2945 }, { "epoch": 4.313610551383038, "grad_norm": 0.25650339318741877, "learning_rate": 5.759349952284532e-06, "loss": 0.6574, "num_tokens": 8999905691.0, "step": 2946 }, { "epoch": 4.315076021249313, "grad_norm": 0.24879595612203237, "learning_rate": 5.7519950756973616e-06, "loss": 0.6619, "num_tokens": 9002912553.0, "step": 2947 }, { "epoch": 4.316541491115589, "grad_norm": 0.2685118006379776, "learning_rate": 5.744654818158412e-06, "loss": 0.6799, "num_tokens": 9005807209.0, "step": 2948 }, { "epoch": 4.318006960981865, "grad_norm": 0.23683261390694316, "learning_rate": 5.7373291862720355e-06, "loss": 0.6786, "num_tokens": 9008950336.0, "step": 2949 }, { "epoch": 4.3194724308481405, "grad_norm": 0.2513178957167594, "learning_rate": 5.730018186629431e-06, "loss": 0.6576, "num_tokens": 9012097105.0, "step": 2950 }, { "epoch": 4.320937900714417, "grad_norm": 0.2648886196152374, "learning_rate": 5.722721825808626e-06, "loss": 0.656, "num_tokens": 9015199091.0, "step": 2951 }, { "epoch": 4.322403370580693, "grad_norm": 0.24830275113741962, "learning_rate": 5.715440110374474e-06, "loss": 0.655, "num_tokens": 9018232510.0, "step": 2952 }, { "epoch": 4.323868840446968, "grad_norm": 0.2577958467251042, "learning_rate": 5.708173046878668e-06, "loss": 0.6763, "num_tokens": 9021460544.0, "step": 2953 }, { "epoch": 4.325334310313244, "grad_norm": 0.2410976995603928, "learning_rate": 5.700920641859692e-06, "loss": 0.6641, "num_tokens": 9024328676.0, "step": 2954 }, { "epoch": 4.32679978017952, "grad_norm": 0.2573195185907293, "learning_rate": 5.693682901842867e-06, "loss": 0.6627, "num_tokens": 9027538989.0, "step": 2955 }, { "epoch": 4.328265250045796, "grad_norm": 0.24827090498728469, "learning_rate": 5.686459833340302e-06, "loss": 0.6674, "num_tokens": 9030531867.0, "step": 2956 }, { "epoch": 4.329730719912072, "grad_norm": 0.2540006400350655, "learning_rate": 5.679251442850911e-06, "loss": 0.6818, "num_tokens": 9033739154.0, "step": 2957 }, { "epoch": 4.331196189778348, "grad_norm": 0.24283018345125743, "learning_rate": 5.672057736860401e-06, "loss": 0.6749, "num_tokens": 9036634402.0, "step": 2958 }, { "epoch": 4.332661659644623, "grad_norm": 0.25243851792249855, "learning_rate": 5.664878721841276e-06, "loss": 0.6447, "num_tokens": 9039650200.0, "step": 2959 }, { "epoch": 4.334127129510899, "grad_norm": 0.25317559043121124, "learning_rate": 5.657714404252797e-06, "loss": 0.6983, "num_tokens": 9042587643.0, "step": 2960 }, { "epoch": 4.335592599377176, "grad_norm": 0.27123755428168456, "learning_rate": 5.650564790541032e-06, "loss": 0.6526, "num_tokens": 9045569332.0, "step": 2961 }, { "epoch": 4.337058069243451, "grad_norm": 0.24613108412945423, "learning_rate": 5.643429887138801e-06, "loss": 0.6811, "num_tokens": 9048627409.0, "step": 2962 }, { "epoch": 4.338523539109727, "grad_norm": 0.2526093710011062, "learning_rate": 5.636309700465684e-06, "loss": 0.6483, "num_tokens": 9051586770.0, "step": 2963 }, { "epoch": 4.339989008976003, "grad_norm": 0.27446649767461523, "learning_rate": 5.6292042369280355e-06, "loss": 0.6543, "num_tokens": 9054476839.0, "step": 2964 }, { "epoch": 4.341454478842278, "grad_norm": 0.2533795337780764, "learning_rate": 5.622113502918954e-06, "loss": 0.6539, "num_tokens": 9057624054.0, "step": 2965 }, { "epoch": 4.342919948708555, "grad_norm": 0.2558906120866623, "learning_rate": 5.615037504818282e-06, "loss": 0.6695, "num_tokens": 9060600046.0, "step": 2966 }, { "epoch": 4.344385418574831, "grad_norm": 0.2441813103136406, "learning_rate": 5.607976248992615e-06, "loss": 0.6573, "num_tokens": 9063526586.0, "step": 2967 }, { "epoch": 4.345850888441106, "grad_norm": 0.26140083132104486, "learning_rate": 5.600929741795271e-06, "loss": 0.6681, "num_tokens": 9066662266.0, "step": 2968 }, { "epoch": 4.347316358307382, "grad_norm": 0.23781288769666115, "learning_rate": 5.5938979895663035e-06, "loss": 0.6759, "num_tokens": 9069972720.0, "step": 2969 }, { "epoch": 4.348781828173658, "grad_norm": 0.2739594547858151, "learning_rate": 5.5868809986325e-06, "loss": 0.6716, "num_tokens": 9072920280.0, "step": 2970 }, { "epoch": 4.350247298039934, "grad_norm": 0.2663442627153427, "learning_rate": 5.579878775307344e-06, "loss": 0.6446, "num_tokens": 9076009077.0, "step": 2971 }, { "epoch": 4.35171276790621, "grad_norm": 0.25357554724588605, "learning_rate": 5.572891325891057e-06, "loss": 0.6681, "num_tokens": 9079004417.0, "step": 2972 }, { "epoch": 4.353178237772486, "grad_norm": 0.2860076905235898, "learning_rate": 5.565918656670548e-06, "loss": 0.6504, "num_tokens": 9082244154.0, "step": 2973 }, { "epoch": 4.354643707638762, "grad_norm": 0.244471391375885, "learning_rate": 5.558960773919441e-06, "loss": 0.6533, "num_tokens": 9085322485.0, "step": 2974 }, { "epoch": 4.356109177505037, "grad_norm": 0.24498590224750422, "learning_rate": 5.552017683898044e-06, "loss": 0.666, "num_tokens": 9088414128.0, "step": 2975 }, { "epoch": 4.3575746473713135, "grad_norm": 0.2742922929432383, "learning_rate": 5.545089392853375e-06, "loss": 0.6757, "num_tokens": 9091527741.0, "step": 2976 }, { "epoch": 4.35904011723759, "grad_norm": 0.25982029122973377, "learning_rate": 5.538175907019105e-06, "loss": 0.6609, "num_tokens": 9094643094.0, "step": 2977 }, { "epoch": 4.360505587103865, "grad_norm": 0.23976058825854804, "learning_rate": 5.53127723261562e-06, "loss": 0.6665, "num_tokens": 9097677935.0, "step": 2978 }, { "epoch": 4.361971056970141, "grad_norm": 0.24473297605558886, "learning_rate": 5.524393375849952e-06, "loss": 0.6625, "num_tokens": 9100990404.0, "step": 2979 }, { "epoch": 4.363436526836417, "grad_norm": 0.23032098211621221, "learning_rate": 5.517524342915818e-06, "loss": 0.6803, "num_tokens": 9104124426.0, "step": 2980 }, { "epoch": 4.3649019967026925, "grad_norm": 0.23313893483942788, "learning_rate": 5.510670139993584e-06, "loss": 0.6775, "num_tokens": 9107003233.0, "step": 2981 }, { "epoch": 4.366367466568969, "grad_norm": 0.24991895956648105, "learning_rate": 5.503830773250285e-06, "loss": 0.6695, "num_tokens": 9110038268.0, "step": 2982 }, { "epoch": 4.367832936435245, "grad_norm": 0.24693325419310477, "learning_rate": 5.497006248839593e-06, "loss": 0.6778, "num_tokens": 9113097541.0, "step": 2983 }, { "epoch": 4.36929840630152, "grad_norm": 0.24577445698076777, "learning_rate": 5.490196572901851e-06, "loss": 0.6685, "num_tokens": 9116114831.0, "step": 2984 }, { "epoch": 4.370763876167796, "grad_norm": 0.26108320791907447, "learning_rate": 5.4834017515640045e-06, "loss": 0.6839, "num_tokens": 9119248578.0, "step": 2985 }, { "epoch": 4.372229346034072, "grad_norm": 0.2373901653532449, "learning_rate": 5.476621790939669e-06, "loss": 0.6573, "num_tokens": 9122182871.0, "step": 2986 }, { "epoch": 4.373694815900348, "grad_norm": 0.25092928070548365, "learning_rate": 5.469856697129069e-06, "loss": 0.6604, "num_tokens": 9125281733.0, "step": 2987 }, { "epoch": 4.375160285766624, "grad_norm": 0.2615713411172448, "learning_rate": 5.463106476219062e-06, "loss": 0.6673, "num_tokens": 9128129068.0, "step": 2988 }, { "epoch": 4.3766257556329, "grad_norm": 0.26742743717009315, "learning_rate": 5.456371134283115e-06, "loss": 0.6619, "num_tokens": 9131171155.0, "step": 2989 }, { "epoch": 4.378091225499175, "grad_norm": 0.28309691375388185, "learning_rate": 5.449650677381321e-06, "loss": 0.6684, "num_tokens": 9134325295.0, "step": 2990 }, { "epoch": 4.3795566953654514, "grad_norm": 0.23548124503230008, "learning_rate": 5.442945111560363e-06, "loss": 0.6669, "num_tokens": 9137373978.0, "step": 2991 }, { "epoch": 4.381022165231728, "grad_norm": 0.2525629540082861, "learning_rate": 5.4362544428535395e-06, "loss": 0.6514, "num_tokens": 9140732009.0, "step": 2992 }, { "epoch": 4.382487635098004, "grad_norm": 0.24729094648386607, "learning_rate": 5.4295786772807416e-06, "loss": 0.6615, "num_tokens": 9143876369.0, "step": 2993 }, { "epoch": 4.383953104964279, "grad_norm": 0.25021136302442404, "learning_rate": 5.422917820848449e-06, "loss": 0.6482, "num_tokens": 9147034724.0, "step": 2994 }, { "epoch": 4.385418574830555, "grad_norm": 0.2729141084807848, "learning_rate": 5.416271879549731e-06, "loss": 0.6736, "num_tokens": 9149948555.0, "step": 2995 }, { "epoch": 4.386884044696831, "grad_norm": 0.27813257687185355, "learning_rate": 5.409640859364232e-06, "loss": 0.6833, "num_tokens": 9152851620.0, "step": 2996 }, { "epoch": 4.388349514563107, "grad_norm": 0.24031644443037073, "learning_rate": 5.403024766258169e-06, "loss": 0.6525, "num_tokens": 9155798891.0, "step": 2997 }, { "epoch": 4.389814984429383, "grad_norm": 0.2631805434843908, "learning_rate": 5.396423606184349e-06, "loss": 0.6707, "num_tokens": 9158861765.0, "step": 2998 }, { "epoch": 4.391280454295659, "grad_norm": 0.2930265187067786, "learning_rate": 5.389837385082108e-06, "loss": 0.657, "num_tokens": 9162057423.0, "step": 2999 }, { "epoch": 4.392745924161934, "grad_norm": 0.25188901190683516, "learning_rate": 5.383266108877379e-06, "loss": 0.6694, "num_tokens": 9165191595.0, "step": 3000 }, { "epoch": 4.39421139402821, "grad_norm": 0.2958681206092923, "learning_rate": 5.376709783482616e-06, "loss": 0.6656, "num_tokens": 9168313516.0, "step": 3001 }, { "epoch": 4.3956768638944865, "grad_norm": 0.2771785114395411, "learning_rate": 5.370168414796839e-06, "loss": 0.6564, "num_tokens": 9171131040.0, "step": 3002 }, { "epoch": 4.397142333760762, "grad_norm": 0.26956782026445514, "learning_rate": 5.3636420087056095e-06, "loss": 0.6644, "num_tokens": 9174159896.0, "step": 3003 }, { "epoch": 4.398607803627038, "grad_norm": 0.25843294104764697, "learning_rate": 5.3571305710810195e-06, "loss": 0.6704, "num_tokens": 9177117611.0, "step": 3004 }, { "epoch": 4.400073273493314, "grad_norm": 0.27172850750197225, "learning_rate": 5.350634107781699e-06, "loss": 0.6666, "num_tokens": 9179910355.0, "step": 3005 }, { "epoch": 4.401538743359589, "grad_norm": 0.27383478236259534, "learning_rate": 5.344152624652801e-06, "loss": 0.6803, "num_tokens": 9182927015.0, "step": 3006 }, { "epoch": 4.4030042132258655, "grad_norm": 0.2647415176351087, "learning_rate": 5.337686127526009e-06, "loss": 0.6595, "num_tokens": 9186168990.0, "step": 3007 }, { "epoch": 4.404469683092142, "grad_norm": 0.2715294900144207, "learning_rate": 5.3312346222195055e-06, "loss": 0.6633, "num_tokens": 9189138932.0, "step": 3008 }, { "epoch": 4.405935152958417, "grad_norm": 0.23933675334435248, "learning_rate": 5.324798114538006e-06, "loss": 0.6815, "num_tokens": 9192107120.0, "step": 3009 }, { "epoch": 4.407400622824693, "grad_norm": 0.2534735141952917, "learning_rate": 5.318376610272715e-06, "loss": 0.6749, "num_tokens": 9194938935.0, "step": 3010 }, { "epoch": 4.408866092690969, "grad_norm": 0.26419113552766715, "learning_rate": 5.311970115201345e-06, "loss": 0.666, "num_tokens": 9197794276.0, "step": 3011 }, { "epoch": 4.410331562557245, "grad_norm": 0.2561183335323131, "learning_rate": 5.305578635088101e-06, "loss": 0.6536, "num_tokens": 9200669509.0, "step": 3012 }, { "epoch": 4.411797032423521, "grad_norm": 0.26324245265136, "learning_rate": 5.299202175683684e-06, "loss": 0.6779, "num_tokens": 9203601947.0, "step": 3013 }, { "epoch": 4.413262502289797, "grad_norm": 0.25112355329905733, "learning_rate": 5.29284074272527e-06, "loss": 0.6559, "num_tokens": 9206491246.0, "step": 3014 }, { "epoch": 4.414727972156072, "grad_norm": 0.2548767866261667, "learning_rate": 5.286494341936532e-06, "loss": 0.6701, "num_tokens": 9209451921.0, "step": 3015 }, { "epoch": 4.416193442022348, "grad_norm": 0.24301911547249022, "learning_rate": 5.280162979027594e-06, "loss": 0.6591, "num_tokens": 9212603036.0, "step": 3016 }, { "epoch": 4.4176589118886245, "grad_norm": 0.2510433669552997, "learning_rate": 5.2738466596950746e-06, "loss": 0.6601, "num_tokens": 9215474892.0, "step": 3017 }, { "epoch": 4.4191243817549, "grad_norm": 0.24716221088257032, "learning_rate": 5.26754538962204e-06, "loss": 0.6609, "num_tokens": 9218450657.0, "step": 3018 }, { "epoch": 4.420589851621176, "grad_norm": 0.24420092126624518, "learning_rate": 5.261259174478022e-06, "loss": 0.6607, "num_tokens": 9221475868.0, "step": 3019 }, { "epoch": 4.422055321487452, "grad_norm": 0.24019242512303168, "learning_rate": 5.254988019919004e-06, "loss": 0.6747, "num_tokens": 9224630529.0, "step": 3020 }, { "epoch": 4.423520791353727, "grad_norm": 0.24335358092685966, "learning_rate": 5.248731931587428e-06, "loss": 0.6461, "num_tokens": 9227722974.0, "step": 3021 }, { "epoch": 4.4249862612200035, "grad_norm": 0.23644840592204705, "learning_rate": 5.242490915112163e-06, "loss": 0.6592, "num_tokens": 9230740438.0, "step": 3022 }, { "epoch": 4.42645173108628, "grad_norm": 0.24539493526892406, "learning_rate": 5.236264976108537e-06, "loss": 0.6646, "num_tokens": 9233723359.0, "step": 3023 }, { "epoch": 4.427917200952556, "grad_norm": 0.24802410516707743, "learning_rate": 5.2300541201782985e-06, "loss": 0.6511, "num_tokens": 9236712034.0, "step": 3024 }, { "epoch": 4.429382670818831, "grad_norm": 0.23896057537849677, "learning_rate": 5.2238583529096285e-06, "loss": 0.6683, "num_tokens": 9239765774.0, "step": 3025 }, { "epoch": 4.430848140685107, "grad_norm": 0.2590115412654243, "learning_rate": 5.217677679877135e-06, "loss": 0.6654, "num_tokens": 9242854259.0, "step": 3026 }, { "epoch": 4.432313610551383, "grad_norm": 0.26691164948157764, "learning_rate": 5.211512106641838e-06, "loss": 0.663, "num_tokens": 9245876296.0, "step": 3027 }, { "epoch": 4.433779080417659, "grad_norm": 0.24520438050268956, "learning_rate": 5.205361638751179e-06, "loss": 0.6532, "num_tokens": 9248905143.0, "step": 3028 }, { "epoch": 4.435244550283935, "grad_norm": 0.24319863078662501, "learning_rate": 5.19922628173901e-06, "loss": 0.6697, "num_tokens": 9252154869.0, "step": 3029 }, { "epoch": 4.436710020150211, "grad_norm": 0.26749816316740893, "learning_rate": 5.193106041125575e-06, "loss": 0.6832, "num_tokens": 9255207219.0, "step": 3030 }, { "epoch": 4.438175490016486, "grad_norm": 0.24125730426563372, "learning_rate": 5.187000922417531e-06, "loss": 0.6707, "num_tokens": 9258411751.0, "step": 3031 }, { "epoch": 4.439640959882762, "grad_norm": 0.2560535423299746, "learning_rate": 5.180910931107922e-06, "loss": 0.6737, "num_tokens": 9261258869.0, "step": 3032 }, { "epoch": 4.441106429749039, "grad_norm": 0.2568454262845663, "learning_rate": 5.174836072676182e-06, "loss": 0.6601, "num_tokens": 9264370602.0, "step": 3033 }, { "epoch": 4.442571899615314, "grad_norm": 0.23836527014953598, "learning_rate": 5.168776352588123e-06, "loss": 0.6547, "num_tokens": 9267436557.0, "step": 3034 }, { "epoch": 4.44403736948159, "grad_norm": 0.25561704449376954, "learning_rate": 5.162731776295958e-06, "loss": 0.6753, "num_tokens": 9270551120.0, "step": 3035 }, { "epoch": 4.445502839347866, "grad_norm": 0.25393343699997695, "learning_rate": 5.156702349238244e-06, "loss": 0.6702, "num_tokens": 9273512269.0, "step": 3036 }, { "epoch": 4.446968309214141, "grad_norm": 0.25010216771630867, "learning_rate": 5.150688076839934e-06, "loss": 0.6652, "num_tokens": 9276414489.0, "step": 3037 }, { "epoch": 4.448433779080418, "grad_norm": 0.2393702470898058, "learning_rate": 5.144688964512327e-06, "loss": 0.6598, "num_tokens": 9279350541.0, "step": 3038 }, { "epoch": 4.449899248946694, "grad_norm": 0.25037438553208136, "learning_rate": 5.1387050176530975e-06, "loss": 0.6665, "num_tokens": 9282277929.0, "step": 3039 }, { "epoch": 4.451364718812969, "grad_norm": 0.2511182281352977, "learning_rate": 5.132736241646259e-06, "loss": 0.6647, "num_tokens": 9285318838.0, "step": 3040 }, { "epoch": 4.452830188679245, "grad_norm": 0.23912898207650501, "learning_rate": 5.126782641862189e-06, "loss": 0.6697, "num_tokens": 9288501630.0, "step": 3041 }, { "epoch": 4.454295658545521, "grad_norm": 0.24645585039622456, "learning_rate": 5.120844223657598e-06, "loss": 0.6612, "num_tokens": 9291785281.0, "step": 3042 }, { "epoch": 4.4557611284117975, "grad_norm": 0.26553474098149116, "learning_rate": 5.114920992375553e-06, "loss": 0.668, "num_tokens": 9294763946.0, "step": 3043 }, { "epoch": 4.457226598278073, "grad_norm": 0.2406365541096892, "learning_rate": 5.109012953345438e-06, "loss": 0.6668, "num_tokens": 9297871294.0, "step": 3044 }, { "epoch": 4.458692068144349, "grad_norm": 0.25000972574977337, "learning_rate": 5.103120111882978e-06, "loss": 0.6851, "num_tokens": 9300854081.0, "step": 3045 }, { "epoch": 4.460157538010625, "grad_norm": 0.26001995647034953, "learning_rate": 5.097242473290231e-06, "loss": 0.6664, "num_tokens": 9303887882.0, "step": 3046 }, { "epoch": 4.4616230078769, "grad_norm": 0.23301341749670199, "learning_rate": 5.091380042855557e-06, "loss": 0.6524, "num_tokens": 9307084278.0, "step": 3047 }, { "epoch": 4.4630884777431765, "grad_norm": 0.24295331900565087, "learning_rate": 5.085532825853651e-06, "loss": 0.6602, "num_tokens": 9310135501.0, "step": 3048 }, { "epoch": 4.464553947609453, "grad_norm": 0.24379296827155128, "learning_rate": 5.079700827545512e-06, "loss": 0.6549, "num_tokens": 9313160885.0, "step": 3049 }, { "epoch": 4.466019417475728, "grad_norm": 0.22967579232457286, "learning_rate": 5.073884053178443e-06, "loss": 0.6658, "num_tokens": 9316091683.0, "step": 3050 }, { "epoch": 4.467484887342004, "grad_norm": 0.2567254338984546, "learning_rate": 5.068082507986054e-06, "loss": 0.667, "num_tokens": 9319084087.0, "step": 3051 }, { "epoch": 4.46895035720828, "grad_norm": 0.24916683409938942, "learning_rate": 5.06229619718826e-06, "loss": 0.6843, "num_tokens": 9321976414.0, "step": 3052 }, { "epoch": 4.4704158270745555, "grad_norm": 0.2474760451483358, "learning_rate": 5.056525125991247e-06, "loss": 0.6748, "num_tokens": 9324914518.0, "step": 3053 }, { "epoch": 4.471881296940832, "grad_norm": 0.2418707946050199, "learning_rate": 5.050769299587513e-06, "loss": 0.6462, "num_tokens": 9327972664.0, "step": 3054 }, { "epoch": 4.473346766807108, "grad_norm": 0.2501616817508447, "learning_rate": 5.045028723155824e-06, "loss": 0.6683, "num_tokens": 9330942140.0, "step": 3055 }, { "epoch": 4.474812236673383, "grad_norm": 0.25012670122543673, "learning_rate": 5.039303401861233e-06, "loss": 0.6312, "num_tokens": 9334136430.0, "step": 3056 }, { "epoch": 4.476277706539659, "grad_norm": 0.24434469311399956, "learning_rate": 5.033593340855064e-06, "loss": 0.6631, "num_tokens": 9337336690.0, "step": 3057 }, { "epoch": 4.477743176405935, "grad_norm": 0.2404181446960059, "learning_rate": 5.027898545274912e-06, "loss": 0.6853, "num_tokens": 9340374251.0, "step": 3058 }, { "epoch": 4.479208646272211, "grad_norm": 0.24067731470521994, "learning_rate": 5.0222190202446315e-06, "loss": 0.6399, "num_tokens": 9343573786.0, "step": 3059 }, { "epoch": 4.480674116138487, "grad_norm": 0.2358926521056054, "learning_rate": 5.0165547708743536e-06, "loss": 0.644, "num_tokens": 9346542321.0, "step": 3060 }, { "epoch": 4.482139586004763, "grad_norm": 0.23924674341849783, "learning_rate": 5.010905802260438e-06, "loss": 0.6444, "num_tokens": 9349748644.0, "step": 3061 }, { "epoch": 4.483605055871038, "grad_norm": 0.24750978685076047, "learning_rate": 5.005272119485527e-06, "loss": 0.6688, "num_tokens": 9352910658.0, "step": 3062 }, { "epoch": 4.4850705257373145, "grad_norm": 0.26378245773562387, "learning_rate": 4.999653727618482e-06, "loss": 0.6738, "num_tokens": 9356032399.0, "step": 3063 }, { "epoch": 4.486535995603591, "grad_norm": 0.2565058016236164, "learning_rate": 4.994050631714427e-06, "loss": 0.6737, "num_tokens": 9359047876.0, "step": 3064 }, { "epoch": 4.488001465469866, "grad_norm": 0.23923595273059367, "learning_rate": 4.9884628368147085e-06, "loss": 0.6579, "num_tokens": 9362175992.0, "step": 3065 }, { "epoch": 4.489466935336142, "grad_norm": 0.2475377416740997, "learning_rate": 4.98289034794692e-06, "loss": 0.6484, "num_tokens": 9365055642.0, "step": 3066 }, { "epoch": 4.490932405202418, "grad_norm": 0.2651839536777912, "learning_rate": 4.9773331701248675e-06, "loss": 0.6482, "num_tokens": 9368376213.0, "step": 3067 }, { "epoch": 4.4923978750686935, "grad_norm": 0.23336756506910022, "learning_rate": 4.971791308348594e-06, "loss": 0.6404, "num_tokens": 9371480182.0, "step": 3068 }, { "epoch": 4.49386334493497, "grad_norm": 0.24018715616980066, "learning_rate": 4.966264767604359e-06, "loss": 0.6532, "num_tokens": 9374593974.0, "step": 3069 }, { "epoch": 4.495328814801246, "grad_norm": 0.2628949187775641, "learning_rate": 4.960753552864632e-06, "loss": 0.6601, "num_tokens": 9377565471.0, "step": 3070 }, { "epoch": 4.496794284667521, "grad_norm": 0.2448462772266706, "learning_rate": 4.9552576690881e-06, "loss": 0.656, "num_tokens": 9380685284.0, "step": 3071 }, { "epoch": 4.498259754533797, "grad_norm": 0.2614733081968525, "learning_rate": 4.94977712121965e-06, "loss": 0.6616, "num_tokens": 9383789908.0, "step": 3072 }, { "epoch": 4.499725224400073, "grad_norm": 0.2685278770089857, "learning_rate": 4.94431191419037e-06, "loss": 0.6624, "num_tokens": 9386766155.0, "step": 3073 }, { "epoch": 4.501190694266349, "grad_norm": 0.26347025786882816, "learning_rate": 4.938862052917558e-06, "loss": 0.6619, "num_tokens": 9389813110.0, "step": 3074 }, { "epoch": 4.502656164132625, "grad_norm": 0.23478587322355565, "learning_rate": 4.933427542304685e-06, "loss": 0.6632, "num_tokens": 9392894911.0, "step": 3075 }, { "epoch": 4.504121633998901, "grad_norm": 0.2632934216447485, "learning_rate": 4.9280083872414265e-06, "loss": 0.6525, "num_tokens": 9395664696.0, "step": 3076 }, { "epoch": 4.505587103865177, "grad_norm": 0.26502632198467946, "learning_rate": 4.9226045926036345e-06, "loss": 0.6616, "num_tokens": 9398710816.0, "step": 3077 }, { "epoch": 4.507052573731452, "grad_norm": 0.27038893288904303, "learning_rate": 4.917216163253342e-06, "loss": 0.6604, "num_tokens": 9401922463.0, "step": 3078 }, { "epoch": 4.508518043597729, "grad_norm": 0.2492078967135333, "learning_rate": 4.911843104038753e-06, "loss": 0.6628, "num_tokens": 9405214006.0, "step": 3079 }, { "epoch": 4.509983513464005, "grad_norm": 0.2534780473072918, "learning_rate": 4.906485419794255e-06, "loss": 0.6522, "num_tokens": 9408511761.0, "step": 3080 }, { "epoch": 4.51144898333028, "grad_norm": 0.25325552585032607, "learning_rate": 4.901143115340383e-06, "loss": 0.6534, "num_tokens": 9411431706.0, "step": 3081 }, { "epoch": 4.512914453196556, "grad_norm": 0.2633102015608032, "learning_rate": 4.895816195483852e-06, "loss": 0.6496, "num_tokens": 9414867077.0, "step": 3082 }, { "epoch": 4.514379923062832, "grad_norm": 0.25229533764516526, "learning_rate": 4.8905046650175254e-06, "loss": 0.6524, "num_tokens": 9417960529.0, "step": 3083 }, { "epoch": 4.515845392929108, "grad_norm": 0.2521765058159813, "learning_rate": 4.885208528720422e-06, "loss": 0.6541, "num_tokens": 9420911832.0, "step": 3084 }, { "epoch": 4.517310862795384, "grad_norm": 0.25568851271747345, "learning_rate": 4.879927791357709e-06, "loss": 0.6656, "num_tokens": 9423841334.0, "step": 3085 }, { "epoch": 4.51877633266166, "grad_norm": 0.2536007775891062, "learning_rate": 4.874662457680698e-06, "loss": 0.6608, "num_tokens": 9426897131.0, "step": 3086 }, { "epoch": 4.520241802527935, "grad_norm": 0.2468046903956502, "learning_rate": 4.869412532426845e-06, "loss": 0.6558, "num_tokens": 9430209565.0, "step": 3087 }, { "epoch": 4.521707272394211, "grad_norm": 0.2516091020737002, "learning_rate": 4.864178020319738e-06, "loss": 0.6701, "num_tokens": 9433289190.0, "step": 3088 }, { "epoch": 4.5231727422604875, "grad_norm": 0.24168674288701958, "learning_rate": 4.8589589260691e-06, "loss": 0.6548, "num_tokens": 9436457441.0, "step": 3089 }, { "epoch": 4.524638212126763, "grad_norm": 0.24713022018059413, "learning_rate": 4.8537552543707765e-06, "loss": 0.6862, "num_tokens": 9439359876.0, "step": 3090 }, { "epoch": 4.526103681993039, "grad_norm": 0.24869570409656233, "learning_rate": 4.848567009906751e-06, "loss": 0.6771, "num_tokens": 9442317920.0, "step": 3091 }, { "epoch": 4.527569151859315, "grad_norm": 0.2652847871635881, "learning_rate": 4.843394197345106e-06, "loss": 0.6677, "num_tokens": 9445320252.0, "step": 3092 }, { "epoch": 4.529034621725591, "grad_norm": 0.2481197548929934, "learning_rate": 4.838236821340055e-06, "loss": 0.6602, "num_tokens": 9448510319.0, "step": 3093 }, { "epoch": 4.5305000915918665, "grad_norm": 0.24557960080565933, "learning_rate": 4.833094886531918e-06, "loss": 0.6672, "num_tokens": 9451771874.0, "step": 3094 }, { "epoch": 4.531965561458143, "grad_norm": 0.23236007777453788, "learning_rate": 4.827968397547117e-06, "loss": 0.6629, "num_tokens": 9454841248.0, "step": 3095 }, { "epoch": 4.533431031324419, "grad_norm": 0.22788506507911596, "learning_rate": 4.822857358998181e-06, "loss": 0.659, "num_tokens": 9458020466.0, "step": 3096 }, { "epoch": 4.534896501190694, "grad_norm": 0.24243085207560156, "learning_rate": 4.817761775483743e-06, "loss": 0.6582, "num_tokens": 9461068325.0, "step": 3097 }, { "epoch": 4.53636197105697, "grad_norm": 0.25270392872102915, "learning_rate": 4.812681651588517e-06, "loss": 0.6674, "num_tokens": 9464114018.0, "step": 3098 }, { "epoch": 4.537827440923246, "grad_norm": 0.24640723360501193, "learning_rate": 4.80761699188332e-06, "loss": 0.6703, "num_tokens": 9467002820.0, "step": 3099 }, { "epoch": 4.539292910789522, "grad_norm": 0.23476360464931922, "learning_rate": 4.8025678009250486e-06, "loss": 0.6669, "num_tokens": 9469844592.0, "step": 3100 }, { "epoch": 4.540758380655798, "grad_norm": 0.24447956596879658, "learning_rate": 4.797534083256684e-06, "loss": 0.6513, "num_tokens": 9472936593.0, "step": 3101 }, { "epoch": 4.542223850522074, "grad_norm": 0.2578389911532004, "learning_rate": 4.792515843407285e-06, "loss": 0.6748, "num_tokens": 9475959001.0, "step": 3102 }, { "epoch": 4.543689320388349, "grad_norm": 0.24163555739318807, "learning_rate": 4.787513085891982e-06, "loss": 0.6663, "num_tokens": 9479069095.0, "step": 3103 }, { "epoch": 4.545154790254625, "grad_norm": 0.2644419896641208, "learning_rate": 4.782525815211976e-06, "loss": 0.671, "num_tokens": 9482166921.0, "step": 3104 }, { "epoch": 4.546620260120902, "grad_norm": 0.24090272644347177, "learning_rate": 4.777554035854539e-06, "loss": 0.6583, "num_tokens": 9485629827.0, "step": 3105 }, { "epoch": 4.548085729987177, "grad_norm": 0.26245665493207015, "learning_rate": 4.772597752292996e-06, "loss": 0.6793, "num_tokens": 9488609550.0, "step": 3106 }, { "epoch": 4.549551199853453, "grad_norm": 0.2679507306789325, "learning_rate": 4.767656968986737e-06, "loss": 0.6747, "num_tokens": 9491645139.0, "step": 3107 }, { "epoch": 4.551016669719729, "grad_norm": 0.2461525236190638, "learning_rate": 4.762731690381207e-06, "loss": 0.6575, "num_tokens": 9494566343.0, "step": 3108 }, { "epoch": 4.5524821395860045, "grad_norm": 0.2595354303277881, "learning_rate": 4.757821920907888e-06, "loss": 0.6535, "num_tokens": 9497510935.0, "step": 3109 }, { "epoch": 4.553947609452281, "grad_norm": 0.27391688565560646, "learning_rate": 4.752927664984323e-06, "loss": 0.6564, "num_tokens": 9500450051.0, "step": 3110 }, { "epoch": 4.555413079318557, "grad_norm": 0.26349929539635497, "learning_rate": 4.748048927014089e-06, "loss": 0.6591, "num_tokens": 9503191478.0, "step": 3111 }, { "epoch": 4.556878549184832, "grad_norm": 0.24142314397658987, "learning_rate": 4.743185711386801e-06, "loss": 0.6783, "num_tokens": 9506258648.0, "step": 3112 }, { "epoch": 4.558344019051108, "grad_norm": 0.2950006362282494, "learning_rate": 4.7383380224781095e-06, "loss": 0.6708, "num_tokens": 9509195664.0, "step": 3113 }, { "epoch": 4.559809488917384, "grad_norm": 0.25056078725175035, "learning_rate": 4.733505864649698e-06, "loss": 0.6687, "num_tokens": 9512377555.0, "step": 3114 }, { "epoch": 4.56127495878366, "grad_norm": 0.26086783274426306, "learning_rate": 4.7286892422492656e-06, "loss": 0.6749, "num_tokens": 9515507625.0, "step": 3115 }, { "epoch": 4.562740428649936, "grad_norm": 0.2598962516385441, "learning_rate": 4.723888159610548e-06, "loss": 0.6688, "num_tokens": 9518696788.0, "step": 3116 }, { "epoch": 4.564205898516212, "grad_norm": 0.24064701676619346, "learning_rate": 4.7191026210532855e-06, "loss": 0.6458, "num_tokens": 9521805801.0, "step": 3117 }, { "epoch": 4.565671368382487, "grad_norm": 0.24684676347513262, "learning_rate": 4.714332630883239e-06, "loss": 0.6606, "num_tokens": 9524926683.0, "step": 3118 }, { "epoch": 4.567136838248763, "grad_norm": 0.24881284727513214, "learning_rate": 4.709578193392186e-06, "loss": 0.6418, "num_tokens": 9527959245.0, "step": 3119 }, { "epoch": 4.5686023081150395, "grad_norm": 0.26383064184416916, "learning_rate": 4.704839312857894e-06, "loss": 0.6641, "num_tokens": 9531076082.0, "step": 3120 }, { "epoch": 4.570067777981315, "grad_norm": 0.27875352524711977, "learning_rate": 4.7001159935441534e-06, "loss": 0.6534, "num_tokens": 9534184471.0, "step": 3121 }, { "epoch": 4.571533247847591, "grad_norm": 0.24557643302865986, "learning_rate": 4.6954082397007375e-06, "loss": 0.6748, "num_tokens": 9537157366.0, "step": 3122 }, { "epoch": 4.572998717713867, "grad_norm": 0.2286991700305166, "learning_rate": 4.690716055563422e-06, "loss": 0.6537, "num_tokens": 9540044608.0, "step": 3123 }, { "epoch": 4.574464187580142, "grad_norm": 0.2503774430311768, "learning_rate": 4.6860394453539695e-06, "loss": 0.6608, "num_tokens": 9542905355.0, "step": 3124 }, { "epoch": 4.5759296574464186, "grad_norm": 0.2325171985056932, "learning_rate": 4.6813784132801385e-06, "loss": 0.661, "num_tokens": 9546006814.0, "step": 3125 }, { "epoch": 4.577395127312695, "grad_norm": 0.23598743048757523, "learning_rate": 4.676732963535661e-06, "loss": 0.6727, "num_tokens": 9548916228.0, "step": 3126 }, { "epoch": 4.578860597178971, "grad_norm": 0.24626357597344972, "learning_rate": 4.672103100300254e-06, "loss": 0.6544, "num_tokens": 9551770976.0, "step": 3127 }, { "epoch": 4.580326067045246, "grad_norm": 0.24970856684041626, "learning_rate": 4.6674888277396165e-06, "loss": 0.6711, "num_tokens": 9555011020.0, "step": 3128 }, { "epoch": 4.581791536911522, "grad_norm": 0.24254050719442524, "learning_rate": 4.662890150005401e-06, "loss": 0.6658, "num_tokens": 9558195322.0, "step": 3129 }, { "epoch": 4.5832570067777985, "grad_norm": 0.24380745041135088, "learning_rate": 4.658307071235252e-06, "loss": 0.6813, "num_tokens": 9561278195.0, "step": 3130 }, { "epoch": 4.584722476644074, "grad_norm": 0.23652059789333404, "learning_rate": 4.653739595552763e-06, "loss": 0.6555, "num_tokens": 9564168330.0, "step": 3131 }, { "epoch": 4.58618794651035, "grad_norm": 0.2279606035662297, "learning_rate": 4.649187727067497e-06, "loss": 0.6692, "num_tokens": 9566963499.0, "step": 3132 }, { "epoch": 4.587653416376626, "grad_norm": 0.26163928787369495, "learning_rate": 4.644651469874969e-06, "loss": 0.6517, "num_tokens": 9570122172.0, "step": 3133 }, { "epoch": 4.589118886242901, "grad_norm": 0.24931470659508967, "learning_rate": 4.640130828056649e-06, "loss": 0.6644, "num_tokens": 9573113349.0, "step": 3134 }, { "epoch": 4.5905843561091775, "grad_norm": 0.22020684484307335, "learning_rate": 4.635625805679959e-06, "loss": 0.644, "num_tokens": 9576041319.0, "step": 3135 }, { "epoch": 4.592049825975454, "grad_norm": 0.23544641897564023, "learning_rate": 4.631136406798268e-06, "loss": 0.6524, "num_tokens": 9579296816.0, "step": 3136 }, { "epoch": 4.593515295841729, "grad_norm": 0.2408847685002871, "learning_rate": 4.6266626354508885e-06, "loss": 0.6819, "num_tokens": 9582150619.0, "step": 3137 }, { "epoch": 4.594980765708005, "grad_norm": 0.24266778695153038, "learning_rate": 4.622204495663065e-06, "loss": 0.6671, "num_tokens": 9585022466.0, "step": 3138 }, { "epoch": 4.596446235574281, "grad_norm": 0.2517241139222598, "learning_rate": 4.617761991445989e-06, "loss": 0.653, "num_tokens": 9587874297.0, "step": 3139 }, { "epoch": 4.5979117054405565, "grad_norm": 0.25976144370037896, "learning_rate": 4.613335126796773e-06, "loss": 0.6532, "num_tokens": 9591026873.0, "step": 3140 }, { "epoch": 4.599377175306833, "grad_norm": 0.23544791221315528, "learning_rate": 4.6089239056984615e-06, "loss": 0.6591, "num_tokens": 9594017898.0, "step": 3141 }, { "epoch": 4.600842645173109, "grad_norm": 0.25954948049907184, "learning_rate": 4.604528332120034e-06, "loss": 0.6619, "num_tokens": 9596938342.0, "step": 3142 }, { "epoch": 4.602308115039385, "grad_norm": 0.254850963966629, "learning_rate": 4.600148410016373e-06, "loss": 0.6728, "num_tokens": 9599989951.0, "step": 3143 }, { "epoch": 4.60377358490566, "grad_norm": 0.2413374671379886, "learning_rate": 4.595784143328291e-06, "loss": 0.6508, "num_tokens": 9603146795.0, "step": 3144 }, { "epoch": 4.605239054771936, "grad_norm": 0.26059191219883276, "learning_rate": 4.591435535982515e-06, "loss": 0.6567, "num_tokens": 9606213061.0, "step": 3145 }, { "epoch": 4.606704524638213, "grad_norm": 0.2497730198027195, "learning_rate": 4.587102591891675e-06, "loss": 0.6616, "num_tokens": 9609224122.0, "step": 3146 }, { "epoch": 4.608169994504488, "grad_norm": 0.2650162740434224, "learning_rate": 4.5827853149543125e-06, "loss": 0.6747, "num_tokens": 9612123028.0, "step": 3147 }, { "epoch": 4.609635464370764, "grad_norm": 0.26374805930768735, "learning_rate": 4.5784837090548746e-06, "loss": 0.6579, "num_tokens": 9615148939.0, "step": 3148 }, { "epoch": 4.61110093423704, "grad_norm": 0.23495707364358784, "learning_rate": 4.574197778063704e-06, "loss": 0.6436, "num_tokens": 9618503635.0, "step": 3149 }, { "epoch": 4.612566404103315, "grad_norm": 0.25226724596666, "learning_rate": 4.569927525837047e-06, "loss": 0.6582, "num_tokens": 9621480687.0, "step": 3150 }, { "epoch": 4.614031873969592, "grad_norm": 0.27335240446689496, "learning_rate": 4.56567295621703e-06, "loss": 0.6486, "num_tokens": 9624632016.0, "step": 3151 }, { "epoch": 4.615497343835868, "grad_norm": 0.23615235480397487, "learning_rate": 4.561434073031688e-06, "loss": 0.6645, "num_tokens": 9627687517.0, "step": 3152 }, { "epoch": 4.616962813702143, "grad_norm": 0.24285400431194604, "learning_rate": 4.557210880094921e-06, "loss": 0.6588, "num_tokens": 9630798861.0, "step": 3153 }, { "epoch": 4.618428283568419, "grad_norm": 0.2296383424533283, "learning_rate": 4.553003381206529e-06, "loss": 0.6588, "num_tokens": 9633633015.0, "step": 3154 }, { "epoch": 4.619893753434695, "grad_norm": 0.2691977765343534, "learning_rate": 4.548811580152182e-06, "loss": 0.6733, "num_tokens": 9636588431.0, "step": 3155 }, { "epoch": 4.621359223300971, "grad_norm": 0.23105229030853533, "learning_rate": 4.5446354807034325e-06, "loss": 0.6477, "num_tokens": 9639513640.0, "step": 3156 }, { "epoch": 4.622824693167247, "grad_norm": 0.2343453372552833, "learning_rate": 4.540475086617693e-06, "loss": 0.6516, "num_tokens": 9642651878.0, "step": 3157 }, { "epoch": 4.624290163033523, "grad_norm": 0.22182506465411253, "learning_rate": 4.536330401638264e-06, "loss": 0.6644, "num_tokens": 9645863142.0, "step": 3158 }, { "epoch": 4.625755632899798, "grad_norm": 0.23702082489281406, "learning_rate": 4.532201429494297e-06, "loss": 0.6603, "num_tokens": 9649034541.0, "step": 3159 }, { "epoch": 4.627221102766074, "grad_norm": 0.23992925925953218, "learning_rate": 4.528088173900811e-06, "loss": 0.6503, "num_tokens": 9652264814.0, "step": 3160 }, { "epoch": 4.6286865726323505, "grad_norm": 0.2657118247933047, "learning_rate": 4.523990638558684e-06, "loss": 0.6484, "num_tokens": 9655409200.0, "step": 3161 }, { "epoch": 4.630152042498626, "grad_norm": 0.23008309050235026, "learning_rate": 4.51990882715465e-06, "loss": 0.659, "num_tokens": 9658519407.0, "step": 3162 }, { "epoch": 4.631617512364902, "grad_norm": 0.2343287355289672, "learning_rate": 4.515842743361295e-06, "loss": 0.6671, "num_tokens": 9661643116.0, "step": 3163 }, { "epoch": 4.633082982231178, "grad_norm": 0.23154322628892454, "learning_rate": 4.511792390837054e-06, "loss": 0.6538, "num_tokens": 9664799914.0, "step": 3164 }, { "epoch": 4.634548452097453, "grad_norm": 0.2334435353828147, "learning_rate": 4.507757773226208e-06, "loss": 0.6651, "num_tokens": 9667615621.0, "step": 3165 }, { "epoch": 4.6360139219637295, "grad_norm": 0.27244751270346496, "learning_rate": 4.503738894158883e-06, "loss": 0.6557, "num_tokens": 9670738842.0, "step": 3166 }, { "epoch": 4.637479391830006, "grad_norm": 0.2628584793953414, "learning_rate": 4.499735757251039e-06, "loss": 0.6779, "num_tokens": 9673704428.0, "step": 3167 }, { "epoch": 4.638944861696281, "grad_norm": 0.24291804891678928, "learning_rate": 4.4957483661044785e-06, "loss": 0.6797, "num_tokens": 9676774032.0, "step": 3168 }, { "epoch": 4.640410331562557, "grad_norm": 0.2584464891252511, "learning_rate": 4.491776724306828e-06, "loss": 0.6715, "num_tokens": 9679559766.0, "step": 3169 }, { "epoch": 4.641875801428833, "grad_norm": 0.2541079840160581, "learning_rate": 4.4878208354315564e-06, "loss": 0.686, "num_tokens": 9682369239.0, "step": 3170 }, { "epoch": 4.6433412712951085, "grad_norm": 0.2767590535851318, "learning_rate": 4.4838807030379474e-06, "loss": 0.6422, "num_tokens": 9685524608.0, "step": 3171 }, { "epoch": 4.644806741161385, "grad_norm": 0.2491807577442647, "learning_rate": 4.479956330671111e-06, "loss": 0.6629, "num_tokens": 9688858592.0, "step": 3172 }, { "epoch": 4.646272211027661, "grad_norm": 0.24461944638180488, "learning_rate": 4.476047721861984e-06, "loss": 0.6621, "num_tokens": 9692024732.0, "step": 3173 }, { "epoch": 4.647737680893936, "grad_norm": 0.2553529752436928, "learning_rate": 4.4721548801273065e-06, "loss": 0.6698, "num_tokens": 9695101149.0, "step": 3174 }, { "epoch": 4.649203150760212, "grad_norm": 0.2426182586862576, "learning_rate": 4.46827780896965e-06, "loss": 0.6414, "num_tokens": 9698124728.0, "step": 3175 }, { "epoch": 4.6506686206264884, "grad_norm": 0.2492930330322641, "learning_rate": 4.464416511877378e-06, "loss": 0.6682, "num_tokens": 9701273187.0, "step": 3176 }, { "epoch": 4.652134090492765, "grad_norm": 0.23525775373213317, "learning_rate": 4.460570992324676e-06, "loss": 0.6572, "num_tokens": 9704196943.0, "step": 3177 }, { "epoch": 4.65359956035904, "grad_norm": 0.25483413796026966, "learning_rate": 4.456741253771526e-06, "loss": 0.6575, "num_tokens": 9707355315.0, "step": 3178 }, { "epoch": 4.655065030225316, "grad_norm": 0.22539062336536916, "learning_rate": 4.4529272996637115e-06, "loss": 0.6486, "num_tokens": 9710370667.0, "step": 3179 }, { "epoch": 4.656530500091592, "grad_norm": 0.24847912656574958, "learning_rate": 4.449129133432815e-06, "loss": 0.6617, "num_tokens": 9713465128.0, "step": 3180 }, { "epoch": 4.6579959699578675, "grad_norm": 0.24157126326010436, "learning_rate": 4.445346758496219e-06, "loss": 0.645, "num_tokens": 9716616870.0, "step": 3181 }, { "epoch": 4.659461439824144, "grad_norm": 0.2504792748548414, "learning_rate": 4.441580178257091e-06, "loss": 0.6547, "num_tokens": 9719653304.0, "step": 3182 }, { "epoch": 4.66092690969042, "grad_norm": 0.23925281359408035, "learning_rate": 4.43782939610439e-06, "loss": 0.6657, "num_tokens": 9722999263.0, "step": 3183 }, { "epoch": 4.662392379556695, "grad_norm": 0.25769197530750265, "learning_rate": 4.43409441541286e-06, "loss": 0.6696, "num_tokens": 9725959253.0, "step": 3184 }, { "epoch": 4.663857849422971, "grad_norm": 0.24250856154886063, "learning_rate": 4.430375239543027e-06, "loss": 0.6664, "num_tokens": 9728963406.0, "step": 3185 }, { "epoch": 4.665323319289247, "grad_norm": 0.24775972887256018, "learning_rate": 4.4266718718412e-06, "loss": 0.6824, "num_tokens": 9731846198.0, "step": 3186 }, { "epoch": 4.666788789155523, "grad_norm": 0.24830009204926612, "learning_rate": 4.422984315639464e-06, "loss": 0.6648, "num_tokens": 9734920651.0, "step": 3187 }, { "epoch": 4.668254259021799, "grad_norm": 0.2407462029748362, "learning_rate": 4.41931257425567e-06, "loss": 0.6887, "num_tokens": 9737901581.0, "step": 3188 }, { "epoch": 4.669719728888075, "grad_norm": 0.24666499385364385, "learning_rate": 4.415656650993452e-06, "loss": 0.6615, "num_tokens": 9741044442.0, "step": 3189 }, { "epoch": 4.67118519875435, "grad_norm": 0.24874117114856079, "learning_rate": 4.412016549142204e-06, "loss": 0.6537, "num_tokens": 9743947739.0, "step": 3190 }, { "epoch": 4.672650668620626, "grad_norm": 0.24672095360403817, "learning_rate": 4.4083922719770885e-06, "loss": 0.6708, "num_tokens": 9747002684.0, "step": 3191 }, { "epoch": 4.6741161384869025, "grad_norm": 0.24000872768165177, "learning_rate": 4.404783822759024e-06, "loss": 0.6602, "num_tokens": 9749998850.0, "step": 3192 }, { "epoch": 4.675581608353179, "grad_norm": 0.23546314955273942, "learning_rate": 4.401191204734695e-06, "loss": 0.6833, "num_tokens": 9753252556.0, "step": 3193 }, { "epoch": 4.677047078219454, "grad_norm": 0.262598571542368, "learning_rate": 4.397614421136535e-06, "loss": 0.6691, "num_tokens": 9756150874.0, "step": 3194 }, { "epoch": 4.67851254808573, "grad_norm": 0.24214230652999752, "learning_rate": 4.394053475182738e-06, "loss": 0.6582, "num_tokens": 9759145059.0, "step": 3195 }, { "epoch": 4.679978017952006, "grad_norm": 0.2498778674560754, "learning_rate": 4.390508370077242e-06, "loss": 0.6523, "num_tokens": 9762318321.0, "step": 3196 }, { "epoch": 4.681443487818282, "grad_norm": 0.23987640602371504, "learning_rate": 4.386979109009738e-06, "loss": 0.6452, "num_tokens": 9765366857.0, "step": 3197 }, { "epoch": 4.682908957684558, "grad_norm": 0.2490818408157706, "learning_rate": 4.383465695155656e-06, "loss": 0.6613, "num_tokens": 9768426145.0, "step": 3198 }, { "epoch": 4.684374427550834, "grad_norm": 0.23612215684525745, "learning_rate": 4.379968131676168e-06, "loss": 0.6623, "num_tokens": 9771593082.0, "step": 3199 }, { "epoch": 4.685839897417109, "grad_norm": 0.2352445108499555, "learning_rate": 4.376486421718188e-06, "loss": 0.664, "num_tokens": 9774735206.0, "step": 3200 }, { "epoch": 4.687305367283385, "grad_norm": 0.22456593319753507, "learning_rate": 4.373020568414368e-06, "loss": 0.647, "num_tokens": 9777850243.0, "step": 3201 }, { "epoch": 4.6887708371496615, "grad_norm": 0.227333265633815, "learning_rate": 4.369570574883083e-06, "loss": 0.6629, "num_tokens": 9780916718.0, "step": 3202 }, { "epoch": 4.690236307015937, "grad_norm": 0.23555955521248362, "learning_rate": 4.366136444228448e-06, "loss": 0.6641, "num_tokens": 9784002873.0, "step": 3203 }, { "epoch": 4.691701776882213, "grad_norm": 0.2513002389053168, "learning_rate": 4.362718179540304e-06, "loss": 0.6728, "num_tokens": 9787089548.0, "step": 3204 }, { "epoch": 4.693167246748489, "grad_norm": 0.2490840544173277, "learning_rate": 4.35931578389421e-06, "loss": 0.6634, "num_tokens": 9790040721.0, "step": 3205 }, { "epoch": 4.694632716614764, "grad_norm": 0.2580907252187034, "learning_rate": 4.355929260351456e-06, "loss": 0.6533, "num_tokens": 9793175364.0, "step": 3206 }, { "epoch": 4.6960981864810405, "grad_norm": 0.22117147293862097, "learning_rate": 4.352558611959046e-06, "loss": 0.6618, "num_tokens": 9796116806.0, "step": 3207 }, { "epoch": 4.697563656347317, "grad_norm": 0.2560623512179201, "learning_rate": 4.349203841749698e-06, "loss": 0.6698, "num_tokens": 9799122862.0, "step": 3208 }, { "epoch": 4.699029126213592, "grad_norm": 0.2438353414504536, "learning_rate": 4.345864952741853e-06, "loss": 0.6464, "num_tokens": 9802415982.0, "step": 3209 }, { "epoch": 4.700494596079868, "grad_norm": 0.2335379220954658, "learning_rate": 4.3425419479396545e-06, "loss": 0.6607, "num_tokens": 9805355812.0, "step": 3210 }, { "epoch": 4.701960065946144, "grad_norm": 0.24197016303883623, "learning_rate": 4.339234830332952e-06, "loss": 0.6748, "num_tokens": 9808451784.0, "step": 3211 }, { "epoch": 4.7034255358124195, "grad_norm": 0.2522544543083815, "learning_rate": 4.335943602897312e-06, "loss": 0.6624, "num_tokens": 9811598973.0, "step": 3212 }, { "epoch": 4.704891005678696, "grad_norm": 0.243330596236898, "learning_rate": 4.332668268593993e-06, "loss": 0.6761, "num_tokens": 9814707397.0, "step": 3213 }, { "epoch": 4.706356475544972, "grad_norm": 0.2282131411670486, "learning_rate": 4.329408830369962e-06, "loss": 0.6699, "num_tokens": 9817819783.0, "step": 3214 }, { "epoch": 4.707821945411247, "grad_norm": 0.2516872535273225, "learning_rate": 4.3261652911578765e-06, "loss": 0.6629, "num_tokens": 9820973680.0, "step": 3215 }, { "epoch": 4.709287415277523, "grad_norm": 0.24567433453193424, "learning_rate": 4.3229376538760945e-06, "loss": 0.657, "num_tokens": 9823770781.0, "step": 3216 }, { "epoch": 4.710752885143799, "grad_norm": 0.2380790398409783, "learning_rate": 4.31972592142866e-06, "loss": 0.6853, "num_tokens": 9826589526.0, "step": 3217 }, { "epoch": 4.712218355010075, "grad_norm": 0.22916306782888507, "learning_rate": 4.316530096705316e-06, "loss": 0.6571, "num_tokens": 9829657911.0, "step": 3218 }, { "epoch": 4.713683824876351, "grad_norm": 0.2539701942345983, "learning_rate": 4.313350182581481e-06, "loss": 0.6673, "num_tokens": 9832726758.0, "step": 3219 }, { "epoch": 4.715149294742627, "grad_norm": 0.23744850501956752, "learning_rate": 4.310186181918271e-06, "loss": 0.6615, "num_tokens": 9835777267.0, "step": 3220 }, { "epoch": 4.716614764608902, "grad_norm": 0.22385971952389067, "learning_rate": 4.307038097562472e-06, "loss": 0.6584, "num_tokens": 9838865584.0, "step": 3221 }, { "epoch": 4.718080234475178, "grad_norm": 0.253959929711729, "learning_rate": 4.303905932346557e-06, "loss": 0.6853, "num_tokens": 9841738476.0, "step": 3222 }, { "epoch": 4.719545704341455, "grad_norm": 0.2386376119433655, "learning_rate": 4.300789689088674e-06, "loss": 0.6739, "num_tokens": 9844811762.0, "step": 3223 }, { "epoch": 4.72101117420773, "grad_norm": 0.24065581363798533, "learning_rate": 4.297689370592644e-06, "loss": 0.6638, "num_tokens": 9847806862.0, "step": 3224 }, { "epoch": 4.722476644074006, "grad_norm": 0.26066726567547405, "learning_rate": 4.294604979647958e-06, "loss": 0.656, "num_tokens": 9850883640.0, "step": 3225 }, { "epoch": 4.723942113940282, "grad_norm": 0.248012163477068, "learning_rate": 4.2915365190297846e-06, "loss": 0.6345, "num_tokens": 9853979716.0, "step": 3226 }, { "epoch": 4.725407583806558, "grad_norm": 0.22996826068388163, "learning_rate": 4.28848399149895e-06, "loss": 0.6732, "num_tokens": 9856844009.0, "step": 3227 }, { "epoch": 4.726873053672834, "grad_norm": 0.26114838821045794, "learning_rate": 4.2854473998019456e-06, "loss": 0.6672, "num_tokens": 9859847945.0, "step": 3228 }, { "epoch": 4.72833852353911, "grad_norm": 0.25111034253763787, "learning_rate": 4.282426746670932e-06, "loss": 0.662, "num_tokens": 9862953698.0, "step": 3229 }, { "epoch": 4.729803993405386, "grad_norm": 0.23899361319790152, "learning_rate": 4.279422034823722e-06, "loss": 0.6707, "num_tokens": 9865924087.0, "step": 3230 }, { "epoch": 4.731269463271661, "grad_norm": 0.2537988783249213, "learning_rate": 4.276433266963787e-06, "loss": 0.6709, "num_tokens": 9869325655.0, "step": 3231 }, { "epoch": 4.732734933137937, "grad_norm": 0.25474148748692943, "learning_rate": 4.2734604457802565e-06, "loss": 0.66, "num_tokens": 9872281766.0, "step": 3232 }, { "epoch": 4.7342004030042135, "grad_norm": 0.2618608684616351, "learning_rate": 4.2705035739479054e-06, "loss": 0.6709, "num_tokens": 9875178863.0, "step": 3233 }, { "epoch": 4.735665872870489, "grad_norm": 0.24289791381796583, "learning_rate": 4.267562654127165e-06, "loss": 0.6612, "num_tokens": 9878259939.0, "step": 3234 }, { "epoch": 4.737131342736765, "grad_norm": 0.25050776014022175, "learning_rate": 4.2646376889641074e-06, "loss": 0.659, "num_tokens": 9881561522.0, "step": 3235 }, { "epoch": 4.738596812603041, "grad_norm": 0.2330725198540521, "learning_rate": 4.261728681090457e-06, "loss": 0.639, "num_tokens": 9884487591.0, "step": 3236 }, { "epoch": 4.740062282469316, "grad_norm": 0.2321698201883858, "learning_rate": 4.258835633123575e-06, "loss": 0.6593, "num_tokens": 9887475274.0, "step": 3237 }, { "epoch": 4.7415277523355925, "grad_norm": 0.258839333813259, "learning_rate": 4.255958547666466e-06, "loss": 0.6695, "num_tokens": 9890593188.0, "step": 3238 }, { "epoch": 4.742993222201869, "grad_norm": 0.2566708064822181, "learning_rate": 4.253097427307764e-06, "loss": 0.6574, "num_tokens": 9893588275.0, "step": 3239 }, { "epoch": 4.744458692068144, "grad_norm": 0.23361914107706985, "learning_rate": 4.250252274621756e-06, "loss": 0.6729, "num_tokens": 9896661742.0, "step": 3240 }, { "epoch": 4.74592416193442, "grad_norm": 0.21798728714874466, "learning_rate": 4.247423092168344e-06, "loss": 0.6469, "num_tokens": 9899981508.0, "step": 3241 }, { "epoch": 4.747389631800696, "grad_norm": 0.24417615223590142, "learning_rate": 4.244609882493074e-06, "loss": 0.6701, "num_tokens": 9903005838.0, "step": 3242 }, { "epoch": 4.7488551016669724, "grad_norm": 0.2550538565499943, "learning_rate": 4.241812648127109e-06, "loss": 0.7031, "num_tokens": 9905989448.0, "step": 3243 }, { "epoch": 4.750320571533248, "grad_norm": 0.2598865041747553, "learning_rate": 4.239031391587249e-06, "loss": 0.6628, "num_tokens": 9909184250.0, "step": 3244 }, { "epoch": 4.751786041399524, "grad_norm": 0.24269716607687597, "learning_rate": 4.236266115375911e-06, "loss": 0.657, "num_tokens": 9912216143.0, "step": 3245 }, { "epoch": 4.7532515112658, "grad_norm": 0.23367998877364038, "learning_rate": 4.2335168219811396e-06, "loss": 0.6606, "num_tokens": 9915129766.0, "step": 3246 }, { "epoch": 4.754716981132075, "grad_norm": 0.23238924686569898, "learning_rate": 4.230783513876591e-06, "loss": 0.6642, "num_tokens": 9918435461.0, "step": 3247 }, { "epoch": 4.7561824509983515, "grad_norm": 0.242077765294199, "learning_rate": 4.228066193521547e-06, "loss": 0.6616, "num_tokens": 9921380344.0, "step": 3248 }, { "epoch": 4.757647920864628, "grad_norm": 0.2267635806042762, "learning_rate": 4.2253648633609e-06, "loss": 0.6484, "num_tokens": 9924346977.0, "step": 3249 }, { "epoch": 4.759113390730903, "grad_norm": 0.23511091997274222, "learning_rate": 4.222679525825153e-06, "loss": 0.6717, "num_tokens": 9927343320.0, "step": 3250 }, { "epoch": 4.760578860597179, "grad_norm": 0.23299627854707206, "learning_rate": 4.220010183330428e-06, "loss": 0.6684, "num_tokens": 9930286988.0, "step": 3251 }, { "epoch": 4.762044330463455, "grad_norm": 0.24936479769332312, "learning_rate": 4.217356838278447e-06, "loss": 0.658, "num_tokens": 9933424168.0, "step": 3252 }, { "epoch": 4.7635098003297305, "grad_norm": 0.26399514450969874, "learning_rate": 4.214719493056541e-06, "loss": 0.6817, "num_tokens": 9936463892.0, "step": 3253 }, { "epoch": 4.764975270196007, "grad_norm": 0.2535653627566746, "learning_rate": 4.212098150037648e-06, "loss": 0.6461, "num_tokens": 9939448137.0, "step": 3254 }, { "epoch": 4.766440740062283, "grad_norm": 0.29373307329634873, "learning_rate": 4.209492811580309e-06, "loss": 0.6621, "num_tokens": 9942491238.0, "step": 3255 }, { "epoch": 4.767906209928558, "grad_norm": 0.2331822459096511, "learning_rate": 4.206903480028656e-06, "loss": 0.6648, "num_tokens": 9945687387.0, "step": 3256 }, { "epoch": 4.769371679794834, "grad_norm": 0.2691908336547314, "learning_rate": 4.20433015771243e-06, "loss": 0.6681, "num_tokens": 9948732309.0, "step": 3257 }, { "epoch": 4.77083714966111, "grad_norm": 0.26606557941007086, "learning_rate": 4.20177284694696e-06, "loss": 0.6517, "num_tokens": 9952204673.0, "step": 3258 }, { "epoch": 4.772302619527386, "grad_norm": 0.2522575303681543, "learning_rate": 4.199231550033172e-06, "loss": 0.6656, "num_tokens": 9955325545.0, "step": 3259 }, { "epoch": 4.773768089393662, "grad_norm": 0.2671076901908884, "learning_rate": 4.196706269257586e-06, "loss": 0.6842, "num_tokens": 9958199616.0, "step": 3260 }, { "epoch": 4.775233559259938, "grad_norm": 0.2940589621637163, "learning_rate": 4.194197006892304e-06, "loss": 0.6543, "num_tokens": 9961443942.0, "step": 3261 }, { "epoch": 4.776699029126213, "grad_norm": 0.24884280934453326, "learning_rate": 4.191703765195021e-06, "loss": 0.679, "num_tokens": 9964564645.0, "step": 3262 }, { "epoch": 4.778164498992489, "grad_norm": 0.25002426481833495, "learning_rate": 4.189226546409025e-06, "loss": 0.6662, "num_tokens": 9967512058.0, "step": 3263 }, { "epoch": 4.779629968858766, "grad_norm": 0.2696749269858804, "learning_rate": 4.186765352763166e-06, "loss": 0.664, "num_tokens": 9970667893.0, "step": 3264 }, { "epoch": 4.781095438725041, "grad_norm": 0.26987762053956615, "learning_rate": 4.184320186471899e-06, "loss": 0.6502, "num_tokens": 9973651724.0, "step": 3265 }, { "epoch": 4.782560908591317, "grad_norm": 0.23980771208494261, "learning_rate": 4.181891049735241e-06, "loss": 0.6474, "num_tokens": 9977106712.0, "step": 3266 }, { "epoch": 4.784026378457593, "grad_norm": 0.2705563622131156, "learning_rate": 4.1794779447388025e-06, "loss": 0.6399, "num_tokens": 9980350509.0, "step": 3267 }, { "epoch": 4.785491848323868, "grad_norm": 0.31286361667932133, "learning_rate": 4.177080873653751e-06, "loss": 0.6775, "num_tokens": 9983392498.0, "step": 3268 }, { "epoch": 4.786957318190145, "grad_norm": 0.24194233340772495, "learning_rate": 4.174699838636846e-06, "loss": 0.666, "num_tokens": 9986386239.0, "step": 3269 }, { "epoch": 4.788422788056421, "grad_norm": 0.24856860921812712, "learning_rate": 4.172334841830401e-06, "loss": 0.6568, "num_tokens": 9989337588.0, "step": 3270 }, { "epoch": 4.789888257922696, "grad_norm": 0.26337525176478194, "learning_rate": 4.169985885362317e-06, "loss": 0.6726, "num_tokens": 9992534969.0, "step": 3271 }, { "epoch": 4.791353727788972, "grad_norm": 0.269574565835146, "learning_rate": 4.1676529713460465e-06, "loss": 0.6814, "num_tokens": 9995569644.0, "step": 3272 }, { "epoch": 4.792819197655248, "grad_norm": 0.2330907210484345, "learning_rate": 4.165336101880621e-06, "loss": 0.6714, "num_tokens": 9998680944.0, "step": 3273 }, { "epoch": 4.794284667521524, "grad_norm": 0.2661359814443262, "learning_rate": 4.1630352790506275e-06, "loss": 0.6794, "num_tokens": 10001707598.0, "step": 3274 }, { "epoch": 4.7957501373878, "grad_norm": 0.2849957804870165, "learning_rate": 4.16075050492622e-06, "loss": 0.6632, "num_tokens": 10004781083.0, "step": 3275 }, { "epoch": 4.797215607254076, "grad_norm": 0.23669060225122585, "learning_rate": 4.158481781563107e-06, "loss": 0.6778, "num_tokens": 10007867448.0, "step": 3276 }, { "epoch": 4.798681077120352, "grad_norm": 0.23802948200338991, "learning_rate": 4.1562291110025635e-06, "loss": 0.6618, "num_tokens": 10010744972.0, "step": 3277 }, { "epoch": 4.800146546986627, "grad_norm": 0.25897445628236854, "learning_rate": 4.153992495271414e-06, "loss": 0.6648, "num_tokens": 10014041173.0, "step": 3278 }, { "epoch": 4.8016120168529035, "grad_norm": 0.25132548211075517, "learning_rate": 4.15177193638204e-06, "loss": 0.6444, "num_tokens": 10017096006.0, "step": 3279 }, { "epoch": 4.80307748671918, "grad_norm": 0.23136396369003998, "learning_rate": 4.149567436332381e-06, "loss": 0.6832, "num_tokens": 10020370573.0, "step": 3280 }, { "epoch": 4.804542956585455, "grad_norm": 0.22433115759099007, "learning_rate": 4.147378997105917e-06, "loss": 0.6537, "num_tokens": 10023307222.0, "step": 3281 }, { "epoch": 4.806008426451731, "grad_norm": 0.25336194850164406, "learning_rate": 4.14520662067169e-06, "loss": 0.6729, "num_tokens": 10026217663.0, "step": 3282 }, { "epoch": 4.807473896318007, "grad_norm": 0.25699339370988566, "learning_rate": 4.143050308984277e-06, "loss": 0.6496, "num_tokens": 10029394044.0, "step": 3283 }, { "epoch": 4.8089393661842825, "grad_norm": 0.23365787610299754, "learning_rate": 4.140910063983809e-06, "loss": 0.6681, "num_tokens": 10032436500.0, "step": 3284 }, { "epoch": 4.810404836050559, "grad_norm": 0.2319119913896515, "learning_rate": 4.138785887595959e-06, "loss": 0.657, "num_tokens": 10035585114.0, "step": 3285 }, { "epoch": 4.811870305916835, "grad_norm": 0.26755282059722246, "learning_rate": 4.136677781731945e-06, "loss": 0.669, "num_tokens": 10038692924.0, "step": 3286 }, { "epoch": 4.81333577578311, "grad_norm": 0.2546731434610133, "learning_rate": 4.134585748288522e-06, "loss": 0.6595, "num_tokens": 10041806551.0, "step": 3287 }, { "epoch": 4.814801245649386, "grad_norm": 0.24832616424684056, "learning_rate": 4.132509789147981e-06, "loss": 0.6632, "num_tokens": 10044783783.0, "step": 3288 }, { "epoch": 4.816266715515662, "grad_norm": 0.26345538123758616, "learning_rate": 4.13044990617816e-06, "loss": 0.6691, "num_tokens": 10047768359.0, "step": 3289 }, { "epoch": 4.817732185381938, "grad_norm": 0.24014538539821292, "learning_rate": 4.128406101232422e-06, "loss": 0.6575, "num_tokens": 10050773130.0, "step": 3290 }, { "epoch": 4.819197655248214, "grad_norm": 0.24555964909618153, "learning_rate": 4.126378376149673e-06, "loss": 0.6673, "num_tokens": 10054172675.0, "step": 3291 }, { "epoch": 4.82066312511449, "grad_norm": 0.25568723765575085, "learning_rate": 4.124366732754342e-06, "loss": 0.6485, "num_tokens": 10057095320.0, "step": 3292 }, { "epoch": 4.822128594980766, "grad_norm": 0.24059049295252455, "learning_rate": 4.122371172856397e-06, "loss": 0.6451, "num_tokens": 10060204040.0, "step": 3293 }, { "epoch": 4.8235940648470415, "grad_norm": 0.24503549214595582, "learning_rate": 4.12039169825133e-06, "loss": 0.6678, "num_tokens": 10063226174.0, "step": 3294 }, { "epoch": 4.825059534713318, "grad_norm": 0.2399578571878561, "learning_rate": 4.118428310720161e-06, "loss": 0.6563, "num_tokens": 10066457237.0, "step": 3295 }, { "epoch": 4.826525004579594, "grad_norm": 0.25626228724359756, "learning_rate": 4.116481012029441e-06, "loss": 0.675, "num_tokens": 10069541378.0, "step": 3296 }, { "epoch": 4.827990474445869, "grad_norm": 0.23492729610920066, "learning_rate": 4.1145498039312345e-06, "loss": 0.6803, "num_tokens": 10072602529.0, "step": 3297 }, { "epoch": 4.829455944312145, "grad_norm": 0.260984441026835, "learning_rate": 4.112634688163138e-06, "loss": 0.6446, "num_tokens": 10075734680.0, "step": 3298 }, { "epoch": 4.830921414178421, "grad_norm": 0.24818212062971204, "learning_rate": 4.110735666448265e-06, "loss": 0.6556, "num_tokens": 10078753094.0, "step": 3299 }, { "epoch": 4.832386884044697, "grad_norm": 0.2552422087123095, "learning_rate": 4.108852740495252e-06, "loss": 0.6802, "num_tokens": 10081764546.0, "step": 3300 }, { "epoch": 4.833852353910973, "grad_norm": 0.24745719766136143, "learning_rate": 4.106985911998247e-06, "loss": 0.6586, "num_tokens": 10084936144.0, "step": 3301 }, { "epoch": 4.835317823777249, "grad_norm": 0.24337760846373835, "learning_rate": 4.105135182636921e-06, "loss": 0.6811, "num_tokens": 10087830473.0, "step": 3302 }, { "epoch": 4.836783293643524, "grad_norm": 0.2512049470443849, "learning_rate": 4.103300554076455e-06, "loss": 0.6721, "num_tokens": 10090807432.0, "step": 3303 }, { "epoch": 4.8382487635098, "grad_norm": 0.2598909586128265, "learning_rate": 4.101482027967546e-06, "loss": 0.6694, "num_tokens": 10093693314.0, "step": 3304 }, { "epoch": 4.8397142333760765, "grad_norm": 0.2365767800164388, "learning_rate": 4.099679605946402e-06, "loss": 0.6466, "num_tokens": 10096860244.0, "step": 3305 }, { "epoch": 4.841179703242352, "grad_norm": 0.2344009449110385, "learning_rate": 4.097893289634743e-06, "loss": 0.6676, "num_tokens": 10099909930.0, "step": 3306 }, { "epoch": 4.842645173108628, "grad_norm": 0.2568835000319415, "learning_rate": 4.096123080639794e-06, "loss": 0.6666, "num_tokens": 10102919461.0, "step": 3307 }, { "epoch": 4.844110642974904, "grad_norm": 0.24003594968612912, "learning_rate": 4.094368980554294e-06, "loss": 0.6583, "num_tokens": 10105937192.0, "step": 3308 }, { "epoch": 4.845576112841179, "grad_norm": 0.24285154095192701, "learning_rate": 4.092630990956481e-06, "loss": 0.6734, "num_tokens": 10108988646.0, "step": 3309 }, { "epoch": 4.8470415827074556, "grad_norm": 0.23834506721390777, "learning_rate": 4.090909113410105e-06, "loss": 0.6787, "num_tokens": 10112025105.0, "step": 3310 }, { "epoch": 4.848507052573732, "grad_norm": 0.27767507346890546, "learning_rate": 4.089203349464411e-06, "loss": 0.6743, "num_tokens": 10115286424.0, "step": 3311 }, { "epoch": 4.849972522440007, "grad_norm": 0.232072509441997, "learning_rate": 4.087513700654153e-06, "loss": 0.6479, "num_tokens": 10118414853.0, "step": 3312 }, { "epoch": 4.851437992306283, "grad_norm": 0.2553948489461406, "learning_rate": 4.085840168499582e-06, "loss": 0.663, "num_tokens": 10121515221.0, "step": 3313 }, { "epoch": 4.852903462172559, "grad_norm": 0.24334159020693186, "learning_rate": 4.0841827545064475e-06, "loss": 0.668, "num_tokens": 10124236733.0, "step": 3314 }, { "epoch": 4.854368932038835, "grad_norm": 0.24204271783604037, "learning_rate": 4.082541460165999e-06, "loss": 0.6553, "num_tokens": 10127360795.0, "step": 3315 }, { "epoch": 4.855834401905111, "grad_norm": 0.23787273882824225, "learning_rate": 4.080916286954983e-06, "loss": 0.6431, "num_tokens": 10130508047.0, "step": 3316 }, { "epoch": 4.857299871771387, "grad_norm": 0.24359198731560341, "learning_rate": 4.079307236335639e-06, "loss": 0.6676, "num_tokens": 10133771785.0, "step": 3317 }, { "epoch": 4.858765341637662, "grad_norm": 0.24744561468587145, "learning_rate": 4.0777143097557025e-06, "loss": 0.6555, "num_tokens": 10136758561.0, "step": 3318 }, { "epoch": 4.860230811503938, "grad_norm": 0.2525095749707601, "learning_rate": 4.076137508648394e-06, "loss": 0.6765, "num_tokens": 10139934563.0, "step": 3319 }, { "epoch": 4.8616962813702145, "grad_norm": 0.22471336963876876, "learning_rate": 4.074576834432438e-06, "loss": 0.6567, "num_tokens": 10143138174.0, "step": 3320 }, { "epoch": 4.86316175123649, "grad_norm": 0.2593195577416882, "learning_rate": 4.073032288512037e-06, "loss": 0.6572, "num_tokens": 10146076023.0, "step": 3321 }, { "epoch": 4.864627221102766, "grad_norm": 0.24691720531379804, "learning_rate": 4.071503872276892e-06, "loss": 0.6727, "num_tokens": 10149043723.0, "step": 3322 }, { "epoch": 4.866092690969042, "grad_norm": 0.2353475597555255, "learning_rate": 4.069991587102184e-06, "loss": 0.6601, "num_tokens": 10152076693.0, "step": 3323 }, { "epoch": 4.867558160835317, "grad_norm": 0.27606765649864623, "learning_rate": 4.0684954343485806e-06, "loss": 0.6635, "num_tokens": 10155106221.0, "step": 3324 }, { "epoch": 4.8690236307015935, "grad_norm": 0.2725279679408378, "learning_rate": 4.06701541536224e-06, "loss": 0.6644, "num_tokens": 10158118374.0, "step": 3325 }, { "epoch": 4.87048910056787, "grad_norm": 0.259555626526974, "learning_rate": 4.065551531474801e-06, "loss": 0.662, "num_tokens": 10161233612.0, "step": 3326 }, { "epoch": 4.871954570434146, "grad_norm": 0.2401063679414391, "learning_rate": 4.064103784003381e-06, "loss": 0.6685, "num_tokens": 10164218144.0, "step": 3327 }, { "epoch": 4.873420040300421, "grad_norm": 0.24770852013881364, "learning_rate": 4.062672174250587e-06, "loss": 0.6727, "num_tokens": 10167201877.0, "step": 3328 }, { "epoch": 4.874885510166697, "grad_norm": 0.25899712969062416, "learning_rate": 4.061256703504498e-06, "loss": 0.6608, "num_tokens": 10170132568.0, "step": 3329 }, { "epoch": 4.876350980032973, "grad_norm": 0.24362490404046167, "learning_rate": 4.0598573730386765e-06, "loss": 0.6765, "num_tokens": 10173474458.0, "step": 3330 }, { "epoch": 4.877816449899249, "grad_norm": 0.25877921319908337, "learning_rate": 4.058474184112163e-06, "loss": 0.6741, "num_tokens": 10176764191.0, "step": 3331 }, { "epoch": 4.879281919765525, "grad_norm": 0.21890189252752154, "learning_rate": 4.057107137969473e-06, "loss": 0.6576, "num_tokens": 10179996567.0, "step": 3332 }, { "epoch": 4.880747389631801, "grad_norm": 0.23725423603431547, "learning_rate": 4.055756235840598e-06, "loss": 0.6741, "num_tokens": 10183090348.0, "step": 3333 }, { "epoch": 4.882212859498076, "grad_norm": 0.24763788137224907, "learning_rate": 4.054421478941005e-06, "loss": 0.6639, "num_tokens": 10186282764.0, "step": 3334 }, { "epoch": 4.883678329364352, "grad_norm": 0.23911064625330353, "learning_rate": 4.053102868471633e-06, "loss": 0.6587, "num_tokens": 10189444787.0, "step": 3335 }, { "epoch": 4.885143799230629, "grad_norm": 0.22712063537280114, "learning_rate": 4.051800405618894e-06, "loss": 0.6641, "num_tokens": 10192379450.0, "step": 3336 }, { "epoch": 4.886609269096904, "grad_norm": 0.2249114852172756, "learning_rate": 4.0505140915546744e-06, "loss": 0.6566, "num_tokens": 10195729246.0, "step": 3337 }, { "epoch": 4.88807473896318, "grad_norm": 0.2501779899369485, "learning_rate": 4.0492439274363235e-06, "loss": 0.6634, "num_tokens": 10198731706.0, "step": 3338 }, { "epoch": 4.889540208829456, "grad_norm": 0.2570072206332202, "learning_rate": 4.047989914406667e-06, "loss": 0.6671, "num_tokens": 10201638224.0, "step": 3339 }, { "epoch": 4.891005678695731, "grad_norm": 0.23983720583322143, "learning_rate": 4.046752053593994e-06, "loss": 0.6585, "num_tokens": 10204847586.0, "step": 3340 }, { "epoch": 4.892471148562008, "grad_norm": 0.24176996472566084, "learning_rate": 4.045530346112064e-06, "loss": 0.674, "num_tokens": 10207878070.0, "step": 3341 }, { "epoch": 4.893936618428284, "grad_norm": 0.2485242918178423, "learning_rate": 4.0443247930600995e-06, "loss": 0.6808, "num_tokens": 10210940245.0, "step": 3342 }, { "epoch": 4.89540208829456, "grad_norm": 0.22672509204028388, "learning_rate": 4.043135395522794e-06, "loss": 0.6444, "num_tokens": 10214085685.0, "step": 3343 }, { "epoch": 4.896867558160835, "grad_norm": 0.2540631253396701, "learning_rate": 4.041962154570294e-06, "loss": 0.6615, "num_tokens": 10217168715.0, "step": 3344 }, { "epoch": 4.898333028027111, "grad_norm": 0.23038009394376857, "learning_rate": 4.040805071258224e-06, "loss": 0.6561, "num_tokens": 10220309143.0, "step": 3345 }, { "epoch": 4.8997984978933875, "grad_norm": 0.23151992318663714, "learning_rate": 4.039664146627658e-06, "loss": 0.6484, "num_tokens": 10223425070.0, "step": 3346 }, { "epoch": 4.901263967759663, "grad_norm": 0.25249963455037666, "learning_rate": 4.038539381705138e-06, "loss": 0.6574, "num_tokens": 10226574466.0, "step": 3347 }, { "epoch": 4.902729437625939, "grad_norm": 0.24490407916529922, "learning_rate": 4.037430777502665e-06, "loss": 0.6525, "num_tokens": 10229521093.0, "step": 3348 }, { "epoch": 4.904194907492215, "grad_norm": 0.2312993830802392, "learning_rate": 4.036338335017697e-06, "loss": 0.6421, "num_tokens": 10232746150.0, "step": 3349 }, { "epoch": 4.90566037735849, "grad_norm": 0.24054832572824672, "learning_rate": 4.035262055233155e-06, "loss": 0.6808, "num_tokens": 10235799916.0, "step": 3350 }, { "epoch": 4.9071258472247665, "grad_norm": 0.24888798910212104, "learning_rate": 4.0342019391174155e-06, "loss": 0.6465, "num_tokens": 10238900773.0, "step": 3351 }, { "epoch": 4.908591317091043, "grad_norm": 0.21278156288114924, "learning_rate": 4.03315798762431e-06, "loss": 0.6621, "num_tokens": 10241899425.0, "step": 3352 }, { "epoch": 4.910056786957318, "grad_norm": 0.2450722550420986, "learning_rate": 4.032130201693129e-06, "loss": 0.6693, "num_tokens": 10245002655.0, "step": 3353 }, { "epoch": 4.911522256823594, "grad_norm": 0.23348002544815005, "learning_rate": 4.031118582248618e-06, "loss": 0.6618, "num_tokens": 10247980716.0, "step": 3354 }, { "epoch": 4.91298772668987, "grad_norm": 0.2620539362851663, "learning_rate": 4.030123130200975e-06, "loss": 0.6531, "num_tokens": 10251082567.0, "step": 3355 }, { "epoch": 4.9144531965561455, "grad_norm": 0.2397075984098191, "learning_rate": 4.029143846445851e-06, "loss": 0.6549, "num_tokens": 10254010801.0, "step": 3356 }, { "epoch": 4.915918666422422, "grad_norm": 0.23537613664779508, "learning_rate": 4.0281807318643526e-06, "loss": 0.6699, "num_tokens": 10257075458.0, "step": 3357 }, { "epoch": 4.917384136288698, "grad_norm": 0.2491329251044043, "learning_rate": 4.027233787323035e-06, "loss": 0.6715, "num_tokens": 10260128275.0, "step": 3358 }, { "epoch": 4.918849606154973, "grad_norm": 0.26525799562504165, "learning_rate": 4.026303013673908e-06, "loss": 0.6688, "num_tokens": 10263189137.0, "step": 3359 }, { "epoch": 4.920315076021249, "grad_norm": 0.27383866474262725, "learning_rate": 4.025388411754426e-06, "loss": 0.63, "num_tokens": 10266417442.0, "step": 3360 }, { "epoch": 4.9217805458875254, "grad_norm": 0.2794440005469038, "learning_rate": 4.024489982387502e-06, "loss": 0.6729, "num_tokens": 10269519743.0, "step": 3361 }, { "epoch": 4.923246015753801, "grad_norm": 0.2543682162679272, "learning_rate": 4.023607726381488e-06, "loss": 0.6871, "num_tokens": 10272295629.0, "step": 3362 }, { "epoch": 4.924711485620077, "grad_norm": 0.2638752217718156, "learning_rate": 4.022741644530191e-06, "loss": 0.6775, "num_tokens": 10275458029.0, "step": 3363 }, { "epoch": 4.926176955486353, "grad_norm": 0.24821375206446913, "learning_rate": 4.0218917376128635e-06, "loss": 0.6709, "num_tokens": 10278653826.0, "step": 3364 }, { "epoch": 4.927642425352628, "grad_norm": 0.24395056265839496, "learning_rate": 4.021058006394204e-06, "loss": 0.6818, "num_tokens": 10281817798.0, "step": 3365 }, { "epoch": 4.9291078952189045, "grad_norm": 0.22583025127807876, "learning_rate": 4.020240451624357e-06, "loss": 0.6501, "num_tokens": 10284930492.0, "step": 3366 }, { "epoch": 4.930573365085181, "grad_norm": 0.24937615285526646, "learning_rate": 4.019439074038912e-06, "loss": 0.6548, "num_tokens": 10288084622.0, "step": 3367 }, { "epoch": 4.932038834951456, "grad_norm": 0.24806917285337138, "learning_rate": 4.018653874358904e-06, "loss": 0.6703, "num_tokens": 10291226036.0, "step": 3368 }, { "epoch": 4.933504304817732, "grad_norm": 0.240888926967346, "learning_rate": 4.017884853290814e-06, "loss": 0.6559, "num_tokens": 10294172034.0, "step": 3369 }, { "epoch": 4.934969774684008, "grad_norm": 0.24705496568282145, "learning_rate": 4.01713201152656e-06, "loss": 0.6766, "num_tokens": 10297261754.0, "step": 3370 }, { "epoch": 4.9364352445502835, "grad_norm": 0.26204171156197903, "learning_rate": 4.016395349743512e-06, "loss": 0.6918, "num_tokens": 10300268809.0, "step": 3371 }, { "epoch": 4.93790071441656, "grad_norm": 0.25276420533569455, "learning_rate": 4.015674868604473e-06, "loss": 0.654, "num_tokens": 10303377808.0, "step": 3372 }, { "epoch": 4.939366184282836, "grad_norm": 0.24236960717055947, "learning_rate": 4.014970568757696e-06, "loss": 0.6726, "num_tokens": 10306272071.0, "step": 3373 }, { "epoch": 4.940831654149111, "grad_norm": 0.23758048217360095, "learning_rate": 4.0142824508368655e-06, "loss": 0.6681, "num_tokens": 10309415447.0, "step": 3374 }, { "epoch": 4.942297124015387, "grad_norm": 0.2554655586449771, "learning_rate": 4.0136105154611155e-06, "loss": 0.6728, "num_tokens": 10312503781.0, "step": 3375 }, { "epoch": 4.943762593881663, "grad_norm": 0.23146119287102876, "learning_rate": 4.012954763235016e-06, "loss": 0.6628, "num_tokens": 10315571007.0, "step": 3376 }, { "epoch": 4.9452280637479396, "grad_norm": 0.232604878560038, "learning_rate": 4.0123151947485725e-06, "loss": 0.6634, "num_tokens": 10318855057.0, "step": 3377 }, { "epoch": 4.946693533614215, "grad_norm": 0.22980342602434667, "learning_rate": 4.0116918105772375e-06, "loss": 0.6524, "num_tokens": 10322011598.0, "step": 3378 }, { "epoch": 4.948159003480491, "grad_norm": 0.22577381012648853, "learning_rate": 4.011084611281893e-06, "loss": 0.6556, "num_tokens": 10324931296.0, "step": 3379 }, { "epoch": 4.949624473346767, "grad_norm": 0.2199843360816917, "learning_rate": 4.010493597408868e-06, "loss": 0.6521, "num_tokens": 10328508880.0, "step": 3380 }, { "epoch": 4.951089943213042, "grad_norm": 0.2322028332239894, "learning_rate": 4.00991876948992e-06, "loss": 0.6451, "num_tokens": 10331620458.0, "step": 3381 }, { "epoch": 4.952555413079319, "grad_norm": 0.22451829435508014, "learning_rate": 4.009360128042249e-06, "loss": 0.6406, "num_tokens": 10334780085.0, "step": 3382 }, { "epoch": 4.954020882945595, "grad_norm": 0.22413339782695352, "learning_rate": 4.008817673568487e-06, "loss": 0.6777, "num_tokens": 10337749677.0, "step": 3383 }, { "epoch": 4.95548635281187, "grad_norm": 0.22848152385550569, "learning_rate": 4.008291406556708e-06, "loss": 0.6796, "num_tokens": 10340769683.0, "step": 3384 }, { "epoch": 4.956951822678146, "grad_norm": 0.22849980282508697, "learning_rate": 4.007781327480415e-06, "loss": 0.651, "num_tokens": 10343805466.0, "step": 3385 }, { "epoch": 4.958417292544422, "grad_norm": 0.2482882936373451, "learning_rate": 4.007287436798549e-06, "loss": 0.6551, "num_tokens": 10346722074.0, "step": 3386 }, { "epoch": 4.959882762410698, "grad_norm": 0.24365064933390798, "learning_rate": 4.0068097349554865e-06, "loss": 0.6872, "num_tokens": 10349658690.0, "step": 3387 }, { "epoch": 4.961348232276974, "grad_norm": 0.24685306430568574, "learning_rate": 4.006348222381034e-06, "loss": 0.6685, "num_tokens": 10352641667.0, "step": 3388 }, { "epoch": 4.96281370214325, "grad_norm": 0.24142202133098234, "learning_rate": 4.005902899490439e-06, "loss": 0.6687, "num_tokens": 10355791818.0, "step": 3389 }, { "epoch": 4.964279172009525, "grad_norm": 0.2492758251665183, "learning_rate": 4.005473766684375e-06, "loss": 0.6599, "num_tokens": 10358746371.0, "step": 3390 }, { "epoch": 4.965744641875801, "grad_norm": 0.25134199628739456, "learning_rate": 4.005060824348953e-06, "loss": 0.6724, "num_tokens": 10361658311.0, "step": 3391 }, { "epoch": 4.9672101117420775, "grad_norm": 0.22529792143437824, "learning_rate": 4.004664072855716e-06, "loss": 0.6582, "num_tokens": 10364628427.0, "step": 3392 }, { "epoch": 4.968675581608354, "grad_norm": 0.2531806118825763, "learning_rate": 4.0042835125616365e-06, "loss": 0.6645, "num_tokens": 10367891435.0, "step": 3393 }, { "epoch": 4.970141051474629, "grad_norm": 0.22867585661997294, "learning_rate": 4.003919143809124e-06, "loss": 0.6637, "num_tokens": 10371280949.0, "step": 3394 }, { "epoch": 4.971606521340905, "grad_norm": 0.23898093698479422, "learning_rate": 4.003570966926017e-06, "loss": 0.6502, "num_tokens": 10374431732.0, "step": 3395 }, { "epoch": 4.973071991207181, "grad_norm": 0.237246129076489, "learning_rate": 4.003238982225583e-06, "loss": 0.6543, "num_tokens": 10377365233.0, "step": 3396 }, { "epoch": 4.9745374610734565, "grad_norm": 0.24735010189655873, "learning_rate": 4.002923190006527e-06, "loss": 0.6514, "num_tokens": 10380460181.0, "step": 3397 }, { "epoch": 4.976002930939733, "grad_norm": 0.232742465942289, "learning_rate": 4.0026235905529785e-06, "loss": 0.6677, "num_tokens": 10383414225.0, "step": 3398 }, { "epoch": 4.977468400806009, "grad_norm": 0.231619986983961, "learning_rate": 4.002340184134502e-06, "loss": 0.6734, "num_tokens": 10386359973.0, "step": 3399 }, { "epoch": 4.978933870672284, "grad_norm": 0.2400915172151342, "learning_rate": 4.002072971006089e-06, "loss": 0.6763, "num_tokens": 10389357384.0, "step": 3400 }, { "epoch": 4.98039934053856, "grad_norm": 0.22642706479975402, "learning_rate": 4.001821951408163e-06, "loss": 0.6522, "num_tokens": 10392538884.0, "step": 3401 }, { "epoch": 4.981864810404836, "grad_norm": 0.25109936291397833, "learning_rate": 4.001587125566579e-06, "loss": 0.6767, "num_tokens": 10395221171.0, "step": 3402 }, { "epoch": 4.983330280271112, "grad_norm": 0.2621244881922184, "learning_rate": 4.001368493692621e-06, "loss": 0.6607, "num_tokens": 10398234249.0, "step": 3403 }, { "epoch": 4.984795750137388, "grad_norm": 0.2508753499169267, "learning_rate": 4.001166055983001e-06, "loss": 0.6481, "num_tokens": 10401430928.0, "step": 3404 }, { "epoch": 4.986261220003664, "grad_norm": 0.2426986135473989, "learning_rate": 4.000979812619857e-06, "loss": 0.6618, "num_tokens": 10404514616.0, "step": 3405 }, { "epoch": 4.987726689869939, "grad_norm": 0.23795954378718198, "learning_rate": 4.000809763770768e-06, "loss": 0.6779, "num_tokens": 10407453510.0, "step": 3406 }, { "epoch": 4.989192159736215, "grad_norm": 0.27657572023584226, "learning_rate": 4.000655909588727e-06, "loss": 0.6838, "num_tokens": 10410272540.0, "step": 3407 }, { "epoch": 4.990657629602492, "grad_norm": 0.2619521261201692, "learning_rate": 4.000518250212169e-06, "loss": 0.6728, "num_tokens": 10413277091.0, "step": 3408 }, { "epoch": 4.992123099468767, "grad_norm": 0.26956884101097617, "learning_rate": 4.000396785764949e-06, "loss": 0.679, "num_tokens": 10416149006.0, "step": 3409 }, { "epoch": 4.993588569335043, "grad_norm": 0.23539351351832585, "learning_rate": 4.000291516356355e-06, "loss": 0.6781, "num_tokens": 10419121779.0, "step": 3410 }, { "epoch": 4.995054039201319, "grad_norm": 0.23202792635745148, "learning_rate": 4.000202442081103e-06, "loss": 0.642, "num_tokens": 10422250208.0, "step": 3411 }, { "epoch": 4.9965195090675945, "grad_norm": 0.24622217854337639, "learning_rate": 4.000129563019336e-06, "loss": 0.6739, "num_tokens": 10425008750.0, "step": 3412 }, { "epoch": 4.997984978933871, "grad_norm": 0.26039551967189023, "learning_rate": 4.000072879236628e-06, "loss": 0.6734, "num_tokens": 10428097263.0, "step": 3413 }, { "epoch": 4.999450448800147, "grad_norm": 0.2442136306876934, "learning_rate": 4.000032390783978e-06, "loss": 0.657, "num_tokens": 10431157791.0, "step": 3414 }, { "epoch": 5.0, "grad_norm": 0.2442136306876934, "learning_rate": 4.000008097697817e-06, "loss": 0.6751, "num_tokens": 10431803027.0, "step": 3415 }, { "epoch": 5.0, "step": 3415, "total_flos": 1.1159501200687104e+16, "train_loss": 0.7174374641820592, "train_runtime": 243574.8878, "train_samples_per_second": 7.171, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 3415, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1159501200687104e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }