{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 3426, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017513134851138354, "grad_norm": 2.150632381439209, "learning_rate": 9.70873786407767e-08, "loss": 0.9259, "step": 1 }, { "epoch": 0.0035026269702276708, "grad_norm": 2.2049365043640137, "learning_rate": 1.941747572815534e-07, "loss": 0.9347, "step": 2 }, { "epoch": 0.005253940455341506, "grad_norm": 2.2153868675231934, "learning_rate": 2.9126213592233014e-07, "loss": 0.9467, "step": 3 }, { "epoch": 0.0070052539404553416, "grad_norm": 2.1438517570495605, "learning_rate": 3.883495145631068e-07, "loss": 0.9275, "step": 4 }, { "epoch": 0.008756567425569177, "grad_norm": 2.143266201019287, "learning_rate": 4.854368932038835e-07, "loss": 0.9287, "step": 5 }, { "epoch": 0.010507880910683012, "grad_norm": 2.122737169265747, "learning_rate": 5.825242718446603e-07, "loss": 0.9226, "step": 6 }, { "epoch": 0.012259194395796848, "grad_norm": 2.188793897628784, "learning_rate": 6.79611650485437e-07, "loss": 0.9451, "step": 7 }, { "epoch": 0.014010507880910683, "grad_norm": 1.9561679363250732, "learning_rate": 7.766990291262136e-07, "loss": 0.9049, "step": 8 }, { "epoch": 0.01576182136602452, "grad_norm": 2.0417442321777344, "learning_rate": 8.737864077669904e-07, "loss": 0.9306, "step": 9 }, { "epoch": 0.017513134851138354, "grad_norm": 1.9827297925949097, "learning_rate": 9.70873786407767e-07, "loss": 0.9182, "step": 10 }, { "epoch": 0.01926444833625219, "grad_norm": 1.696359634399414, "learning_rate": 1.0679611650485437e-06, "loss": 0.9101, "step": 11 }, { "epoch": 0.021015761821366025, "grad_norm": 1.6967633962631226, "learning_rate": 1.1650485436893206e-06, "loss": 0.9168, "step": 12 }, { "epoch": 0.02276707530647986, "grad_norm": 1.5853500366210938, "learning_rate": 1.2621359223300972e-06, "loss": 0.8995, "step": 13 }, { "epoch": 0.024518388791593695, "grad_norm": 1.5341763496398926, "learning_rate": 1.359223300970874e-06, "loss": 0.9031, "step": 14 }, { "epoch": 0.02626970227670753, "grad_norm": 1.2013392448425293, "learning_rate": 1.4563106796116506e-06, "loss": 0.8805, "step": 15 }, { "epoch": 0.028021015761821366, "grad_norm": 1.1864920854568481, "learning_rate": 1.5533980582524272e-06, "loss": 0.8822, "step": 16 }, { "epoch": 0.0297723292469352, "grad_norm": 1.1450755596160889, "learning_rate": 1.650485436893204e-06, "loss": 0.8874, "step": 17 }, { "epoch": 0.03152364273204904, "grad_norm": 1.098598599433899, "learning_rate": 1.7475728155339808e-06, "loss": 0.8775, "step": 18 }, { "epoch": 0.03327495621716287, "grad_norm": 0.8985109925270081, "learning_rate": 1.8446601941747574e-06, "loss": 0.8638, "step": 19 }, { "epoch": 0.03502626970227671, "grad_norm": 0.9442009925842285, "learning_rate": 1.941747572815534e-06, "loss": 0.8514, "step": 20 }, { "epoch": 0.03677758318739054, "grad_norm": 1.1730965375900269, "learning_rate": 2.0388349514563107e-06, "loss": 0.8599, "step": 21 }, { "epoch": 0.03852889667250438, "grad_norm": 1.1770706176757812, "learning_rate": 2.1359223300970874e-06, "loss": 0.8474, "step": 22 }, { "epoch": 0.040280210157618214, "grad_norm": 1.0898829698562622, "learning_rate": 2.2330097087378645e-06, "loss": 0.8454, "step": 23 }, { "epoch": 0.04203152364273205, "grad_norm": 0.9709469079971313, "learning_rate": 2.330097087378641e-06, "loss": 0.8347, "step": 24 }, { "epoch": 0.043782837127845885, "grad_norm": 0.8363985419273376, "learning_rate": 2.427184466019418e-06, "loss": 0.8251, "step": 25 }, { "epoch": 0.04553415061295972, "grad_norm": 0.6238609552383423, "learning_rate": 2.5242718446601945e-06, "loss": 0.835, "step": 26 }, { "epoch": 0.047285464098073555, "grad_norm": 0.7074145078659058, "learning_rate": 2.621359223300971e-06, "loss": 0.8032, "step": 27 }, { "epoch": 0.04903677758318739, "grad_norm": 0.8333970308303833, "learning_rate": 2.718446601941748e-06, "loss": 0.8246, "step": 28 }, { "epoch": 0.050788091068301226, "grad_norm": 0.7815043926239014, "learning_rate": 2.8155339805825245e-06, "loss": 0.8, "step": 29 }, { "epoch": 0.05253940455341506, "grad_norm": 0.6044496297836304, "learning_rate": 2.912621359223301e-06, "loss": 0.7768, "step": 30 }, { "epoch": 0.0542907180385289, "grad_norm": 0.5156769156455994, "learning_rate": 3.0097087378640778e-06, "loss": 0.77, "step": 31 }, { "epoch": 0.05604203152364273, "grad_norm": 0.4538262188434601, "learning_rate": 3.1067961165048544e-06, "loss": 0.7759, "step": 32 }, { "epoch": 0.05779334500875657, "grad_norm": 0.45355334877967834, "learning_rate": 3.2038834951456315e-06, "loss": 0.761, "step": 33 }, { "epoch": 0.0595446584938704, "grad_norm": 0.5108784437179565, "learning_rate": 3.300970873786408e-06, "loss": 0.7666, "step": 34 }, { "epoch": 0.06129597197898424, "grad_norm": 0.48184970021247864, "learning_rate": 3.398058252427185e-06, "loss": 0.7579, "step": 35 }, { "epoch": 0.06304728546409807, "grad_norm": 0.45717641711235046, "learning_rate": 3.4951456310679615e-06, "loss": 0.7446, "step": 36 }, { "epoch": 0.0647985989492119, "grad_norm": 0.4016164541244507, "learning_rate": 3.592233009708738e-06, "loss": 0.7442, "step": 37 }, { "epoch": 0.06654991243432574, "grad_norm": 0.3465012311935425, "learning_rate": 3.689320388349515e-06, "loss": 0.7462, "step": 38 }, { "epoch": 0.06830122591943957, "grad_norm": 0.35492774844169617, "learning_rate": 3.7864077669902915e-06, "loss": 0.7364, "step": 39 }, { "epoch": 0.07005253940455342, "grad_norm": 0.38788941502571106, "learning_rate": 3.883495145631068e-06, "loss": 0.7514, "step": 40 }, { "epoch": 0.07180385288966724, "grad_norm": 0.38771378993988037, "learning_rate": 3.980582524271845e-06, "loss": 0.7149, "step": 41 }, { "epoch": 0.07355516637478109, "grad_norm": 0.3979766368865967, "learning_rate": 4.0776699029126215e-06, "loss": 0.7142, "step": 42 }, { "epoch": 0.07530647985989491, "grad_norm": 0.33620333671569824, "learning_rate": 4.1747572815533986e-06, "loss": 0.7121, "step": 43 }, { "epoch": 0.07705779334500876, "grad_norm": 0.31666654348373413, "learning_rate": 4.271844660194175e-06, "loss": 0.7045, "step": 44 }, { "epoch": 0.07880910683012259, "grad_norm": 0.3593182861804962, "learning_rate": 4.368932038834952e-06, "loss": 0.7197, "step": 45 }, { "epoch": 0.08056042031523643, "grad_norm": 0.3974195122718811, "learning_rate": 4.466019417475729e-06, "loss": 0.7014, "step": 46 }, { "epoch": 0.08231173380035026, "grad_norm": 0.34400108456611633, "learning_rate": 4.563106796116505e-06, "loss": 0.6834, "step": 47 }, { "epoch": 0.0840630472854641, "grad_norm": 0.3082146644592285, "learning_rate": 4.660194174757282e-06, "loss": 0.7008, "step": 48 }, { "epoch": 0.08581436077057793, "grad_norm": 0.29610538482666016, "learning_rate": 4.7572815533980585e-06, "loss": 0.7127, "step": 49 }, { "epoch": 0.08756567425569177, "grad_norm": 0.3143293559551239, "learning_rate": 4.854368932038836e-06, "loss": 0.6796, "step": 50 }, { "epoch": 0.0893169877408056, "grad_norm": 0.30066683888435364, "learning_rate": 4.951456310679612e-06, "loss": 0.6927, "step": 51 }, { "epoch": 0.09106830122591944, "grad_norm": 0.28887683153152466, "learning_rate": 5.048543689320389e-06, "loss": 0.6915, "step": 52 }, { "epoch": 0.09281961471103327, "grad_norm": 0.2595258057117462, "learning_rate": 5.145631067961165e-06, "loss": 0.6826, "step": 53 }, { "epoch": 0.09457092819614711, "grad_norm": 0.25768303871154785, "learning_rate": 5.242718446601942e-06, "loss": 0.69, "step": 54 }, { "epoch": 0.09632224168126094, "grad_norm": 0.2705790400505066, "learning_rate": 5.3398058252427185e-06, "loss": 0.6677, "step": 55 }, { "epoch": 0.09807355516637478, "grad_norm": 0.27472802996635437, "learning_rate": 5.436893203883496e-06, "loss": 0.6707, "step": 56 }, { "epoch": 0.09982486865148861, "grad_norm": 0.237682044506073, "learning_rate": 5.533980582524272e-06, "loss": 0.6692, "step": 57 }, { "epoch": 0.10157618213660245, "grad_norm": 0.2301778346300125, "learning_rate": 5.631067961165049e-06, "loss": 0.6748, "step": 58 }, { "epoch": 0.10332749562171628, "grad_norm": 0.2231488972902298, "learning_rate": 5.728155339805825e-06, "loss": 0.6643, "step": 59 }, { "epoch": 0.10507880910683012, "grad_norm": 0.24614176154136658, "learning_rate": 5.825242718446602e-06, "loss": 0.6633, "step": 60 }, { "epoch": 0.10683012259194395, "grad_norm": 0.25170809030532837, "learning_rate": 5.9223300970873785e-06, "loss": 0.6606, "step": 61 }, { "epoch": 0.1085814360770578, "grad_norm": 0.2277362048625946, "learning_rate": 6.0194174757281556e-06, "loss": 0.6645, "step": 62 }, { "epoch": 0.11033274956217162, "grad_norm": 0.21989953517913818, "learning_rate": 6.116504854368932e-06, "loss": 0.6524, "step": 63 }, { "epoch": 0.11208406304728546, "grad_norm": 0.22444699704647064, "learning_rate": 6.213592233009709e-06, "loss": 0.659, "step": 64 }, { "epoch": 0.1138353765323993, "grad_norm": 0.2281796634197235, "learning_rate": 6.310679611650487e-06, "loss": 0.6521, "step": 65 }, { "epoch": 0.11558669001751314, "grad_norm": 0.21072547137737274, "learning_rate": 6.407766990291263e-06, "loss": 0.6591, "step": 66 }, { "epoch": 0.11733800350262696, "grad_norm": 0.21704071760177612, "learning_rate": 6.50485436893204e-06, "loss": 0.654, "step": 67 }, { "epoch": 0.1190893169877408, "grad_norm": 0.21867427229881287, "learning_rate": 6.601941747572816e-06, "loss": 0.6551, "step": 68 }, { "epoch": 0.12084063047285463, "grad_norm": 0.20939725637435913, "learning_rate": 6.6990291262135935e-06, "loss": 0.6436, "step": 69 }, { "epoch": 0.12259194395796848, "grad_norm": 0.2344408929347992, "learning_rate": 6.79611650485437e-06, "loss": 0.6551, "step": 70 }, { "epoch": 0.1243432574430823, "grad_norm": 0.1880216747522354, "learning_rate": 6.893203883495147e-06, "loss": 0.6568, "step": 71 }, { "epoch": 0.12609457092819615, "grad_norm": 0.198270782828331, "learning_rate": 6.990291262135923e-06, "loss": 0.636, "step": 72 }, { "epoch": 0.12784588441331, "grad_norm": 0.21382354199886322, "learning_rate": 7.0873786407767e-06, "loss": 0.6458, "step": 73 }, { "epoch": 0.1295971978984238, "grad_norm": 0.22019973397254944, "learning_rate": 7.184466019417476e-06, "loss": 0.6452, "step": 74 }, { "epoch": 0.13134851138353765, "grad_norm": 0.23420555889606476, "learning_rate": 7.2815533980582534e-06, "loss": 0.6392, "step": 75 }, { "epoch": 0.1330998248686515, "grad_norm": 0.21380138397216797, "learning_rate": 7.37864077669903e-06, "loss": 0.6426, "step": 76 }, { "epoch": 0.13485113835376533, "grad_norm": 0.199965700507164, "learning_rate": 7.475728155339807e-06, "loss": 0.644, "step": 77 }, { "epoch": 0.13660245183887915, "grad_norm": 0.20081277191638947, "learning_rate": 7.572815533980583e-06, "loss": 0.6406, "step": 78 }, { "epoch": 0.138353765323993, "grad_norm": 0.18999333679676056, "learning_rate": 7.66990291262136e-06, "loss": 0.6364, "step": 79 }, { "epoch": 0.14010507880910683, "grad_norm": 0.19302135705947876, "learning_rate": 7.766990291262136e-06, "loss": 0.6419, "step": 80 }, { "epoch": 0.14185639229422067, "grad_norm": 0.21235792338848114, "learning_rate": 7.864077669902913e-06, "loss": 0.6509, "step": 81 }, { "epoch": 0.1436077057793345, "grad_norm": 0.19387727975845337, "learning_rate": 7.96116504854369e-06, "loss": 0.65, "step": 82 }, { "epoch": 0.14535901926444833, "grad_norm": 0.1971002072095871, "learning_rate": 8.058252427184466e-06, "loss": 0.6298, "step": 83 }, { "epoch": 0.14711033274956217, "grad_norm": 0.1986115276813507, "learning_rate": 8.155339805825243e-06, "loss": 0.6234, "step": 84 }, { "epoch": 0.14886164623467601, "grad_norm": 0.19286580383777618, "learning_rate": 8.25242718446602e-06, "loss": 0.6374, "step": 85 }, { "epoch": 0.15061295971978983, "grad_norm": 0.20243774354457855, "learning_rate": 8.349514563106797e-06, "loss": 0.6324, "step": 86 }, { "epoch": 0.15236427320490367, "grad_norm": 0.1975741684436798, "learning_rate": 8.446601941747573e-06, "loss": 0.6156, "step": 87 }, { "epoch": 0.15411558669001751, "grad_norm": 0.19458846747875214, "learning_rate": 8.54368932038835e-06, "loss": 0.6243, "step": 88 }, { "epoch": 0.15586690017513136, "grad_norm": 0.19080938398838043, "learning_rate": 8.640776699029127e-06, "loss": 0.6251, "step": 89 }, { "epoch": 0.15761821366024517, "grad_norm": 0.20771424472332, "learning_rate": 8.737864077669904e-06, "loss": 0.6191, "step": 90 }, { "epoch": 0.159369527145359, "grad_norm": 0.20744624733924866, "learning_rate": 8.834951456310681e-06, "loss": 0.6159, "step": 91 }, { "epoch": 0.16112084063047286, "grad_norm": 0.21205607056617737, "learning_rate": 8.932038834951458e-06, "loss": 0.6325, "step": 92 }, { "epoch": 0.1628721541155867, "grad_norm": 0.2003038227558136, "learning_rate": 9.029126213592233e-06, "loss": 0.6052, "step": 93 }, { "epoch": 0.1646234676007005, "grad_norm": 0.2042955607175827, "learning_rate": 9.12621359223301e-06, "loss": 0.6209, "step": 94 }, { "epoch": 0.16637478108581435, "grad_norm": 0.2008165568113327, "learning_rate": 9.223300970873788e-06, "loss": 0.6101, "step": 95 }, { "epoch": 0.1681260945709282, "grad_norm": 0.2303919643163681, "learning_rate": 9.320388349514565e-06, "loss": 0.6127, "step": 96 }, { "epoch": 0.16987740805604204, "grad_norm": 0.20745767652988434, "learning_rate": 9.41747572815534e-06, "loss": 0.6175, "step": 97 }, { "epoch": 0.17162872154115585, "grad_norm": 0.21293964982032776, "learning_rate": 9.514563106796117e-06, "loss": 0.6167, "step": 98 }, { "epoch": 0.1733800350262697, "grad_norm": 0.19930484890937805, "learning_rate": 9.611650485436894e-06, "loss": 0.6113, "step": 99 }, { "epoch": 0.17513134851138354, "grad_norm": 0.22008217871189117, "learning_rate": 9.708737864077671e-06, "loss": 0.6087, "step": 100 }, { "epoch": 0.17688266199649738, "grad_norm": 0.20740891993045807, "learning_rate": 9.805825242718447e-06, "loss": 0.6097, "step": 101 }, { "epoch": 0.1786339754816112, "grad_norm": 0.23654663562774658, "learning_rate": 9.902912621359224e-06, "loss": 0.6077, "step": 102 }, { "epoch": 0.18038528896672504, "grad_norm": 0.23930685222148895, "learning_rate": 1e-05, "loss": 0.6065, "step": 103 }, { "epoch": 0.18213660245183888, "grad_norm": 0.2338990569114685, "learning_rate": 9.999997765506791e-06, "loss": 0.6129, "step": 104 }, { "epoch": 0.18388791593695272, "grad_norm": 0.24303226172924042, "learning_rate": 9.99999106202916e-06, "loss": 0.6184, "step": 105 }, { "epoch": 0.18563922942206654, "grad_norm": 0.2709009647369385, "learning_rate": 9.9999798895731e-06, "loss": 0.5994, "step": 106 }, { "epoch": 0.18739054290718038, "grad_norm": 0.23080910742282867, "learning_rate": 9.999964248148597e-06, "loss": 0.6175, "step": 107 }, { "epoch": 0.18914185639229422, "grad_norm": 0.26710715889930725, "learning_rate": 9.999944137769629e-06, "loss": 0.5936, "step": 108 }, { "epoch": 0.19089316987740806, "grad_norm": 0.2665548324584961, "learning_rate": 9.999919558454171e-06, "loss": 0.6185, "step": 109 }, { "epoch": 0.19264448336252188, "grad_norm": 0.23193393647670746, "learning_rate": 9.999890510224195e-06, "loss": 0.6032, "step": 110 }, { "epoch": 0.19439579684763572, "grad_norm": 0.22096823155879974, "learning_rate": 9.999856993105661e-06, "loss": 0.5997, "step": 111 }, { "epoch": 0.19614711033274956, "grad_norm": 0.22147373855113983, "learning_rate": 9.999819007128527e-06, "loss": 0.5971, "step": 112 }, { "epoch": 0.1978984238178634, "grad_norm": 0.23360490798950195, "learning_rate": 9.999776552326747e-06, "loss": 0.6108, "step": 113 }, { "epoch": 0.19964973730297722, "grad_norm": 0.26431509852409363, "learning_rate": 9.999729628738263e-06, "loss": 0.6029, "step": 114 }, { "epoch": 0.20140105078809106, "grad_norm": 0.22472847998142242, "learning_rate": 9.99967823640502e-06, "loss": 0.5997, "step": 115 }, { "epoch": 0.2031523642732049, "grad_norm": 0.23647689819335938, "learning_rate": 9.99962237537295e-06, "loss": 0.6024, "step": 116 }, { "epoch": 0.20490367775831875, "grad_norm": 0.23935003578662872, "learning_rate": 9.999562045691979e-06, "loss": 0.5919, "step": 117 }, { "epoch": 0.20665499124343256, "grad_norm": 0.24106818437576294, "learning_rate": 9.999497247416035e-06, "loss": 0.5935, "step": 118 }, { "epoch": 0.2084063047285464, "grad_norm": 0.24979829788208008, "learning_rate": 9.99942798060303e-06, "loss": 0.6008, "step": 119 }, { "epoch": 0.21015761821366025, "grad_norm": 0.26320943236351013, "learning_rate": 9.999354245314874e-06, "loss": 0.5883, "step": 120 }, { "epoch": 0.2119089316987741, "grad_norm": 0.2361680567264557, "learning_rate": 9.999276041617477e-06, "loss": 0.5937, "step": 121 }, { "epoch": 0.2136602451838879, "grad_norm": 0.24374207854270935, "learning_rate": 9.999193369580729e-06, "loss": 0.6079, "step": 122 }, { "epoch": 0.21541155866900175, "grad_norm": 0.24115914106369019, "learning_rate": 9.999106229278531e-06, "loss": 0.6055, "step": 123 }, { "epoch": 0.2171628721541156, "grad_norm": 0.25526124238967896, "learning_rate": 9.99901462078876e-06, "loss": 0.6085, "step": 124 }, { "epoch": 0.21891418563922943, "grad_norm": 0.21939681470394135, "learning_rate": 9.998918544193303e-06, "loss": 0.5971, "step": 125 }, { "epoch": 0.22066549912434325, "grad_norm": 0.24003750085830688, "learning_rate": 9.998817999578028e-06, "loss": 0.5781, "step": 126 }, { "epoch": 0.2224168126094571, "grad_norm": 0.23765799403190613, "learning_rate": 9.998712987032803e-06, "loss": 0.5895, "step": 127 }, { "epoch": 0.22416812609457093, "grad_norm": 0.2507164180278778, "learning_rate": 9.99860350665149e-06, "loss": 0.5937, "step": 128 }, { "epoch": 0.22591943957968477, "grad_norm": 0.2620762288570404, "learning_rate": 9.99848955853194e-06, "loss": 0.5811, "step": 129 }, { "epoch": 0.2276707530647986, "grad_norm": 0.24096980690956116, "learning_rate": 9.998371142776e-06, "loss": 0.5947, "step": 130 }, { "epoch": 0.22942206654991243, "grad_norm": 0.26118066906929016, "learning_rate": 9.998248259489509e-06, "loss": 0.6042, "step": 131 }, { "epoch": 0.23117338003502627, "grad_norm": 0.2633346915245056, "learning_rate": 9.998120908782301e-06, "loss": 0.5891, "step": 132 }, { "epoch": 0.2329246935201401, "grad_norm": 0.28693127632141113, "learning_rate": 9.9979890907682e-06, "loss": 0.5918, "step": 133 }, { "epoch": 0.23467600700525393, "grad_norm": 0.2802914083003998, "learning_rate": 9.997852805565026e-06, "loss": 0.5824, "step": 134 }, { "epoch": 0.23642732049036777, "grad_norm": 0.27514660358428955, "learning_rate": 9.99771205329459e-06, "loss": 0.5804, "step": 135 }, { "epoch": 0.2381786339754816, "grad_norm": 0.2574217617511749, "learning_rate": 9.997566834082697e-06, "loss": 0.5856, "step": 136 }, { "epoch": 0.23992994746059546, "grad_norm": 0.26858755946159363, "learning_rate": 9.997417148059142e-06, "loss": 0.5873, "step": 137 }, { "epoch": 0.24168126094570927, "grad_norm": 0.29078754782676697, "learning_rate": 9.997262995357714e-06, "loss": 0.584, "step": 138 }, { "epoch": 0.2434325744308231, "grad_norm": 0.2492205649614334, "learning_rate": 9.997104376116195e-06, "loss": 0.5871, "step": 139 }, { "epoch": 0.24518388791593695, "grad_norm": 0.2627987563610077, "learning_rate": 9.996941290476359e-06, "loss": 0.5921, "step": 140 }, { "epoch": 0.2469352014010508, "grad_norm": 0.24301783740520477, "learning_rate": 9.99677373858397e-06, "loss": 0.5826, "step": 141 }, { "epoch": 0.2486865148861646, "grad_norm": 0.29153090715408325, "learning_rate": 9.996601720588787e-06, "loss": 0.5871, "step": 142 }, { "epoch": 0.2504378283712785, "grad_norm": 0.2720746397972107, "learning_rate": 9.996425236644558e-06, "loss": 0.5915, "step": 143 }, { "epoch": 0.2521891418563923, "grad_norm": 0.2785452902317047, "learning_rate": 9.996244286909022e-06, "loss": 0.5808, "step": 144 }, { "epoch": 0.2539404553415061, "grad_norm": 0.2903323769569397, "learning_rate": 9.996058871543917e-06, "loss": 0.5966, "step": 145 }, { "epoch": 0.25569176882662, "grad_norm": 0.29386812448501587, "learning_rate": 9.995868990714963e-06, "loss": 0.577, "step": 146 }, { "epoch": 0.2574430823117338, "grad_norm": 0.3283627927303314, "learning_rate": 9.995674644591873e-06, "loss": 0.5891, "step": 147 }, { "epoch": 0.2591943957968476, "grad_norm": 0.3268330991268158, "learning_rate": 9.995475833348359e-06, "loss": 0.5847, "step": 148 }, { "epoch": 0.2609457092819615, "grad_norm": 0.2840765416622162, "learning_rate": 9.99527255716211e-06, "loss": 0.5685, "step": 149 }, { "epoch": 0.2626970227670753, "grad_norm": 0.31424835324287415, "learning_rate": 9.99506481621482e-06, "loss": 0.5737, "step": 150 }, { "epoch": 0.26444833625218916, "grad_norm": 0.24279426038265228, "learning_rate": 9.994852610692166e-06, "loss": 0.5754, "step": 151 }, { "epoch": 0.266199649737303, "grad_norm": 0.3418554663658142, "learning_rate": 9.994635940783816e-06, "loss": 0.5716, "step": 152 }, { "epoch": 0.2679509632224168, "grad_norm": 0.24452657997608185, "learning_rate": 9.994414806683429e-06, "loss": 0.5741, "step": 153 }, { "epoch": 0.26970227670753066, "grad_norm": 0.32895058393478394, "learning_rate": 9.994189208588655e-06, "loss": 0.5774, "step": 154 }, { "epoch": 0.2714535901926445, "grad_norm": 0.2512442469596863, "learning_rate": 9.993959146701128e-06, "loss": 0.5921, "step": 155 }, { "epoch": 0.2732049036777583, "grad_norm": 0.31063804030418396, "learning_rate": 9.993724621226484e-06, "loss": 0.5757, "step": 156 }, { "epoch": 0.27495621716287216, "grad_norm": 0.2251133918762207, "learning_rate": 9.993485632374338e-06, "loss": 0.5715, "step": 157 }, { "epoch": 0.276707530647986, "grad_norm": 0.2714596092700958, "learning_rate": 9.993242180358298e-06, "loss": 0.5681, "step": 158 }, { "epoch": 0.27845884413309985, "grad_norm": 0.2522925138473511, "learning_rate": 9.992994265395959e-06, "loss": 0.5829, "step": 159 }, { "epoch": 0.28021015761821366, "grad_norm": 0.23874452710151672, "learning_rate": 9.992741887708908e-06, "loss": 0.5759, "step": 160 }, { "epoch": 0.2819614711033275, "grad_norm": 0.2989838123321533, "learning_rate": 9.99248504752272e-06, "loss": 0.5876, "step": 161 }, { "epoch": 0.28371278458844135, "grad_norm": 0.25998419523239136, "learning_rate": 9.992223745066959e-06, "loss": 0.5803, "step": 162 }, { "epoch": 0.28546409807355516, "grad_norm": 0.2334461659193039, "learning_rate": 9.991957980575172e-06, "loss": 0.5692, "step": 163 }, { "epoch": 0.287215411558669, "grad_norm": 0.2473168522119522, "learning_rate": 9.991687754284904e-06, "loss": 0.5676, "step": 164 }, { "epoch": 0.28896672504378285, "grad_norm": 0.28365078568458557, "learning_rate": 9.99141306643768e-06, "loss": 0.5628, "step": 165 }, { "epoch": 0.29071803852889666, "grad_norm": 0.25091469287872314, "learning_rate": 9.991133917279013e-06, "loss": 0.5789, "step": 166 }, { "epoch": 0.29246935201401053, "grad_norm": 0.2994779348373413, "learning_rate": 9.990850307058411e-06, "loss": 0.576, "step": 167 }, { "epoch": 0.29422066549912435, "grad_norm": 0.24791498482227325, "learning_rate": 9.99056223602936e-06, "loss": 0.5683, "step": 168 }, { "epoch": 0.29597197898423816, "grad_norm": 0.26695767045021057, "learning_rate": 9.990269704449338e-06, "loss": 0.5676, "step": 169 }, { "epoch": 0.29772329246935203, "grad_norm": 0.23262614011764526, "learning_rate": 9.989972712579809e-06, "loss": 0.5678, "step": 170 }, { "epoch": 0.29947460595446584, "grad_norm": 0.23438610136508942, "learning_rate": 9.989671260686225e-06, "loss": 0.5724, "step": 171 }, { "epoch": 0.30122591943957966, "grad_norm": 0.23405811190605164, "learning_rate": 9.989365349038019e-06, "loss": 0.5775, "step": 172 }, { "epoch": 0.30297723292469353, "grad_norm": 0.24892154335975647, "learning_rate": 9.98905497790862e-06, "loss": 0.5769, "step": 173 }, { "epoch": 0.30472854640980734, "grad_norm": 0.2552638351917267, "learning_rate": 9.98874014757543e-06, "loss": 0.5739, "step": 174 }, { "epoch": 0.3064798598949212, "grad_norm": 0.2572234272956848, "learning_rate": 9.98842085831985e-06, "loss": 0.5749, "step": 175 }, { "epoch": 0.30823117338003503, "grad_norm": 0.25847917795181274, "learning_rate": 9.988097110427255e-06, "loss": 0.5686, "step": 176 }, { "epoch": 0.30998248686514884, "grad_norm": 0.25293731689453125, "learning_rate": 9.987768904187014e-06, "loss": 0.5706, "step": 177 }, { "epoch": 0.3117338003502627, "grad_norm": 0.24121694266796112, "learning_rate": 9.987436239892472e-06, "loss": 0.5705, "step": 178 }, { "epoch": 0.3134851138353765, "grad_norm": 0.29280948638916016, "learning_rate": 9.987099117840969e-06, "loss": 0.5865, "step": 179 }, { "epoch": 0.31523642732049034, "grad_norm": 0.28841423988342285, "learning_rate": 9.986757538333817e-06, "loss": 0.5698, "step": 180 }, { "epoch": 0.3169877408056042, "grad_norm": 0.26757946610450745, "learning_rate": 9.986411501676327e-06, "loss": 0.5669, "step": 181 }, { "epoch": 0.318739054290718, "grad_norm": 0.2654379904270172, "learning_rate": 9.986061008177779e-06, "loss": 0.5791, "step": 182 }, { "epoch": 0.3204903677758319, "grad_norm": 0.24858781695365906, "learning_rate": 9.985706058151446e-06, "loss": 0.5614, "step": 183 }, { "epoch": 0.3222416812609457, "grad_norm": 0.25888592004776, "learning_rate": 9.985346651914581e-06, "loss": 0.574, "step": 184 }, { "epoch": 0.3239929947460595, "grad_norm": 0.2842358946800232, "learning_rate": 9.98498278978842e-06, "loss": 0.5633, "step": 185 }, { "epoch": 0.3257443082311734, "grad_norm": 0.2566906809806824, "learning_rate": 9.984614472098181e-06, "loss": 0.5753, "step": 186 }, { "epoch": 0.3274956217162872, "grad_norm": 0.2775246202945709, "learning_rate": 9.984241699173069e-06, "loss": 0.562, "step": 187 }, { "epoch": 0.329246935201401, "grad_norm": 0.267216295003891, "learning_rate": 9.983864471346263e-06, "loss": 0.5598, "step": 188 }, { "epoch": 0.3309982486865149, "grad_norm": 0.24555706977844238, "learning_rate": 9.983482788954931e-06, "loss": 0.5728, "step": 189 }, { "epoch": 0.3327495621716287, "grad_norm": 0.25659212470054626, "learning_rate": 9.983096652340219e-06, "loss": 0.5849, "step": 190 }, { "epoch": 0.3345008756567426, "grad_norm": 0.27766188979148865, "learning_rate": 9.982706061847254e-06, "loss": 0.5753, "step": 191 }, { "epoch": 0.3362521891418564, "grad_norm": 0.31068143248558044, "learning_rate": 9.982311017825145e-06, "loss": 0.5703, "step": 192 }, { "epoch": 0.3380035026269702, "grad_norm": 0.2504432797431946, "learning_rate": 9.981911520626983e-06, "loss": 0.555, "step": 193 }, { "epoch": 0.3397548161120841, "grad_norm": 0.3018089532852173, "learning_rate": 9.981507570609837e-06, "loss": 0.5604, "step": 194 }, { "epoch": 0.3415061295971979, "grad_norm": 0.26155000925064087, "learning_rate": 9.981099168134753e-06, "loss": 0.5725, "step": 195 }, { "epoch": 0.3432574430823117, "grad_norm": 0.26130011677742004, "learning_rate": 9.980686313566765e-06, "loss": 0.5626, "step": 196 }, { "epoch": 0.3450087565674256, "grad_norm": 0.2927478849887848, "learning_rate": 9.980269007274878e-06, "loss": 0.5695, "step": 197 }, { "epoch": 0.3467600700525394, "grad_norm": 0.3368290662765503, "learning_rate": 9.97984724963208e-06, "loss": 0.5832, "step": 198 }, { "epoch": 0.34851138353765326, "grad_norm": 0.28618425130844116, "learning_rate": 9.979421041015336e-06, "loss": 0.5689, "step": 199 }, { "epoch": 0.3502626970227671, "grad_norm": 0.3093123435974121, "learning_rate": 9.978990381805593e-06, "loss": 0.5611, "step": 200 }, { "epoch": 0.3520140105078809, "grad_norm": 0.3169271647930145, "learning_rate": 9.978555272387771e-06, "loss": 0.56, "step": 201 }, { "epoch": 0.35376532399299476, "grad_norm": 0.3340418040752411, "learning_rate": 9.978115713150768e-06, "loss": 0.569, "step": 202 }, { "epoch": 0.3555166374781086, "grad_norm": 0.282405287027359, "learning_rate": 9.977671704487465e-06, "loss": 0.5663, "step": 203 }, { "epoch": 0.3572679509632224, "grad_norm": 0.29856857657432556, "learning_rate": 9.977223246794712e-06, "loss": 0.575, "step": 204 }, { "epoch": 0.35901926444833626, "grad_norm": 0.3150472640991211, "learning_rate": 9.97677034047334e-06, "loss": 0.568, "step": 205 }, { "epoch": 0.3607705779334501, "grad_norm": 0.3057059943675995, "learning_rate": 9.976312985928159e-06, "loss": 0.5698, "step": 206 }, { "epoch": 0.36252189141856395, "grad_norm": 0.2924758791923523, "learning_rate": 9.975851183567945e-06, "loss": 0.5611, "step": 207 }, { "epoch": 0.36427320490367776, "grad_norm": 0.27792227268218994, "learning_rate": 9.975384933805461e-06, "loss": 0.5761, "step": 208 }, { "epoch": 0.3660245183887916, "grad_norm": 0.2885139584541321, "learning_rate": 9.974914237057435e-06, "loss": 0.5784, "step": 209 }, { "epoch": 0.36777583187390545, "grad_norm": 0.25505802035331726, "learning_rate": 9.974439093744581e-06, "loss": 0.5693, "step": 210 }, { "epoch": 0.36952714535901926, "grad_norm": 0.29688501358032227, "learning_rate": 9.973959504291574e-06, "loss": 0.5653, "step": 211 }, { "epoch": 0.3712784588441331, "grad_norm": 0.2540914714336395, "learning_rate": 9.973475469127074e-06, "loss": 0.5601, "step": 212 }, { "epoch": 0.37302977232924694, "grad_norm": 0.304522305727005, "learning_rate": 9.972986988683711e-06, "loss": 0.5495, "step": 213 }, { "epoch": 0.37478108581436076, "grad_norm": 0.24643920361995697, "learning_rate": 9.972494063398083e-06, "loss": 0.5454, "step": 214 }, { "epoch": 0.37653239929947463, "grad_norm": 0.2849683463573456, "learning_rate": 9.971996693710768e-06, "loss": 0.573, "step": 215 }, { "epoch": 0.37828371278458844, "grad_norm": 0.28706809878349304, "learning_rate": 9.971494880066315e-06, "loss": 0.557, "step": 216 }, { "epoch": 0.38003502626970226, "grad_norm": 0.2782764434814453, "learning_rate": 9.97098862291324e-06, "loss": 0.5615, "step": 217 }, { "epoch": 0.38178633975481613, "grad_norm": 0.2530321776866913, "learning_rate": 9.97047792270404e-06, "loss": 0.5748, "step": 218 }, { "epoch": 0.38353765323992994, "grad_norm": 0.2818725109100342, "learning_rate": 9.969962779895172e-06, "loss": 0.5567, "step": 219 }, { "epoch": 0.38528896672504376, "grad_norm": 0.24282576143741608, "learning_rate": 9.96944319494707e-06, "loss": 0.564, "step": 220 }, { "epoch": 0.38704028021015763, "grad_norm": 0.2554701268672943, "learning_rate": 9.96891916832414e-06, "loss": 0.5675, "step": 221 }, { "epoch": 0.38879159369527144, "grad_norm": 0.2486743927001953, "learning_rate": 9.968390700494755e-06, "loss": 0.5581, "step": 222 }, { "epoch": 0.3905429071803853, "grad_norm": 0.26184117794036865, "learning_rate": 9.967857791931257e-06, "loss": 0.5574, "step": 223 }, { "epoch": 0.3922942206654991, "grad_norm": 0.25980809330940247, "learning_rate": 9.967320443109958e-06, "loss": 0.5623, "step": 224 }, { "epoch": 0.39404553415061294, "grad_norm": 0.31790444254875183, "learning_rate": 9.966778654511143e-06, "loss": 0.5727, "step": 225 }, { "epoch": 0.3957968476357268, "grad_norm": 0.28591448068618774, "learning_rate": 9.966232426619054e-06, "loss": 0.5518, "step": 226 }, { "epoch": 0.3975481611208406, "grad_norm": 0.25516653060913086, "learning_rate": 9.965681759921912e-06, "loss": 0.57, "step": 227 }, { "epoch": 0.39929947460595444, "grad_norm": 0.2496168464422226, "learning_rate": 9.965126654911904e-06, "loss": 0.5597, "step": 228 }, { "epoch": 0.4010507880910683, "grad_norm": 0.24498221278190613, "learning_rate": 9.964567112085175e-06, "loss": 0.5552, "step": 229 }, { "epoch": 0.4028021015761821, "grad_norm": 0.26579558849334717, "learning_rate": 9.96400313194185e-06, "loss": 0.5658, "step": 230 }, { "epoch": 0.404553415061296, "grad_norm": 0.24160237610340118, "learning_rate": 9.963434714986006e-06, "loss": 0.5552, "step": 231 }, { "epoch": 0.4063047285464098, "grad_norm": 0.2752656042575836, "learning_rate": 9.962861861725698e-06, "loss": 0.561, "step": 232 }, { "epoch": 0.4080560420315236, "grad_norm": 0.2843784689903259, "learning_rate": 9.96228457267294e-06, "loss": 0.5617, "step": 233 }, { "epoch": 0.4098073555166375, "grad_norm": 0.25715509057044983, "learning_rate": 9.961702848343709e-06, "loss": 0.5627, "step": 234 }, { "epoch": 0.4115586690017513, "grad_norm": 0.2647821307182312, "learning_rate": 9.961116689257949e-06, "loss": 0.5634, "step": 235 }, { "epoch": 0.4133099824868651, "grad_norm": 0.2796805202960968, "learning_rate": 9.960526095939566e-06, "loss": 0.5584, "step": 236 }, { "epoch": 0.415061295971979, "grad_norm": 0.24035733938217163, "learning_rate": 9.959931068916436e-06, "loss": 0.5666, "step": 237 }, { "epoch": 0.4168126094570928, "grad_norm": 0.25458523631095886, "learning_rate": 9.959331608720386e-06, "loss": 0.5542, "step": 238 }, { "epoch": 0.4185639229422067, "grad_norm": 0.2417057454586029, "learning_rate": 9.958727715887218e-06, "loss": 0.5574, "step": 239 }, { "epoch": 0.4203152364273205, "grad_norm": 0.2655900716781616, "learning_rate": 9.958119390956685e-06, "loss": 0.5489, "step": 240 }, { "epoch": 0.4220665499124343, "grad_norm": 0.22494421899318695, "learning_rate": 9.95750663447251e-06, "loss": 0.5515, "step": 241 }, { "epoch": 0.4238178633975482, "grad_norm": 0.26922833919525146, "learning_rate": 9.956889446982369e-06, "loss": 0.5555, "step": 242 }, { "epoch": 0.425569176882662, "grad_norm": 0.22891448438167572, "learning_rate": 9.956267829037906e-06, "loss": 0.556, "step": 243 }, { "epoch": 0.4273204903677758, "grad_norm": 0.314229816198349, "learning_rate": 9.95564178119472e-06, "loss": 0.5606, "step": 244 }, { "epoch": 0.4290718038528897, "grad_norm": 0.24712178111076355, "learning_rate": 9.95501130401237e-06, "loss": 0.5511, "step": 245 }, { "epoch": 0.4308231173380035, "grad_norm": 0.32478204369544983, "learning_rate": 9.954376398054377e-06, "loss": 0.5449, "step": 246 }, { "epoch": 0.43257443082311736, "grad_norm": 0.24862055480480194, "learning_rate": 9.953737063888216e-06, "loss": 0.5358, "step": 247 }, { "epoch": 0.4343257443082312, "grad_norm": 0.28171032667160034, "learning_rate": 9.953093302085324e-06, "loss": 0.5518, "step": 248 }, { "epoch": 0.436077057793345, "grad_norm": 0.27398228645324707, "learning_rate": 9.952445113221093e-06, "loss": 0.5507, "step": 249 }, { "epoch": 0.43782837127845886, "grad_norm": 0.25380590558052063, "learning_rate": 9.95179249787487e-06, "loss": 0.5455, "step": 250 }, { "epoch": 0.4395796847635727, "grad_norm": 0.2990418076515198, "learning_rate": 9.951135456629966e-06, "loss": 0.5651, "step": 251 }, { "epoch": 0.4413309982486865, "grad_norm": 0.25516533851623535, "learning_rate": 9.950473990073637e-06, "loss": 0.5657, "step": 252 }, { "epoch": 0.44308231173380036, "grad_norm": 0.32216688990592957, "learning_rate": 9.949808098797104e-06, "loss": 0.5556, "step": 253 }, { "epoch": 0.4448336252189142, "grad_norm": 0.24693478643894196, "learning_rate": 9.949137783395537e-06, "loss": 0.553, "step": 254 }, { "epoch": 0.44658493870402804, "grad_norm": 0.318928062915802, "learning_rate": 9.948463044468063e-06, "loss": 0.5564, "step": 255 }, { "epoch": 0.44833625218914186, "grad_norm": 0.2623842656612396, "learning_rate": 9.94778388261776e-06, "loss": 0.5532, "step": 256 }, { "epoch": 0.4500875656742557, "grad_norm": 0.33799979090690613, "learning_rate": 9.947100298451663e-06, "loss": 0.5514, "step": 257 }, { "epoch": 0.45183887915936954, "grad_norm": 0.2586771547794342, "learning_rate": 9.946412292580757e-06, "loss": 0.5636, "step": 258 }, { "epoch": 0.45359019264448336, "grad_norm": 0.29948994517326355, "learning_rate": 9.94571986561998e-06, "loss": 0.5498, "step": 259 }, { "epoch": 0.4553415061295972, "grad_norm": 0.26078349351882935, "learning_rate": 9.945023018188222e-06, "loss": 0.5489, "step": 260 }, { "epoch": 0.45709281961471104, "grad_norm": 0.30358684062957764, "learning_rate": 9.944321750908321e-06, "loss": 0.5577, "step": 261 }, { "epoch": 0.45884413309982486, "grad_norm": 0.2756328284740448, "learning_rate": 9.94361606440707e-06, "loss": 0.5473, "step": 262 }, { "epoch": 0.46059544658493873, "grad_norm": 0.3419603407382965, "learning_rate": 9.94290595931521e-06, "loss": 0.5562, "step": 263 }, { "epoch": 0.46234676007005254, "grad_norm": 0.26494550704956055, "learning_rate": 9.942191436267428e-06, "loss": 0.567, "step": 264 }, { "epoch": 0.46409807355516636, "grad_norm": 0.295282781124115, "learning_rate": 9.941472495902366e-06, "loss": 0.5709, "step": 265 }, { "epoch": 0.4658493870402802, "grad_norm": 0.3235480487346649, "learning_rate": 9.940749138862609e-06, "loss": 0.5586, "step": 266 }, { "epoch": 0.46760070052539404, "grad_norm": 0.261745423078537, "learning_rate": 9.940021365794692e-06, "loss": 0.5524, "step": 267 }, { "epoch": 0.46935201401050786, "grad_norm": 0.2950759828090668, "learning_rate": 9.939289177349097e-06, "loss": 0.5522, "step": 268 }, { "epoch": 0.4711033274956217, "grad_norm": 0.2707611620426178, "learning_rate": 9.938552574180252e-06, "loss": 0.5556, "step": 269 }, { "epoch": 0.47285464098073554, "grad_norm": 0.2857029438018799, "learning_rate": 9.937811556946531e-06, "loss": 0.5625, "step": 270 }, { "epoch": 0.4746059544658494, "grad_norm": 0.2616572380065918, "learning_rate": 9.937066126310253e-06, "loss": 0.5373, "step": 271 }, { "epoch": 0.4763572679509632, "grad_norm": 0.34570741653442383, "learning_rate": 9.936316282937682e-06, "loss": 0.5573, "step": 272 }, { "epoch": 0.47810858143607704, "grad_norm": 0.2870980203151703, "learning_rate": 9.935562027499026e-06, "loss": 0.5544, "step": 273 }, { "epoch": 0.4798598949211909, "grad_norm": 0.28473755717277527, "learning_rate": 9.934803360668435e-06, "loss": 0.5413, "step": 274 }, { "epoch": 0.4816112084063047, "grad_norm": 0.35125458240509033, "learning_rate": 9.934040283124006e-06, "loss": 0.5469, "step": 275 }, { "epoch": 0.48336252189141854, "grad_norm": 0.261339396238327, "learning_rate": 9.933272795547773e-06, "loss": 0.5476, "step": 276 }, { "epoch": 0.4851138353765324, "grad_norm": 0.30614277720451355, "learning_rate": 9.932500898625716e-06, "loss": 0.5535, "step": 277 }, { "epoch": 0.4868651488616462, "grad_norm": 0.327487587928772, "learning_rate": 9.931724593047754e-06, "loss": 0.5586, "step": 278 }, { "epoch": 0.4886164623467601, "grad_norm": 0.25929269194602966, "learning_rate": 9.930943879507748e-06, "loss": 0.5509, "step": 279 }, { "epoch": 0.4903677758318739, "grad_norm": 0.3318157494068146, "learning_rate": 9.930158758703495e-06, "loss": 0.563, "step": 280 }, { "epoch": 0.4921190893169877, "grad_norm": 0.29014113545417786, "learning_rate": 9.929369231336735e-06, "loss": 0.5485, "step": 281 }, { "epoch": 0.4938704028021016, "grad_norm": 0.2765292525291443, "learning_rate": 9.928575298113146e-06, "loss": 0.5642, "step": 282 }, { "epoch": 0.4956217162872154, "grad_norm": 0.28836140036582947, "learning_rate": 9.927776959742344e-06, "loss": 0.545, "step": 283 }, { "epoch": 0.4973730297723292, "grad_norm": 0.26135045289993286, "learning_rate": 9.92697421693788e-06, "loss": 0.5539, "step": 284 }, { "epoch": 0.4991243432574431, "grad_norm": 0.2688350975513458, "learning_rate": 9.926167070417243e-06, "loss": 0.5514, "step": 285 }, { "epoch": 0.500875656742557, "grad_norm": 0.2706112861633301, "learning_rate": 9.925355520901861e-06, "loss": 0.5594, "step": 286 }, { "epoch": 0.5026269702276708, "grad_norm": 0.2534312903881073, "learning_rate": 9.924539569117092e-06, "loss": 0.5468, "step": 287 }, { "epoch": 0.5043782837127846, "grad_norm": 0.2595043480396271, "learning_rate": 9.923719215792233e-06, "loss": 0.5563, "step": 288 }, { "epoch": 0.5061295971978984, "grad_norm": 0.2749403417110443, "learning_rate": 9.922894461660513e-06, "loss": 0.5474, "step": 289 }, { "epoch": 0.5078809106830122, "grad_norm": 0.26016855239868164, "learning_rate": 9.922065307459096e-06, "loss": 0.5454, "step": 290 }, { "epoch": 0.5096322241681261, "grad_norm": 0.31344085931777954, "learning_rate": 9.921231753929077e-06, "loss": 0.537, "step": 291 }, { "epoch": 0.51138353765324, "grad_norm": 0.2499876469373703, "learning_rate": 9.920393801815482e-06, "loss": 0.556, "step": 292 }, { "epoch": 0.5131348511383538, "grad_norm": 0.3453201949596405, "learning_rate": 9.919551451867275e-06, "loss": 0.556, "step": 293 }, { "epoch": 0.5148861646234676, "grad_norm": 0.2961561977863312, "learning_rate": 9.91870470483734e-06, "loss": 0.5526, "step": 294 }, { "epoch": 0.5166374781085814, "grad_norm": 0.2629326283931732, "learning_rate": 9.917853561482503e-06, "loss": 0.5462, "step": 295 }, { "epoch": 0.5183887915936952, "grad_norm": 0.29436981678009033, "learning_rate": 9.91699802256351e-06, "loss": 0.5393, "step": 296 }, { "epoch": 0.5201401050788091, "grad_norm": 0.2170306295156479, "learning_rate": 9.916138088845042e-06, "loss": 0.5399, "step": 297 }, { "epoch": 0.521891418563923, "grad_norm": 0.2699151337146759, "learning_rate": 9.915273761095701e-06, "loss": 0.5462, "step": 298 }, { "epoch": 0.5236427320490368, "grad_norm": 0.2514743208885193, "learning_rate": 9.914405040088026e-06, "loss": 0.5442, "step": 299 }, { "epoch": 0.5253940455341506, "grad_norm": 0.26019957661628723, "learning_rate": 9.913531926598475e-06, "loss": 0.5466, "step": 300 }, { "epoch": 0.5271453590192644, "grad_norm": 0.253452867269516, "learning_rate": 9.912654421407434e-06, "loss": 0.5447, "step": 301 }, { "epoch": 0.5288966725043783, "grad_norm": 0.26478469371795654, "learning_rate": 9.911772525299214e-06, "loss": 0.5363, "step": 302 }, { "epoch": 0.5306479859894921, "grad_norm": 0.26917457580566406, "learning_rate": 9.910886239062054e-06, "loss": 0.546, "step": 303 }, { "epoch": 0.532399299474606, "grad_norm": 0.30234187841415405, "learning_rate": 9.909995563488116e-06, "loss": 0.5453, "step": 304 }, { "epoch": 0.5341506129597198, "grad_norm": 0.28828272223472595, "learning_rate": 9.909100499373476e-06, "loss": 0.5585, "step": 305 }, { "epoch": 0.5359019264448336, "grad_norm": 0.3235272169113159, "learning_rate": 9.908201047518146e-06, "loss": 0.5477, "step": 306 }, { "epoch": 0.5376532399299475, "grad_norm": 0.29789993166923523, "learning_rate": 9.907297208726051e-06, "loss": 0.5424, "step": 307 }, { "epoch": 0.5394045534150613, "grad_norm": 0.2761630415916443, "learning_rate": 9.90638898380504e-06, "loss": 0.5492, "step": 308 }, { "epoch": 0.5411558669001751, "grad_norm": 0.24579960107803345, "learning_rate": 9.905476373566884e-06, "loss": 0.5422, "step": 309 }, { "epoch": 0.542907180385289, "grad_norm": 0.2819322347640991, "learning_rate": 9.90455937882727e-06, "loss": 0.5562, "step": 310 }, { "epoch": 0.5446584938704028, "grad_norm": 0.28800955414772034, "learning_rate": 9.903638000405805e-06, "loss": 0.5509, "step": 311 }, { "epoch": 0.5464098073555166, "grad_norm": 0.31411775946617126, "learning_rate": 9.902712239126016e-06, "loss": 0.5302, "step": 312 }, { "epoch": 0.5481611208406305, "grad_norm": 0.27875185012817383, "learning_rate": 9.901782095815343e-06, "loss": 0.559, "step": 313 }, { "epoch": 0.5499124343257443, "grad_norm": 0.2801899015903473, "learning_rate": 9.900847571305148e-06, "loss": 0.5432, "step": 314 }, { "epoch": 0.5516637478108581, "grad_norm": 0.29885923862457275, "learning_rate": 9.899908666430707e-06, "loss": 0.5533, "step": 315 }, { "epoch": 0.553415061295972, "grad_norm": 0.28866827487945557, "learning_rate": 9.898965382031208e-06, "loss": 0.553, "step": 316 }, { "epoch": 0.5551663747810858, "grad_norm": 0.29007112979888916, "learning_rate": 9.898017718949759e-06, "loss": 0.5531, "step": 317 }, { "epoch": 0.5569176882661997, "grad_norm": 0.3157470226287842, "learning_rate": 9.897065678033376e-06, "loss": 0.5459, "step": 318 }, { "epoch": 0.5586690017513135, "grad_norm": 0.28694310784339905, "learning_rate": 9.896109260132993e-06, "loss": 0.5438, "step": 319 }, { "epoch": 0.5604203152364273, "grad_norm": 0.3286214768886566, "learning_rate": 9.895148466103451e-06, "loss": 0.5378, "step": 320 }, { "epoch": 0.5621716287215411, "grad_norm": 0.25030264258384705, "learning_rate": 9.894183296803509e-06, "loss": 0.5403, "step": 321 }, { "epoch": 0.563922942206655, "grad_norm": 0.30412280559539795, "learning_rate": 9.893213753095829e-06, "loss": 0.5636, "step": 322 }, { "epoch": 0.5656742556917689, "grad_norm": 0.29335540533065796, "learning_rate": 9.892239835846988e-06, "loss": 0.5559, "step": 323 }, { "epoch": 0.5674255691768827, "grad_norm": 0.2787269055843353, "learning_rate": 9.891261545927469e-06, "loss": 0.543, "step": 324 }, { "epoch": 0.5691768826619965, "grad_norm": 0.2925263047218323, "learning_rate": 9.890278884211668e-06, "loss": 0.5552, "step": 325 }, { "epoch": 0.5709281961471103, "grad_norm": 0.23951222002506256, "learning_rate": 9.889291851577885e-06, "loss": 0.5377, "step": 326 }, { "epoch": 0.5726795096322241, "grad_norm": 0.3163376450538635, "learning_rate": 9.888300448908324e-06, "loss": 0.5398, "step": 327 }, { "epoch": 0.574430823117338, "grad_norm": 0.23217764496803284, "learning_rate": 9.8873046770891e-06, "loss": 0.5383, "step": 328 }, { "epoch": 0.5761821366024519, "grad_norm": 0.30196481943130493, "learning_rate": 9.88630453701023e-06, "loss": 0.5452, "step": 329 }, { "epoch": 0.5779334500875657, "grad_norm": 0.27252551913261414, "learning_rate": 9.88530002956564e-06, "loss": 0.5435, "step": 330 }, { "epoch": 0.5796847635726795, "grad_norm": 0.28413671255111694, "learning_rate": 9.884291155653151e-06, "loss": 0.5472, "step": 331 }, { "epoch": 0.5814360770577933, "grad_norm": 0.30083227157592773, "learning_rate": 9.883277916174496e-06, "loss": 0.5503, "step": 332 }, { "epoch": 0.5831873905429071, "grad_norm": 0.31956204771995544, "learning_rate": 9.882260312035304e-06, "loss": 0.5517, "step": 333 }, { "epoch": 0.5849387040280211, "grad_norm": 0.3508720099925995, "learning_rate": 9.881238344145106e-06, "loss": 0.5476, "step": 334 }, { "epoch": 0.5866900175131349, "grad_norm": 0.3125290274620056, "learning_rate": 9.880212013417334e-06, "loss": 0.5357, "step": 335 }, { "epoch": 0.5884413309982487, "grad_norm": 0.3368894159793854, "learning_rate": 9.879181320769322e-06, "loss": 0.5511, "step": 336 }, { "epoch": 0.5901926444833625, "grad_norm": 0.3274988532066345, "learning_rate": 9.878146267122298e-06, "loss": 0.5453, "step": 337 }, { "epoch": 0.5919439579684763, "grad_norm": 0.3618800640106201, "learning_rate": 9.877106853401392e-06, "loss": 0.5445, "step": 338 }, { "epoch": 0.5936952714535902, "grad_norm": 0.27040430903434753, "learning_rate": 9.876063080535627e-06, "loss": 0.5425, "step": 339 }, { "epoch": 0.5954465849387041, "grad_norm": 0.36068591475486755, "learning_rate": 9.875014949457926e-06, "loss": 0.5438, "step": 340 }, { "epoch": 0.5971978984238179, "grad_norm": 0.27858391404151917, "learning_rate": 9.873962461105104e-06, "loss": 0.5394, "step": 341 }, { "epoch": 0.5989492119089317, "grad_norm": 0.3052641451358795, "learning_rate": 9.872905616417875e-06, "loss": 0.5468, "step": 342 }, { "epoch": 0.6007005253940455, "grad_norm": 0.2868247628211975, "learning_rate": 9.87184441634084e-06, "loss": 0.5437, "step": 343 }, { "epoch": 0.6024518388791593, "grad_norm": 0.27847471833229065, "learning_rate": 9.870778861822502e-06, "loss": 0.5418, "step": 344 }, { "epoch": 0.6042031523642732, "grad_norm": 0.28269708156585693, "learning_rate": 9.869708953815247e-06, "loss": 0.5446, "step": 345 }, { "epoch": 0.6059544658493871, "grad_norm": 0.298794686794281, "learning_rate": 9.868634693275356e-06, "loss": 0.5365, "step": 346 }, { "epoch": 0.6077057793345009, "grad_norm": 0.29023054242134094, "learning_rate": 9.867556081163002e-06, "loss": 0.5378, "step": 347 }, { "epoch": 0.6094570928196147, "grad_norm": 0.3231043815612793, "learning_rate": 9.86647311844224e-06, "loss": 0.5416, "step": 348 }, { "epoch": 0.6112084063047285, "grad_norm": 0.27856680750846863, "learning_rate": 9.865385806081028e-06, "loss": 0.536, "step": 349 }, { "epoch": 0.6129597197898424, "grad_norm": 0.3531879186630249, "learning_rate": 9.864294145051197e-06, "loss": 0.5503, "step": 350 }, { "epoch": 0.6147110332749562, "grad_norm": 0.29704150557518005, "learning_rate": 9.863198136328474e-06, "loss": 0.5441, "step": 351 }, { "epoch": 0.6164623467600701, "grad_norm": 0.30443838238716125, "learning_rate": 9.862097780892463e-06, "loss": 0.5452, "step": 352 }, { "epoch": 0.6182136602451839, "grad_norm": 0.2533243000507355, "learning_rate": 9.860993079726665e-06, "loss": 0.5482, "step": 353 }, { "epoch": 0.6199649737302977, "grad_norm": 0.28299546241760254, "learning_rate": 9.859884033818454e-06, "loss": 0.56, "step": 354 }, { "epoch": 0.6217162872154116, "grad_norm": 0.2958698570728302, "learning_rate": 9.858770644159098e-06, "loss": 0.5374, "step": 355 }, { "epoch": 0.6234676007005254, "grad_norm": 0.254801869392395, "learning_rate": 9.857652911743734e-06, "loss": 0.5413, "step": 356 }, { "epoch": 0.6252189141856392, "grad_norm": 0.3303804099559784, "learning_rate": 9.856530837571394e-06, "loss": 0.5555, "step": 357 }, { "epoch": 0.626970227670753, "grad_norm": 0.2821517288684845, "learning_rate": 9.855404422644983e-06, "loss": 0.5424, "step": 358 }, { "epoch": 0.6287215411558669, "grad_norm": 0.2973978817462921, "learning_rate": 9.85427366797129e-06, "loss": 0.5432, "step": 359 }, { "epoch": 0.6304728546409807, "grad_norm": 0.26731741428375244, "learning_rate": 9.853138574560976e-06, "loss": 0.5352, "step": 360 }, { "epoch": 0.6322241681260946, "grad_norm": 0.22203773260116577, "learning_rate": 9.851999143428587e-06, "loss": 0.5336, "step": 361 }, { "epoch": 0.6339754816112084, "grad_norm": 0.3159387707710266, "learning_rate": 9.850855375592543e-06, "loss": 0.5512, "step": 362 }, { "epoch": 0.6357267950963222, "grad_norm": 0.24576880037784576, "learning_rate": 9.84970727207514e-06, "loss": 0.5376, "step": 363 }, { "epoch": 0.637478108581436, "grad_norm": 0.28789791464805603, "learning_rate": 9.848554833902551e-06, "loss": 0.5267, "step": 364 }, { "epoch": 0.6392294220665499, "grad_norm": 0.2839234471321106, "learning_rate": 9.847398062104823e-06, "loss": 0.5602, "step": 365 }, { "epoch": 0.6409807355516638, "grad_norm": 0.28113773465156555, "learning_rate": 9.846236957715872e-06, "loss": 0.5312, "step": 366 }, { "epoch": 0.6427320490367776, "grad_norm": 0.3353060781955719, "learning_rate": 9.845071521773492e-06, "loss": 0.5348, "step": 367 }, { "epoch": 0.6444833625218914, "grad_norm": 0.291412889957428, "learning_rate": 9.843901755319347e-06, "loss": 0.5376, "step": 368 }, { "epoch": 0.6462346760070052, "grad_norm": 0.31840208172798157, "learning_rate": 9.842727659398971e-06, "loss": 0.539, "step": 369 }, { "epoch": 0.647985989492119, "grad_norm": 0.30987051129341125, "learning_rate": 9.841549235061766e-06, "loss": 0.5535, "step": 370 }, { "epoch": 0.649737302977233, "grad_norm": 0.28143659234046936, "learning_rate": 9.840366483361008e-06, "loss": 0.5433, "step": 371 }, { "epoch": 0.6514886164623468, "grad_norm": 0.28234633803367615, "learning_rate": 9.839179405353834e-06, "loss": 0.5361, "step": 372 }, { "epoch": 0.6532399299474606, "grad_norm": 0.2790328860282898, "learning_rate": 9.837988002101253e-06, "loss": 0.5389, "step": 373 }, { "epoch": 0.6549912434325744, "grad_norm": 0.27249667048454285, "learning_rate": 9.836792274668135e-06, "loss": 0.5357, "step": 374 }, { "epoch": 0.6567425569176882, "grad_norm": 0.30348479747772217, "learning_rate": 9.835592224123221e-06, "loss": 0.5441, "step": 375 }, { "epoch": 0.658493870402802, "grad_norm": 0.28792864084243774, "learning_rate": 9.834387851539113e-06, "loss": 0.5441, "step": 376 }, { "epoch": 0.660245183887916, "grad_norm": 0.3008653521537781, "learning_rate": 9.833179157992274e-06, "loss": 0.5384, "step": 377 }, { "epoch": 0.6619964973730298, "grad_norm": 0.27208006381988525, "learning_rate": 9.831966144563032e-06, "loss": 0.5448, "step": 378 }, { "epoch": 0.6637478108581436, "grad_norm": 0.3308258056640625, "learning_rate": 9.830748812335576e-06, "loss": 0.5436, "step": 379 }, { "epoch": 0.6654991243432574, "grad_norm": 0.27882903814315796, "learning_rate": 9.829527162397951e-06, "loss": 0.529, "step": 380 }, { "epoch": 0.6672504378283712, "grad_norm": 0.3228173851966858, "learning_rate": 9.828301195842069e-06, "loss": 0.5522, "step": 381 }, { "epoch": 0.6690017513134852, "grad_norm": 0.26001131534576416, "learning_rate": 9.827070913763693e-06, "loss": 0.5418, "step": 382 }, { "epoch": 0.670753064798599, "grad_norm": 0.30818986892700195, "learning_rate": 9.825836317262445e-06, "loss": 0.5427, "step": 383 }, { "epoch": 0.6725043782837128, "grad_norm": 0.2951263189315796, "learning_rate": 9.824597407441805e-06, "loss": 0.5442, "step": 384 }, { "epoch": 0.6742556917688266, "grad_norm": 0.34286317229270935, "learning_rate": 9.823354185409108e-06, "loss": 0.5378, "step": 385 }, { "epoch": 0.6760070052539404, "grad_norm": 0.28890466690063477, "learning_rate": 9.822106652275541e-06, "loss": 0.5481, "step": 386 }, { "epoch": 0.6777583187390543, "grad_norm": 0.3289094567298889, "learning_rate": 9.820854809156146e-06, "loss": 0.5294, "step": 387 }, { "epoch": 0.6795096322241682, "grad_norm": 0.2500302493572235, "learning_rate": 9.819598657169819e-06, "loss": 0.5446, "step": 388 }, { "epoch": 0.681260945709282, "grad_norm": 0.27785786986351013, "learning_rate": 9.818338197439304e-06, "loss": 0.5304, "step": 389 }, { "epoch": 0.6830122591943958, "grad_norm": 0.25425100326538086, "learning_rate": 9.817073431091194e-06, "loss": 0.5341, "step": 390 }, { "epoch": 0.6847635726795096, "grad_norm": 0.28915366530418396, "learning_rate": 9.815804359255937e-06, "loss": 0.5315, "step": 391 }, { "epoch": 0.6865148861646234, "grad_norm": 0.2511161267757416, "learning_rate": 9.814530983067825e-06, "loss": 0.5521, "step": 392 }, { "epoch": 0.6882661996497373, "grad_norm": 0.3435809016227722, "learning_rate": 9.813253303664997e-06, "loss": 0.5481, "step": 393 }, { "epoch": 0.6900175131348512, "grad_norm": 0.2594046890735626, "learning_rate": 9.811971322189442e-06, "loss": 0.5436, "step": 394 }, { "epoch": 0.691768826619965, "grad_norm": 0.26637154817581177, "learning_rate": 9.810685039786989e-06, "loss": 0.5281, "step": 395 }, { "epoch": 0.6935201401050788, "grad_norm": 0.2868831157684326, "learning_rate": 9.809394457607315e-06, "loss": 0.5292, "step": 396 }, { "epoch": 0.6952714535901926, "grad_norm": 0.2707003951072693, "learning_rate": 9.808099576803937e-06, "loss": 0.5499, "step": 397 }, { "epoch": 0.6970227670753065, "grad_norm": 0.28687766194343567, "learning_rate": 9.80680039853422e-06, "loss": 0.548, "step": 398 }, { "epoch": 0.6987740805604203, "grad_norm": 0.3369269073009491, "learning_rate": 9.805496923959363e-06, "loss": 0.5432, "step": 399 }, { "epoch": 0.7005253940455342, "grad_norm": 0.24818706512451172, "learning_rate": 9.804189154244408e-06, "loss": 0.5249, "step": 400 }, { "epoch": 0.702276707530648, "grad_norm": 0.3885733187198639, "learning_rate": 9.802877090558237e-06, "loss": 0.5381, "step": 401 }, { "epoch": 0.7040280210157618, "grad_norm": 0.267222136259079, "learning_rate": 9.801560734073567e-06, "loss": 0.5418, "step": 402 }, { "epoch": 0.7057793345008757, "grad_norm": 0.33261215686798096, "learning_rate": 9.800240085966957e-06, "loss": 0.5435, "step": 403 }, { "epoch": 0.7075306479859895, "grad_norm": 0.28227198123931885, "learning_rate": 9.798915147418795e-06, "loss": 0.5399, "step": 404 }, { "epoch": 0.7092819614711033, "grad_norm": 0.2483345866203308, "learning_rate": 9.797585919613311e-06, "loss": 0.5365, "step": 405 }, { "epoch": 0.7110332749562172, "grad_norm": 0.2918708026409149, "learning_rate": 9.796252403738564e-06, "loss": 0.5334, "step": 406 }, { "epoch": 0.712784588441331, "grad_norm": 0.26654672622680664, "learning_rate": 9.794914600986446e-06, "loss": 0.5487, "step": 407 }, { "epoch": 0.7145359019264448, "grad_norm": 0.2967936098575592, "learning_rate": 9.793572512552683e-06, "loss": 0.5334, "step": 408 }, { "epoch": 0.7162872154115587, "grad_norm": 0.31796395778656006, "learning_rate": 9.792226139636827e-06, "loss": 0.5488, "step": 409 }, { "epoch": 0.7180385288966725, "grad_norm": 0.2653609812259674, "learning_rate": 9.790875483442265e-06, "loss": 0.5251, "step": 410 }, { "epoch": 0.7197898423817863, "grad_norm": 0.35138100385665894, "learning_rate": 9.78952054517621e-06, "loss": 0.5522, "step": 411 }, { "epoch": 0.7215411558669002, "grad_norm": 0.26711902022361755, "learning_rate": 9.788161326049702e-06, "loss": 0.535, "step": 412 }, { "epoch": 0.723292469352014, "grad_norm": 0.3040355443954468, "learning_rate": 9.786797827277607e-06, "loss": 0.5193, "step": 413 }, { "epoch": 0.7250437828371279, "grad_norm": 0.2833821773529053, "learning_rate": 9.785430050078616e-06, "loss": 0.5447, "step": 414 }, { "epoch": 0.7267950963222417, "grad_norm": 0.24764513969421387, "learning_rate": 9.784057995675245e-06, "loss": 0.5465, "step": 415 }, { "epoch": 0.7285464098073555, "grad_norm": 0.2827005684375763, "learning_rate": 9.782681665293831e-06, "loss": 0.5434, "step": 416 }, { "epoch": 0.7302977232924693, "grad_norm": 0.28893953561782837, "learning_rate": 9.781301060164537e-06, "loss": 0.5242, "step": 417 }, { "epoch": 0.7320490367775832, "grad_norm": 0.2670647203922272, "learning_rate": 9.779916181521342e-06, "loss": 0.5275, "step": 418 }, { "epoch": 0.7338003502626971, "grad_norm": 0.3368021249771118, "learning_rate": 9.778527030602049e-06, "loss": 0.5243, "step": 419 }, { "epoch": 0.7355516637478109, "grad_norm": 0.2541683614253998, "learning_rate": 9.777133608648276e-06, "loss": 0.5324, "step": 420 }, { "epoch": 0.7373029772329247, "grad_norm": 0.30804985761642456, "learning_rate": 9.775735916905458e-06, "loss": 0.5311, "step": 421 }, { "epoch": 0.7390542907180385, "grad_norm": 0.25023210048675537, "learning_rate": 9.77433395662285e-06, "loss": 0.5454, "step": 422 }, { "epoch": 0.7408056042031523, "grad_norm": 0.280285120010376, "learning_rate": 9.77292772905352e-06, "loss": 0.524, "step": 423 }, { "epoch": 0.7425569176882661, "grad_norm": 0.27067258954048157, "learning_rate": 9.77151723545435e-06, "loss": 0.5362, "step": 424 }, { "epoch": 0.7443082311733801, "grad_norm": 0.3023117780685425, "learning_rate": 9.77010247708604e-06, "loss": 0.5415, "step": 425 }, { "epoch": 0.7460595446584939, "grad_norm": 0.30357789993286133, "learning_rate": 9.768683455213089e-06, "loss": 0.5172, "step": 426 }, { "epoch": 0.7478108581436077, "grad_norm": 0.2836744487285614, "learning_rate": 9.76726017110382e-06, "loss": 0.5318, "step": 427 }, { "epoch": 0.7495621716287215, "grad_norm": 0.2795674800872803, "learning_rate": 9.765832626030359e-06, "loss": 0.5384, "step": 428 }, { "epoch": 0.7513134851138353, "grad_norm": 0.326028972864151, "learning_rate": 9.764400821268642e-06, "loss": 0.5383, "step": 429 }, { "epoch": 0.7530647985989493, "grad_norm": 0.2751913070678711, "learning_rate": 9.762964758098412e-06, "loss": 0.5314, "step": 430 }, { "epoch": 0.7548161120840631, "grad_norm": 0.3898245394229889, "learning_rate": 9.761524437803221e-06, "loss": 0.5342, "step": 431 }, { "epoch": 0.7565674255691769, "grad_norm": 0.29031047224998474, "learning_rate": 9.76007986167042e-06, "loss": 0.5463, "step": 432 }, { "epoch": 0.7583187390542907, "grad_norm": 0.35554924607276917, "learning_rate": 9.758631030991168e-06, "loss": 0.5316, "step": 433 }, { "epoch": 0.7600700525394045, "grad_norm": 0.30123552680015564, "learning_rate": 9.757177947060426e-06, "loss": 0.517, "step": 434 }, { "epoch": 0.7618213660245184, "grad_norm": 0.2998826205730438, "learning_rate": 9.755720611176957e-06, "loss": 0.5457, "step": 435 }, { "epoch": 0.7635726795096323, "grad_norm": 0.3098287880420685, "learning_rate": 9.754259024643324e-06, "loss": 0.5385, "step": 436 }, { "epoch": 0.7653239929947461, "grad_norm": 0.2692965269088745, "learning_rate": 9.75279318876589e-06, "loss": 0.5351, "step": 437 }, { "epoch": 0.7670753064798599, "grad_norm": 0.2788274586200714, "learning_rate": 9.751323104854812e-06, "loss": 0.5349, "step": 438 }, { "epoch": 0.7688266199649737, "grad_norm": 0.2546873986721039, "learning_rate": 9.74984877422405e-06, "loss": 0.5335, "step": 439 }, { "epoch": 0.7705779334500875, "grad_norm": 0.2893652021884918, "learning_rate": 9.748370198191355e-06, "loss": 0.5346, "step": 440 }, { "epoch": 0.7723292469352014, "grad_norm": 0.24183516204357147, "learning_rate": 9.746887378078276e-06, "loss": 0.5367, "step": 441 }, { "epoch": 0.7740805604203153, "grad_norm": 0.2932787835597992, "learning_rate": 9.74540031521015e-06, "loss": 0.5315, "step": 442 }, { "epoch": 0.7758318739054291, "grad_norm": 0.27858680486679077, "learning_rate": 9.743909010916115e-06, "loss": 0.5361, "step": 443 }, { "epoch": 0.7775831873905429, "grad_norm": 0.27135366201400757, "learning_rate": 9.74241346652909e-06, "loss": 0.5277, "step": 444 }, { "epoch": 0.7793345008756567, "grad_norm": 0.24995167553424835, "learning_rate": 9.740913683385791e-06, "loss": 0.5372, "step": 445 }, { "epoch": 0.7810858143607706, "grad_norm": 0.2605813145637512, "learning_rate": 9.73940966282672e-06, "loss": 0.5343, "step": 446 }, { "epoch": 0.7828371278458844, "grad_norm": 0.2586875259876251, "learning_rate": 9.737901406196164e-06, "loss": 0.5384, "step": 447 }, { "epoch": 0.7845884413309983, "grad_norm": 0.29552969336509705, "learning_rate": 9.736388914842203e-06, "loss": 0.5387, "step": 448 }, { "epoch": 0.7863397548161121, "grad_norm": 0.2626779079437256, "learning_rate": 9.734872190116695e-06, "loss": 0.5374, "step": 449 }, { "epoch": 0.7880910683012259, "grad_norm": 0.2511596083641052, "learning_rate": 9.733351233375283e-06, "loss": 0.528, "step": 450 }, { "epoch": 0.7898423817863398, "grad_norm": 0.27891314029693604, "learning_rate": 9.731826045977397e-06, "loss": 0.5294, "step": 451 }, { "epoch": 0.7915936952714536, "grad_norm": 0.2633238434791565, "learning_rate": 9.730296629286245e-06, "loss": 0.5248, "step": 452 }, { "epoch": 0.7933450087565674, "grad_norm": 0.294650673866272, "learning_rate": 9.728762984668813e-06, "loss": 0.5327, "step": 453 }, { "epoch": 0.7950963222416813, "grad_norm": 0.29112985730171204, "learning_rate": 9.727225113495871e-06, "loss": 0.5396, "step": 454 }, { "epoch": 0.7968476357267951, "grad_norm": 0.2557793855667114, "learning_rate": 9.725683017141964e-06, "loss": 0.5297, "step": 455 }, { "epoch": 0.7985989492119089, "grad_norm": 0.2950380742549896, "learning_rate": 9.724136696985412e-06, "loss": 0.5177, "step": 456 }, { "epoch": 0.8003502626970228, "grad_norm": 0.27464380860328674, "learning_rate": 9.722586154408312e-06, "loss": 0.5445, "step": 457 }, { "epoch": 0.8021015761821366, "grad_norm": 0.23023128509521484, "learning_rate": 9.721031390796535e-06, "loss": 0.5166, "step": 458 }, { "epoch": 0.8038528896672504, "grad_norm": 0.2651790380477905, "learning_rate": 9.719472407539725e-06, "loss": 0.5384, "step": 459 }, { "epoch": 0.8056042031523643, "grad_norm": 0.2580623924732208, "learning_rate": 9.717909206031295e-06, "loss": 0.5221, "step": 460 }, { "epoch": 0.8073555166374781, "grad_norm": 0.25435981154441833, "learning_rate": 9.716341787668434e-06, "loss": 0.5444, "step": 461 }, { "epoch": 0.809106830122592, "grad_norm": 0.2704431116580963, "learning_rate": 9.714770153852093e-06, "loss": 0.5326, "step": 462 }, { "epoch": 0.8108581436077058, "grad_norm": 0.2431914061307907, "learning_rate": 9.713194305986995e-06, "loss": 0.5242, "step": 463 }, { "epoch": 0.8126094570928196, "grad_norm": 0.2812761962413788, "learning_rate": 9.71161424548163e-06, "loss": 0.5333, "step": 464 }, { "epoch": 0.8143607705779334, "grad_norm": 0.26699599623680115, "learning_rate": 9.710029973748249e-06, "loss": 0.5221, "step": 465 }, { "epoch": 0.8161120840630472, "grad_norm": 0.29118746519088745, "learning_rate": 9.708441492202874e-06, "loss": 0.5299, "step": 466 }, { "epoch": 0.8178633975481612, "grad_norm": 0.2801555097103119, "learning_rate": 9.70684880226528e-06, "loss": 0.5333, "step": 467 }, { "epoch": 0.819614711033275, "grad_norm": 0.2447882443666458, "learning_rate": 9.705251905359014e-06, "loss": 0.5327, "step": 468 }, { "epoch": 0.8213660245183888, "grad_norm": 0.30146121978759766, "learning_rate": 9.703650802911374e-06, "loss": 0.5314, "step": 469 }, { "epoch": 0.8231173380035026, "grad_norm": 0.3016609847545624, "learning_rate": 9.702045496353422e-06, "loss": 0.5398, "step": 470 }, { "epoch": 0.8248686514886164, "grad_norm": 0.2665237784385681, "learning_rate": 9.700435987119981e-06, "loss": 0.5328, "step": 471 }, { "epoch": 0.8266199649737302, "grad_norm": 0.29319673776626587, "learning_rate": 9.69882227664962e-06, "loss": 0.533, "step": 472 }, { "epoch": 0.8283712784588442, "grad_norm": 0.2662714123725891, "learning_rate": 9.69720436638467e-06, "loss": 0.5191, "step": 473 }, { "epoch": 0.830122591943958, "grad_norm": 0.26234444975852966, "learning_rate": 9.69558225777122e-06, "loss": 0.5258, "step": 474 }, { "epoch": 0.8318739054290718, "grad_norm": 0.30167844891548157, "learning_rate": 9.693955952259099e-06, "loss": 0.5304, "step": 475 }, { "epoch": 0.8336252189141856, "grad_norm": 0.25661635398864746, "learning_rate": 9.6923254513019e-06, "loss": 0.537, "step": 476 }, { "epoch": 0.8353765323992994, "grad_norm": 0.26465481519699097, "learning_rate": 9.69069075635696e-06, "loss": 0.5243, "step": 477 }, { "epoch": 0.8371278458844134, "grad_norm": 0.28614136576652527, "learning_rate": 9.689051868885362e-06, "loss": 0.5253, "step": 478 }, { "epoch": 0.8388791593695272, "grad_norm": 0.2636883556842804, "learning_rate": 9.68740879035194e-06, "loss": 0.5407, "step": 479 }, { "epoch": 0.840630472854641, "grad_norm": 0.28720590472221375, "learning_rate": 9.685761522225271e-06, "loss": 0.5324, "step": 480 }, { "epoch": 0.8423817863397548, "grad_norm": 0.25268250703811646, "learning_rate": 9.684110065977685e-06, "loss": 0.5304, "step": 481 }, { "epoch": 0.8441330998248686, "grad_norm": 0.2812349796295166, "learning_rate": 9.682454423085244e-06, "loss": 0.533, "step": 482 }, { "epoch": 0.8458844133099825, "grad_norm": 0.2850024104118347, "learning_rate": 9.680794595027761e-06, "loss": 0.5409, "step": 483 }, { "epoch": 0.8476357267950964, "grad_norm": 0.2802037298679352, "learning_rate": 9.67913058328878e-06, "loss": 0.5311, "step": 484 }, { "epoch": 0.8493870402802102, "grad_norm": 0.32752901315689087, "learning_rate": 9.677462389355594e-06, "loss": 0.5204, "step": 485 }, { "epoch": 0.851138353765324, "grad_norm": 0.23605093359947205, "learning_rate": 9.675790014719231e-06, "loss": 0.5237, "step": 486 }, { "epoch": 0.8528896672504378, "grad_norm": 0.31068938970565796, "learning_rate": 9.674113460874453e-06, "loss": 0.5283, "step": 487 }, { "epoch": 0.8546409807355516, "grad_norm": 0.2400563657283783, "learning_rate": 9.67243272931976e-06, "loss": 0.5349, "step": 488 }, { "epoch": 0.8563922942206655, "grad_norm": 0.2998121380805969, "learning_rate": 9.670747821557387e-06, "loss": 0.5372, "step": 489 }, { "epoch": 0.8581436077057794, "grad_norm": 0.2821276783943176, "learning_rate": 9.669058739093296e-06, "loss": 0.5349, "step": 490 }, { "epoch": 0.8598949211908932, "grad_norm": 0.28009453415870667, "learning_rate": 9.667365483437188e-06, "loss": 0.5334, "step": 491 }, { "epoch": 0.861646234676007, "grad_norm": 0.31216636300086975, "learning_rate": 9.665668056102487e-06, "loss": 0.5406, "step": 492 }, { "epoch": 0.8633975481611208, "grad_norm": 0.25741538405418396, "learning_rate": 9.663966458606353e-06, "loss": 0.5374, "step": 493 }, { "epoch": 0.8651488616462347, "grad_norm": 0.33116355538368225, "learning_rate": 9.662260692469664e-06, "loss": 0.5229, "step": 494 }, { "epoch": 0.8669001751313485, "grad_norm": 0.25497379899024963, "learning_rate": 9.660550759217035e-06, "loss": 0.5349, "step": 495 }, { "epoch": 0.8686514886164624, "grad_norm": 0.31184378266334534, "learning_rate": 9.658836660376796e-06, "loss": 0.5254, "step": 496 }, { "epoch": 0.8704028021015762, "grad_norm": 0.2598186135292053, "learning_rate": 9.657118397481004e-06, "loss": 0.5371, "step": 497 }, { "epoch": 0.87215411558669, "grad_norm": 0.30250346660614014, "learning_rate": 9.65539597206544e-06, "loss": 0.5222, "step": 498 }, { "epoch": 0.8739054290718039, "grad_norm": 0.24063172936439514, "learning_rate": 9.6536693856696e-06, "loss": 0.5409, "step": 499 }, { "epoch": 0.8756567425569177, "grad_norm": 0.3076416254043579, "learning_rate": 9.651938639836705e-06, "loss": 0.5313, "step": 500 }, { "epoch": 0.8774080560420315, "grad_norm": 0.2726023495197296, "learning_rate": 9.650203736113689e-06, "loss": 0.5172, "step": 501 }, { "epoch": 0.8791593695271454, "grad_norm": 0.27463313937187195, "learning_rate": 9.648464676051206e-06, "loss": 0.5289, "step": 502 }, { "epoch": 0.8809106830122592, "grad_norm": 0.2762005925178528, "learning_rate": 9.64672146120362e-06, "loss": 0.5179, "step": 503 }, { "epoch": 0.882661996497373, "grad_norm": 0.25673907995224, "learning_rate": 9.644974093129017e-06, "loss": 0.5195, "step": 504 }, { "epoch": 0.8844133099824869, "grad_norm": 0.3035512864589691, "learning_rate": 9.643222573389184e-06, "loss": 0.5239, "step": 505 }, { "epoch": 0.8861646234676007, "grad_norm": 0.3251352310180664, "learning_rate": 9.641466903549628e-06, "loss": 0.5335, "step": 506 }, { "epoch": 0.8879159369527145, "grad_norm": 0.32601791620254517, "learning_rate": 9.639707085179562e-06, "loss": 0.5272, "step": 507 }, { "epoch": 0.8896672504378283, "grad_norm": 0.26642143726348877, "learning_rate": 9.637943119851905e-06, "loss": 0.5348, "step": 508 }, { "epoch": 0.8914185639229422, "grad_norm": 0.3467028737068176, "learning_rate": 9.636175009143286e-06, "loss": 0.5348, "step": 509 }, { "epoch": 0.8931698774080561, "grad_norm": 0.30048230290412903, "learning_rate": 9.634402754634037e-06, "loss": 0.5384, "step": 510 }, { "epoch": 0.8949211908931699, "grad_norm": 0.403886616230011, "learning_rate": 9.632626357908194e-06, "loss": 0.5412, "step": 511 }, { "epoch": 0.8966725043782837, "grad_norm": 0.3361055552959442, "learning_rate": 9.630845820553495e-06, "loss": 0.5191, "step": 512 }, { "epoch": 0.8984238178633975, "grad_norm": 0.34397387504577637, "learning_rate": 9.629061144161383e-06, "loss": 0.5259, "step": 513 }, { "epoch": 0.9001751313485113, "grad_norm": 0.3465234339237213, "learning_rate": 9.627272330326993e-06, "loss": 0.5364, "step": 514 }, { "epoch": 0.9019264448336253, "grad_norm": 0.3488556146621704, "learning_rate": 9.625479380649163e-06, "loss": 0.526, "step": 515 }, { "epoch": 0.9036777583187391, "grad_norm": 0.31670987606048584, "learning_rate": 9.623682296730427e-06, "loss": 0.5404, "step": 516 }, { "epoch": 0.9054290718038529, "grad_norm": 0.33771857619285583, "learning_rate": 9.621881080177013e-06, "loss": 0.5236, "step": 517 }, { "epoch": 0.9071803852889667, "grad_norm": 0.26242950558662415, "learning_rate": 9.620075732598843e-06, "loss": 0.5274, "step": 518 }, { "epoch": 0.9089316987740805, "grad_norm": 0.30802416801452637, "learning_rate": 9.618266255609533e-06, "loss": 0.5202, "step": 519 }, { "epoch": 0.9106830122591943, "grad_norm": 0.29652339220046997, "learning_rate": 9.616452650826392e-06, "loss": 0.5271, "step": 520 }, { "epoch": 0.9124343257443083, "grad_norm": 0.28188326954841614, "learning_rate": 9.614634919870407e-06, "loss": 0.5326, "step": 521 }, { "epoch": 0.9141856392294221, "grad_norm": 0.3271211087703705, "learning_rate": 9.612813064366267e-06, "loss": 0.5355, "step": 522 }, { "epoch": 0.9159369527145359, "grad_norm": 0.26931485533714294, "learning_rate": 9.61098708594234e-06, "loss": 0.523, "step": 523 }, { "epoch": 0.9176882661996497, "grad_norm": 0.3195890486240387, "learning_rate": 9.60915698623068e-06, "loss": 0.5303, "step": 524 }, { "epoch": 0.9194395796847635, "grad_norm": 0.25536176562309265, "learning_rate": 9.607322766867026e-06, "loss": 0.5053, "step": 525 }, { "epoch": 0.9211908931698775, "grad_norm": 0.31145158410072327, "learning_rate": 9.605484429490796e-06, "loss": 0.5322, "step": 526 }, { "epoch": 0.9229422066549913, "grad_norm": 0.2799984812736511, "learning_rate": 9.603641975745095e-06, "loss": 0.5241, "step": 527 }, { "epoch": 0.9246935201401051, "grad_norm": 0.3053185045719147, "learning_rate": 9.601795407276699e-06, "loss": 0.5252, "step": 528 }, { "epoch": 0.9264448336252189, "grad_norm": 0.24108369648456573, "learning_rate": 9.59994472573607e-06, "loss": 0.5074, "step": 529 }, { "epoch": 0.9281961471103327, "grad_norm": 0.2950921356678009, "learning_rate": 9.598089932777339e-06, "loss": 0.524, "step": 530 }, { "epoch": 0.9299474605954466, "grad_norm": 0.25882333517074585, "learning_rate": 9.596231030058315e-06, "loss": 0.5432, "step": 531 }, { "epoch": 0.9316987740805605, "grad_norm": 0.3323661983013153, "learning_rate": 9.594368019240484e-06, "loss": 0.5332, "step": 532 }, { "epoch": 0.9334500875656743, "grad_norm": 0.24082472920417786, "learning_rate": 9.592500901988994e-06, "loss": 0.5257, "step": 533 }, { "epoch": 0.9352014010507881, "grad_norm": 0.2902567982673645, "learning_rate": 9.590629679972673e-06, "loss": 0.5365, "step": 534 }, { "epoch": 0.9369527145359019, "grad_norm": 0.2845010459423065, "learning_rate": 9.588754354864014e-06, "loss": 0.5237, "step": 535 }, { "epoch": 0.9387040280210157, "grad_norm": 0.26561248302459717, "learning_rate": 9.586874928339176e-06, "loss": 0.5111, "step": 536 }, { "epoch": 0.9404553415061296, "grad_norm": 0.3377715051174164, "learning_rate": 9.584991402077988e-06, "loss": 0.5202, "step": 537 }, { "epoch": 0.9422066549912435, "grad_norm": 0.28620895743370056, "learning_rate": 9.583103777763938e-06, "loss": 0.5388, "step": 538 }, { "epoch": 0.9439579684763573, "grad_norm": 0.3053395748138428, "learning_rate": 9.58121205708418e-06, "loss": 0.5232, "step": 539 }, { "epoch": 0.9457092819614711, "grad_norm": 0.2687322497367859, "learning_rate": 9.57931624172953e-06, "loss": 0.5294, "step": 540 }, { "epoch": 0.9474605954465849, "grad_norm": 0.3101924955844879, "learning_rate": 9.577416333394463e-06, "loss": 0.5306, "step": 541 }, { "epoch": 0.9492119089316988, "grad_norm": 0.3063521385192871, "learning_rate": 9.575512333777108e-06, "loss": 0.536, "step": 542 }, { "epoch": 0.9509632224168126, "grad_norm": 0.3156822919845581, "learning_rate": 9.57360424457926e-06, "loss": 0.5226, "step": 543 }, { "epoch": 0.9527145359019265, "grad_norm": 0.2774573266506195, "learning_rate": 9.571692067506363e-06, "loss": 0.5207, "step": 544 }, { "epoch": 0.9544658493870403, "grad_norm": 0.2948058545589447, "learning_rate": 9.56977580426751e-06, "loss": 0.5236, "step": 545 }, { "epoch": 0.9562171628721541, "grad_norm": 0.33326247334480286, "learning_rate": 9.567855456575459e-06, "loss": 0.5299, "step": 546 }, { "epoch": 0.957968476357268, "grad_norm": 0.2924548089504242, "learning_rate": 9.565931026146607e-06, "loss": 0.523, "step": 547 }, { "epoch": 0.9597197898423818, "grad_norm": 0.27914273738861084, "learning_rate": 9.564002514701006e-06, "loss": 0.5218, "step": 548 }, { "epoch": 0.9614711033274956, "grad_norm": 0.35397985577583313, "learning_rate": 9.562069923962355e-06, "loss": 0.523, "step": 549 }, { "epoch": 0.9632224168126094, "grad_norm": 0.31407415866851807, "learning_rate": 9.560133255657997e-06, "loss": 0.5278, "step": 550 }, { "epoch": 0.9649737302977233, "grad_norm": 0.33255934715270996, "learning_rate": 9.558192511518925e-06, "loss": 0.5194, "step": 551 }, { "epoch": 0.9667250437828371, "grad_norm": 0.34817197918891907, "learning_rate": 9.556247693279764e-06, "loss": 0.5285, "step": 552 }, { "epoch": 0.968476357267951, "grad_norm": 0.3032604157924652, "learning_rate": 9.554298802678793e-06, "loss": 0.544, "step": 553 }, { "epoch": 0.9702276707530648, "grad_norm": 0.30100229382514954, "learning_rate": 9.552345841457922e-06, "loss": 0.5165, "step": 554 }, { "epoch": 0.9719789842381786, "grad_norm": 0.34597358107566833, "learning_rate": 9.550388811362704e-06, "loss": 0.529, "step": 555 }, { "epoch": 0.9737302977232924, "grad_norm": 0.261093407869339, "learning_rate": 9.548427714142326e-06, "loss": 0.5194, "step": 556 }, { "epoch": 0.9754816112084063, "grad_norm": 0.33266597986221313, "learning_rate": 9.546462551549612e-06, "loss": 0.5329, "step": 557 }, { "epoch": 0.9772329246935202, "grad_norm": 0.24415044486522675, "learning_rate": 9.54449332534102e-06, "loss": 0.5267, "step": 558 }, { "epoch": 0.978984238178634, "grad_norm": 0.3906889259815216, "learning_rate": 9.542520037276636e-06, "loss": 0.5252, "step": 559 }, { "epoch": 0.9807355516637478, "grad_norm": 0.2591072916984558, "learning_rate": 9.540542689120184e-06, "loss": 0.5216, "step": 560 }, { "epoch": 0.9824868651488616, "grad_norm": 0.3639351427555084, "learning_rate": 9.538561282639008e-06, "loss": 0.5294, "step": 561 }, { "epoch": 0.9842381786339754, "grad_norm": 0.2680988311767578, "learning_rate": 9.536575819604087e-06, "loss": 0.5218, "step": 562 }, { "epoch": 0.9859894921190894, "grad_norm": 0.3374999761581421, "learning_rate": 9.534586301790021e-06, "loss": 0.5211, "step": 563 }, { "epoch": 0.9877408056042032, "grad_norm": 0.30274730920791626, "learning_rate": 9.532592730975035e-06, "loss": 0.5289, "step": 564 }, { "epoch": 0.989492119089317, "grad_norm": 0.3283017575740814, "learning_rate": 9.530595108940978e-06, "loss": 0.5288, "step": 565 }, { "epoch": 0.9912434325744308, "grad_norm": 0.2796541452407837, "learning_rate": 9.52859343747332e-06, "loss": 0.5201, "step": 566 }, { "epoch": 0.9929947460595446, "grad_norm": 0.3567406237125397, "learning_rate": 9.526587718361147e-06, "loss": 0.5389, "step": 567 }, { "epoch": 0.9947460595446584, "grad_norm": 0.2703985869884491, "learning_rate": 9.524577953397167e-06, "loss": 0.5231, "step": 568 }, { "epoch": 0.9964973730297724, "grad_norm": 0.2881230413913727, "learning_rate": 9.522564144377703e-06, "loss": 0.5304, "step": 569 }, { "epoch": 0.9982486865148862, "grad_norm": 0.2968869209289551, "learning_rate": 9.52054629310269e-06, "loss": 0.5118, "step": 570 }, { "epoch": 1.0, "grad_norm": 0.32066547870635986, "learning_rate": 9.518524401375682e-06, "loss": 0.5152, "step": 571 }, { "epoch": 1.001751313485114, "grad_norm": 0.2891395390033722, "learning_rate": 9.516498471003837e-06, "loss": 0.5181, "step": 572 }, { "epoch": 1.0035026269702276, "grad_norm": 0.3206441104412079, "learning_rate": 9.514468503797927e-06, "loss": 0.5083, "step": 573 }, { "epoch": 1.0052539404553416, "grad_norm": 0.293159157037735, "learning_rate": 9.51243450157233e-06, "loss": 0.5239, "step": 574 }, { "epoch": 1.0070052539404553, "grad_norm": 0.30442747473716736, "learning_rate": 9.510396466145032e-06, "loss": 0.5042, "step": 575 }, { "epoch": 1.0087565674255692, "grad_norm": 0.3323088586330414, "learning_rate": 9.508354399337625e-06, "loss": 0.5094, "step": 576 }, { "epoch": 1.010507880910683, "grad_norm": 0.32885992527008057, "learning_rate": 9.5063083029753e-06, "loss": 0.5113, "step": 577 }, { "epoch": 1.0122591943957968, "grad_norm": 0.27729055285453796, "learning_rate": 9.504258178886858e-06, "loss": 0.5079, "step": 578 }, { "epoch": 1.0140105078809107, "grad_norm": 0.35448357462882996, "learning_rate": 9.502204028904687e-06, "loss": 0.4967, "step": 579 }, { "epoch": 1.0157618213660244, "grad_norm": 0.286769837141037, "learning_rate": 9.500145854864785e-06, "loss": 0.5153, "step": 580 }, { "epoch": 1.0175131348511384, "grad_norm": 0.3435061275959015, "learning_rate": 9.498083658606744e-06, "loss": 0.5127, "step": 581 }, { "epoch": 1.0192644483362523, "grad_norm": 0.2716231346130371, "learning_rate": 9.496017441973747e-06, "loss": 0.5066, "step": 582 }, { "epoch": 1.021015761821366, "grad_norm": 0.303275465965271, "learning_rate": 9.49394720681257e-06, "loss": 0.5119, "step": 583 }, { "epoch": 1.02276707530648, "grad_norm": 0.2821592092514038, "learning_rate": 9.49187295497359e-06, "loss": 0.5115, "step": 584 }, { "epoch": 1.0245183887915936, "grad_norm": 0.3063298761844635, "learning_rate": 9.48979468831076e-06, "loss": 0.5098, "step": 585 }, { "epoch": 1.0262697022767076, "grad_norm": 0.2609064280986786, "learning_rate": 9.487712408681635e-06, "loss": 0.5018, "step": 586 }, { "epoch": 1.0280210157618215, "grad_norm": 0.33187058568000793, "learning_rate": 9.485626117947351e-06, "loss": 0.5045, "step": 587 }, { "epoch": 1.0297723292469352, "grad_norm": 0.23559680581092834, "learning_rate": 9.483535817972625e-06, "loss": 0.5153, "step": 588 }, { "epoch": 1.031523642732049, "grad_norm": 0.3870587944984436, "learning_rate": 9.481441510625765e-06, "loss": 0.5178, "step": 589 }, { "epoch": 1.0332749562171628, "grad_norm": 0.2456979751586914, "learning_rate": 9.479343197778653e-06, "loss": 0.5003, "step": 590 }, { "epoch": 1.0350262697022767, "grad_norm": 0.33313217759132385, "learning_rate": 9.477240881306759e-06, "loss": 0.5028, "step": 591 }, { "epoch": 1.0367775831873904, "grad_norm": 0.25221139192581177, "learning_rate": 9.475134563089128e-06, "loss": 0.5065, "step": 592 }, { "epoch": 1.0385288966725044, "grad_norm": 0.31374484300613403, "learning_rate": 9.47302424500838e-06, "loss": 0.4974, "step": 593 }, { "epoch": 1.0402802101576183, "grad_norm": 0.29692599177360535, "learning_rate": 9.47090992895071e-06, "loss": 0.5, "step": 594 }, { "epoch": 1.042031523642732, "grad_norm": 0.3045715391635895, "learning_rate": 9.468791616805893e-06, "loss": 0.5115, "step": 595 }, { "epoch": 1.043782837127846, "grad_norm": 0.24662898480892181, "learning_rate": 9.466669310467266e-06, "loss": 0.4883, "step": 596 }, { "epoch": 1.0455341506129596, "grad_norm": 0.28495752811431885, "learning_rate": 9.464543011831742e-06, "loss": 0.505, "step": 597 }, { "epoch": 1.0472854640980735, "grad_norm": 0.25603187084198, "learning_rate": 9.462412722799801e-06, "loss": 0.5101, "step": 598 }, { "epoch": 1.0490367775831875, "grad_norm": 0.2636597752571106, "learning_rate": 9.46027844527549e-06, "loss": 0.5115, "step": 599 }, { "epoch": 1.0507880910683012, "grad_norm": 0.28547048568725586, "learning_rate": 9.45814018116642e-06, "loss": 0.5111, "step": 600 }, { "epoch": 1.052539404553415, "grad_norm": 0.26494544744491577, "learning_rate": 9.455997932383766e-06, "loss": 0.5106, "step": 601 }, { "epoch": 1.0542907180385288, "grad_norm": 0.2640525996685028, "learning_rate": 9.453851700842262e-06, "loss": 0.4975, "step": 602 }, { "epoch": 1.0560420315236427, "grad_norm": 0.26648426055908203, "learning_rate": 9.451701488460207e-06, "loss": 0.5017, "step": 603 }, { "epoch": 1.0577933450087567, "grad_norm": 0.26182693243026733, "learning_rate": 9.449547297159453e-06, "loss": 0.5064, "step": 604 }, { "epoch": 1.0595446584938704, "grad_norm": 0.24341371655464172, "learning_rate": 9.447389128865413e-06, "loss": 0.505, "step": 605 }, { "epoch": 1.0612959719789843, "grad_norm": 0.28062382340431213, "learning_rate": 9.445226985507047e-06, "loss": 0.5203, "step": 606 }, { "epoch": 1.063047285464098, "grad_norm": 0.2650873363018036, "learning_rate": 9.44306086901688e-06, "loss": 0.5145, "step": 607 }, { "epoch": 1.064798598949212, "grad_norm": 0.2780110836029053, "learning_rate": 9.440890781330974e-06, "loss": 0.5055, "step": 608 }, { "epoch": 1.0665499124343258, "grad_norm": 0.255342572927475, "learning_rate": 9.43871672438895e-06, "loss": 0.5057, "step": 609 }, { "epoch": 1.0683012259194395, "grad_norm": 0.2764577865600586, "learning_rate": 9.436538700133977e-06, "loss": 0.509, "step": 610 }, { "epoch": 1.0700525394045535, "grad_norm": 0.24311257898807526, "learning_rate": 9.434356710512763e-06, "loss": 0.5196, "step": 611 }, { "epoch": 1.0718038528896672, "grad_norm": 0.326602965593338, "learning_rate": 9.432170757475566e-06, "loss": 0.5147, "step": 612 }, { "epoch": 1.073555166374781, "grad_norm": 0.25586217641830444, "learning_rate": 9.429980842976186e-06, "loss": 0.515, "step": 613 }, { "epoch": 1.0753064798598948, "grad_norm": 0.3195502460002899, "learning_rate": 9.427786968971961e-06, "loss": 0.5148, "step": 614 }, { "epoch": 1.0770577933450087, "grad_norm": 0.27243632078170776, "learning_rate": 9.425589137423772e-06, "loss": 0.5045, "step": 615 }, { "epoch": 1.0788091068301227, "grad_norm": 0.2834032475948334, "learning_rate": 9.423387350296032e-06, "loss": 0.5028, "step": 616 }, { "epoch": 1.0805604203152364, "grad_norm": 0.2710925340652466, "learning_rate": 9.421181609556693e-06, "loss": 0.5099, "step": 617 }, { "epoch": 1.0823117338003503, "grad_norm": 0.26944929361343384, "learning_rate": 9.418971917177241e-06, "loss": 0.5103, "step": 618 }, { "epoch": 1.084063047285464, "grad_norm": 0.30453526973724365, "learning_rate": 9.416758275132693e-06, "loss": 0.5072, "step": 619 }, { "epoch": 1.085814360770578, "grad_norm": 0.25348109006881714, "learning_rate": 9.414540685401596e-06, "loss": 0.5081, "step": 620 }, { "epoch": 1.0875656742556918, "grad_norm": 0.28791606426239014, "learning_rate": 9.412319149966025e-06, "loss": 0.5019, "step": 621 }, { "epoch": 1.0893169877408055, "grad_norm": 0.24679851531982422, "learning_rate": 9.410093670811582e-06, "loss": 0.5117, "step": 622 }, { "epoch": 1.0910683012259195, "grad_norm": 0.274676650762558, "learning_rate": 9.407864249927396e-06, "loss": 0.5106, "step": 623 }, { "epoch": 1.0928196147110332, "grad_norm": 0.2711188793182373, "learning_rate": 9.405630889306116e-06, "loss": 0.5088, "step": 624 }, { "epoch": 1.094570928196147, "grad_norm": 0.26072394847869873, "learning_rate": 9.403393590943916e-06, "loss": 0.5128, "step": 625 }, { "epoch": 1.096322241681261, "grad_norm": 0.26186367869377136, "learning_rate": 9.401152356840484e-06, "loss": 0.5173, "step": 626 }, { "epoch": 1.0980735551663747, "grad_norm": 0.26387590169906616, "learning_rate": 9.39890718899903e-06, "loss": 0.5075, "step": 627 }, { "epoch": 1.0998248686514887, "grad_norm": 0.2613793909549713, "learning_rate": 9.39665808942628e-06, "loss": 0.5061, "step": 628 }, { "epoch": 1.1015761821366024, "grad_norm": 0.25638338923454285, "learning_rate": 9.394405060132473e-06, "loss": 0.5043, "step": 629 }, { "epoch": 1.1033274956217163, "grad_norm": 0.2717747688293457, "learning_rate": 9.392148103131358e-06, "loss": 0.5039, "step": 630 }, { "epoch": 1.1050788091068302, "grad_norm": 0.2394360899925232, "learning_rate": 9.3898872204402e-06, "loss": 0.5098, "step": 631 }, { "epoch": 1.106830122591944, "grad_norm": 0.2995234429836273, "learning_rate": 9.38762241407977e-06, "loss": 0.4923, "step": 632 }, { "epoch": 1.1085814360770578, "grad_norm": 0.30655384063720703, "learning_rate": 9.38535368607434e-06, "loss": 0.5094, "step": 633 }, { "epoch": 1.1103327495621715, "grad_norm": 0.27806323766708374, "learning_rate": 9.3830810384517e-06, "loss": 0.505, "step": 634 }, { "epoch": 1.1120840630472855, "grad_norm": 0.3014695644378662, "learning_rate": 9.380804473243133e-06, "loss": 0.5088, "step": 635 }, { "epoch": 1.1138353765323994, "grad_norm": 0.249694362282753, "learning_rate": 9.378523992483429e-06, "loss": 0.5024, "step": 636 }, { "epoch": 1.115586690017513, "grad_norm": 0.2832924425601959, "learning_rate": 9.376239598210873e-06, "loss": 0.513, "step": 637 }, { "epoch": 1.117338003502627, "grad_norm": 0.2772475481033325, "learning_rate": 9.37395129246725e-06, "loss": 0.502, "step": 638 }, { "epoch": 1.1190893169877407, "grad_norm": 0.26696112751960754, "learning_rate": 9.371659077297843e-06, "loss": 0.5153, "step": 639 }, { "epoch": 1.1208406304728546, "grad_norm": 0.282622367143631, "learning_rate": 9.369362954751428e-06, "loss": 0.5091, "step": 640 }, { "epoch": 1.1225919439579686, "grad_norm": 0.2807959020137787, "learning_rate": 9.367062926880273e-06, "loss": 0.5046, "step": 641 }, { "epoch": 1.1243432574430823, "grad_norm": 0.29210928082466125, "learning_rate": 9.364758995740136e-06, "loss": 0.5042, "step": 642 }, { "epoch": 1.1260945709281962, "grad_norm": 0.27993541955947876, "learning_rate": 9.362451163390265e-06, "loss": 0.5039, "step": 643 }, { "epoch": 1.12784588441331, "grad_norm": 0.3007780611515045, "learning_rate": 9.360139431893394e-06, "loss": 0.5118, "step": 644 }, { "epoch": 1.1295971978984238, "grad_norm": 0.30060988664627075, "learning_rate": 9.35782380331574e-06, "loss": 0.5045, "step": 645 }, { "epoch": 1.1313485113835378, "grad_norm": 0.2803099751472473, "learning_rate": 9.35550427972701e-06, "loss": 0.5001, "step": 646 }, { "epoch": 1.1330998248686515, "grad_norm": 0.2857620418071747, "learning_rate": 9.353180863200385e-06, "loss": 0.5088, "step": 647 }, { "epoch": 1.1348511383537654, "grad_norm": 0.28795570135116577, "learning_rate": 9.350853555812529e-06, "loss": 0.5069, "step": 648 }, { "epoch": 1.136602451838879, "grad_norm": 0.32201433181762695, "learning_rate": 9.348522359643583e-06, "loss": 0.5102, "step": 649 }, { "epoch": 1.138353765323993, "grad_norm": 0.257118284702301, "learning_rate": 9.346187276777163e-06, "loss": 0.5003, "step": 650 }, { "epoch": 1.140105078809107, "grad_norm": 0.29452115297317505, "learning_rate": 9.34384830930036e-06, "loss": 0.5088, "step": 651 }, { "epoch": 1.1418563922942206, "grad_norm": 0.34120097756385803, "learning_rate": 9.34150545930374e-06, "loss": 0.5101, "step": 652 }, { "epoch": 1.1436077057793346, "grad_norm": 0.3042868971824646, "learning_rate": 9.339158728881332e-06, "loss": 0.5023, "step": 653 }, { "epoch": 1.1453590192644483, "grad_norm": 0.27089107036590576, "learning_rate": 9.336808120130638e-06, "loss": 0.5178, "step": 654 }, { "epoch": 1.1471103327495622, "grad_norm": 0.2959478199481964, "learning_rate": 9.334453635152628e-06, "loss": 0.5018, "step": 655 }, { "epoch": 1.1488616462346761, "grad_norm": 0.24110515415668488, "learning_rate": 9.332095276051729e-06, "loss": 0.4985, "step": 656 }, { "epoch": 1.1506129597197898, "grad_norm": 0.2865137755870819, "learning_rate": 9.329733044935843e-06, "loss": 0.5038, "step": 657 }, { "epoch": 1.1523642732049038, "grad_norm": 0.24463608860969543, "learning_rate": 9.32736694391632e-06, "loss": 0.5103, "step": 658 }, { "epoch": 1.1541155866900175, "grad_norm": 0.313749760389328, "learning_rate": 9.324996975107978e-06, "loss": 0.5082, "step": 659 }, { "epoch": 1.1558669001751314, "grad_norm": 0.27725738286972046, "learning_rate": 9.322623140629088e-06, "loss": 0.5015, "step": 660 }, { "epoch": 1.157618213660245, "grad_norm": 0.28405508399009705, "learning_rate": 9.320245442601377e-06, "loss": 0.5123, "step": 661 }, { "epoch": 1.159369527145359, "grad_norm": 0.2574841380119324, "learning_rate": 9.317863883150022e-06, "loss": 0.5006, "step": 662 }, { "epoch": 1.161120840630473, "grad_norm": 0.2974395751953125, "learning_rate": 9.31547846440366e-06, "loss": 0.5097, "step": 663 }, { "epoch": 1.1628721541155866, "grad_norm": 0.24906659126281738, "learning_rate": 9.313089188494366e-06, "loss": 0.5164, "step": 664 }, { "epoch": 1.1646234676007006, "grad_norm": 0.29376420378685, "learning_rate": 9.31069605755767e-06, "loss": 0.4987, "step": 665 }, { "epoch": 1.1663747810858143, "grad_norm": 0.25388413667678833, "learning_rate": 9.30829907373255e-06, "loss": 0.5134, "step": 666 }, { "epoch": 1.1681260945709282, "grad_norm": 0.2886774241924286, "learning_rate": 9.30589823916142e-06, "loss": 0.5089, "step": 667 }, { "epoch": 1.1698774080560421, "grad_norm": 0.2778962254524231, "learning_rate": 9.303493555990139e-06, "loss": 0.5016, "step": 668 }, { "epoch": 1.1716287215411558, "grad_norm": 0.3308984041213989, "learning_rate": 9.301085026368006e-06, "loss": 0.5052, "step": 669 }, { "epoch": 1.1733800350262698, "grad_norm": 0.26970142126083374, "learning_rate": 9.298672652447763e-06, "loss": 0.5138, "step": 670 }, { "epoch": 1.1751313485113835, "grad_norm": 0.2756795585155487, "learning_rate": 9.296256436385576e-06, "loss": 0.5044, "step": 671 }, { "epoch": 1.1768826619964974, "grad_norm": 0.2963114380836487, "learning_rate": 9.293836380341059e-06, "loss": 0.5174, "step": 672 }, { "epoch": 1.178633975481611, "grad_norm": 0.2403171956539154, "learning_rate": 9.291412486477247e-06, "loss": 0.4995, "step": 673 }, { "epoch": 1.180385288966725, "grad_norm": 0.26922065019607544, "learning_rate": 9.28898475696061e-06, "loss": 0.5028, "step": 674 }, { "epoch": 1.182136602451839, "grad_norm": 0.2508949637413025, "learning_rate": 9.28655319396105e-06, "loss": 0.5103, "step": 675 }, { "epoch": 1.1838879159369526, "grad_norm": 0.28659486770629883, "learning_rate": 9.284117799651887e-06, "loss": 0.5117, "step": 676 }, { "epoch": 1.1856392294220666, "grad_norm": 0.2516440749168396, "learning_rate": 9.281678576209873e-06, "loss": 0.51, "step": 677 }, { "epoch": 1.1873905429071803, "grad_norm": 0.3334640860557556, "learning_rate": 9.279235525815177e-06, "loss": 0.5044, "step": 678 }, { "epoch": 1.1891418563922942, "grad_norm": 0.28877073526382446, "learning_rate": 9.276788650651392e-06, "loss": 0.4937, "step": 679 }, { "epoch": 1.1908931698774081, "grad_norm": 0.3116786777973175, "learning_rate": 9.274337952905528e-06, "loss": 0.4953, "step": 680 }, { "epoch": 1.1926444833625218, "grad_norm": 0.2561086118221283, "learning_rate": 9.271883434768012e-06, "loss": 0.5021, "step": 681 }, { "epoch": 1.1943957968476357, "grad_norm": 0.32319343090057373, "learning_rate": 9.269425098432686e-06, "loss": 0.5094, "step": 682 }, { "epoch": 1.1961471103327495, "grad_norm": 0.27400627732276917, "learning_rate": 9.266962946096802e-06, "loss": 0.5022, "step": 683 }, { "epoch": 1.1978984238178634, "grad_norm": 0.28904300928115845, "learning_rate": 9.264496979961031e-06, "loss": 0.5101, "step": 684 }, { "epoch": 1.1996497373029773, "grad_norm": 0.29013481736183167, "learning_rate": 9.26202720222944e-06, "loss": 0.4987, "step": 685 }, { "epoch": 1.201401050788091, "grad_norm": 0.25359904766082764, "learning_rate": 9.259553615109514e-06, "loss": 0.5096, "step": 686 }, { "epoch": 1.203152364273205, "grad_norm": 0.2656508982181549, "learning_rate": 9.257076220812136e-06, "loss": 0.5086, "step": 687 }, { "epoch": 1.2049036777583186, "grad_norm": 0.23829378187656403, "learning_rate": 9.254595021551595e-06, "loss": 0.5034, "step": 688 }, { "epoch": 1.2066549912434326, "grad_norm": 0.2792379558086395, "learning_rate": 9.252110019545581e-06, "loss": 0.5027, "step": 689 }, { "epoch": 1.2084063047285465, "grad_norm": 0.2357989251613617, "learning_rate": 9.249621217015182e-06, "loss": 0.5051, "step": 690 }, { "epoch": 1.2101576182136602, "grad_norm": 0.29815125465393066, "learning_rate": 9.247128616184881e-06, "loss": 0.505, "step": 691 }, { "epoch": 1.2119089316987741, "grad_norm": 0.2425329089164734, "learning_rate": 9.244632219282561e-06, "loss": 0.512, "step": 692 }, { "epoch": 1.2136602451838878, "grad_norm": 0.3383273482322693, "learning_rate": 9.242132028539493e-06, "loss": 0.4921, "step": 693 }, { "epoch": 1.2154115586690017, "grad_norm": 0.2572219669818878, "learning_rate": 9.239628046190342e-06, "loss": 0.5111, "step": 694 }, { "epoch": 1.2171628721541157, "grad_norm": 0.34071478247642517, "learning_rate": 9.237120274473157e-06, "loss": 0.5098, "step": 695 }, { "epoch": 1.2189141856392294, "grad_norm": 0.26128917932510376, "learning_rate": 9.23460871562938e-06, "loss": 0.5014, "step": 696 }, { "epoch": 1.2206654991243433, "grad_norm": 0.3015629053115845, "learning_rate": 9.232093371903836e-06, "loss": 0.5063, "step": 697 }, { "epoch": 1.222416812609457, "grad_norm": 0.2479422241449356, "learning_rate": 9.229574245544732e-06, "loss": 0.5092, "step": 698 }, { "epoch": 1.224168126094571, "grad_norm": 0.325838178396225, "learning_rate": 9.227051338803656e-06, "loss": 0.5016, "step": 699 }, { "epoch": 1.2259194395796849, "grad_norm": 0.2658190131187439, "learning_rate": 9.224524653935577e-06, "loss": 0.5058, "step": 700 }, { "epoch": 1.2276707530647986, "grad_norm": 0.32845255732536316, "learning_rate": 9.221994193198835e-06, "loss": 0.5128, "step": 701 }, { "epoch": 1.2294220665499125, "grad_norm": 0.2649884521961212, "learning_rate": 9.219459958855152e-06, "loss": 0.4947, "step": 702 }, { "epoch": 1.2311733800350262, "grad_norm": 0.2916931211948395, "learning_rate": 9.216921953169618e-06, "loss": 0.5049, "step": 703 }, { "epoch": 1.2329246935201401, "grad_norm": 0.251615047454834, "learning_rate": 9.214380178410698e-06, "loss": 0.5041, "step": 704 }, { "epoch": 1.234676007005254, "grad_norm": 0.2568936049938202, "learning_rate": 9.211834636850222e-06, "loss": 0.5079, "step": 705 }, { "epoch": 1.2364273204903677, "grad_norm": 0.29326027631759644, "learning_rate": 9.209285330763388e-06, "loss": 0.5084, "step": 706 }, { "epoch": 1.2381786339754817, "grad_norm": 0.2550510764122009, "learning_rate": 9.20673226242876e-06, "loss": 0.5076, "step": 707 }, { "epoch": 1.2399299474605954, "grad_norm": 0.3320823311805725, "learning_rate": 9.204175434128259e-06, "loss": 0.4952, "step": 708 }, { "epoch": 1.2416812609457093, "grad_norm": 0.2722248435020447, "learning_rate": 9.201614848147179e-06, "loss": 0.5213, "step": 709 }, { "epoch": 1.2434325744308232, "grad_norm": 0.2735079824924469, "learning_rate": 9.199050506774159e-06, "loss": 0.5141, "step": 710 }, { "epoch": 1.245183887915937, "grad_norm": 0.25974908471107483, "learning_rate": 9.196482412301202e-06, "loss": 0.4945, "step": 711 }, { "epoch": 1.2469352014010509, "grad_norm": 0.30489978194236755, "learning_rate": 9.193910567023663e-06, "loss": 0.5094, "step": 712 }, { "epoch": 1.2486865148861646, "grad_norm": 0.260397344827652, "learning_rate": 9.19133497324025e-06, "loss": 0.5015, "step": 713 }, { "epoch": 1.2504378283712785, "grad_norm": 0.31140151619911194, "learning_rate": 9.188755633253025e-06, "loss": 0.5023, "step": 714 }, { "epoch": 1.2521891418563924, "grad_norm": 0.26732125878334045, "learning_rate": 9.186172549367394e-06, "loss": 0.5071, "step": 715 }, { "epoch": 1.253940455341506, "grad_norm": 0.26332908868789673, "learning_rate": 9.183585723892107e-06, "loss": 0.4989, "step": 716 }, { "epoch": 1.25569176882662, "grad_norm": 0.33228617906570435, "learning_rate": 9.180995159139265e-06, "loss": 0.5068, "step": 717 }, { "epoch": 1.2574430823117337, "grad_norm": 0.27336442470550537, "learning_rate": 9.178400857424305e-06, "loss": 0.4971, "step": 718 }, { "epoch": 1.2591943957968477, "grad_norm": 0.32833847403526306, "learning_rate": 9.175802821066009e-06, "loss": 0.5035, "step": 719 }, { "epoch": 1.2609457092819616, "grad_norm": 0.3346620798110962, "learning_rate": 9.173201052386496e-06, "loss": 0.507, "step": 720 }, { "epoch": 1.2626970227670753, "grad_norm": 0.2770747244358063, "learning_rate": 9.170595553711216e-06, "loss": 0.5018, "step": 721 }, { "epoch": 1.2644483362521892, "grad_norm": 0.2657966613769531, "learning_rate": 9.167986327368958e-06, "loss": 0.484, "step": 722 }, { "epoch": 1.266199649737303, "grad_norm": 0.2880741059780121, "learning_rate": 9.165373375691845e-06, "loss": 0.5027, "step": 723 }, { "epoch": 1.2679509632224168, "grad_norm": 0.323265939950943, "learning_rate": 9.16275670101532e-06, "loss": 0.5012, "step": 724 }, { "epoch": 1.2697022767075308, "grad_norm": 0.2913571894168854, "learning_rate": 9.160136305678167e-06, "loss": 0.5188, "step": 725 }, { "epoch": 1.2714535901926445, "grad_norm": 0.25726553797721863, "learning_rate": 9.15751219202248e-06, "loss": 0.5079, "step": 726 }, { "epoch": 1.2732049036777582, "grad_norm": 0.30503085255622864, "learning_rate": 9.15488436239369e-06, "loss": 0.5135, "step": 727 }, { "epoch": 1.274956217162872, "grad_norm": 0.2533815801143646, "learning_rate": 9.152252819140544e-06, "loss": 0.5029, "step": 728 }, { "epoch": 1.276707530647986, "grad_norm": 0.25240036845207214, "learning_rate": 9.149617564615106e-06, "loss": 0.4967, "step": 729 }, { "epoch": 1.2784588441331, "grad_norm": 0.27546778321266174, "learning_rate": 9.146978601172761e-06, "loss": 0.5159, "step": 730 }, { "epoch": 1.2802101576182137, "grad_norm": 0.26249411702156067, "learning_rate": 9.144335931172205e-06, "loss": 0.5211, "step": 731 }, { "epoch": 1.2819614711033274, "grad_norm": 0.27705278992652893, "learning_rate": 9.141689556975453e-06, "loss": 0.5183, "step": 732 }, { "epoch": 1.2837127845884413, "grad_norm": 0.24783894419670105, "learning_rate": 9.139039480947825e-06, "loss": 0.5039, "step": 733 }, { "epoch": 1.2854640980735552, "grad_norm": 0.2909696400165558, "learning_rate": 9.136385705457951e-06, "loss": 0.4906, "step": 734 }, { "epoch": 1.287215411558669, "grad_norm": 0.2718595266342163, "learning_rate": 9.13372823287777e-06, "loss": 0.5024, "step": 735 }, { "epoch": 1.2889667250437828, "grad_norm": 0.2781555652618408, "learning_rate": 9.131067065582521e-06, "loss": 0.5025, "step": 736 }, { "epoch": 1.2907180385288965, "grad_norm": 0.27486342191696167, "learning_rate": 9.128402205950751e-06, "loss": 0.498, "step": 737 }, { "epoch": 1.2924693520140105, "grad_norm": 0.29053154587745667, "learning_rate": 9.125733656364304e-06, "loss": 0.5093, "step": 738 }, { "epoch": 1.2942206654991244, "grad_norm": 0.2722828984260559, "learning_rate": 9.12306141920832e-06, "loss": 0.5117, "step": 739 }, { "epoch": 1.295971978984238, "grad_norm": 0.2933242917060852, "learning_rate": 9.12038549687124e-06, "loss": 0.4969, "step": 740 }, { "epoch": 1.297723292469352, "grad_norm": 0.25183534622192383, "learning_rate": 9.117705891744795e-06, "loss": 0.5124, "step": 741 }, { "epoch": 1.2994746059544657, "grad_norm": 0.25993046164512634, "learning_rate": 9.115022606224008e-06, "loss": 0.4999, "step": 742 }, { "epoch": 1.3012259194395797, "grad_norm": 0.2474258989095688, "learning_rate": 9.112335642707196e-06, "loss": 0.4904, "step": 743 }, { "epoch": 1.3029772329246936, "grad_norm": 0.238336443901062, "learning_rate": 9.109645003595954e-06, "loss": 0.5078, "step": 744 }, { "epoch": 1.3047285464098073, "grad_norm": 0.2545296251773834, "learning_rate": 9.106950691295172e-06, "loss": 0.5081, "step": 745 }, { "epoch": 1.3064798598949212, "grad_norm": 0.24469688534736633, "learning_rate": 9.104252708213018e-06, "loss": 0.4974, "step": 746 }, { "epoch": 1.308231173380035, "grad_norm": 0.35833367705345154, "learning_rate": 9.101551056760942e-06, "loss": 0.5116, "step": 747 }, { "epoch": 1.3099824868651488, "grad_norm": 0.2501485347747803, "learning_rate": 9.098845739353672e-06, "loss": 0.5065, "step": 748 }, { "epoch": 1.3117338003502628, "grad_norm": 0.29684051871299744, "learning_rate": 9.096136758409212e-06, "loss": 0.5039, "step": 749 }, { "epoch": 1.3134851138353765, "grad_norm": 0.27165910601615906, "learning_rate": 9.093424116348846e-06, "loss": 0.5072, "step": 750 }, { "epoch": 1.3152364273204904, "grad_norm": 0.23929235339164734, "learning_rate": 9.090707815597124e-06, "loss": 0.5012, "step": 751 }, { "epoch": 1.316987740805604, "grad_norm": 0.23666945099830627, "learning_rate": 9.087987858581867e-06, "loss": 0.487, "step": 752 }, { "epoch": 1.318739054290718, "grad_norm": 0.2523861229419708, "learning_rate": 9.085264247734165e-06, "loss": 0.5027, "step": 753 }, { "epoch": 1.320490367775832, "grad_norm": 0.31751981377601624, "learning_rate": 9.082536985488377e-06, "loss": 0.503, "step": 754 }, { "epoch": 1.3222416812609457, "grad_norm": 0.27856993675231934, "learning_rate": 9.07980607428212e-06, "loss": 0.5102, "step": 755 }, { "epoch": 1.3239929947460596, "grad_norm": 0.287130206823349, "learning_rate": 9.077071516556276e-06, "loss": 0.5012, "step": 756 }, { "epoch": 1.3257443082311733, "grad_norm": 0.2700994908809662, "learning_rate": 9.074333314754984e-06, "loss": 0.5032, "step": 757 }, { "epoch": 1.3274956217162872, "grad_norm": 0.31179970502853394, "learning_rate": 9.071591471325644e-06, "loss": 0.4925, "step": 758 }, { "epoch": 1.3292469352014011, "grad_norm": 0.2814747989177704, "learning_rate": 9.068845988718906e-06, "loss": 0.5044, "step": 759 }, { "epoch": 1.3309982486865148, "grad_norm": 0.31861022114753723, "learning_rate": 9.066096869388674e-06, "loss": 0.4977, "step": 760 }, { "epoch": 1.3327495621716288, "grad_norm": 0.29276588559150696, "learning_rate": 9.063344115792108e-06, "loss": 0.5094, "step": 761 }, { "epoch": 1.3345008756567425, "grad_norm": 0.26735180616378784, "learning_rate": 9.060587730389606e-06, "loss": 0.5035, "step": 762 }, { "epoch": 1.3362521891418564, "grad_norm": 0.31272873282432556, "learning_rate": 9.05782771564482e-06, "loss": 0.5138, "step": 763 }, { "epoch": 1.3380035026269703, "grad_norm": 0.27689245343208313, "learning_rate": 9.055064074024646e-06, "loss": 0.5076, "step": 764 }, { "epoch": 1.339754816112084, "grad_norm": 0.3294570744037628, "learning_rate": 9.052296807999216e-06, "loss": 0.491, "step": 765 }, { "epoch": 1.341506129597198, "grad_norm": 0.28081896901130676, "learning_rate": 9.049525920041909e-06, "loss": 0.504, "step": 766 }, { "epoch": 1.3432574430823117, "grad_norm": 0.3303787410259247, "learning_rate": 9.046751412629331e-06, "loss": 0.4939, "step": 767 }, { "epoch": 1.3450087565674256, "grad_norm": 0.2892819046974182, "learning_rate": 9.043973288241334e-06, "loss": 0.4956, "step": 768 }, { "epoch": 1.3467600700525395, "grad_norm": 0.31658071279525757, "learning_rate": 9.041191549360995e-06, "loss": 0.5076, "step": 769 }, { "epoch": 1.3485113835376532, "grad_norm": 0.2869291305541992, "learning_rate": 9.038406198474627e-06, "loss": 0.5167, "step": 770 }, { "epoch": 1.3502626970227671, "grad_norm": 0.28896933794021606, "learning_rate": 9.03561723807177e-06, "loss": 0.5007, "step": 771 }, { "epoch": 1.3520140105078808, "grad_norm": 0.27130383253097534, "learning_rate": 9.032824670645187e-06, "loss": 0.4932, "step": 772 }, { "epoch": 1.3537653239929948, "grad_norm": 0.28186798095703125, "learning_rate": 9.030028498690866e-06, "loss": 0.4959, "step": 773 }, { "epoch": 1.3555166374781087, "grad_norm": 0.29956695437431335, "learning_rate": 9.027228724708022e-06, "loss": 0.5011, "step": 774 }, { "epoch": 1.3572679509632224, "grad_norm": 0.28760290145874023, "learning_rate": 9.02442535119908e-06, "loss": 0.5066, "step": 775 }, { "epoch": 1.3590192644483363, "grad_norm": 0.29219913482666016, "learning_rate": 9.021618380669693e-06, "loss": 0.5101, "step": 776 }, { "epoch": 1.36077057793345, "grad_norm": 0.266013503074646, "learning_rate": 9.018807815628721e-06, "loss": 0.5091, "step": 777 }, { "epoch": 1.362521891418564, "grad_norm": 0.28396347165107727, "learning_rate": 9.01599365858824e-06, "loss": 0.5063, "step": 778 }, { "epoch": 1.3642732049036779, "grad_norm": 0.24017423391342163, "learning_rate": 9.013175912063534e-06, "loss": 0.4952, "step": 779 }, { "epoch": 1.3660245183887916, "grad_norm": 0.28447985649108887, "learning_rate": 9.0103545785731e-06, "loss": 0.5, "step": 780 }, { "epoch": 1.3677758318739055, "grad_norm": 0.294630229473114, "learning_rate": 9.007529660638637e-06, "loss": 0.4987, "step": 781 }, { "epoch": 1.3695271453590192, "grad_norm": 0.27203619480133057, "learning_rate": 9.004701160785047e-06, "loss": 0.5035, "step": 782 }, { "epoch": 1.3712784588441331, "grad_norm": 0.2691570520401001, "learning_rate": 9.00186908154044e-06, "loss": 0.5052, "step": 783 }, { "epoch": 1.373029772329247, "grad_norm": 0.2797298729419708, "learning_rate": 8.999033425436116e-06, "loss": 0.4899, "step": 784 }, { "epoch": 1.3747810858143608, "grad_norm": 0.2878737449645996, "learning_rate": 8.996194195006582e-06, "loss": 0.5016, "step": 785 }, { "epoch": 1.3765323992994747, "grad_norm": 0.25854969024658203, "learning_rate": 8.993351392789529e-06, "loss": 0.4991, "step": 786 }, { "epoch": 1.3782837127845884, "grad_norm": 0.27235379815101624, "learning_rate": 8.990505021325849e-06, "loss": 0.5064, "step": 787 }, { "epoch": 1.3800350262697023, "grad_norm": 0.27820509672164917, "learning_rate": 8.987655083159618e-06, "loss": 0.4981, "step": 788 }, { "epoch": 1.3817863397548162, "grad_norm": 0.28524118661880493, "learning_rate": 8.984801580838109e-06, "loss": 0.4963, "step": 789 }, { "epoch": 1.38353765323993, "grad_norm": 0.2646663784980774, "learning_rate": 8.98194451691177e-06, "loss": 0.5016, "step": 790 }, { "epoch": 1.3852889667250436, "grad_norm": 0.2633478045463562, "learning_rate": 8.979083893934236e-06, "loss": 0.5109, "step": 791 }, { "epoch": 1.3870402802101576, "grad_norm": 0.3020489811897278, "learning_rate": 8.976219714462326e-06, "loss": 0.5065, "step": 792 }, { "epoch": 1.3887915936952715, "grad_norm": 0.2408159077167511, "learning_rate": 8.973351981056037e-06, "loss": 0.5059, "step": 793 }, { "epoch": 1.3905429071803854, "grad_norm": 0.32046765089035034, "learning_rate": 8.97048069627854e-06, "loss": 0.4896, "step": 794 }, { "epoch": 1.3922942206654991, "grad_norm": 0.2597024738788605, "learning_rate": 8.967605862696183e-06, "loss": 0.4967, "step": 795 }, { "epoch": 1.3940455341506128, "grad_norm": 0.29363709688186646, "learning_rate": 8.964727482878482e-06, "loss": 0.5044, "step": 796 }, { "epoch": 1.3957968476357268, "grad_norm": 0.2801847755908966, "learning_rate": 8.961845559398125e-06, "loss": 0.5137, "step": 797 }, { "epoch": 1.3975481611208407, "grad_norm": 0.32091495394706726, "learning_rate": 8.95896009483097e-06, "loss": 0.5064, "step": 798 }, { "epoch": 1.3992994746059544, "grad_norm": 0.31299999356269836, "learning_rate": 8.956071091756036e-06, "loss": 0.5053, "step": 799 }, { "epoch": 1.4010507880910683, "grad_norm": 0.3232526481151581, "learning_rate": 8.953178552755506e-06, "loss": 0.5034, "step": 800 }, { "epoch": 1.402802101576182, "grad_norm": 0.27107933163642883, "learning_rate": 8.950282480414723e-06, "loss": 0.514, "step": 801 }, { "epoch": 1.404553415061296, "grad_norm": 0.2760787606239319, "learning_rate": 8.947382877322189e-06, "loss": 0.4957, "step": 802 }, { "epoch": 1.4063047285464099, "grad_norm": 0.3247121572494507, "learning_rate": 8.94447974606956e-06, "loss": 0.4958, "step": 803 }, { "epoch": 1.4080560420315236, "grad_norm": 0.2634688913822174, "learning_rate": 8.941573089251652e-06, "loss": 0.5067, "step": 804 }, { "epoch": 1.4098073555166375, "grad_norm": 0.31248942017555237, "learning_rate": 8.93866290946642e-06, "loss": 0.5131, "step": 805 }, { "epoch": 1.4115586690017512, "grad_norm": 0.2677425444126129, "learning_rate": 8.935749209314981e-06, "loss": 0.4921, "step": 806 }, { "epoch": 1.4133099824868651, "grad_norm": 0.31269389390945435, "learning_rate": 8.932831991401587e-06, "loss": 0.5004, "step": 807 }, { "epoch": 1.415061295971979, "grad_norm": 0.25982677936553955, "learning_rate": 8.929911258333644e-06, "loss": 0.4994, "step": 808 }, { "epoch": 1.4168126094570928, "grad_norm": 0.27495139837265015, "learning_rate": 8.92698701272169e-06, "loss": 0.496, "step": 809 }, { "epoch": 1.4185639229422067, "grad_norm": 0.27856966853141785, "learning_rate": 8.924059257179414e-06, "loss": 0.5116, "step": 810 }, { "epoch": 1.4203152364273204, "grad_norm": 0.3033006191253662, "learning_rate": 8.92112799432363e-06, "loss": 0.504, "step": 811 }, { "epoch": 1.4220665499124343, "grad_norm": 0.35081201791763306, "learning_rate": 8.9181932267743e-06, "loss": 0.5109, "step": 812 }, { "epoch": 1.4238178633975482, "grad_norm": 0.29742851853370667, "learning_rate": 8.915254957154501e-06, "loss": 0.5046, "step": 813 }, { "epoch": 1.425569176882662, "grad_norm": 0.32191410660743713, "learning_rate": 8.91231318809046e-06, "loss": 0.499, "step": 814 }, { "epoch": 1.4273204903677759, "grad_norm": 0.33820077776908875, "learning_rate": 8.909367922211519e-06, "loss": 0.4964, "step": 815 }, { "epoch": 1.4290718038528896, "grad_norm": 0.281932532787323, "learning_rate": 8.906419162150147e-06, "loss": 0.5045, "step": 816 }, { "epoch": 1.4308231173380035, "grad_norm": 0.25566336512565613, "learning_rate": 8.903466910541938e-06, "loss": 0.5025, "step": 817 }, { "epoch": 1.4325744308231174, "grad_norm": 0.2605109214782715, "learning_rate": 8.900511170025609e-06, "loss": 0.5062, "step": 818 }, { "epoch": 1.4343257443082311, "grad_norm": 0.2635284960269928, "learning_rate": 8.89755194324299e-06, "loss": 0.5084, "step": 819 }, { "epoch": 1.436077057793345, "grad_norm": 0.2798272669315338, "learning_rate": 8.89458923283903e-06, "loss": 0.5065, "step": 820 }, { "epoch": 1.4378283712784588, "grad_norm": 0.317976176738739, "learning_rate": 8.891623041461794e-06, "loss": 0.5026, "step": 821 }, { "epoch": 1.4395796847635727, "grad_norm": 0.2653420865535736, "learning_rate": 8.888653371762454e-06, "loss": 0.5075, "step": 822 }, { "epoch": 1.4413309982486866, "grad_norm": 0.34058496356010437, "learning_rate": 8.885680226395295e-06, "loss": 0.5056, "step": 823 }, { "epoch": 1.4430823117338003, "grad_norm": 0.2745915651321411, "learning_rate": 8.882703608017702e-06, "loss": 0.4976, "step": 824 }, { "epoch": 1.4448336252189142, "grad_norm": 0.33895379304885864, "learning_rate": 8.87972351929017e-06, "loss": 0.504, "step": 825 }, { "epoch": 1.446584938704028, "grad_norm": 0.28984346985816956, "learning_rate": 8.876739962876298e-06, "loss": 0.5016, "step": 826 }, { "epoch": 1.4483362521891419, "grad_norm": 0.3084270656108856, "learning_rate": 8.873752941442775e-06, "loss": 0.5175, "step": 827 }, { "epoch": 1.4500875656742558, "grad_norm": 0.30663713812828064, "learning_rate": 8.870762457659397e-06, "loss": 0.5061, "step": 828 }, { "epoch": 1.4518388791593695, "grad_norm": 0.32536306977272034, "learning_rate": 8.867768514199046e-06, "loss": 0.4994, "step": 829 }, { "epoch": 1.4535901926444834, "grad_norm": 0.3010607063770294, "learning_rate": 8.864771113737705e-06, "loss": 0.5061, "step": 830 }, { "epoch": 1.4553415061295971, "grad_norm": 0.35699892044067383, "learning_rate": 8.86177025895444e-06, "loss": 0.5023, "step": 831 }, { "epoch": 1.457092819614711, "grad_norm": 0.32362446188926697, "learning_rate": 8.858765952531407e-06, "loss": 0.4976, "step": 832 }, { "epoch": 1.458844133099825, "grad_norm": 0.38085582852363586, "learning_rate": 8.855758197153848e-06, "loss": 0.492, "step": 833 }, { "epoch": 1.4605954465849387, "grad_norm": 0.30261924862861633, "learning_rate": 8.852746995510084e-06, "loss": 0.507, "step": 834 }, { "epoch": 1.4623467600700526, "grad_norm": 0.31016966700553894, "learning_rate": 8.849732350291522e-06, "loss": 0.4941, "step": 835 }, { "epoch": 1.4640980735551663, "grad_norm": 0.29356691241264343, "learning_rate": 8.846714264192642e-06, "loss": 0.505, "step": 836 }, { "epoch": 1.4658493870402802, "grad_norm": 0.2616441249847412, "learning_rate": 8.843692739911e-06, "loss": 0.5036, "step": 837 }, { "epoch": 1.4676007005253942, "grad_norm": 0.2784304618835449, "learning_rate": 8.84066778014723e-06, "loss": 0.5017, "step": 838 }, { "epoch": 1.4693520140105079, "grad_norm": 0.27035757899284363, "learning_rate": 8.837639387605031e-06, "loss": 0.5007, "step": 839 }, { "epoch": 1.4711033274956218, "grad_norm": 0.23001836240291595, "learning_rate": 8.834607564991168e-06, "loss": 0.5078, "step": 840 }, { "epoch": 1.4728546409807355, "grad_norm": 0.3180595338344574, "learning_rate": 8.831572315015484e-06, "loss": 0.5, "step": 841 }, { "epoch": 1.4746059544658494, "grad_norm": 0.23938365280628204, "learning_rate": 8.828533640390868e-06, "loss": 0.4949, "step": 842 }, { "epoch": 1.4763572679509633, "grad_norm": 0.2774653136730194, "learning_rate": 8.825491543833286e-06, "loss": 0.5056, "step": 843 }, { "epoch": 1.478108581436077, "grad_norm": 0.2744264304637909, "learning_rate": 8.822446028061752e-06, "loss": 0.509, "step": 844 }, { "epoch": 1.479859894921191, "grad_norm": 0.28092241287231445, "learning_rate": 8.819397095798343e-06, "loss": 0.4938, "step": 845 }, { "epoch": 1.4816112084063047, "grad_norm": 0.2804762125015259, "learning_rate": 8.816344749768184e-06, "loss": 0.5023, "step": 846 }, { "epoch": 1.4833625218914186, "grad_norm": 0.2866804897785187, "learning_rate": 8.813288992699454e-06, "loss": 0.5076, "step": 847 }, { "epoch": 1.4851138353765325, "grad_norm": 0.2918645739555359, "learning_rate": 8.810229827323378e-06, "loss": 0.5127, "step": 848 }, { "epoch": 1.4868651488616462, "grad_norm": 0.2568419873714447, "learning_rate": 8.807167256374234e-06, "loss": 0.4935, "step": 849 }, { "epoch": 1.4886164623467601, "grad_norm": 0.2943757176399231, "learning_rate": 8.804101282589338e-06, "loss": 0.5119, "step": 850 }, { "epoch": 1.4903677758318739, "grad_norm": 0.2885313928127289, "learning_rate": 8.801031908709047e-06, "loss": 0.4963, "step": 851 }, { "epoch": 1.4921190893169878, "grad_norm": 0.30723658204078674, "learning_rate": 8.797959137476763e-06, "loss": 0.5028, "step": 852 }, { "epoch": 1.4938704028021017, "grad_norm": 0.32620394229888916, "learning_rate": 8.794882971638916e-06, "loss": 0.4954, "step": 853 }, { "epoch": 1.4956217162872154, "grad_norm": 0.3169829249382019, "learning_rate": 8.791803413944979e-06, "loss": 0.5128, "step": 854 }, { "epoch": 1.4973730297723291, "grad_norm": 0.28345316648483276, "learning_rate": 8.78872046714745e-06, "loss": 0.4925, "step": 855 }, { "epoch": 1.499124343257443, "grad_norm": 0.33128541707992554, "learning_rate": 8.785634134001856e-06, "loss": 0.495, "step": 856 }, { "epoch": 1.500875656742557, "grad_norm": 0.23043100535869598, "learning_rate": 8.782544417266758e-06, "loss": 0.496, "step": 857 }, { "epoch": 1.5026269702276709, "grad_norm": 0.35986557602882385, "learning_rate": 8.779451319703735e-06, "loss": 0.4964, "step": 858 }, { "epoch": 1.5043782837127846, "grad_norm": 0.2690294682979584, "learning_rate": 8.776354844077389e-06, "loss": 0.4997, "step": 859 }, { "epoch": 1.5061295971978983, "grad_norm": 0.279460608959198, "learning_rate": 8.77325499315534e-06, "loss": 0.5087, "step": 860 }, { "epoch": 1.5078809106830122, "grad_norm": 0.2580622732639313, "learning_rate": 8.770151769708226e-06, "loss": 0.4937, "step": 861 }, { "epoch": 1.5096322241681261, "grad_norm": 0.24749192595481873, "learning_rate": 8.767045176509703e-06, "loss": 0.5042, "step": 862 }, { "epoch": 1.51138353765324, "grad_norm": 0.2696499228477478, "learning_rate": 8.763935216336433e-06, "loss": 0.4985, "step": 863 }, { "epoch": 1.5131348511383538, "grad_norm": 0.23225151002407074, "learning_rate": 8.76082189196809e-06, "loss": 0.5013, "step": 864 }, { "epoch": 1.5148861646234675, "grad_norm": 0.25126150250434875, "learning_rate": 8.757705206187357e-06, "loss": 0.5016, "step": 865 }, { "epoch": 1.5166374781085814, "grad_norm": 0.25764787197113037, "learning_rate": 8.754585161779916e-06, "loss": 0.5015, "step": 866 }, { "epoch": 1.5183887915936953, "grad_norm": 0.27548742294311523, "learning_rate": 8.751461761534457e-06, "loss": 0.4969, "step": 867 }, { "epoch": 1.5201401050788093, "grad_norm": 0.24379505217075348, "learning_rate": 8.748335008242667e-06, "loss": 0.4985, "step": 868 }, { "epoch": 1.521891418563923, "grad_norm": 0.2800361216068268, "learning_rate": 8.745204904699226e-06, "loss": 0.4998, "step": 869 }, { "epoch": 1.5236427320490367, "grad_norm": 0.24024802446365356, "learning_rate": 8.742071453701815e-06, "loss": 0.4971, "step": 870 }, { "epoch": 1.5253940455341506, "grad_norm": 0.26992031931877136, "learning_rate": 8.738934658051103e-06, "loss": 0.5007, "step": 871 }, { "epoch": 1.5271453590192645, "grad_norm": 0.23915138840675354, "learning_rate": 8.73579452055075e-06, "loss": 0.4987, "step": 872 }, { "epoch": 1.5288966725043784, "grad_norm": 0.2689554989337921, "learning_rate": 8.732651044007402e-06, "loss": 0.4936, "step": 873 }, { "epoch": 1.5306479859894921, "grad_norm": 0.24867811799049377, "learning_rate": 8.72950423123069e-06, "loss": 0.4938, "step": 874 }, { "epoch": 1.5323992994746058, "grad_norm": 0.28861039876937866, "learning_rate": 8.726354085033227e-06, "loss": 0.4969, "step": 875 }, { "epoch": 1.5341506129597198, "grad_norm": 0.25068604946136475, "learning_rate": 8.723200608230605e-06, "loss": 0.4901, "step": 876 }, { "epoch": 1.5359019264448337, "grad_norm": 0.26353421807289124, "learning_rate": 8.72004380364139e-06, "loss": 0.4942, "step": 877 }, { "epoch": 1.5376532399299476, "grad_norm": 0.2808261215686798, "learning_rate": 8.71688367408713e-06, "loss": 0.5038, "step": 878 }, { "epoch": 1.5394045534150613, "grad_norm": 0.2545369863510132, "learning_rate": 8.713720222392338e-06, "loss": 0.4976, "step": 879 }, { "epoch": 1.541155866900175, "grad_norm": 0.2789628505706787, "learning_rate": 8.710553451384499e-06, "loss": 0.5082, "step": 880 }, { "epoch": 1.542907180385289, "grad_norm": 0.27202168107032776, "learning_rate": 8.707383363894064e-06, "loss": 0.4996, "step": 881 }, { "epoch": 1.5446584938704029, "grad_norm": 0.29009705781936646, "learning_rate": 8.704209962754448e-06, "loss": 0.5042, "step": 882 }, { "epoch": 1.5464098073555166, "grad_norm": 0.33155375719070435, "learning_rate": 8.70103325080203e-06, "loss": 0.5038, "step": 883 }, { "epoch": 1.5481611208406305, "grad_norm": 0.29216158390045166, "learning_rate": 8.697853230876145e-06, "loss": 0.4985, "step": 884 }, { "epoch": 1.5499124343257442, "grad_norm": 0.29765912890434265, "learning_rate": 8.694669905819087e-06, "loss": 0.4882, "step": 885 }, { "epoch": 1.5516637478108581, "grad_norm": 0.3053787648677826, "learning_rate": 8.691483278476103e-06, "loss": 0.4978, "step": 886 }, { "epoch": 1.553415061295972, "grad_norm": 0.2876129150390625, "learning_rate": 8.688293351695392e-06, "loss": 0.5092, "step": 887 }, { "epoch": 1.5551663747810858, "grad_norm": 0.30320921540260315, "learning_rate": 8.685100128328101e-06, "loss": 0.4853, "step": 888 }, { "epoch": 1.5569176882661997, "grad_norm": 0.2568370997905731, "learning_rate": 8.681903611228327e-06, "loss": 0.5016, "step": 889 }, { "epoch": 1.5586690017513134, "grad_norm": 0.2983393371105194, "learning_rate": 8.678703803253103e-06, "loss": 0.5063, "step": 890 }, { "epoch": 1.5604203152364273, "grad_norm": 0.29777783155441284, "learning_rate": 8.675500707262415e-06, "loss": 0.4967, "step": 891 }, { "epoch": 1.5621716287215412, "grad_norm": 0.26510778069496155, "learning_rate": 8.672294326119177e-06, "loss": 0.5072, "step": 892 }, { "epoch": 1.563922942206655, "grad_norm": 0.28842294216156006, "learning_rate": 8.669084662689246e-06, "loss": 0.4982, "step": 893 }, { "epoch": 1.5656742556917689, "grad_norm": 0.25181058049201965, "learning_rate": 8.66587171984141e-06, "loss": 0.5034, "step": 894 }, { "epoch": 1.5674255691768826, "grad_norm": 0.27379193902015686, "learning_rate": 8.662655500447388e-06, "loss": 0.5025, "step": 895 }, { "epoch": 1.5691768826619965, "grad_norm": 0.2673984467983246, "learning_rate": 8.65943600738183e-06, "loss": 0.5034, "step": 896 }, { "epoch": 1.5709281961471104, "grad_norm": 0.2988172173500061, "learning_rate": 8.656213243522307e-06, "loss": 0.5083, "step": 897 }, { "epoch": 1.5726795096322241, "grad_norm": 0.2309127002954483, "learning_rate": 8.65298721174932e-06, "loss": 0.4935, "step": 898 }, { "epoch": 1.5744308231173378, "grad_norm": 0.26038849353790283, "learning_rate": 8.649757914946284e-06, "loss": 0.4878, "step": 899 }, { "epoch": 1.5761821366024518, "grad_norm": 0.2482953667640686, "learning_rate": 8.646525355999538e-06, "loss": 0.4953, "step": 900 }, { "epoch": 1.5779334500875657, "grad_norm": 0.28201067447662354, "learning_rate": 8.643289537798332e-06, "loss": 0.5126, "step": 901 }, { "epoch": 1.5796847635726796, "grad_norm": 0.28524452447891235, "learning_rate": 8.640050463234837e-06, "loss": 0.4976, "step": 902 }, { "epoch": 1.5814360770577933, "grad_norm": 0.2480105608701706, "learning_rate": 8.636808135204123e-06, "loss": 0.4954, "step": 903 }, { "epoch": 1.583187390542907, "grad_norm": 0.25186482071876526, "learning_rate": 8.633562556604177e-06, "loss": 0.4903, "step": 904 }, { "epoch": 1.584938704028021, "grad_norm": 0.24566999077796936, "learning_rate": 8.630313730335888e-06, "loss": 0.4982, "step": 905 }, { "epoch": 1.5866900175131349, "grad_norm": 0.24237769842147827, "learning_rate": 8.627061659303047e-06, "loss": 0.4994, "step": 906 }, { "epoch": 1.5884413309982488, "grad_norm": 0.26949164271354675, "learning_rate": 8.623806346412349e-06, "loss": 0.5018, "step": 907 }, { "epoch": 1.5901926444833625, "grad_norm": 0.27059540152549744, "learning_rate": 8.62054779457338e-06, "loss": 0.5002, "step": 908 }, { "epoch": 1.5919439579684762, "grad_norm": 0.28216081857681274, "learning_rate": 8.617286006698628e-06, "loss": 0.5073, "step": 909 }, { "epoch": 1.5936952714535901, "grad_norm": 0.25720566511154175, "learning_rate": 8.614020985703468e-06, "loss": 0.5032, "step": 910 }, { "epoch": 1.595446584938704, "grad_norm": 0.24057425558567047, "learning_rate": 8.610752734506166e-06, "loss": 0.491, "step": 911 }, { "epoch": 1.597197898423818, "grad_norm": 0.26739776134490967, "learning_rate": 8.60748125602788e-06, "loss": 0.5078, "step": 912 }, { "epoch": 1.5989492119089317, "grad_norm": 0.2662481963634491, "learning_rate": 8.604206553192647e-06, "loss": 0.4965, "step": 913 }, { "epoch": 1.6007005253940454, "grad_norm": 0.2356027215719223, "learning_rate": 8.600928628927385e-06, "loss": 0.4978, "step": 914 }, { "epoch": 1.6024518388791593, "grad_norm": 0.31105688214302063, "learning_rate": 8.597647486161898e-06, "loss": 0.5076, "step": 915 }, { "epoch": 1.6042031523642732, "grad_norm": 0.2700553238391876, "learning_rate": 8.594363127828858e-06, "loss": 0.499, "step": 916 }, { "epoch": 1.6059544658493872, "grad_norm": 0.22000934183597565, "learning_rate": 8.591075556863818e-06, "loss": 0.4936, "step": 917 }, { "epoch": 1.6077057793345009, "grad_norm": 0.2898847460746765, "learning_rate": 8.5877847762052e-06, "loss": 0.5057, "step": 918 }, { "epoch": 1.6094570928196146, "grad_norm": 0.25598645210266113, "learning_rate": 8.584490788794296e-06, "loss": 0.5059, "step": 919 }, { "epoch": 1.6112084063047285, "grad_norm": 0.26784011721611023, "learning_rate": 8.581193597575259e-06, "loss": 0.4983, "step": 920 }, { "epoch": 1.6129597197898424, "grad_norm": 0.24729891121387482, "learning_rate": 8.577893205495113e-06, "loss": 0.4784, "step": 921 }, { "epoch": 1.6147110332749564, "grad_norm": 0.26067620515823364, "learning_rate": 8.574589615503739e-06, "loss": 0.5, "step": 922 }, { "epoch": 1.61646234676007, "grad_norm": 0.26521897315979004, "learning_rate": 8.571282830553875e-06, "loss": 0.4834, "step": 923 }, { "epoch": 1.6182136602451838, "grad_norm": 0.25744500756263733, "learning_rate": 8.567972853601119e-06, "loss": 0.5026, "step": 924 }, { "epoch": 1.6199649737302977, "grad_norm": 0.2629903554916382, "learning_rate": 8.564659687603917e-06, "loss": 0.5049, "step": 925 }, { "epoch": 1.6217162872154116, "grad_norm": 0.2598622143268585, "learning_rate": 8.56134333552357e-06, "loss": 0.5147, "step": 926 }, { "epoch": 1.6234676007005255, "grad_norm": 0.2570825517177582, "learning_rate": 8.558023800324223e-06, "loss": 0.4981, "step": 927 }, { "epoch": 1.6252189141856392, "grad_norm": 0.2902008593082428, "learning_rate": 8.554701084972869e-06, "loss": 0.4977, "step": 928 }, { "epoch": 1.626970227670753, "grad_norm": 0.26709258556365967, "learning_rate": 8.551375192439339e-06, "loss": 0.5046, "step": 929 }, { "epoch": 1.6287215411558669, "grad_norm": 0.25945159792900085, "learning_rate": 8.54804612569631e-06, "loss": 0.4871, "step": 930 }, { "epoch": 1.6304728546409808, "grad_norm": 0.26322823762893677, "learning_rate": 8.54471388771929e-06, "loss": 0.4932, "step": 931 }, { "epoch": 1.6322241681260947, "grad_norm": 0.26510369777679443, "learning_rate": 8.541378481486627e-06, "loss": 0.4892, "step": 932 }, { "epoch": 1.6339754816112084, "grad_norm": 0.26875239610671997, "learning_rate": 8.538039909979496e-06, "loss": 0.5016, "step": 933 }, { "epoch": 1.6357267950963221, "grad_norm": 0.2442116141319275, "learning_rate": 8.534698176181904e-06, "loss": 0.5059, "step": 934 }, { "epoch": 1.637478108581436, "grad_norm": 0.27727556228637695, "learning_rate": 8.531353283080683e-06, "loss": 0.4917, "step": 935 }, { "epoch": 1.63922942206655, "grad_norm": 0.2551816999912262, "learning_rate": 8.528005233665491e-06, "loss": 0.4924, "step": 936 }, { "epoch": 1.640980735551664, "grad_norm": 0.26167088747024536, "learning_rate": 8.524654030928802e-06, "loss": 0.497, "step": 937 }, { "epoch": 1.6427320490367776, "grad_norm": 0.2334335595369339, "learning_rate": 8.521299677865915e-06, "loss": 0.5007, "step": 938 }, { "epoch": 1.6444833625218913, "grad_norm": 0.2525847554206848, "learning_rate": 8.517942177474943e-06, "loss": 0.497, "step": 939 }, { "epoch": 1.6462346760070052, "grad_norm": 0.27443936467170715, "learning_rate": 8.514581532756806e-06, "loss": 0.5002, "step": 940 }, { "epoch": 1.6479859894921192, "grad_norm": 0.2506290376186371, "learning_rate": 8.511217746715243e-06, "loss": 0.4918, "step": 941 }, { "epoch": 1.649737302977233, "grad_norm": 0.29824429750442505, "learning_rate": 8.507850822356796e-06, "loss": 0.5083, "step": 942 }, { "epoch": 1.6514886164623468, "grad_norm": 0.26883766055107117, "learning_rate": 8.50448076269081e-06, "loss": 0.5045, "step": 943 }, { "epoch": 1.6532399299474605, "grad_norm": 0.28716492652893066, "learning_rate": 8.50110757072944e-06, "loss": 0.4902, "step": 944 }, { "epoch": 1.6549912434325744, "grad_norm": 0.29646971821784973, "learning_rate": 8.497731249487632e-06, "loss": 0.4967, "step": 945 }, { "epoch": 1.6567425569176883, "grad_norm": 0.29888349771499634, "learning_rate": 8.494351801983135e-06, "loss": 0.5052, "step": 946 }, { "epoch": 1.658493870402802, "grad_norm": 0.3003864288330078, "learning_rate": 8.490969231236487e-06, "loss": 0.4968, "step": 947 }, { "epoch": 1.660245183887916, "grad_norm": 0.319704532623291, "learning_rate": 8.487583540271024e-06, "loss": 0.5066, "step": 948 }, { "epoch": 1.6619964973730297, "grad_norm": 0.2800961136817932, "learning_rate": 8.484194732112866e-06, "loss": 0.5059, "step": 949 }, { "epoch": 1.6637478108581436, "grad_norm": 0.3415386974811554, "learning_rate": 8.480802809790922e-06, "loss": 0.4989, "step": 950 }, { "epoch": 1.6654991243432575, "grad_norm": 0.3067757189273834, "learning_rate": 8.47740777633688e-06, "loss": 0.4892, "step": 951 }, { "epoch": 1.6672504378283712, "grad_norm": 0.3165009021759033, "learning_rate": 8.474009634785214e-06, "loss": 0.4999, "step": 952 }, { "epoch": 1.6690017513134852, "grad_norm": 0.3313622772693634, "learning_rate": 8.47060838817317e-06, "loss": 0.4857, "step": 953 }, { "epoch": 1.6707530647985989, "grad_norm": 0.2591697573661804, "learning_rate": 8.467204039540779e-06, "loss": 0.5018, "step": 954 }, { "epoch": 1.6725043782837128, "grad_norm": 0.27844011783599854, "learning_rate": 8.463796591930833e-06, "loss": 0.5059, "step": 955 }, { "epoch": 1.6742556917688267, "grad_norm": 0.30326732993125916, "learning_rate": 8.460386048388905e-06, "loss": 0.4951, "step": 956 }, { "epoch": 1.6760070052539404, "grad_norm": 0.2558797597885132, "learning_rate": 8.456972411963323e-06, "loss": 0.4961, "step": 957 }, { "epoch": 1.6777583187390543, "grad_norm": 0.2871032953262329, "learning_rate": 8.453555685705192e-06, "loss": 0.5059, "step": 958 }, { "epoch": 1.679509632224168, "grad_norm": 0.2644273638725281, "learning_rate": 8.450135872668369e-06, "loss": 0.4908, "step": 959 }, { "epoch": 1.681260945709282, "grad_norm": 0.29457271099090576, "learning_rate": 8.446712975909474e-06, "loss": 0.494, "step": 960 }, { "epoch": 1.683012259194396, "grad_norm": 0.2812623977661133, "learning_rate": 8.443286998487884e-06, "loss": 0.4902, "step": 961 }, { "epoch": 1.6847635726795096, "grad_norm": 0.25032851099967957, "learning_rate": 8.439857943465728e-06, "loss": 0.4903, "step": 962 }, { "epoch": 1.6865148861646233, "grad_norm": 0.2708956003189087, "learning_rate": 8.436425813907885e-06, "loss": 0.4951, "step": 963 }, { "epoch": 1.6882661996497372, "grad_norm": 0.28702670335769653, "learning_rate": 8.432990612881985e-06, "loss": 0.4865, "step": 964 }, { "epoch": 1.6900175131348512, "grad_norm": 0.2750205397605896, "learning_rate": 8.4295523434584e-06, "loss": 0.5027, "step": 965 }, { "epoch": 1.691768826619965, "grad_norm": 0.2952778935432434, "learning_rate": 8.426111008710245e-06, "loss": 0.5048, "step": 966 }, { "epoch": 1.6935201401050788, "grad_norm": 0.26920726895332336, "learning_rate": 8.422666611713378e-06, "loss": 0.5044, "step": 967 }, { "epoch": 1.6952714535901925, "grad_norm": 0.31938883662223816, "learning_rate": 8.41921915554639e-06, "loss": 0.5042, "step": 968 }, { "epoch": 1.6970227670753064, "grad_norm": 0.28980016708374023, "learning_rate": 8.415768643290608e-06, "loss": 0.4996, "step": 969 }, { "epoch": 1.6987740805604203, "grad_norm": 0.2822161018848419, "learning_rate": 8.412315078030091e-06, "loss": 0.4958, "step": 970 }, { "epoch": 1.7005253940455343, "grad_norm": 0.2612764239311218, "learning_rate": 8.408858462851625e-06, "loss": 0.501, "step": 971 }, { "epoch": 1.702276707530648, "grad_norm": 0.30062928795814514, "learning_rate": 8.405398800844726e-06, "loss": 0.4989, "step": 972 }, { "epoch": 1.7040280210157617, "grad_norm": 0.27315762639045715, "learning_rate": 8.401936095101629e-06, "loss": 0.5073, "step": 973 }, { "epoch": 1.7057793345008756, "grad_norm": 0.31192299723625183, "learning_rate": 8.398470348717288e-06, "loss": 0.5136, "step": 974 }, { "epoch": 1.7075306479859895, "grad_norm": 0.283441424369812, "learning_rate": 8.395001564789385e-06, "loss": 0.4938, "step": 975 }, { "epoch": 1.7092819614711035, "grad_norm": 0.2891184389591217, "learning_rate": 8.391529746418303e-06, "loss": 0.493, "step": 976 }, { "epoch": 1.7110332749562172, "grad_norm": 0.23242133855819702, "learning_rate": 8.388054896707146e-06, "loss": 0.4911, "step": 977 }, { "epoch": 1.7127845884413309, "grad_norm": 0.345678448677063, "learning_rate": 8.384577018761725e-06, "loss": 0.5005, "step": 978 }, { "epoch": 1.7145359019264448, "grad_norm": 0.2690630555152893, "learning_rate": 8.38109611569056e-06, "loss": 0.5086, "step": 979 }, { "epoch": 1.7162872154115587, "grad_norm": 0.33596232533454895, "learning_rate": 8.377612190604868e-06, "loss": 0.494, "step": 980 }, { "epoch": 1.7180385288966726, "grad_norm": 0.2602155804634094, "learning_rate": 8.374125246618576e-06, "loss": 0.4915, "step": 981 }, { "epoch": 1.7197898423817863, "grad_norm": 0.3973917067050934, "learning_rate": 8.370635286848304e-06, "loss": 0.5066, "step": 982 }, { "epoch": 1.7215411558669, "grad_norm": 0.2582109570503235, "learning_rate": 8.367142314413368e-06, "loss": 0.5003, "step": 983 }, { "epoch": 1.723292469352014, "grad_norm": 0.31483617424964905, "learning_rate": 8.363646332435778e-06, "loss": 0.5002, "step": 984 }, { "epoch": 1.725043782837128, "grad_norm": 0.2442713975906372, "learning_rate": 8.360147344040233e-06, "loss": 0.4847, "step": 985 }, { "epoch": 1.7267950963222418, "grad_norm": 0.32398876547813416, "learning_rate": 8.356645352354117e-06, "loss": 0.5109, "step": 986 }, { "epoch": 1.7285464098073555, "grad_norm": 0.2943169176578522, "learning_rate": 8.353140360507506e-06, "loss": 0.5101, "step": 987 }, { "epoch": 1.7302977232924692, "grad_norm": 0.2699657082557678, "learning_rate": 8.349632371633146e-06, "loss": 0.4984, "step": 988 }, { "epoch": 1.7320490367775832, "grad_norm": 0.2820737659931183, "learning_rate": 8.346121388866472e-06, "loss": 0.4996, "step": 989 }, { "epoch": 1.733800350262697, "grad_norm": 0.2738484740257263, "learning_rate": 8.34260741534559e-06, "loss": 0.5006, "step": 990 }, { "epoch": 1.735551663747811, "grad_norm": 0.3114807605743408, "learning_rate": 8.339090454211278e-06, "loss": 0.4887, "step": 991 }, { "epoch": 1.7373029772329247, "grad_norm": 0.2746036648750305, "learning_rate": 8.33557050860699e-06, "loss": 0.4992, "step": 992 }, { "epoch": 1.7390542907180384, "grad_norm": 0.2798019349575043, "learning_rate": 8.332047581678839e-06, "loss": 0.4839, "step": 993 }, { "epoch": 1.7408056042031523, "grad_norm": 0.23756490647792816, "learning_rate": 8.328521676575612e-06, "loss": 0.4905, "step": 994 }, { "epoch": 1.7425569176882663, "grad_norm": 0.2635986804962158, "learning_rate": 8.324992796448752e-06, "loss": 0.4918, "step": 995 }, { "epoch": 1.7443082311733802, "grad_norm": 0.24415454268455505, "learning_rate": 8.321460944452364e-06, "loss": 0.4929, "step": 996 }, { "epoch": 1.746059544658494, "grad_norm": 0.23732221126556396, "learning_rate": 8.317926123743202e-06, "loss": 0.4993, "step": 997 }, { "epoch": 1.7478108581436076, "grad_norm": 0.28668639063835144, "learning_rate": 8.314388337480686e-06, "loss": 0.5105, "step": 998 }, { "epoch": 1.7495621716287215, "grad_norm": 0.23841291666030884, "learning_rate": 8.310847588826876e-06, "loss": 0.4823, "step": 999 }, { "epoch": 1.7513134851138354, "grad_norm": 0.2779991626739502, "learning_rate": 8.307303880946485e-06, "loss": 0.4983, "step": 1000 }, { "epoch": 1.7530647985989494, "grad_norm": 0.30683937668800354, "learning_rate": 8.303757217006869e-06, "loss": 0.5124, "step": 1001 }, { "epoch": 1.754816112084063, "grad_norm": 0.225300133228302, "learning_rate": 8.300207600178024e-06, "loss": 0.4939, "step": 1002 }, { "epoch": 1.7565674255691768, "grad_norm": 0.36409851908683777, "learning_rate": 8.296655033632592e-06, "loss": 0.4898, "step": 1003 }, { "epoch": 1.7583187390542907, "grad_norm": 0.24094580113887787, "learning_rate": 8.293099520545845e-06, "loss": 0.4966, "step": 1004 }, { "epoch": 1.7600700525394046, "grad_norm": 0.3516525626182556, "learning_rate": 8.289541064095693e-06, "loss": 0.4798, "step": 1005 }, { "epoch": 1.7618213660245186, "grad_norm": 0.26245102286338806, "learning_rate": 8.285979667462671e-06, "loss": 0.5003, "step": 1006 }, { "epoch": 1.7635726795096323, "grad_norm": 0.2718586325645447, "learning_rate": 8.28241533382995e-06, "loss": 0.4975, "step": 1007 }, { "epoch": 1.765323992994746, "grad_norm": 0.2723478376865387, "learning_rate": 8.27884806638332e-06, "loss": 0.5021, "step": 1008 }, { "epoch": 1.7670753064798599, "grad_norm": 0.2253461629152298, "learning_rate": 8.275277868311193e-06, "loss": 0.4994, "step": 1009 }, { "epoch": 1.7688266199649738, "grad_norm": 0.2490890622138977, "learning_rate": 8.271704742804605e-06, "loss": 0.4952, "step": 1010 }, { "epoch": 1.7705779334500875, "grad_norm": 0.24335192143917084, "learning_rate": 8.268128693057202e-06, "loss": 0.5143, "step": 1011 }, { "epoch": 1.7723292469352014, "grad_norm": 0.25545734167099, "learning_rate": 8.264549722265254e-06, "loss": 0.4969, "step": 1012 }, { "epoch": 1.7740805604203151, "grad_norm": 0.23772890865802765, "learning_rate": 8.260967833627628e-06, "loss": 0.4979, "step": 1013 }, { "epoch": 1.775831873905429, "grad_norm": 0.29130104184150696, "learning_rate": 8.257383030345814e-06, "loss": 0.482, "step": 1014 }, { "epoch": 1.777583187390543, "grad_norm": 0.2826871871948242, "learning_rate": 8.253795315623894e-06, "loss": 0.5061, "step": 1015 }, { "epoch": 1.7793345008756567, "grad_norm": 0.25870591402053833, "learning_rate": 8.250204692668559e-06, "loss": 0.4875, "step": 1016 }, { "epoch": 1.7810858143607706, "grad_norm": 0.26248106360435486, "learning_rate": 8.246611164689097e-06, "loss": 0.4966, "step": 1017 }, { "epoch": 1.7828371278458843, "grad_norm": 0.2533188462257385, "learning_rate": 8.243014734897397e-06, "loss": 0.4966, "step": 1018 }, { "epoch": 1.7845884413309983, "grad_norm": 0.2812862694263458, "learning_rate": 8.239415406507934e-06, "loss": 0.5078, "step": 1019 }, { "epoch": 1.7863397548161122, "grad_norm": 0.252164363861084, "learning_rate": 8.235813182737782e-06, "loss": 0.4997, "step": 1020 }, { "epoch": 1.7880910683012259, "grad_norm": 0.24732531607151031, "learning_rate": 8.232208066806596e-06, "loss": 0.4886, "step": 1021 }, { "epoch": 1.7898423817863398, "grad_norm": 0.3046465814113617, "learning_rate": 8.228600061936617e-06, "loss": 0.5109, "step": 1022 }, { "epoch": 1.7915936952714535, "grad_norm": 0.27932676672935486, "learning_rate": 8.224989171352675e-06, "loss": 0.5061, "step": 1023 }, { "epoch": 1.7933450087565674, "grad_norm": 0.3224479556083679, "learning_rate": 8.221375398282172e-06, "loss": 0.5026, "step": 1024 }, { "epoch": 1.7950963222416814, "grad_norm": 0.24726442992687225, "learning_rate": 8.217758745955087e-06, "loss": 0.5031, "step": 1025 }, { "epoch": 1.796847635726795, "grad_norm": 0.3002009689807892, "learning_rate": 8.214139217603975e-06, "loss": 0.4932, "step": 1026 }, { "epoch": 1.7985989492119088, "grad_norm": 0.2555195689201355, "learning_rate": 8.210516816463961e-06, "loss": 0.5007, "step": 1027 }, { "epoch": 1.8003502626970227, "grad_norm": 0.2394469827413559, "learning_rate": 8.206891545772738e-06, "loss": 0.4986, "step": 1028 }, { "epoch": 1.8021015761821366, "grad_norm": 0.2755083739757538, "learning_rate": 8.203263408770562e-06, "loss": 0.494, "step": 1029 }, { "epoch": 1.8038528896672505, "grad_norm": 0.24962982535362244, "learning_rate": 8.19963240870025e-06, "loss": 0.4956, "step": 1030 }, { "epoch": 1.8056042031523643, "grad_norm": 0.25620362162590027, "learning_rate": 8.195998548807185e-06, "loss": 0.5007, "step": 1031 }, { "epoch": 1.807355516637478, "grad_norm": 0.23974083364009857, "learning_rate": 8.192361832339299e-06, "loss": 0.5005, "step": 1032 }, { "epoch": 1.8091068301225919, "grad_norm": 0.25333765149116516, "learning_rate": 8.188722262547078e-06, "loss": 0.5017, "step": 1033 }, { "epoch": 1.8108581436077058, "grad_norm": 0.24216823279857635, "learning_rate": 8.185079842683558e-06, "loss": 0.4889, "step": 1034 }, { "epoch": 1.8126094570928197, "grad_norm": 0.21616069972515106, "learning_rate": 8.181434576004328e-06, "loss": 0.4806, "step": 1035 }, { "epoch": 1.8143607705779334, "grad_norm": 0.27945753931999207, "learning_rate": 8.177786465767517e-06, "loss": 0.4804, "step": 1036 }, { "epoch": 1.8161120840630471, "grad_norm": 0.25529342889785767, "learning_rate": 8.174135515233792e-06, "loss": 0.5026, "step": 1037 }, { "epoch": 1.817863397548161, "grad_norm": 0.2525923550128937, "learning_rate": 8.170481727666366e-06, "loss": 0.4939, "step": 1038 }, { "epoch": 1.819614711033275, "grad_norm": 0.27408623695373535, "learning_rate": 8.166825106330985e-06, "loss": 0.4976, "step": 1039 }, { "epoch": 1.821366024518389, "grad_norm": 0.25060075521469116, "learning_rate": 8.163165654495925e-06, "loss": 0.4875, "step": 1040 }, { "epoch": 1.8231173380035026, "grad_norm": 0.22724473476409912, "learning_rate": 8.159503375431996e-06, "loss": 0.5056, "step": 1041 }, { "epoch": 1.8248686514886163, "grad_norm": 0.24598391354084015, "learning_rate": 8.155838272412531e-06, "loss": 0.4974, "step": 1042 }, { "epoch": 1.8266199649737302, "grad_norm": 0.2717282474040985, "learning_rate": 8.152170348713392e-06, "loss": 0.4992, "step": 1043 }, { "epoch": 1.8283712784588442, "grad_norm": 0.2261698842048645, "learning_rate": 8.148499607612956e-06, "loss": 0.508, "step": 1044 }, { "epoch": 1.830122591943958, "grad_norm": 0.24696145951747894, "learning_rate": 8.144826052392123e-06, "loss": 0.4947, "step": 1045 }, { "epoch": 1.8318739054290718, "grad_norm": 0.23653894662857056, "learning_rate": 8.141149686334307e-06, "loss": 0.5001, "step": 1046 }, { "epoch": 1.8336252189141855, "grad_norm": 0.25998201966285706, "learning_rate": 8.137470512725434e-06, "loss": 0.4816, "step": 1047 }, { "epoch": 1.8353765323992994, "grad_norm": 0.2655760645866394, "learning_rate": 8.133788534853939e-06, "loss": 0.4917, "step": 1048 }, { "epoch": 1.8371278458844134, "grad_norm": 0.2161719799041748, "learning_rate": 8.130103756010763e-06, "loss": 0.4885, "step": 1049 }, { "epoch": 1.8388791593695273, "grad_norm": 0.2432861626148224, "learning_rate": 8.126416179489354e-06, "loss": 0.4966, "step": 1050 }, { "epoch": 1.840630472854641, "grad_norm": 0.25206586718559265, "learning_rate": 8.122725808585654e-06, "loss": 0.493, "step": 1051 }, { "epoch": 1.8423817863397547, "grad_norm": 0.25826793909072876, "learning_rate": 8.11903264659811e-06, "loss": 0.4841, "step": 1052 }, { "epoch": 1.8441330998248686, "grad_norm": 0.23472435772418976, "learning_rate": 8.115336696827658e-06, "loss": 0.4789, "step": 1053 }, { "epoch": 1.8458844133099825, "grad_norm": 0.2774119973182678, "learning_rate": 8.111637962577728e-06, "loss": 0.4961, "step": 1054 }, { "epoch": 1.8476357267950965, "grad_norm": 0.22878648340702057, "learning_rate": 8.10793644715424e-06, "loss": 0.499, "step": 1055 }, { "epoch": 1.8493870402802102, "grad_norm": 0.25276604294776917, "learning_rate": 8.104232153865597e-06, "loss": 0.4815, "step": 1056 }, { "epoch": 1.8511383537653239, "grad_norm": 0.26200565695762634, "learning_rate": 8.100525086022687e-06, "loss": 0.4946, "step": 1057 }, { "epoch": 1.8528896672504378, "grad_norm": 0.24748796224594116, "learning_rate": 8.096815246938879e-06, "loss": 0.4921, "step": 1058 }, { "epoch": 1.8546409807355517, "grad_norm": 0.29953470826148987, "learning_rate": 8.093102639930013e-06, "loss": 0.4983, "step": 1059 }, { "epoch": 1.8563922942206657, "grad_norm": 0.25943639874458313, "learning_rate": 8.089387268314408e-06, "loss": 0.4942, "step": 1060 }, { "epoch": 1.8581436077057794, "grad_norm": 0.2611134350299835, "learning_rate": 8.085669135412856e-06, "loss": 0.5043, "step": 1061 }, { "epoch": 1.859894921190893, "grad_norm": 0.2580453157424927, "learning_rate": 8.081948244548615e-06, "loss": 0.5004, "step": 1062 }, { "epoch": 1.861646234676007, "grad_norm": 0.26615554094314575, "learning_rate": 8.0782245990474e-06, "loss": 0.4991, "step": 1063 }, { "epoch": 1.863397548161121, "grad_norm": 0.2587311863899231, "learning_rate": 8.074498202237402e-06, "loss": 0.5013, "step": 1064 }, { "epoch": 1.8651488616462348, "grad_norm": 0.2527863085269928, "learning_rate": 8.070769057449262e-06, "loss": 0.4874, "step": 1065 }, { "epoch": 1.8669001751313485, "grad_norm": 0.25608697533607483, "learning_rate": 8.067037168016079e-06, "loss": 0.4915, "step": 1066 }, { "epoch": 1.8686514886164622, "grad_norm": 0.3036438822746277, "learning_rate": 8.063302537273407e-06, "loss": 0.5025, "step": 1067 }, { "epoch": 1.8704028021015762, "grad_norm": 0.2668799161911011, "learning_rate": 8.059565168559247e-06, "loss": 0.5081, "step": 1068 }, { "epoch": 1.87215411558669, "grad_norm": 0.3020061254501343, "learning_rate": 8.05582506521405e-06, "loss": 0.4972, "step": 1069 }, { "epoch": 1.873905429071804, "grad_norm": 0.26153817772865295, "learning_rate": 8.052082230580711e-06, "loss": 0.4827, "step": 1070 }, { "epoch": 1.8756567425569177, "grad_norm": 0.2898184359073639, "learning_rate": 8.048336668004564e-06, "loss": 0.4944, "step": 1071 }, { "epoch": 1.8774080560420314, "grad_norm": 0.27005770802497864, "learning_rate": 8.044588380833384e-06, "loss": 0.4945, "step": 1072 }, { "epoch": 1.8791593695271454, "grad_norm": 0.3048478662967682, "learning_rate": 8.040837372417379e-06, "loss": 0.4903, "step": 1073 }, { "epoch": 1.8809106830122593, "grad_norm": 0.24676044285297394, "learning_rate": 8.03708364610919e-06, "loss": 0.4951, "step": 1074 }, { "epoch": 1.882661996497373, "grad_norm": 0.2616218030452728, "learning_rate": 8.033327205263888e-06, "loss": 0.5008, "step": 1075 }, { "epoch": 1.884413309982487, "grad_norm": 0.2605310082435608, "learning_rate": 8.02956805323897e-06, "loss": 0.5016, "step": 1076 }, { "epoch": 1.8861646234676006, "grad_norm": 0.2713586091995239, "learning_rate": 8.025806193394354e-06, "loss": 0.4968, "step": 1077 }, { "epoch": 1.8879159369527145, "grad_norm": 0.2846681773662567, "learning_rate": 8.022041629092382e-06, "loss": 0.4845, "step": 1078 }, { "epoch": 1.8896672504378285, "grad_norm": 0.25122737884521484, "learning_rate": 8.01827436369781e-06, "loss": 0.4954, "step": 1079 }, { "epoch": 1.8914185639229422, "grad_norm": 0.25180283188819885, "learning_rate": 8.01450440057781e-06, "loss": 0.4994, "step": 1080 }, { "epoch": 1.893169877408056, "grad_norm": 0.22931450605392456, "learning_rate": 8.010731743101967e-06, "loss": 0.494, "step": 1081 }, { "epoch": 1.8949211908931698, "grad_norm": 0.2794499397277832, "learning_rate": 8.006956394642268e-06, "loss": 0.5015, "step": 1082 }, { "epoch": 1.8966725043782837, "grad_norm": 0.24207690358161926, "learning_rate": 8.003178358573112e-06, "loss": 0.4845, "step": 1083 }, { "epoch": 1.8984238178633976, "grad_norm": 0.24195319414138794, "learning_rate": 7.999397638271298e-06, "loss": 0.4992, "step": 1084 }, { "epoch": 1.9001751313485113, "grad_norm": 0.2274049073457718, "learning_rate": 7.99561423711602e-06, "loss": 0.4895, "step": 1085 }, { "epoch": 1.9019264448336253, "grad_norm": 0.26628100872039795, "learning_rate": 7.991828158488875e-06, "loss": 0.5104, "step": 1086 }, { "epoch": 1.903677758318739, "grad_norm": 0.2854364216327667, "learning_rate": 7.98803940577385e-06, "loss": 0.5038, "step": 1087 }, { "epoch": 1.905429071803853, "grad_norm": 0.3062647581100464, "learning_rate": 7.984247982357317e-06, "loss": 0.4854, "step": 1088 }, { "epoch": 1.9071803852889668, "grad_norm": 0.2632718086242676, "learning_rate": 7.980453891628047e-06, "loss": 0.4993, "step": 1089 }, { "epoch": 1.9089316987740805, "grad_norm": 0.26267972588539124, "learning_rate": 7.976657136977181e-06, "loss": 0.4896, "step": 1090 }, { "epoch": 1.9106830122591942, "grad_norm": 0.2732277512550354, "learning_rate": 7.972857721798254e-06, "loss": 0.4981, "step": 1091 }, { "epoch": 1.9124343257443082, "grad_norm": 0.2710610628128052, "learning_rate": 7.96905564948717e-06, "loss": 0.5, "step": 1092 }, { "epoch": 1.914185639229422, "grad_norm": 0.29722097516059875, "learning_rate": 7.96525092344221e-06, "loss": 0.4942, "step": 1093 }, { "epoch": 1.915936952714536, "grad_norm": 0.25748172402381897, "learning_rate": 7.96144354706403e-06, "loss": 0.5031, "step": 1094 }, { "epoch": 1.9176882661996497, "grad_norm": 0.28321152925491333, "learning_rate": 7.957633523755652e-06, "loss": 0.5017, "step": 1095 }, { "epoch": 1.9194395796847634, "grad_norm": 0.26676031947135925, "learning_rate": 7.953820856922465e-06, "loss": 0.4994, "step": 1096 }, { "epoch": 1.9211908931698773, "grad_norm": 0.27427613735198975, "learning_rate": 7.95000554997222e-06, "loss": 0.5059, "step": 1097 }, { "epoch": 1.9229422066549913, "grad_norm": 0.2790091633796692, "learning_rate": 7.946187606315026e-06, "loss": 0.4987, "step": 1098 }, { "epoch": 1.9246935201401052, "grad_norm": 0.2733962833881378, "learning_rate": 7.942367029363351e-06, "loss": 0.4991, "step": 1099 }, { "epoch": 1.926444833625219, "grad_norm": 0.27874237298965454, "learning_rate": 7.938543822532019e-06, "loss": 0.4988, "step": 1100 }, { "epoch": 1.9281961471103326, "grad_norm": 0.24094350636005402, "learning_rate": 7.9347179892382e-06, "loss": 0.5034, "step": 1101 }, { "epoch": 1.9299474605954465, "grad_norm": 0.24686165153980255, "learning_rate": 7.930889532901416e-06, "loss": 0.4866, "step": 1102 }, { "epoch": 1.9316987740805605, "grad_norm": 0.24394018948078156, "learning_rate": 7.927058456943527e-06, "loss": 0.4831, "step": 1103 }, { "epoch": 1.9334500875656744, "grad_norm": 0.23612022399902344, "learning_rate": 7.92322476478874e-06, "loss": 0.4886, "step": 1104 }, { "epoch": 1.935201401050788, "grad_norm": 0.26888737082481384, "learning_rate": 7.919388459863597e-06, "loss": 0.5018, "step": 1105 }, { "epoch": 1.9369527145359018, "grad_norm": 0.2854922115802765, "learning_rate": 7.915549545596979e-06, "loss": 0.4954, "step": 1106 }, { "epoch": 1.9387040280210157, "grad_norm": 0.22274835407733917, "learning_rate": 7.911708025420097e-06, "loss": 0.4893, "step": 1107 }, { "epoch": 1.9404553415061296, "grad_norm": 0.26972684264183044, "learning_rate": 7.907863902766494e-06, "loss": 0.5055, "step": 1108 }, { "epoch": 1.9422066549912436, "grad_norm": 0.27521759271621704, "learning_rate": 7.90401718107203e-06, "loss": 0.4967, "step": 1109 }, { "epoch": 1.9439579684763573, "grad_norm": 0.26362916827201843, "learning_rate": 7.900167863774896e-06, "loss": 0.4947, "step": 1110 }, { "epoch": 1.945709281961471, "grad_norm": 0.3170153498649597, "learning_rate": 7.896315954315608e-06, "loss": 0.4974, "step": 1111 }, { "epoch": 1.947460595446585, "grad_norm": 0.2366592139005661, "learning_rate": 7.892461456136984e-06, "loss": 0.4854, "step": 1112 }, { "epoch": 1.9492119089316988, "grad_norm": 0.33102795481681824, "learning_rate": 7.888604372684168e-06, "loss": 0.4995, "step": 1113 }, { "epoch": 1.9509632224168127, "grad_norm": 0.23470748960971832, "learning_rate": 7.88474470740461e-06, "loss": 0.4968, "step": 1114 }, { "epoch": 1.9527145359019265, "grad_norm": 0.2613508999347687, "learning_rate": 7.880882463748067e-06, "loss": 0.4975, "step": 1115 }, { "epoch": 1.9544658493870402, "grad_norm": 0.2975543439388275, "learning_rate": 7.877017645166606e-06, "loss": 0.4802, "step": 1116 }, { "epoch": 1.956217162872154, "grad_norm": 0.2566398084163666, "learning_rate": 7.873150255114588e-06, "loss": 0.5079, "step": 1117 }, { "epoch": 1.957968476357268, "grad_norm": 0.27090561389923096, "learning_rate": 7.869280297048674e-06, "loss": 0.4972, "step": 1118 }, { "epoch": 1.959719789842382, "grad_norm": 0.25802868604660034, "learning_rate": 7.865407774427828e-06, "loss": 0.4901, "step": 1119 }, { "epoch": 1.9614711033274956, "grad_norm": 0.24567188322544098, "learning_rate": 7.861532690713293e-06, "loss": 0.4855, "step": 1120 }, { "epoch": 1.9632224168126093, "grad_norm": 0.32075437903404236, "learning_rate": 7.857655049368614e-06, "loss": 0.4776, "step": 1121 }, { "epoch": 1.9649737302977233, "grad_norm": 0.27623453736305237, "learning_rate": 7.853774853859612e-06, "loss": 0.4919, "step": 1122 }, { "epoch": 1.9667250437828372, "grad_norm": 0.2769637703895569, "learning_rate": 7.8498921076544e-06, "loss": 0.4873, "step": 1123 }, { "epoch": 1.9684763572679511, "grad_norm": 0.3275054097175598, "learning_rate": 7.846006814223362e-06, "loss": 0.4913, "step": 1124 }, { "epoch": 1.9702276707530648, "grad_norm": 0.2284826636314392, "learning_rate": 7.842118977039162e-06, "loss": 0.4936, "step": 1125 }, { "epoch": 1.9719789842381785, "grad_norm": 0.29755324125289917, "learning_rate": 7.838228599576743e-06, "loss": 0.4914, "step": 1126 }, { "epoch": 1.9737302977232924, "grad_norm": 0.283377468585968, "learning_rate": 7.83433568531331e-06, "loss": 0.5098, "step": 1127 }, { "epoch": 1.9754816112084064, "grad_norm": 0.23735524713993073, "learning_rate": 7.830440237728341e-06, "loss": 0.4839, "step": 1128 }, { "epoch": 1.9772329246935203, "grad_norm": 0.29906007647514343, "learning_rate": 7.826542260303576e-06, "loss": 0.4897, "step": 1129 }, { "epoch": 1.978984238178634, "grad_norm": 0.29623350501060486, "learning_rate": 7.822641756523015e-06, "loss": 0.4889, "step": 1130 }, { "epoch": 1.9807355516637477, "grad_norm": 0.2920228838920593, "learning_rate": 7.818738729872921e-06, "loss": 0.4906, "step": 1131 }, { "epoch": 1.9824868651488616, "grad_norm": 0.2949115037918091, "learning_rate": 7.814833183841805e-06, "loss": 0.4961, "step": 1132 }, { "epoch": 1.9842381786339756, "grad_norm": 0.2540757954120636, "learning_rate": 7.810925121920437e-06, "loss": 0.4894, "step": 1133 }, { "epoch": 1.9859894921190895, "grad_norm": 0.30852529406547546, "learning_rate": 7.80701454760183e-06, "loss": 0.5058, "step": 1134 }, { "epoch": 1.9877408056042032, "grad_norm": 0.23210765421390533, "learning_rate": 7.803101464381244e-06, "loss": 0.4914, "step": 1135 }, { "epoch": 1.989492119089317, "grad_norm": 0.2679421901702881, "learning_rate": 7.799185875756184e-06, "loss": 0.4872, "step": 1136 }, { "epoch": 1.9912434325744308, "grad_norm": 0.24284853041172028, "learning_rate": 7.795267785226392e-06, "loss": 0.4967, "step": 1137 }, { "epoch": 1.9929947460595447, "grad_norm": 0.26808610558509827, "learning_rate": 7.791347196293846e-06, "loss": 0.4867, "step": 1138 }, { "epoch": 1.9947460595446584, "grad_norm": 0.25926128029823303, "learning_rate": 7.787424112462758e-06, "loss": 0.4901, "step": 1139 }, { "epoch": 1.9964973730297724, "grad_norm": 0.296563059091568, "learning_rate": 7.783498537239568e-06, "loss": 0.4979, "step": 1140 }, { "epoch": 1.998248686514886, "grad_norm": 0.2278580218553543, "learning_rate": 7.779570474132949e-06, "loss": 0.4929, "step": 1141 }, { "epoch": 2.0, "grad_norm": 0.2370726317167282, "learning_rate": 7.775639926653789e-06, "loss": 0.4936, "step": 1142 }, { "epoch": 2.001751313485114, "grad_norm": 0.25017791986465454, "learning_rate": 7.771706898315204e-06, "loss": 0.4759, "step": 1143 }, { "epoch": 2.003502626970228, "grad_norm": 0.25403401255607605, "learning_rate": 7.767771392632518e-06, "loss": 0.4795, "step": 1144 }, { "epoch": 2.0052539404553413, "grad_norm": 0.27029046416282654, "learning_rate": 7.763833413123281e-06, "loss": 0.4822, "step": 1145 }, { "epoch": 2.0070052539404553, "grad_norm": 0.2553444504737854, "learning_rate": 7.759892963307249e-06, "loss": 0.4759, "step": 1146 }, { "epoch": 2.008756567425569, "grad_norm": 0.2674221396446228, "learning_rate": 7.75595004670638e-06, "loss": 0.4809, "step": 1147 }, { "epoch": 2.010507880910683, "grad_norm": 0.27343425154685974, "learning_rate": 7.752004666844849e-06, "loss": 0.4806, "step": 1148 }, { "epoch": 2.012259194395797, "grad_norm": 0.24847763776779175, "learning_rate": 7.748056827249018e-06, "loss": 0.4741, "step": 1149 }, { "epoch": 2.0140105078809105, "grad_norm": 0.2759471535682678, "learning_rate": 7.744106531447462e-06, "loss": 0.4807, "step": 1150 }, { "epoch": 2.0157618213660244, "grad_norm": 0.24739977717399597, "learning_rate": 7.74015378297094e-06, "loss": 0.4742, "step": 1151 }, { "epoch": 2.0175131348511384, "grad_norm": 0.24282759428024292, "learning_rate": 7.736198585352411e-06, "loss": 0.4749, "step": 1152 }, { "epoch": 2.0192644483362523, "grad_norm": 0.2567790746688843, "learning_rate": 7.732240942127018e-06, "loss": 0.4849, "step": 1153 }, { "epoch": 2.021015761821366, "grad_norm": 0.26015692949295044, "learning_rate": 7.728280856832094e-06, "loss": 0.466, "step": 1154 }, { "epoch": 2.0227670753064797, "grad_norm": 0.23609691858291626, "learning_rate": 7.72431833300715e-06, "loss": 0.4797, "step": 1155 }, { "epoch": 2.0245183887915936, "grad_norm": 0.24707616865634918, "learning_rate": 7.72035337419388e-06, "loss": 0.4732, "step": 1156 }, { "epoch": 2.0262697022767076, "grad_norm": 0.2665928900241852, "learning_rate": 7.716385983936154e-06, "loss": 0.4828, "step": 1157 }, { "epoch": 2.0280210157618215, "grad_norm": 0.23509110510349274, "learning_rate": 7.712416165780014e-06, "loss": 0.4811, "step": 1158 }, { "epoch": 2.0297723292469354, "grad_norm": 0.2667239308357239, "learning_rate": 7.708443923273671e-06, "loss": 0.4727, "step": 1159 }, { "epoch": 2.031523642732049, "grad_norm": 0.2525825798511505, "learning_rate": 7.704469259967509e-06, "loss": 0.4799, "step": 1160 }, { "epoch": 2.033274956217163, "grad_norm": 0.25882866978645325, "learning_rate": 7.700492179414067e-06, "loss": 0.4754, "step": 1161 }, { "epoch": 2.0350262697022767, "grad_norm": 0.2753877639770508, "learning_rate": 7.69651268516805e-06, "loss": 0.4678, "step": 1162 }, { "epoch": 2.0367775831873907, "grad_norm": 0.24893034994602203, "learning_rate": 7.692530780786321e-06, "loss": 0.4607, "step": 1163 }, { "epoch": 2.0385288966725046, "grad_norm": 0.24251002073287964, "learning_rate": 7.688546469827892e-06, "loss": 0.4769, "step": 1164 }, { "epoch": 2.040280210157618, "grad_norm": 0.2298019528388977, "learning_rate": 7.684559755853932e-06, "loss": 0.4885, "step": 1165 }, { "epoch": 2.042031523642732, "grad_norm": 0.2434874176979065, "learning_rate": 7.680570642427755e-06, "loss": 0.4853, "step": 1166 }, { "epoch": 2.043782837127846, "grad_norm": 0.24134932458400726, "learning_rate": 7.676579133114819e-06, "loss": 0.4756, "step": 1167 }, { "epoch": 2.04553415061296, "grad_norm": 0.24601143598556519, "learning_rate": 7.672585231482723e-06, "loss": 0.4698, "step": 1168 }, { "epoch": 2.0472854640980738, "grad_norm": 0.26766082644462585, "learning_rate": 7.668588941101208e-06, "loss": 0.4758, "step": 1169 }, { "epoch": 2.0490367775831873, "grad_norm": 0.27923572063446045, "learning_rate": 7.664590265542144e-06, "loss": 0.4795, "step": 1170 }, { "epoch": 2.050788091068301, "grad_norm": 0.23213371634483337, "learning_rate": 7.66058920837954e-06, "loss": 0.4813, "step": 1171 }, { "epoch": 2.052539404553415, "grad_norm": 0.2721827030181885, "learning_rate": 7.65658577318953e-06, "loss": 0.4768, "step": 1172 }, { "epoch": 2.054290718038529, "grad_norm": 0.24875935912132263, "learning_rate": 7.65257996355037e-06, "loss": 0.4825, "step": 1173 }, { "epoch": 2.056042031523643, "grad_norm": 0.24263955652713776, "learning_rate": 7.648571783042445e-06, "loss": 0.4853, "step": 1174 }, { "epoch": 2.0577933450087564, "grad_norm": 0.22961705923080444, "learning_rate": 7.644561235248254e-06, "loss": 0.4722, "step": 1175 }, { "epoch": 2.0595446584938704, "grad_norm": 0.23489715158939362, "learning_rate": 7.640548323752412e-06, "loss": 0.4698, "step": 1176 }, { "epoch": 2.0612959719789843, "grad_norm": 0.252006471157074, "learning_rate": 7.636533052141653e-06, "loss": 0.4696, "step": 1177 }, { "epoch": 2.063047285464098, "grad_norm": 0.2545519769191742, "learning_rate": 7.632515424004813e-06, "loss": 0.4631, "step": 1178 }, { "epoch": 2.0647985989492117, "grad_norm": 0.2704659104347229, "learning_rate": 7.628495442932838e-06, "loss": 0.4696, "step": 1179 }, { "epoch": 2.0665499124343256, "grad_norm": 0.24293069541454315, "learning_rate": 7.624473112518777e-06, "loss": 0.4805, "step": 1180 }, { "epoch": 2.0683012259194395, "grad_norm": 0.29739639163017273, "learning_rate": 7.6204484363577756e-06, "loss": 0.4802, "step": 1181 }, { "epoch": 2.0700525394045535, "grad_norm": 0.28204089403152466, "learning_rate": 7.61642141804708e-06, "loss": 0.479, "step": 1182 }, { "epoch": 2.0718038528896674, "grad_norm": 0.2698694169521332, "learning_rate": 7.612392061186027e-06, "loss": 0.4716, "step": 1183 }, { "epoch": 2.073555166374781, "grad_norm": 0.3055534064769745, "learning_rate": 7.608360369376047e-06, "loss": 0.4758, "step": 1184 }, { "epoch": 2.075306479859895, "grad_norm": 0.22567550837993622, "learning_rate": 7.604326346220654e-06, "loss": 0.4763, "step": 1185 }, { "epoch": 2.0770577933450087, "grad_norm": 0.2510170638561249, "learning_rate": 7.600289995325446e-06, "loss": 0.4754, "step": 1186 }, { "epoch": 2.0788091068301227, "grad_norm": 0.23751291632652283, "learning_rate": 7.596251320298105e-06, "loss": 0.478, "step": 1187 }, { "epoch": 2.0805604203152366, "grad_norm": 0.25299546122550964, "learning_rate": 7.592210324748384e-06, "loss": 0.473, "step": 1188 }, { "epoch": 2.08231173380035, "grad_norm": 0.2768765091896057, "learning_rate": 7.588167012288117e-06, "loss": 0.4701, "step": 1189 }, { "epoch": 2.084063047285464, "grad_norm": 0.24105657637119293, "learning_rate": 7.584121386531205e-06, "loss": 0.467, "step": 1190 }, { "epoch": 2.085814360770578, "grad_norm": 0.3011622726917267, "learning_rate": 7.580073451093617e-06, "loss": 0.4787, "step": 1191 }, { "epoch": 2.087565674255692, "grad_norm": 0.24904850125312805, "learning_rate": 7.576023209593386e-06, "loss": 0.4745, "step": 1192 }, { "epoch": 2.0893169877408058, "grad_norm": 0.2686646580696106, "learning_rate": 7.571970665650607e-06, "loss": 0.4813, "step": 1193 }, { "epoch": 2.0910683012259192, "grad_norm": 0.2815907597541809, "learning_rate": 7.567915822887434e-06, "loss": 0.469, "step": 1194 }, { "epoch": 2.092819614711033, "grad_norm": 0.24291875958442688, "learning_rate": 7.563858684928074e-06, "loss": 0.4724, "step": 1195 }, { "epoch": 2.094570928196147, "grad_norm": 0.2825978994369507, "learning_rate": 7.559799255398785e-06, "loss": 0.4664, "step": 1196 }, { "epoch": 2.096322241681261, "grad_norm": 0.24558214843273163, "learning_rate": 7.555737537927875e-06, "loss": 0.4715, "step": 1197 }, { "epoch": 2.098073555166375, "grad_norm": 0.31230637431144714, "learning_rate": 7.551673536145695e-06, "loss": 0.4751, "step": 1198 }, { "epoch": 2.0998248686514884, "grad_norm": 0.2480107843875885, "learning_rate": 7.54760725368464e-06, "loss": 0.4771, "step": 1199 }, { "epoch": 2.1015761821366024, "grad_norm": 0.2984243929386139, "learning_rate": 7.543538694179142e-06, "loss": 0.4775, "step": 1200 }, { "epoch": 2.1033274956217163, "grad_norm": 0.2471974641084671, "learning_rate": 7.539467861265668e-06, "loss": 0.4701, "step": 1201 }, { "epoch": 2.10507880910683, "grad_norm": 0.28089043498039246, "learning_rate": 7.535394758582717e-06, "loss": 0.4712, "step": 1202 }, { "epoch": 2.106830122591944, "grad_norm": 0.2497229278087616, "learning_rate": 7.531319389770818e-06, "loss": 0.4672, "step": 1203 }, { "epoch": 2.1085814360770576, "grad_norm": 0.26567015051841736, "learning_rate": 7.527241758472525e-06, "loss": 0.4691, "step": 1204 }, { "epoch": 2.1103327495621715, "grad_norm": 0.3077966272830963, "learning_rate": 7.523161868332412e-06, "loss": 0.4767, "step": 1205 }, { "epoch": 2.1120840630472855, "grad_norm": 0.2657139301300049, "learning_rate": 7.519079722997076e-06, "loss": 0.4694, "step": 1206 }, { "epoch": 2.1138353765323994, "grad_norm": 0.2908776104450226, "learning_rate": 7.514995326115124e-06, "loss": 0.4741, "step": 1207 }, { "epoch": 2.1155866900175133, "grad_norm": 0.30074191093444824, "learning_rate": 7.5109086813371835e-06, "loss": 0.485, "step": 1208 }, { "epoch": 2.117338003502627, "grad_norm": 0.2594562768936157, "learning_rate": 7.506819792315883e-06, "loss": 0.4747, "step": 1209 }, { "epoch": 2.1190893169877407, "grad_norm": 0.296916127204895, "learning_rate": 7.502728662705861e-06, "loss": 0.4738, "step": 1210 }, { "epoch": 2.1208406304728546, "grad_norm": 0.2921127378940582, "learning_rate": 7.498635296163758e-06, "loss": 0.4865, "step": 1211 }, { "epoch": 2.1225919439579686, "grad_norm": 0.25451305508613586, "learning_rate": 7.494539696348215e-06, "loss": 0.4754, "step": 1212 }, { "epoch": 2.1243432574430825, "grad_norm": 0.27056148648262024, "learning_rate": 7.490441866919869e-06, "loss": 0.4746, "step": 1213 }, { "epoch": 2.126094570928196, "grad_norm": 0.2772015631198883, "learning_rate": 7.486341811541345e-06, "loss": 0.483, "step": 1214 }, { "epoch": 2.12784588441331, "grad_norm": 0.2591862380504608, "learning_rate": 7.482239533877264e-06, "loss": 0.4807, "step": 1215 }, { "epoch": 2.129597197898424, "grad_norm": 0.2802082896232605, "learning_rate": 7.4781350375942296e-06, "loss": 0.4654, "step": 1216 }, { "epoch": 2.1313485113835378, "grad_norm": 0.24129603803157806, "learning_rate": 7.4740283263608305e-06, "loss": 0.4813, "step": 1217 }, { "epoch": 2.1330998248686517, "grad_norm": 0.2675319015979767, "learning_rate": 7.469919403847633e-06, "loss": 0.4823, "step": 1218 }, { "epoch": 2.134851138353765, "grad_norm": 0.23089131712913513, "learning_rate": 7.465808273727182e-06, "loss": 0.4757, "step": 1219 }, { "epoch": 2.136602451838879, "grad_norm": 0.22164127230644226, "learning_rate": 7.461694939673992e-06, "loss": 0.4708, "step": 1220 }, { "epoch": 2.138353765323993, "grad_norm": 0.28215470910072327, "learning_rate": 7.457579405364555e-06, "loss": 0.4766, "step": 1221 }, { "epoch": 2.140105078809107, "grad_norm": 0.25820234417915344, "learning_rate": 7.453461674477318e-06, "loss": 0.4882, "step": 1222 }, { "epoch": 2.141856392294221, "grad_norm": 0.22555367648601532, "learning_rate": 7.449341750692703e-06, "loss": 0.4792, "step": 1223 }, { "epoch": 2.1436077057793343, "grad_norm": 0.2563372850418091, "learning_rate": 7.445219637693082e-06, "loss": 0.4718, "step": 1224 }, { "epoch": 2.1453590192644483, "grad_norm": 0.2649422883987427, "learning_rate": 7.441095339162791e-06, "loss": 0.4806, "step": 1225 }, { "epoch": 2.147110332749562, "grad_norm": 0.23169836401939392, "learning_rate": 7.436968858788118e-06, "loss": 0.4742, "step": 1226 }, { "epoch": 2.148861646234676, "grad_norm": 0.2605903744697571, "learning_rate": 7.4328402002572985e-06, "loss": 0.4799, "step": 1227 }, { "epoch": 2.1506129597197896, "grad_norm": 0.26005035638809204, "learning_rate": 7.428709367260516e-06, "loss": 0.481, "step": 1228 }, { "epoch": 2.1523642732049035, "grad_norm": 0.2742314338684082, "learning_rate": 7.424576363489897e-06, "loss": 0.4857, "step": 1229 }, { "epoch": 2.1541155866900175, "grad_norm": 0.2332695573568344, "learning_rate": 7.420441192639511e-06, "loss": 0.4818, "step": 1230 }, { "epoch": 2.1558669001751314, "grad_norm": 0.2665252387523651, "learning_rate": 7.416303858405363e-06, "loss": 0.4752, "step": 1231 }, { "epoch": 2.1576182136602453, "grad_norm": 0.218715101480484, "learning_rate": 7.412164364485388e-06, "loss": 0.4737, "step": 1232 }, { "epoch": 2.1593695271453592, "grad_norm": 0.27262410521507263, "learning_rate": 7.408022714579457e-06, "loss": 0.4752, "step": 1233 }, { "epoch": 2.1611208406304727, "grad_norm": 0.2558940351009369, "learning_rate": 7.403878912389365e-06, "loss": 0.4764, "step": 1234 }, { "epoch": 2.1628721541155866, "grad_norm": 0.24400568008422852, "learning_rate": 7.3997329616188305e-06, "loss": 0.4781, "step": 1235 }, { "epoch": 2.1646234676007006, "grad_norm": 0.2944111227989197, "learning_rate": 7.395584865973493e-06, "loss": 0.47, "step": 1236 }, { "epoch": 2.1663747810858145, "grad_norm": 0.2947983145713806, "learning_rate": 7.391434629160909e-06, "loss": 0.4761, "step": 1237 }, { "epoch": 2.168126094570928, "grad_norm": 0.2724582850933075, "learning_rate": 7.38728225489055e-06, "loss": 0.467, "step": 1238 }, { "epoch": 2.169877408056042, "grad_norm": 0.34981852769851685, "learning_rate": 7.383127746873796e-06, "loss": 0.474, "step": 1239 }, { "epoch": 2.171628721541156, "grad_norm": 0.2432534098625183, "learning_rate": 7.378971108823933e-06, "loss": 0.4772, "step": 1240 }, { "epoch": 2.1733800350262698, "grad_norm": 0.3122000992298126, "learning_rate": 7.374812344456157e-06, "loss": 0.4752, "step": 1241 }, { "epoch": 2.1751313485113837, "grad_norm": 0.23819342255592346, "learning_rate": 7.370651457487559e-06, "loss": 0.4776, "step": 1242 }, { "epoch": 2.1768826619964976, "grad_norm": 0.3048440217971802, "learning_rate": 7.366488451637126e-06, "loss": 0.4767, "step": 1243 }, { "epoch": 2.178633975481611, "grad_norm": 0.2504461705684662, "learning_rate": 7.362323330625744e-06, "loss": 0.4803, "step": 1244 }, { "epoch": 2.180385288966725, "grad_norm": 0.2728504240512848, "learning_rate": 7.358156098176185e-06, "loss": 0.474, "step": 1245 }, { "epoch": 2.182136602451839, "grad_norm": 0.2615172863006592, "learning_rate": 7.353986758013112e-06, "loss": 0.4815, "step": 1246 }, { "epoch": 2.183887915936953, "grad_norm": 0.24069763720035553, "learning_rate": 7.349815313863068e-06, "loss": 0.4819, "step": 1247 }, { "epoch": 2.1856392294220663, "grad_norm": 0.2871399223804474, "learning_rate": 7.3456417694544804e-06, "loss": 0.4812, "step": 1248 }, { "epoch": 2.1873905429071803, "grad_norm": 0.24974587559700012, "learning_rate": 7.341466128517649e-06, "loss": 0.4775, "step": 1249 }, { "epoch": 2.189141856392294, "grad_norm": 0.27028268575668335, "learning_rate": 7.337288394784754e-06, "loss": 0.4868, "step": 1250 }, { "epoch": 2.190893169877408, "grad_norm": 0.27631840109825134, "learning_rate": 7.3331085719898385e-06, "loss": 0.4676, "step": 1251 }, { "epoch": 2.192644483362522, "grad_norm": 0.23873765766620636, "learning_rate": 7.32892666386882e-06, "loss": 0.4762, "step": 1252 }, { "epoch": 2.1943957968476355, "grad_norm": 0.2560415267944336, "learning_rate": 7.324742674159475e-06, "loss": 0.4822, "step": 1253 }, { "epoch": 2.1961471103327495, "grad_norm": 0.234714537858963, "learning_rate": 7.3205566066014436e-06, "loss": 0.4783, "step": 1254 }, { "epoch": 2.1978984238178634, "grad_norm": 0.24144577980041504, "learning_rate": 7.316368464936219e-06, "loss": 0.477, "step": 1255 }, { "epoch": 2.1996497373029773, "grad_norm": 0.2541288435459137, "learning_rate": 7.312178252907154e-06, "loss": 0.4748, "step": 1256 }, { "epoch": 2.2014010507880912, "grad_norm": 0.2339569628238678, "learning_rate": 7.307985974259445e-06, "loss": 0.4798, "step": 1257 }, { "epoch": 2.2031523642732047, "grad_norm": 0.2869264483451843, "learning_rate": 7.303791632740143e-06, "loss": 0.4797, "step": 1258 }, { "epoch": 2.2049036777583186, "grad_norm": 0.23036102950572968, "learning_rate": 7.2995952320981356e-06, "loss": 0.4927, "step": 1259 }, { "epoch": 2.2066549912434326, "grad_norm": 0.34828218817710876, "learning_rate": 7.2953967760841585e-06, "loss": 0.4827, "step": 1260 }, { "epoch": 2.2084063047285465, "grad_norm": 0.2181822806596756, "learning_rate": 7.291196268450777e-06, "loss": 0.4802, "step": 1261 }, { "epoch": 2.2101576182136604, "grad_norm": 0.3188292384147644, "learning_rate": 7.286993712952394e-06, "loss": 0.4878, "step": 1262 }, { "epoch": 2.211908931698774, "grad_norm": 0.22807644307613373, "learning_rate": 7.282789113345242e-06, "loss": 0.472, "step": 1263 }, { "epoch": 2.213660245183888, "grad_norm": 0.2571803331375122, "learning_rate": 7.278582473387382e-06, "loss": 0.4689, "step": 1264 }, { "epoch": 2.2154115586690017, "grad_norm": 0.24550053477287292, "learning_rate": 7.274373796838696e-06, "loss": 0.4828, "step": 1265 }, { "epoch": 2.2171628721541157, "grad_norm": 0.22426451742649078, "learning_rate": 7.27016308746089e-06, "loss": 0.4714, "step": 1266 }, { "epoch": 2.2189141856392296, "grad_norm": 0.24780161678791046, "learning_rate": 7.265950349017481e-06, "loss": 0.4752, "step": 1267 }, { "epoch": 2.220665499124343, "grad_norm": 0.25049155950546265, "learning_rate": 7.261735585273804e-06, "loss": 0.47, "step": 1268 }, { "epoch": 2.222416812609457, "grad_norm": 0.22903437912464142, "learning_rate": 7.257518799997007e-06, "loss": 0.4733, "step": 1269 }, { "epoch": 2.224168126094571, "grad_norm": 0.2537083029747009, "learning_rate": 7.253299996956038e-06, "loss": 0.4753, "step": 1270 }, { "epoch": 2.225919439579685, "grad_norm": 0.22805850207805634, "learning_rate": 7.249079179921652e-06, "loss": 0.4762, "step": 1271 }, { "epoch": 2.227670753064799, "grad_norm": 0.24282661080360413, "learning_rate": 7.244856352666404e-06, "loss": 0.4753, "step": 1272 }, { "epoch": 2.2294220665499123, "grad_norm": 0.26388421654701233, "learning_rate": 7.240631518964646e-06, "loss": 0.4768, "step": 1273 }, { "epoch": 2.231173380035026, "grad_norm": 0.24809370934963226, "learning_rate": 7.236404682592523e-06, "loss": 0.4694, "step": 1274 }, { "epoch": 2.23292469352014, "grad_norm": 0.2520983815193176, "learning_rate": 7.232175847327969e-06, "loss": 0.4813, "step": 1275 }, { "epoch": 2.234676007005254, "grad_norm": 0.2596047520637512, "learning_rate": 7.227945016950706e-06, "loss": 0.4779, "step": 1276 }, { "epoch": 2.236427320490368, "grad_norm": 0.23794572055339813, "learning_rate": 7.223712195242238e-06, "loss": 0.4707, "step": 1277 }, { "epoch": 2.2381786339754814, "grad_norm": 0.24630607664585114, "learning_rate": 7.2194773859858495e-06, "loss": 0.4802, "step": 1278 }, { "epoch": 2.2399299474605954, "grad_norm": 0.263212651014328, "learning_rate": 7.215240592966603e-06, "loss": 0.4685, "step": 1279 }, { "epoch": 2.2416812609457093, "grad_norm": 0.2266635298728943, "learning_rate": 7.2110018199713325e-06, "loss": 0.4791, "step": 1280 }, { "epoch": 2.2434325744308232, "grad_norm": 0.3054019510746002, "learning_rate": 7.206761070788641e-06, "loss": 0.4818, "step": 1281 }, { "epoch": 2.245183887915937, "grad_norm": 0.22553367912769318, "learning_rate": 7.202518349208898e-06, "loss": 0.48, "step": 1282 }, { "epoch": 2.2469352014010506, "grad_norm": 0.269550085067749, "learning_rate": 7.1982736590242375e-06, "loss": 0.4636, "step": 1283 }, { "epoch": 2.2486865148861646, "grad_norm": 0.24003508687019348, "learning_rate": 7.194027004028552e-06, "loss": 0.4771, "step": 1284 }, { "epoch": 2.2504378283712785, "grad_norm": 0.24351759254932404, "learning_rate": 7.189778388017489e-06, "loss": 0.4909, "step": 1285 }, { "epoch": 2.2521891418563924, "grad_norm": 0.22760875523090363, "learning_rate": 7.185527814788451e-06, "loss": 0.4744, "step": 1286 }, { "epoch": 2.253940455341506, "grad_norm": 0.2476336807012558, "learning_rate": 7.18127528814059e-06, "loss": 0.4811, "step": 1287 }, { "epoch": 2.25569176882662, "grad_norm": 0.21746376156806946, "learning_rate": 7.1770208118748e-06, "loss": 0.4713, "step": 1288 }, { "epoch": 2.2574430823117337, "grad_norm": 0.25395968556404114, "learning_rate": 7.172764389793723e-06, "loss": 0.4711, "step": 1289 }, { "epoch": 2.2591943957968477, "grad_norm": 0.23189714550971985, "learning_rate": 7.168506025701734e-06, "loss": 0.4756, "step": 1290 }, { "epoch": 2.2609457092819616, "grad_norm": 0.24515607953071594, "learning_rate": 7.164245723404951e-06, "loss": 0.4701, "step": 1291 }, { "epoch": 2.2626970227670755, "grad_norm": 0.2571549117565155, "learning_rate": 7.159983486711219e-06, "loss": 0.4819, "step": 1292 }, { "epoch": 2.264448336252189, "grad_norm": 0.2631593942642212, "learning_rate": 7.155719319430114e-06, "loss": 0.4833, "step": 1293 }, { "epoch": 2.266199649737303, "grad_norm": 0.23882010579109192, "learning_rate": 7.151453225372938e-06, "loss": 0.4843, "step": 1294 }, { "epoch": 2.267950963222417, "grad_norm": 0.22926534712314606, "learning_rate": 7.147185208352712e-06, "loss": 0.4789, "step": 1295 }, { "epoch": 2.2697022767075308, "grad_norm": 0.23342682421207428, "learning_rate": 7.142915272184179e-06, "loss": 0.4764, "step": 1296 }, { "epoch": 2.2714535901926443, "grad_norm": 0.23448091745376587, "learning_rate": 7.138643420683795e-06, "loss": 0.4802, "step": 1297 }, { "epoch": 2.273204903677758, "grad_norm": 0.24560777842998505, "learning_rate": 7.134369657669733e-06, "loss": 0.4811, "step": 1298 }, { "epoch": 2.274956217162872, "grad_norm": 0.2583111524581909, "learning_rate": 7.130093986961868e-06, "loss": 0.4862, "step": 1299 }, { "epoch": 2.276707530647986, "grad_norm": 0.23379403352737427, "learning_rate": 7.1258164123817834e-06, "loss": 0.4729, "step": 1300 }, { "epoch": 2.2784588441331, "grad_norm": 0.227223739027977, "learning_rate": 7.121536937752764e-06, "loss": 0.4722, "step": 1301 }, { "epoch": 2.280210157618214, "grad_norm": 0.26606008410453796, "learning_rate": 7.1172555668997925e-06, "loss": 0.4851, "step": 1302 }, { "epoch": 2.2819614711033274, "grad_norm": 0.23672211170196533, "learning_rate": 7.112972303649546e-06, "loss": 0.4797, "step": 1303 }, { "epoch": 2.2837127845884413, "grad_norm": 0.2304062396287918, "learning_rate": 7.108687151830394e-06, "loss": 0.4737, "step": 1304 }, { "epoch": 2.285464098073555, "grad_norm": 0.24235469102859497, "learning_rate": 7.104400115272393e-06, "loss": 0.4796, "step": 1305 }, { "epoch": 2.287215411558669, "grad_norm": 0.23263747990131378, "learning_rate": 7.100111197807286e-06, "loss": 0.4708, "step": 1306 }, { "epoch": 2.2889667250437826, "grad_norm": 0.23081403970718384, "learning_rate": 7.095820403268494e-06, "loss": 0.4818, "step": 1307 }, { "epoch": 2.2907180385288965, "grad_norm": 0.24612343311309814, "learning_rate": 7.091527735491119e-06, "loss": 0.481, "step": 1308 }, { "epoch": 2.2924693520140105, "grad_norm": 0.2484482377767563, "learning_rate": 7.087233198311936e-06, "loss": 0.4668, "step": 1309 }, { "epoch": 2.2942206654991244, "grad_norm": 0.2504749298095703, "learning_rate": 7.082936795569388e-06, "loss": 0.4666, "step": 1310 }, { "epoch": 2.2959719789842383, "grad_norm": 0.24885070323944092, "learning_rate": 7.078638531103592e-06, "loss": 0.4688, "step": 1311 }, { "epoch": 2.2977232924693523, "grad_norm": 0.24633708596229553, "learning_rate": 7.074338408756321e-06, "loss": 0.4877, "step": 1312 }, { "epoch": 2.2994746059544657, "grad_norm": 0.21285942196846008, "learning_rate": 7.070036432371017e-06, "loss": 0.4773, "step": 1313 }, { "epoch": 2.3012259194395797, "grad_norm": 0.265489399433136, "learning_rate": 7.065732605792771e-06, "loss": 0.469, "step": 1314 }, { "epoch": 2.3029772329246936, "grad_norm": 0.2179526388645172, "learning_rate": 7.061426932868334e-06, "loss": 0.476, "step": 1315 }, { "epoch": 2.3047285464098075, "grad_norm": 0.25545284152030945, "learning_rate": 7.057119417446102e-06, "loss": 0.4805, "step": 1316 }, { "epoch": 2.306479859894921, "grad_norm": 0.22555258870124817, "learning_rate": 7.052810063376124e-06, "loss": 0.4765, "step": 1317 }, { "epoch": 2.308231173380035, "grad_norm": 0.2280188649892807, "learning_rate": 7.048498874510088e-06, "loss": 0.4707, "step": 1318 }, { "epoch": 2.309982486865149, "grad_norm": 0.2366933971643448, "learning_rate": 7.044185854701321e-06, "loss": 0.4699, "step": 1319 }, { "epoch": 2.3117338003502628, "grad_norm": 0.24220187962055206, "learning_rate": 7.03987100780479e-06, "loss": 0.479, "step": 1320 }, { "epoch": 2.3134851138353767, "grad_norm": 0.30797141790390015, "learning_rate": 7.035554337677094e-06, "loss": 0.468, "step": 1321 }, { "epoch": 2.31523642732049, "grad_norm": 0.2702125310897827, "learning_rate": 7.03123584817646e-06, "loss": 0.4726, "step": 1322 }, { "epoch": 2.316987740805604, "grad_norm": 0.2428278774023056, "learning_rate": 7.026915543162741e-06, "loss": 0.4841, "step": 1323 }, { "epoch": 2.318739054290718, "grad_norm": 0.2757025361061096, "learning_rate": 7.022593426497416e-06, "loss": 0.4789, "step": 1324 }, { "epoch": 2.320490367775832, "grad_norm": 0.22997231781482697, "learning_rate": 7.018269502043581e-06, "loss": 0.4682, "step": 1325 }, { "epoch": 2.322241681260946, "grad_norm": 0.25785231590270996, "learning_rate": 7.013943773665947e-06, "loss": 0.4722, "step": 1326 }, { "epoch": 2.3239929947460594, "grad_norm": 0.22201287746429443, "learning_rate": 7.009616245230838e-06, "loss": 0.4784, "step": 1327 }, { "epoch": 2.3257443082311733, "grad_norm": 0.26438623666763306, "learning_rate": 7.005286920606188e-06, "loss": 0.486, "step": 1328 }, { "epoch": 2.327495621716287, "grad_norm": 0.2390347272157669, "learning_rate": 7.0009558036615355e-06, "loss": 0.4855, "step": 1329 }, { "epoch": 2.329246935201401, "grad_norm": 0.25700390338897705, "learning_rate": 6.996622898268021e-06, "loss": 0.4735, "step": 1330 }, { "epoch": 2.330998248686515, "grad_norm": 0.25492212176322937, "learning_rate": 6.992288208298383e-06, "loss": 0.4761, "step": 1331 }, { "epoch": 2.3327495621716285, "grad_norm": 0.2522454261779785, "learning_rate": 6.987951737626956e-06, "loss": 0.469, "step": 1332 }, { "epoch": 2.3345008756567425, "grad_norm": 0.24122093617916107, "learning_rate": 6.983613490129666e-06, "loss": 0.4693, "step": 1333 }, { "epoch": 2.3362521891418564, "grad_norm": 0.26125288009643555, "learning_rate": 6.979273469684026e-06, "loss": 0.4807, "step": 1334 }, { "epoch": 2.3380035026269703, "grad_norm": 0.23071101307868958, "learning_rate": 6.974931680169136e-06, "loss": 0.4761, "step": 1335 }, { "epoch": 2.3397548161120842, "grad_norm": 0.27627864480018616, "learning_rate": 6.970588125465674e-06, "loss": 0.4753, "step": 1336 }, { "epoch": 2.3415061295971977, "grad_norm": 0.2603510916233063, "learning_rate": 6.966242809455899e-06, "loss": 0.4766, "step": 1337 }, { "epoch": 2.3432574430823117, "grad_norm": 0.2348988652229309, "learning_rate": 6.961895736023641e-06, "loss": 0.4818, "step": 1338 }, { "epoch": 2.3450087565674256, "grad_norm": 0.23671063780784607, "learning_rate": 6.957546909054304e-06, "loss": 0.4871, "step": 1339 }, { "epoch": 2.3467600700525395, "grad_norm": 0.22177307307720184, "learning_rate": 6.953196332434856e-06, "loss": 0.4771, "step": 1340 }, { "epoch": 2.3485113835376534, "grad_norm": 0.27954792976379395, "learning_rate": 6.948844010053832e-06, "loss": 0.48, "step": 1341 }, { "epoch": 2.350262697022767, "grad_norm": 0.20723402500152588, "learning_rate": 6.944489945801326e-06, "loss": 0.4723, "step": 1342 }, { "epoch": 2.352014010507881, "grad_norm": 0.24722208082675934, "learning_rate": 6.940134143568987e-06, "loss": 0.4653, "step": 1343 }, { "epoch": 2.3537653239929948, "grad_norm": 0.21650715172290802, "learning_rate": 6.935776607250022e-06, "loss": 0.4729, "step": 1344 }, { "epoch": 2.3555166374781087, "grad_norm": 0.23649564385414124, "learning_rate": 6.931417340739183e-06, "loss": 0.4703, "step": 1345 }, { "epoch": 2.357267950963222, "grad_norm": 0.2577320635318756, "learning_rate": 6.927056347932772e-06, "loss": 0.4757, "step": 1346 }, { "epoch": 2.359019264448336, "grad_norm": 0.229734867811203, "learning_rate": 6.92269363272863e-06, "loss": 0.4762, "step": 1347 }, { "epoch": 2.36077057793345, "grad_norm": 0.24682272970676422, "learning_rate": 6.918329199026143e-06, "loss": 0.4738, "step": 1348 }, { "epoch": 2.362521891418564, "grad_norm": 0.26345813274383545, "learning_rate": 6.913963050726226e-06, "loss": 0.4751, "step": 1349 }, { "epoch": 2.364273204903678, "grad_norm": 0.21282905340194702, "learning_rate": 6.909595191731337e-06, "loss": 0.4771, "step": 1350 }, { "epoch": 2.366024518388792, "grad_norm": 0.25669440627098083, "learning_rate": 6.905225625945451e-06, "loss": 0.478, "step": 1351 }, { "epoch": 2.3677758318739053, "grad_norm": 0.2389729619026184, "learning_rate": 6.900854357274075e-06, "loss": 0.4818, "step": 1352 }, { "epoch": 2.369527145359019, "grad_norm": 0.256237655878067, "learning_rate": 6.896481389624239e-06, "loss": 0.4817, "step": 1353 }, { "epoch": 2.371278458844133, "grad_norm": 0.26227429509162903, "learning_rate": 6.892106726904486e-06, "loss": 0.4669, "step": 1354 }, { "epoch": 2.373029772329247, "grad_norm": 0.24018187820911407, "learning_rate": 6.887730373024881e-06, "loss": 0.4753, "step": 1355 }, { "epoch": 2.3747810858143605, "grad_norm": 0.28819161653518677, "learning_rate": 6.883352331896998e-06, "loss": 0.4719, "step": 1356 }, { "epoch": 2.3765323992994745, "grad_norm": 0.28188714385032654, "learning_rate": 6.878972607433915e-06, "loss": 0.4681, "step": 1357 }, { "epoch": 2.3782837127845884, "grad_norm": 0.23586764931678772, "learning_rate": 6.8745912035502205e-06, "loss": 0.4826, "step": 1358 }, { "epoch": 2.3800350262697023, "grad_norm": 0.30168452858924866, "learning_rate": 6.870208124161998e-06, "loss": 0.4776, "step": 1359 }, { "epoch": 2.3817863397548162, "grad_norm": 0.22693537175655365, "learning_rate": 6.8658233731868355e-06, "loss": 0.4786, "step": 1360 }, { "epoch": 2.38353765323993, "grad_norm": 0.29306626319885254, "learning_rate": 6.86143695454381e-06, "loss": 0.4752, "step": 1361 }, { "epoch": 2.3852889667250436, "grad_norm": 0.2479327768087387, "learning_rate": 6.857048872153491e-06, "loss": 0.4791, "step": 1362 }, { "epoch": 2.3870402802101576, "grad_norm": 0.254743367433548, "learning_rate": 6.852659129937936e-06, "loss": 0.4715, "step": 1363 }, { "epoch": 2.3887915936952715, "grad_norm": 0.282892107963562, "learning_rate": 6.84826773182068e-06, "loss": 0.4785, "step": 1364 }, { "epoch": 2.3905429071803854, "grad_norm": 0.24271348118782043, "learning_rate": 6.843874681726747e-06, "loss": 0.4692, "step": 1365 }, { "epoch": 2.392294220665499, "grad_norm": 0.3012458384037018, "learning_rate": 6.839479983582632e-06, "loss": 0.473, "step": 1366 }, { "epoch": 2.394045534150613, "grad_norm": 0.24996064603328705, "learning_rate": 6.835083641316304e-06, "loss": 0.4587, "step": 1367 }, { "epoch": 2.3957968476357268, "grad_norm": 0.26152753829956055, "learning_rate": 6.830685658857203e-06, "loss": 0.4816, "step": 1368 }, { "epoch": 2.3975481611208407, "grad_norm": 0.28228166699409485, "learning_rate": 6.826286040136232e-06, "loss": 0.4823, "step": 1369 }, { "epoch": 2.3992994746059546, "grad_norm": 0.28117984533309937, "learning_rate": 6.82188478908576e-06, "loss": 0.4588, "step": 1370 }, { "epoch": 2.4010507880910685, "grad_norm": 0.30529993772506714, "learning_rate": 6.817481909639611e-06, "loss": 0.4806, "step": 1371 }, { "epoch": 2.402802101576182, "grad_norm": 0.28182902932167053, "learning_rate": 6.813077405733068e-06, "loss": 0.4805, "step": 1372 }, { "epoch": 2.404553415061296, "grad_norm": 0.2988859713077545, "learning_rate": 6.808671281302865e-06, "loss": 0.4717, "step": 1373 }, { "epoch": 2.40630472854641, "grad_norm": 0.3090115785598755, "learning_rate": 6.804263540287184e-06, "loss": 0.4802, "step": 1374 }, { "epoch": 2.408056042031524, "grad_norm": 0.2658812403678894, "learning_rate": 6.799854186625651e-06, "loss": 0.4766, "step": 1375 }, { "epoch": 2.4098073555166373, "grad_norm": 0.2784208059310913, "learning_rate": 6.795443224259335e-06, "loss": 0.4746, "step": 1376 }, { "epoch": 2.411558669001751, "grad_norm": 0.28446006774902344, "learning_rate": 6.7910306571307425e-06, "loss": 0.4708, "step": 1377 }, { "epoch": 2.413309982486865, "grad_norm": 0.30972498655319214, "learning_rate": 6.786616489183814e-06, "loss": 0.468, "step": 1378 }, { "epoch": 2.415061295971979, "grad_norm": 0.2512277066707611, "learning_rate": 6.78220072436392e-06, "loss": 0.4711, "step": 1379 }, { "epoch": 2.416812609457093, "grad_norm": 0.2764579653739929, "learning_rate": 6.7777833666178594e-06, "loss": 0.4664, "step": 1380 }, { "epoch": 2.418563922942207, "grad_norm": 0.24445760250091553, "learning_rate": 6.773364419893854e-06, "loss": 0.4656, "step": 1381 }, { "epoch": 2.4203152364273204, "grad_norm": 0.26920217275619507, "learning_rate": 6.768943888141548e-06, "loss": 0.4732, "step": 1382 }, { "epoch": 2.4220665499124343, "grad_norm": 0.24392275512218475, "learning_rate": 6.764521775312e-06, "loss": 0.4696, "step": 1383 }, { "epoch": 2.4238178633975482, "grad_norm": 0.2635992765426636, "learning_rate": 6.760098085357681e-06, "loss": 0.4705, "step": 1384 }, { "epoch": 2.425569176882662, "grad_norm": 0.22139278054237366, "learning_rate": 6.755672822232475e-06, "loss": 0.4683, "step": 1385 }, { "epoch": 2.4273204903677756, "grad_norm": 0.24126608669757843, "learning_rate": 6.7512459898916704e-06, "loss": 0.4738, "step": 1386 }, { "epoch": 2.4290718038528896, "grad_norm": 0.2652655839920044, "learning_rate": 6.746817592291957e-06, "loss": 0.4742, "step": 1387 }, { "epoch": 2.4308231173380035, "grad_norm": 0.24341928958892822, "learning_rate": 6.742387633391423e-06, "loss": 0.4759, "step": 1388 }, { "epoch": 2.4325744308231174, "grad_norm": 0.26422587037086487, "learning_rate": 6.737956117149555e-06, "loss": 0.4764, "step": 1389 }, { "epoch": 2.4343257443082313, "grad_norm": 0.24043823778629303, "learning_rate": 6.733523047527231e-06, "loss": 0.4768, "step": 1390 }, { "epoch": 2.436077057793345, "grad_norm": 0.2581097185611725, "learning_rate": 6.729088428486717e-06, "loss": 0.4822, "step": 1391 }, { "epoch": 2.4378283712784588, "grad_norm": 0.22651852667331696, "learning_rate": 6.72465226399166e-06, "loss": 0.455, "step": 1392 }, { "epoch": 2.4395796847635727, "grad_norm": 0.25386038422584534, "learning_rate": 6.720214558007094e-06, "loss": 0.4725, "step": 1393 }, { "epoch": 2.4413309982486866, "grad_norm": 0.2247757762670517, "learning_rate": 6.715775314499431e-06, "loss": 0.4679, "step": 1394 }, { "epoch": 2.4430823117338005, "grad_norm": 0.2318974733352661, "learning_rate": 6.711334537436451e-06, "loss": 0.4699, "step": 1395 }, { "epoch": 2.444833625218914, "grad_norm": 0.27365902066230774, "learning_rate": 6.7068922307873104e-06, "loss": 0.4769, "step": 1396 }, { "epoch": 2.446584938704028, "grad_norm": 0.23485533893108368, "learning_rate": 6.7024483985225295e-06, "loss": 0.4836, "step": 1397 }, { "epoch": 2.448336252189142, "grad_norm": 0.25492972135543823, "learning_rate": 6.698003044613997e-06, "loss": 0.4675, "step": 1398 }, { "epoch": 2.450087565674256, "grad_norm": 0.24341584742069244, "learning_rate": 6.693556173034953e-06, "loss": 0.4782, "step": 1399 }, { "epoch": 2.4518388791593697, "grad_norm": 0.27167612314224243, "learning_rate": 6.689107787760002e-06, "loss": 0.4853, "step": 1400 }, { "epoch": 2.453590192644483, "grad_norm": 0.2590796947479248, "learning_rate": 6.6846578927650985e-06, "loss": 0.4804, "step": 1401 }, { "epoch": 2.455341506129597, "grad_norm": 0.25044649839401245, "learning_rate": 6.6802064920275475e-06, "loss": 0.4661, "step": 1402 }, { "epoch": 2.457092819614711, "grad_norm": 0.260174036026001, "learning_rate": 6.675753589525996e-06, "loss": 0.4807, "step": 1403 }, { "epoch": 2.458844133099825, "grad_norm": 0.29686006903648376, "learning_rate": 6.671299189240439e-06, "loss": 0.4876, "step": 1404 }, { "epoch": 2.460595446584939, "grad_norm": 0.27899208664894104, "learning_rate": 6.666843295152207e-06, "loss": 0.4731, "step": 1405 }, { "epoch": 2.4623467600700524, "grad_norm": 0.26958832144737244, "learning_rate": 6.662385911243965e-06, "loss": 0.4834, "step": 1406 }, { "epoch": 2.4640980735551663, "grad_norm": 0.2566066086292267, "learning_rate": 6.657927041499711e-06, "loss": 0.4794, "step": 1407 }, { "epoch": 2.4658493870402802, "grad_norm": 0.2711206078529358, "learning_rate": 6.653466689904771e-06, "loss": 0.4761, "step": 1408 }, { "epoch": 2.467600700525394, "grad_norm": 0.24218931794166565, "learning_rate": 6.649004860445795e-06, "loss": 0.474, "step": 1409 }, { "epoch": 2.469352014010508, "grad_norm": 0.2473221868276596, "learning_rate": 6.644541557110753e-06, "loss": 0.4743, "step": 1410 }, { "epoch": 2.4711033274956216, "grad_norm": 0.214772030711174, "learning_rate": 6.640076783888934e-06, "loss": 0.4742, "step": 1411 }, { "epoch": 2.4728546409807355, "grad_norm": 0.24450646340847015, "learning_rate": 6.63561054477094e-06, "loss": 0.4757, "step": 1412 }, { "epoch": 2.4746059544658494, "grad_norm": 0.22629189491271973, "learning_rate": 6.631142843748686e-06, "loss": 0.4882, "step": 1413 }, { "epoch": 2.4763572679509633, "grad_norm": 0.2226397544145584, "learning_rate": 6.626673684815389e-06, "loss": 0.4885, "step": 1414 }, { "epoch": 2.478108581436077, "grad_norm": 0.25883325934410095, "learning_rate": 6.62220307196557e-06, "loss": 0.4716, "step": 1415 }, { "epoch": 2.4798598949211907, "grad_norm": 0.2228289246559143, "learning_rate": 6.617731009195052e-06, "loss": 0.4743, "step": 1416 }, { "epoch": 2.4816112084063047, "grad_norm": 0.2553895115852356, "learning_rate": 6.613257500500952e-06, "loss": 0.471, "step": 1417 }, { "epoch": 2.4833625218914186, "grad_norm": 0.24779090285301208, "learning_rate": 6.60878254988168e-06, "loss": 0.4628, "step": 1418 }, { "epoch": 2.4851138353765325, "grad_norm": 0.23109550774097443, "learning_rate": 6.6043061613369356e-06, "loss": 0.4685, "step": 1419 }, { "epoch": 2.4868651488616464, "grad_norm": 0.23692291975021362, "learning_rate": 6.5998283388677005e-06, "loss": 0.4766, "step": 1420 }, { "epoch": 2.48861646234676, "grad_norm": 0.23908090591430664, "learning_rate": 6.595349086476244e-06, "loss": 0.4678, "step": 1421 }, { "epoch": 2.490367775831874, "grad_norm": 0.24529942870140076, "learning_rate": 6.590868408166105e-06, "loss": 0.4804, "step": 1422 }, { "epoch": 2.492119089316988, "grad_norm": 0.23898500204086304, "learning_rate": 6.586386307942105e-06, "loss": 0.4782, "step": 1423 }, { "epoch": 2.4938704028021017, "grad_norm": 0.26164963841438293, "learning_rate": 6.581902789810333e-06, "loss": 0.4719, "step": 1424 }, { "epoch": 2.495621716287215, "grad_norm": 0.2303069531917572, "learning_rate": 6.577417857778142e-06, "loss": 0.479, "step": 1425 }, { "epoch": 2.497373029772329, "grad_norm": 0.24469813704490662, "learning_rate": 6.572931515854158e-06, "loss": 0.4834, "step": 1426 }, { "epoch": 2.499124343257443, "grad_norm": 0.23378215730190277, "learning_rate": 6.568443768048254e-06, "loss": 0.4705, "step": 1427 }, { "epoch": 2.500875656742557, "grad_norm": 0.2308712750673294, "learning_rate": 6.563954618371573e-06, "loss": 0.4657, "step": 1428 }, { "epoch": 2.502626970227671, "grad_norm": 0.2712639272212982, "learning_rate": 6.559464070836501e-06, "loss": 0.4852, "step": 1429 }, { "epoch": 2.504378283712785, "grad_norm": 0.2120043784379959, "learning_rate": 6.5549721294566794e-06, "loss": 0.4776, "step": 1430 }, { "epoch": 2.5061295971978983, "grad_norm": 0.27986472845077515, "learning_rate": 6.550478798246991e-06, "loss": 0.4658, "step": 1431 }, { "epoch": 2.507880910683012, "grad_norm": 0.23222224414348602, "learning_rate": 6.545984081223564e-06, "loss": 0.4749, "step": 1432 }, { "epoch": 2.509632224168126, "grad_norm": 0.24188610911369324, "learning_rate": 6.541487982403767e-06, "loss": 0.4788, "step": 1433 }, { "epoch": 2.51138353765324, "grad_norm": 0.26985159516334534, "learning_rate": 6.536990505806197e-06, "loss": 0.493, "step": 1434 }, { "epoch": 2.5131348511383536, "grad_norm": 0.21577303111553192, "learning_rate": 6.532491655450688e-06, "loss": 0.4638, "step": 1435 }, { "epoch": 2.5148861646234675, "grad_norm": 0.25388312339782715, "learning_rate": 6.5279914353582996e-06, "loss": 0.4754, "step": 1436 }, { "epoch": 2.5166374781085814, "grad_norm": 0.2599342167377472, "learning_rate": 6.523489849551316e-06, "loss": 0.4599, "step": 1437 }, { "epoch": 2.5183887915936953, "grad_norm": 0.2265194058418274, "learning_rate": 6.518986902053244e-06, "loss": 0.4587, "step": 1438 }, { "epoch": 2.5201401050788093, "grad_norm": 0.25815537571907043, "learning_rate": 6.514482596888807e-06, "loss": 0.4779, "step": 1439 }, { "epoch": 2.521891418563923, "grad_norm": 0.28118544816970825, "learning_rate": 6.509976938083936e-06, "loss": 0.4851, "step": 1440 }, { "epoch": 2.5236427320490367, "grad_norm": 0.2639354169368744, "learning_rate": 6.505469929665781e-06, "loss": 0.4682, "step": 1441 }, { "epoch": 2.5253940455341506, "grad_norm": 0.2556841969490051, "learning_rate": 6.500961575662691e-06, "loss": 0.4766, "step": 1442 }, { "epoch": 2.5271453590192645, "grad_norm": 0.3086076080799103, "learning_rate": 6.496451880104222e-06, "loss": 0.4664, "step": 1443 }, { "epoch": 2.5288966725043784, "grad_norm": 0.2424655258655548, "learning_rate": 6.491940847021128e-06, "loss": 0.4831, "step": 1444 }, { "epoch": 2.530647985989492, "grad_norm": 0.26785361766815186, "learning_rate": 6.487428480445357e-06, "loss": 0.4732, "step": 1445 }, { "epoch": 2.532399299474606, "grad_norm": 0.31000539660453796, "learning_rate": 6.48291478441005e-06, "loss": 0.4754, "step": 1446 }, { "epoch": 2.5341506129597198, "grad_norm": 0.22115488350391388, "learning_rate": 6.478399762949537e-06, "loss": 0.4774, "step": 1447 }, { "epoch": 2.5359019264448337, "grad_norm": 0.23776067793369293, "learning_rate": 6.47388342009933e-06, "loss": 0.4724, "step": 1448 }, { "epoch": 2.5376532399299476, "grad_norm": 0.23321521282196045, "learning_rate": 6.469365759896128e-06, "loss": 0.4807, "step": 1449 }, { "epoch": 2.5394045534150615, "grad_norm": 0.21922698616981506, "learning_rate": 6.464846786377801e-06, "loss": 0.479, "step": 1450 }, { "epoch": 2.541155866900175, "grad_norm": 0.2197449952363968, "learning_rate": 6.460326503583394e-06, "loss": 0.4732, "step": 1451 }, { "epoch": 2.542907180385289, "grad_norm": 0.22063954174518585, "learning_rate": 6.4558049155531255e-06, "loss": 0.4674, "step": 1452 }, { "epoch": 2.544658493870403, "grad_norm": 0.2454167902469635, "learning_rate": 6.451282026328377e-06, "loss": 0.4813, "step": 1453 }, { "epoch": 2.5464098073555164, "grad_norm": 0.2549092769622803, "learning_rate": 6.4467578399516965e-06, "loss": 0.4749, "step": 1454 }, { "epoch": 2.5481611208406303, "grad_norm": 0.25996288657188416, "learning_rate": 6.442232360466789e-06, "loss": 0.4849, "step": 1455 }, { "epoch": 2.549912434325744, "grad_norm": 0.2845767140388489, "learning_rate": 6.437705591918514e-06, "loss": 0.4711, "step": 1456 }, { "epoch": 2.551663747810858, "grad_norm": 0.2263530045747757, "learning_rate": 6.433177538352887e-06, "loss": 0.4668, "step": 1457 }, { "epoch": 2.553415061295972, "grad_norm": 0.3122987747192383, "learning_rate": 6.428648203817069e-06, "loss": 0.4702, "step": 1458 }, { "epoch": 2.555166374781086, "grad_norm": 0.23197712004184723, "learning_rate": 6.424117592359367e-06, "loss": 0.4643, "step": 1459 }, { "epoch": 2.5569176882662, "grad_norm": 0.26558855175971985, "learning_rate": 6.419585708029229e-06, "loss": 0.4748, "step": 1460 }, { "epoch": 2.5586690017513134, "grad_norm": 0.3134874403476715, "learning_rate": 6.415052554877241e-06, "loss": 0.4597, "step": 1461 }, { "epoch": 2.5604203152364273, "grad_norm": 0.2330329269170761, "learning_rate": 6.410518136955124e-06, "loss": 0.4753, "step": 1462 }, { "epoch": 2.5621716287215412, "grad_norm": 0.3184458911418915, "learning_rate": 6.405982458315727e-06, "loss": 0.4751, "step": 1463 }, { "epoch": 2.5639229422066547, "grad_norm": 0.24522699415683746, "learning_rate": 6.401445523013028e-06, "loss": 0.4762, "step": 1464 }, { "epoch": 2.5656742556917687, "grad_norm": 0.2536839544773102, "learning_rate": 6.396907335102127e-06, "loss": 0.4654, "step": 1465 }, { "epoch": 2.5674255691768826, "grad_norm": 0.24792176485061646, "learning_rate": 6.392367898639245e-06, "loss": 0.462, "step": 1466 }, { "epoch": 2.5691768826619965, "grad_norm": 0.25114861130714417, "learning_rate": 6.387827217681717e-06, "loss": 0.4794, "step": 1467 }, { "epoch": 2.5709281961471104, "grad_norm": 0.25055497884750366, "learning_rate": 6.383285296287991e-06, "loss": 0.4698, "step": 1468 }, { "epoch": 2.5726795096322244, "grad_norm": 0.22596575319766998, "learning_rate": 6.378742138517624e-06, "loss": 0.4775, "step": 1469 }, { "epoch": 2.574430823117338, "grad_norm": 0.2789793014526367, "learning_rate": 6.37419774843128e-06, "loss": 0.4882, "step": 1470 }, { "epoch": 2.5761821366024518, "grad_norm": 0.25551262497901917, "learning_rate": 6.3696521300907214e-06, "loss": 0.4738, "step": 1471 }, { "epoch": 2.5779334500875657, "grad_norm": 0.2276565283536911, "learning_rate": 6.3651052875588085e-06, "loss": 0.4815, "step": 1472 }, { "epoch": 2.5796847635726796, "grad_norm": 0.2771836221218109, "learning_rate": 6.360557224899496e-06, "loss": 0.4735, "step": 1473 }, { "epoch": 2.581436077057793, "grad_norm": 0.2607584297657013, "learning_rate": 6.356007946177833e-06, "loss": 0.4685, "step": 1474 }, { "epoch": 2.583187390542907, "grad_norm": 0.23815704882144928, "learning_rate": 6.351457455459953e-06, "loss": 0.4757, "step": 1475 }, { "epoch": 2.584938704028021, "grad_norm": 0.23679518699645996, "learning_rate": 6.346905756813069e-06, "loss": 0.4679, "step": 1476 }, { "epoch": 2.586690017513135, "grad_norm": 0.24209415912628174, "learning_rate": 6.342352854305477e-06, "loss": 0.4843, "step": 1477 }, { "epoch": 2.588441330998249, "grad_norm": 0.27902746200561523, "learning_rate": 6.337798752006551e-06, "loss": 0.4752, "step": 1478 }, { "epoch": 2.5901926444833627, "grad_norm": 0.21801671385765076, "learning_rate": 6.333243453986734e-06, "loss": 0.4742, "step": 1479 }, { "epoch": 2.591943957968476, "grad_norm": 0.27108070254325867, "learning_rate": 6.32868696431754e-06, "loss": 0.4871, "step": 1480 }, { "epoch": 2.59369527145359, "grad_norm": 0.2202731966972351, "learning_rate": 6.324129287071546e-06, "loss": 0.4715, "step": 1481 }, { "epoch": 2.595446584938704, "grad_norm": 0.23348449170589447, "learning_rate": 6.3195704263223914e-06, "loss": 0.4688, "step": 1482 }, { "epoch": 2.597197898423818, "grad_norm": 0.21953924000263214, "learning_rate": 6.315010386144776e-06, "loss": 0.4776, "step": 1483 }, { "epoch": 2.5989492119089315, "grad_norm": 0.23213444650173187, "learning_rate": 6.310449170614446e-06, "loss": 0.4762, "step": 1484 }, { "epoch": 2.6007005253940454, "grad_norm": 0.2585700452327728, "learning_rate": 6.305886783808209e-06, "loss": 0.4719, "step": 1485 }, { "epoch": 2.6024518388791593, "grad_norm": 0.23312188684940338, "learning_rate": 6.301323229803913e-06, "loss": 0.4882, "step": 1486 }, { "epoch": 2.6042031523642732, "grad_norm": 0.23894193768501282, "learning_rate": 6.296758512680448e-06, "loss": 0.4756, "step": 1487 }, { "epoch": 2.605954465849387, "grad_norm": 0.22601532936096191, "learning_rate": 6.292192636517744e-06, "loss": 0.4704, "step": 1488 }, { "epoch": 2.607705779334501, "grad_norm": 0.24558894336223602, "learning_rate": 6.287625605396774e-06, "loss": 0.4768, "step": 1489 }, { "epoch": 2.6094570928196146, "grad_norm": 0.23692543804645538, "learning_rate": 6.2830574233995356e-06, "loss": 0.4699, "step": 1490 }, { "epoch": 2.6112084063047285, "grad_norm": 0.22108256816864014, "learning_rate": 6.278488094609057e-06, "loss": 0.4744, "step": 1491 }, { "epoch": 2.6129597197898424, "grad_norm": 0.2398330569267273, "learning_rate": 6.273917623109392e-06, "loss": 0.4729, "step": 1492 }, { "epoch": 2.6147110332749564, "grad_norm": 0.26067885756492615, "learning_rate": 6.269346012985617e-06, "loss": 0.4648, "step": 1493 }, { "epoch": 2.61646234676007, "grad_norm": 0.24195633828639984, "learning_rate": 6.264773268323821e-06, "loss": 0.484, "step": 1494 }, { "epoch": 2.6182136602451838, "grad_norm": 0.2363269329071045, "learning_rate": 6.260199393211115e-06, "loss": 0.4794, "step": 1495 }, { "epoch": 2.6199649737302977, "grad_norm": 0.23290184140205383, "learning_rate": 6.255624391735614e-06, "loss": 0.4713, "step": 1496 }, { "epoch": 2.6217162872154116, "grad_norm": 0.23180311918258667, "learning_rate": 6.2510482679864425e-06, "loss": 0.4748, "step": 1497 }, { "epoch": 2.6234676007005255, "grad_norm": 0.26696014404296875, "learning_rate": 6.246471026053727e-06, "loss": 0.4686, "step": 1498 }, { "epoch": 2.6252189141856395, "grad_norm": 0.21255359053611755, "learning_rate": 6.241892670028595e-06, "loss": 0.4833, "step": 1499 }, { "epoch": 2.626970227670753, "grad_norm": 0.24414612352848053, "learning_rate": 6.237313204003167e-06, "loss": 0.4692, "step": 1500 }, { "epoch": 2.628721541155867, "grad_norm": 0.22232811152935028, "learning_rate": 6.23273263207056e-06, "loss": 0.4784, "step": 1501 }, { "epoch": 2.630472854640981, "grad_norm": 0.22301869094371796, "learning_rate": 6.228150958324872e-06, "loss": 0.471, "step": 1502 }, { "epoch": 2.6322241681260947, "grad_norm": 0.22775830328464508, "learning_rate": 6.223568186861195e-06, "loss": 0.4723, "step": 1503 }, { "epoch": 2.633975481611208, "grad_norm": 0.23964332044124603, "learning_rate": 6.218984321775596e-06, "loss": 0.4738, "step": 1504 }, { "epoch": 2.635726795096322, "grad_norm": 0.2296011596918106, "learning_rate": 6.214399367165122e-06, "loss": 0.4711, "step": 1505 }, { "epoch": 2.637478108581436, "grad_norm": 0.23158049583435059, "learning_rate": 6.209813327127792e-06, "loss": 0.4811, "step": 1506 }, { "epoch": 2.63922942206655, "grad_norm": 0.20414398610591888, "learning_rate": 6.205226205762595e-06, "loss": 0.4708, "step": 1507 }, { "epoch": 2.640980735551664, "grad_norm": 0.22074143588542938, "learning_rate": 6.20063800716949e-06, "loss": 0.4631, "step": 1508 }, { "epoch": 2.642732049036778, "grad_norm": 0.27125951647758484, "learning_rate": 6.196048735449396e-06, "loss": 0.4799, "step": 1509 }, { "epoch": 2.6444833625218913, "grad_norm": 0.20037025213241577, "learning_rate": 6.1914583947041906e-06, "loss": 0.4542, "step": 1510 }, { "epoch": 2.6462346760070052, "grad_norm": 0.22740329802036285, "learning_rate": 6.186866989036708e-06, "loss": 0.4656, "step": 1511 }, { "epoch": 2.647985989492119, "grad_norm": 0.24340282380580902, "learning_rate": 6.1822745225507355e-06, "loss": 0.4756, "step": 1512 }, { "epoch": 2.649737302977233, "grad_norm": 0.22336256504058838, "learning_rate": 6.177680999351006e-06, "loss": 0.4838, "step": 1513 }, { "epoch": 2.6514886164623466, "grad_norm": 0.22251924872398376, "learning_rate": 6.173086423543198e-06, "loss": 0.4763, "step": 1514 }, { "epoch": 2.6532399299474605, "grad_norm": 0.2707374095916748, "learning_rate": 6.168490799233931e-06, "loss": 0.4797, "step": 1515 }, { "epoch": 2.6549912434325744, "grad_norm": 0.21212726831436157, "learning_rate": 6.163894130530761e-06, "loss": 0.4778, "step": 1516 }, { "epoch": 2.6567425569176883, "grad_norm": 0.2374967634677887, "learning_rate": 6.159296421542178e-06, "loss": 0.4639, "step": 1517 }, { "epoch": 2.6584938704028023, "grad_norm": 0.21518474817276, "learning_rate": 6.154697676377605e-06, "loss": 0.4806, "step": 1518 }, { "epoch": 2.660245183887916, "grad_norm": 0.25490880012512207, "learning_rate": 6.150097899147384e-06, "loss": 0.4811, "step": 1519 }, { "epoch": 2.6619964973730297, "grad_norm": 0.23227356374263763, "learning_rate": 6.1454970939627845e-06, "loss": 0.4691, "step": 1520 }, { "epoch": 2.6637478108581436, "grad_norm": 0.24654003977775574, "learning_rate": 6.140895264935993e-06, "loss": 0.4635, "step": 1521 }, { "epoch": 2.6654991243432575, "grad_norm": 0.24978375434875488, "learning_rate": 6.136292416180114e-06, "loss": 0.4905, "step": 1522 }, { "epoch": 2.667250437828371, "grad_norm": 0.23622217774391174, "learning_rate": 6.131688551809161e-06, "loss": 0.4644, "step": 1523 }, { "epoch": 2.669001751313485, "grad_norm": 0.26276740431785583, "learning_rate": 6.127083675938053e-06, "loss": 0.471, "step": 1524 }, { "epoch": 2.670753064798599, "grad_norm": 0.2503618001937866, "learning_rate": 6.122477792682616e-06, "loss": 0.4718, "step": 1525 }, { "epoch": 2.672504378283713, "grad_norm": 0.2524091303348541, "learning_rate": 6.117870906159577e-06, "loss": 0.4716, "step": 1526 }, { "epoch": 2.6742556917688267, "grad_norm": 0.2266114503145218, "learning_rate": 6.113263020486559e-06, "loss": 0.4721, "step": 1527 }, { "epoch": 2.6760070052539406, "grad_norm": 0.26695168018341064, "learning_rate": 6.1086541397820785e-06, "loss": 0.4746, "step": 1528 }, { "epoch": 2.6777583187390546, "grad_norm": 0.23733223974704742, "learning_rate": 6.104044268165539e-06, "loss": 0.4785, "step": 1529 }, { "epoch": 2.679509632224168, "grad_norm": 0.22073259949684143, "learning_rate": 6.0994334097572305e-06, "loss": 0.4734, "step": 1530 }, { "epoch": 2.681260945709282, "grad_norm": 0.2858473062515259, "learning_rate": 6.094821568678328e-06, "loss": 0.4707, "step": 1531 }, { "epoch": 2.683012259194396, "grad_norm": 0.2292838990688324, "learning_rate": 6.090208749050881e-06, "loss": 0.4773, "step": 1532 }, { "epoch": 2.6847635726795094, "grad_norm": 0.25465014576911926, "learning_rate": 6.085594954997814e-06, "loss": 0.4638, "step": 1533 }, { "epoch": 2.6865148861646233, "grad_norm": 0.28694549202919006, "learning_rate": 6.080980190642927e-06, "loss": 0.4693, "step": 1534 }, { "epoch": 2.6882661996497372, "grad_norm": 0.23009628057479858, "learning_rate": 6.076364460110881e-06, "loss": 0.4706, "step": 1535 }, { "epoch": 2.690017513134851, "grad_norm": 0.2999688982963562, "learning_rate": 6.071747767527204e-06, "loss": 0.471, "step": 1536 }, { "epoch": 2.691768826619965, "grad_norm": 0.22890199720859528, "learning_rate": 6.067130117018284e-06, "loss": 0.4587, "step": 1537 }, { "epoch": 2.693520140105079, "grad_norm": 0.2426457405090332, "learning_rate": 6.062511512711362e-06, "loss": 0.4731, "step": 1538 }, { "epoch": 2.6952714535901925, "grad_norm": 0.28306809067726135, "learning_rate": 6.057891958734538e-06, "loss": 0.4658, "step": 1539 }, { "epoch": 2.6970227670753064, "grad_norm": 0.22130653262138367, "learning_rate": 6.053271459216755e-06, "loss": 0.4679, "step": 1540 }, { "epoch": 2.6987740805604203, "grad_norm": 0.3118220865726471, "learning_rate": 6.048650018287803e-06, "loss": 0.4821, "step": 1541 }, { "epoch": 2.7005253940455343, "grad_norm": 0.2277858704328537, "learning_rate": 6.044027640078312e-06, "loss": 0.4648, "step": 1542 }, { "epoch": 2.7022767075306477, "grad_norm": 0.27158117294311523, "learning_rate": 6.0394043287197524e-06, "loss": 0.4847, "step": 1543 }, { "epoch": 2.7040280210157617, "grad_norm": 0.24457202851772308, "learning_rate": 6.034780088344427e-06, "loss": 0.4742, "step": 1544 }, { "epoch": 2.7057793345008756, "grad_norm": 0.25558730959892273, "learning_rate": 6.030154923085469e-06, "loss": 0.4833, "step": 1545 }, { "epoch": 2.7075306479859895, "grad_norm": 0.26936206221580505, "learning_rate": 6.025528837076841e-06, "loss": 0.4673, "step": 1546 }, { "epoch": 2.7092819614711035, "grad_norm": 0.24131938815116882, "learning_rate": 6.020901834453322e-06, "loss": 0.4709, "step": 1547 }, { "epoch": 2.7110332749562174, "grad_norm": 0.29715803265571594, "learning_rate": 6.016273919350517e-06, "loss": 0.4709, "step": 1548 }, { "epoch": 2.712784588441331, "grad_norm": 0.2827681005001068, "learning_rate": 6.011645095904845e-06, "loss": 0.4825, "step": 1549 }, { "epoch": 2.714535901926445, "grad_norm": 0.2860810160636902, "learning_rate": 6.0070153682535315e-06, "loss": 0.4784, "step": 1550 }, { "epoch": 2.7162872154115587, "grad_norm": 0.33546850085258484, "learning_rate": 6.002384740534619e-06, "loss": 0.472, "step": 1551 }, { "epoch": 2.7180385288966726, "grad_norm": 0.22264711558818817, "learning_rate": 5.997753216886948e-06, "loss": 0.4727, "step": 1552 }, { "epoch": 2.719789842381786, "grad_norm": 0.34993332624435425, "learning_rate": 5.993120801450162e-06, "loss": 0.4822, "step": 1553 }, { "epoch": 2.7215411558669, "grad_norm": 0.2425786256790161, "learning_rate": 5.9884874983647e-06, "loss": 0.4672, "step": 1554 }, { "epoch": 2.723292469352014, "grad_norm": 0.2504480183124542, "learning_rate": 5.983853311771799e-06, "loss": 0.4796, "step": 1555 }, { "epoch": 2.725043782837128, "grad_norm": 0.21305829286575317, "learning_rate": 5.979218245813478e-06, "loss": 0.4681, "step": 1556 }, { "epoch": 2.726795096322242, "grad_norm": 0.23465098440647125, "learning_rate": 5.974582304632551e-06, "loss": 0.4684, "step": 1557 }, { "epoch": 2.7285464098073557, "grad_norm": 0.23325827717781067, "learning_rate": 5.969945492372606e-06, "loss": 0.4846, "step": 1558 }, { "epoch": 2.7302977232924692, "grad_norm": 0.22016417980194092, "learning_rate": 5.965307813178015e-06, "loss": 0.4682, "step": 1559 }, { "epoch": 2.732049036777583, "grad_norm": 0.20822075009346008, "learning_rate": 5.960669271193922e-06, "loss": 0.4753, "step": 1560 }, { "epoch": 2.733800350262697, "grad_norm": 0.24520151317119598, "learning_rate": 5.9560298705662444e-06, "loss": 0.4593, "step": 1561 }, { "epoch": 2.735551663747811, "grad_norm": 0.21916668117046356, "learning_rate": 5.951389615441666e-06, "loss": 0.4651, "step": 1562 }, { "epoch": 2.7373029772329245, "grad_norm": 0.23994038999080658, "learning_rate": 5.9467485099676325e-06, "loss": 0.48, "step": 1563 }, { "epoch": 2.7390542907180384, "grad_norm": 0.23882193863391876, "learning_rate": 5.942106558292352e-06, "loss": 0.4677, "step": 1564 }, { "epoch": 2.7408056042031523, "grad_norm": 0.226144477725029, "learning_rate": 5.93746376456479e-06, "loss": 0.4733, "step": 1565 }, { "epoch": 2.7425569176882663, "grad_norm": 0.21297326683998108, "learning_rate": 5.932820132934661e-06, "loss": 0.4747, "step": 1566 }, { "epoch": 2.74430823117338, "grad_norm": 0.2408396303653717, "learning_rate": 5.928175667552432e-06, "loss": 0.4712, "step": 1567 }, { "epoch": 2.746059544658494, "grad_norm": 0.2605265974998474, "learning_rate": 5.923530372569312e-06, "loss": 0.486, "step": 1568 }, { "epoch": 2.7478108581436076, "grad_norm": 0.22772520780563354, "learning_rate": 5.9188842521372535e-06, "loss": 0.47, "step": 1569 }, { "epoch": 2.7495621716287215, "grad_norm": 0.24437303841114044, "learning_rate": 5.914237310408947e-06, "loss": 0.4654, "step": 1570 }, { "epoch": 2.7513134851138354, "grad_norm": 0.2286171019077301, "learning_rate": 5.909589551537814e-06, "loss": 0.4778, "step": 1571 }, { "epoch": 2.7530647985989494, "grad_norm": 0.21846842765808105, "learning_rate": 5.904940979678012e-06, "loss": 0.4767, "step": 1572 }, { "epoch": 2.754816112084063, "grad_norm": 0.24414116144180298, "learning_rate": 5.90029159898442e-06, "loss": 0.48, "step": 1573 }, { "epoch": 2.7565674255691768, "grad_norm": 0.22952145338058472, "learning_rate": 5.895641413612643e-06, "loss": 0.479, "step": 1574 }, { "epoch": 2.7583187390542907, "grad_norm": 0.2522551417350769, "learning_rate": 5.890990427719003e-06, "loss": 0.4803, "step": 1575 }, { "epoch": 2.7600700525394046, "grad_norm": 0.27528974413871765, "learning_rate": 5.886338645460539e-06, "loss": 0.4677, "step": 1576 }, { "epoch": 2.7618213660245186, "grad_norm": 0.2723996639251709, "learning_rate": 5.881686070995001e-06, "loss": 0.476, "step": 1577 }, { "epoch": 2.7635726795096325, "grad_norm": 0.27024775743484497, "learning_rate": 5.877032708480847e-06, "loss": 0.4673, "step": 1578 }, { "epoch": 2.765323992994746, "grad_norm": 0.2562869191169739, "learning_rate": 5.872378562077241e-06, "loss": 0.4665, "step": 1579 }, { "epoch": 2.76707530647986, "grad_norm": 0.2334377020597458, "learning_rate": 5.867723635944047e-06, "loss": 0.4786, "step": 1580 }, { "epoch": 2.768826619964974, "grad_norm": 0.25704053044319153, "learning_rate": 5.863067934241823e-06, "loss": 0.475, "step": 1581 }, { "epoch": 2.7705779334500873, "grad_norm": 0.22388367354869843, "learning_rate": 5.8584114611318225e-06, "loss": 0.478, "step": 1582 }, { "epoch": 2.772329246935201, "grad_norm": 0.2708052396774292, "learning_rate": 5.853754220775992e-06, "loss": 0.4726, "step": 1583 }, { "epoch": 2.774080560420315, "grad_norm": 0.2374090701341629, "learning_rate": 5.849096217336956e-06, "loss": 0.4643, "step": 1584 }, { "epoch": 2.775831873905429, "grad_norm": 0.30215975642204285, "learning_rate": 5.844437454978029e-06, "loss": 0.4634, "step": 1585 }, { "epoch": 2.777583187390543, "grad_norm": 0.2496325522661209, "learning_rate": 5.8397779378631955e-06, "loss": 0.4774, "step": 1586 }, { "epoch": 2.779334500875657, "grad_norm": 0.28560683131217957, "learning_rate": 5.835117670157126e-06, "loss": 0.4687, "step": 1587 }, { "epoch": 2.781085814360771, "grad_norm": 0.2893136441707611, "learning_rate": 5.83045665602515e-06, "loss": 0.4639, "step": 1588 }, { "epoch": 2.7828371278458843, "grad_norm": 0.2619282901287079, "learning_rate": 5.82579489963327e-06, "loss": 0.4646, "step": 1589 }, { "epoch": 2.7845884413309983, "grad_norm": 0.25850850343704224, "learning_rate": 5.8211324051481534e-06, "loss": 0.4665, "step": 1590 }, { "epoch": 2.786339754816112, "grad_norm": 0.29105764627456665, "learning_rate": 5.816469176737122e-06, "loss": 0.4699, "step": 1591 }, { "epoch": 2.7880910683012257, "grad_norm": 0.2661987543106079, "learning_rate": 5.8118052185681584e-06, "loss": 0.4765, "step": 1592 }, { "epoch": 2.7898423817863396, "grad_norm": 0.2592061161994934, "learning_rate": 5.807140534809896e-06, "loss": 0.4737, "step": 1593 }, { "epoch": 2.7915936952714535, "grad_norm": 0.2567844092845917, "learning_rate": 5.802475129631616e-06, "loss": 0.473, "step": 1594 }, { "epoch": 2.7933450087565674, "grad_norm": 0.2476944625377655, "learning_rate": 5.797809007203245e-06, "loss": 0.476, "step": 1595 }, { "epoch": 2.7950963222416814, "grad_norm": 0.2472182661294937, "learning_rate": 5.79314217169535e-06, "loss": 0.4837, "step": 1596 }, { "epoch": 2.7968476357267953, "grad_norm": 0.21559025347232819, "learning_rate": 5.788474627279137e-06, "loss": 0.4703, "step": 1597 }, { "epoch": 2.7985989492119088, "grad_norm": 0.23731544613838196, "learning_rate": 5.783806378126443e-06, "loss": 0.4811, "step": 1598 }, { "epoch": 2.8003502626970227, "grad_norm": 0.22964346408843994, "learning_rate": 5.779137428409738e-06, "loss": 0.4837, "step": 1599 }, { "epoch": 2.8021015761821366, "grad_norm": 0.2164280265569687, "learning_rate": 5.774467782302115e-06, "loss": 0.4758, "step": 1600 }, { "epoch": 2.8038528896672505, "grad_norm": 0.20975154638290405, "learning_rate": 5.7697974439772934e-06, "loss": 0.4575, "step": 1601 }, { "epoch": 2.805604203152364, "grad_norm": 0.22482150793075562, "learning_rate": 5.765126417609606e-06, "loss": 0.4724, "step": 1602 }, { "epoch": 2.807355516637478, "grad_norm": 0.2320108860731125, "learning_rate": 5.760454707374004e-06, "loss": 0.4764, "step": 1603 }, { "epoch": 2.809106830122592, "grad_norm": 0.2246716469526291, "learning_rate": 5.755782317446051e-06, "loss": 0.4798, "step": 1604 }, { "epoch": 2.810858143607706, "grad_norm": 0.25763237476348877, "learning_rate": 5.751109252001916e-06, "loss": 0.4675, "step": 1605 }, { "epoch": 2.8126094570928197, "grad_norm": 0.2428632527589798, "learning_rate": 5.746435515218373e-06, "loss": 0.4715, "step": 1606 }, { "epoch": 2.8143607705779337, "grad_norm": 0.23453634977340698, "learning_rate": 5.741761111272792e-06, "loss": 0.4729, "step": 1607 }, { "epoch": 2.816112084063047, "grad_norm": 0.24348552525043488, "learning_rate": 5.737086044343145e-06, "loss": 0.4879, "step": 1608 }, { "epoch": 2.817863397548161, "grad_norm": 0.23443646728992462, "learning_rate": 5.732410318607993e-06, "loss": 0.4719, "step": 1609 }, { "epoch": 2.819614711033275, "grad_norm": 0.2584564685821533, "learning_rate": 5.72773393824649e-06, "loss": 0.473, "step": 1610 }, { "epoch": 2.821366024518389, "grad_norm": 0.21733525395393372, "learning_rate": 5.723056907438369e-06, "loss": 0.467, "step": 1611 }, { "epoch": 2.8231173380035024, "grad_norm": 0.23562777042388916, "learning_rate": 5.718379230363948e-06, "loss": 0.4719, "step": 1612 }, { "epoch": 2.8248686514886163, "grad_norm": 0.22895033657550812, "learning_rate": 5.713700911204121e-06, "loss": 0.4628, "step": 1613 }, { "epoch": 2.8266199649737302, "grad_norm": 0.2485402673482895, "learning_rate": 5.709021954140359e-06, "loss": 0.4712, "step": 1614 }, { "epoch": 2.828371278458844, "grad_norm": 0.22453851997852325, "learning_rate": 5.704342363354701e-06, "loss": 0.479, "step": 1615 }, { "epoch": 2.830122591943958, "grad_norm": 0.20618684589862823, "learning_rate": 5.6996621430297514e-06, "loss": 0.4692, "step": 1616 }, { "epoch": 2.831873905429072, "grad_norm": 0.2330033779144287, "learning_rate": 5.6949812973486785e-06, "loss": 0.4736, "step": 1617 }, { "epoch": 2.8336252189141855, "grad_norm": 0.23389120399951935, "learning_rate": 5.690299830495211e-06, "loss": 0.484, "step": 1618 }, { "epoch": 2.8353765323992994, "grad_norm": 0.2313048541545868, "learning_rate": 5.685617746653629e-06, "loss": 0.4727, "step": 1619 }, { "epoch": 2.8371278458844134, "grad_norm": 0.24941956996917725, "learning_rate": 5.680935050008767e-06, "loss": 0.4649, "step": 1620 }, { "epoch": 2.8388791593695273, "grad_norm": 0.22352802753448486, "learning_rate": 5.676251744746008e-06, "loss": 0.4712, "step": 1621 }, { "epoch": 2.8406304728546408, "grad_norm": 0.22139288485050201, "learning_rate": 5.671567835051274e-06, "loss": 0.4726, "step": 1622 }, { "epoch": 2.8423817863397547, "grad_norm": 0.2724957764148712, "learning_rate": 5.666883325111035e-06, "loss": 0.4794, "step": 1623 }, { "epoch": 2.8441330998248686, "grad_norm": 0.23055025935173035, "learning_rate": 5.662198219112291e-06, "loss": 0.4687, "step": 1624 }, { "epoch": 2.8458844133099825, "grad_norm": 0.2264506071805954, "learning_rate": 5.657512521242579e-06, "loss": 0.4721, "step": 1625 }, { "epoch": 2.8476357267950965, "grad_norm": 0.26750507950782776, "learning_rate": 5.652826235689961e-06, "loss": 0.4738, "step": 1626 }, { "epoch": 2.8493870402802104, "grad_norm": 0.240803062915802, "learning_rate": 5.648139366643026e-06, "loss": 0.471, "step": 1627 }, { "epoch": 2.851138353765324, "grad_norm": 0.23084402084350586, "learning_rate": 5.643451918290885e-06, "loss": 0.4672, "step": 1628 }, { "epoch": 2.852889667250438, "grad_norm": 0.24746589362621307, "learning_rate": 5.638763894823169e-06, "loss": 0.4718, "step": 1629 }, { "epoch": 2.8546409807355517, "grad_norm": 0.21790818870067596, "learning_rate": 5.634075300430017e-06, "loss": 0.4656, "step": 1630 }, { "epoch": 2.8563922942206657, "grad_norm": 0.2374950647354126, "learning_rate": 5.6293861393020855e-06, "loss": 0.4796, "step": 1631 }, { "epoch": 2.858143607705779, "grad_norm": 0.22003382444381714, "learning_rate": 5.624696415630532e-06, "loss": 0.4696, "step": 1632 }, { "epoch": 2.859894921190893, "grad_norm": 0.22055236995220184, "learning_rate": 5.620006133607017e-06, "loss": 0.4744, "step": 1633 }, { "epoch": 2.861646234676007, "grad_norm": 0.23345732688903809, "learning_rate": 5.615315297423707e-06, "loss": 0.4747, "step": 1634 }, { "epoch": 2.863397548161121, "grad_norm": 0.21275362372398376, "learning_rate": 5.610623911273254e-06, "loss": 0.4663, "step": 1635 }, { "epoch": 2.865148861646235, "grad_norm": 0.21715585887432098, "learning_rate": 5.605931979348807e-06, "loss": 0.4735, "step": 1636 }, { "epoch": 2.8669001751313488, "grad_norm": 0.2259286642074585, "learning_rate": 5.601239505844005e-06, "loss": 0.461, "step": 1637 }, { "epoch": 2.8686514886164622, "grad_norm": 0.21886242926120758, "learning_rate": 5.596546494952965e-06, "loss": 0.4667, "step": 1638 }, { "epoch": 2.870402802101576, "grad_norm": 0.23742616176605225, "learning_rate": 5.591852950870287e-06, "loss": 0.4661, "step": 1639 }, { "epoch": 2.87215411558669, "grad_norm": 0.24182602763175964, "learning_rate": 5.58715887779105e-06, "loss": 0.4723, "step": 1640 }, { "epoch": 2.873905429071804, "grad_norm": 0.20334392786026, "learning_rate": 5.582464279910802e-06, "loss": 0.4638, "step": 1641 }, { "epoch": 2.8756567425569175, "grad_norm": 0.24879103899002075, "learning_rate": 5.577769161425563e-06, "loss": 0.4639, "step": 1642 }, { "epoch": 2.8774080560420314, "grad_norm": 0.21825525164604187, "learning_rate": 5.573073526531818e-06, "loss": 0.4683, "step": 1643 }, { "epoch": 2.8791593695271454, "grad_norm": 0.2217390537261963, "learning_rate": 5.568377379426511e-06, "loss": 0.4698, "step": 1644 }, { "epoch": 2.8809106830122593, "grad_norm": 0.23091694712638855, "learning_rate": 5.563680724307046e-06, "loss": 0.4763, "step": 1645 }, { "epoch": 2.882661996497373, "grad_norm": 0.2372456192970276, "learning_rate": 5.558983565371281e-06, "loss": 0.4838, "step": 1646 }, { "epoch": 2.884413309982487, "grad_norm": 0.20284809172153473, "learning_rate": 5.554285906817524e-06, "loss": 0.4722, "step": 1647 }, { "epoch": 2.8861646234676006, "grad_norm": 0.2143716961145401, "learning_rate": 5.549587752844529e-06, "loss": 0.473, "step": 1648 }, { "epoch": 2.8879159369527145, "grad_norm": 0.22084815800189972, "learning_rate": 5.5448891076514925e-06, "loss": 0.4652, "step": 1649 }, { "epoch": 2.8896672504378285, "grad_norm": 0.23953060805797577, "learning_rate": 5.540189975438053e-06, "loss": 0.4833, "step": 1650 }, { "epoch": 2.891418563922942, "grad_norm": 0.24518680572509766, "learning_rate": 5.53549036040428e-06, "loss": 0.4869, "step": 1651 }, { "epoch": 2.893169877408056, "grad_norm": 0.22846171259880066, "learning_rate": 5.530790266750675e-06, "loss": 0.4756, "step": 1652 }, { "epoch": 2.89492119089317, "grad_norm": 0.22001749277114868, "learning_rate": 5.526089698678174e-06, "loss": 0.4693, "step": 1653 }, { "epoch": 2.8966725043782837, "grad_norm": 0.22037629783153534, "learning_rate": 5.521388660388129e-06, "loss": 0.4778, "step": 1654 }, { "epoch": 2.8984238178633976, "grad_norm": 0.22476251423358917, "learning_rate": 5.516687156082317e-06, "loss": 0.4739, "step": 1655 }, { "epoch": 2.9001751313485116, "grad_norm": 0.23524627089500427, "learning_rate": 5.511985189962926e-06, "loss": 0.4812, "step": 1656 }, { "epoch": 2.9019264448336255, "grad_norm": 0.2642686665058136, "learning_rate": 5.507282766232565e-06, "loss": 0.4683, "step": 1657 }, { "epoch": 2.903677758318739, "grad_norm": 0.242937371134758, "learning_rate": 5.502579889094244e-06, "loss": 0.4621, "step": 1658 }, { "epoch": 2.905429071803853, "grad_norm": 0.22185936570167542, "learning_rate": 5.497876562751384e-06, "loss": 0.4749, "step": 1659 }, { "epoch": 2.907180385288967, "grad_norm": 0.24833036959171295, "learning_rate": 5.493172791407803e-06, "loss": 0.4744, "step": 1660 }, { "epoch": 2.9089316987740803, "grad_norm": 0.2064930498600006, "learning_rate": 5.488468579267721e-06, "loss": 0.4636, "step": 1661 }, { "epoch": 2.9106830122591942, "grad_norm": 0.2487211376428604, "learning_rate": 5.483763930535752e-06, "loss": 0.4817, "step": 1662 }, { "epoch": 2.912434325744308, "grad_norm": 0.22842663526535034, "learning_rate": 5.479058849416894e-06, "loss": 0.4824, "step": 1663 }, { "epoch": 2.914185639229422, "grad_norm": 0.21831420063972473, "learning_rate": 5.474353340116538e-06, "loss": 0.4718, "step": 1664 }, { "epoch": 2.915936952714536, "grad_norm": 0.25253191590309143, "learning_rate": 5.469647406840456e-06, "loss": 0.4813, "step": 1665 }, { "epoch": 2.91768826619965, "grad_norm": 0.22754544019699097, "learning_rate": 5.464941053794797e-06, "loss": 0.4778, "step": 1666 }, { "epoch": 2.9194395796847634, "grad_norm": 0.2200315147638321, "learning_rate": 5.460234285186087e-06, "loss": 0.4769, "step": 1667 }, { "epoch": 2.9211908931698773, "grad_norm": 0.22814348340034485, "learning_rate": 5.455527105221222e-06, "loss": 0.4725, "step": 1668 }, { "epoch": 2.9229422066549913, "grad_norm": 0.2425338327884674, "learning_rate": 5.450819518107468e-06, "loss": 0.4694, "step": 1669 }, { "epoch": 2.924693520140105, "grad_norm": 0.2026367038488388, "learning_rate": 5.446111528052455e-06, "loss": 0.4595, "step": 1670 }, { "epoch": 2.9264448336252187, "grad_norm": 0.2740146815776825, "learning_rate": 5.441403139264169e-06, "loss": 0.4751, "step": 1671 }, { "epoch": 2.9281961471103326, "grad_norm": 0.23199190199375153, "learning_rate": 5.436694355950955e-06, "loss": 0.4846, "step": 1672 }, { "epoch": 2.9299474605954465, "grad_norm": 0.24909235537052155, "learning_rate": 5.431985182321513e-06, "loss": 0.4623, "step": 1673 }, { "epoch": 2.9316987740805605, "grad_norm": 0.23055759072303772, "learning_rate": 5.427275622584888e-06, "loss": 0.4756, "step": 1674 }, { "epoch": 2.9334500875656744, "grad_norm": 0.22452832758426666, "learning_rate": 5.422565680950472e-06, "loss": 0.465, "step": 1675 }, { "epoch": 2.9352014010507883, "grad_norm": 0.23566685616970062, "learning_rate": 5.417855361627998e-06, "loss": 0.4676, "step": 1676 }, { "epoch": 2.936952714535902, "grad_norm": 0.21268676221370697, "learning_rate": 5.413144668827538e-06, "loss": 0.4666, "step": 1677 }, { "epoch": 2.9387040280210157, "grad_norm": 0.24833792448043823, "learning_rate": 5.4084336067594945e-06, "loss": 0.4972, "step": 1678 }, { "epoch": 2.9404553415061296, "grad_norm": 0.22484271228313446, "learning_rate": 5.403722179634602e-06, "loss": 0.4789, "step": 1679 }, { "epoch": 2.9422066549912436, "grad_norm": 0.21195551753044128, "learning_rate": 5.399010391663923e-06, "loss": 0.4699, "step": 1680 }, { "epoch": 2.943957968476357, "grad_norm": 0.20528341829776764, "learning_rate": 5.3942982470588386e-06, "loss": 0.4731, "step": 1681 }, { "epoch": 2.945709281961471, "grad_norm": 0.2222284972667694, "learning_rate": 5.389585750031053e-06, "loss": 0.4742, "step": 1682 }, { "epoch": 2.947460595446585, "grad_norm": 0.22357113659381866, "learning_rate": 5.384872904792583e-06, "loss": 0.479, "step": 1683 }, { "epoch": 2.949211908931699, "grad_norm": 0.2241937518119812, "learning_rate": 5.380159715555755e-06, "loss": 0.4705, "step": 1684 }, { "epoch": 2.9509632224168127, "grad_norm": 0.21309718489646912, "learning_rate": 5.375446186533206e-06, "loss": 0.4801, "step": 1685 }, { "epoch": 2.9527145359019267, "grad_norm": 0.20784953236579895, "learning_rate": 5.370732321937876e-06, "loss": 0.465, "step": 1686 }, { "epoch": 2.95446584938704, "grad_norm": 0.2273818999528885, "learning_rate": 5.3660181259830024e-06, "loss": 0.477, "step": 1687 }, { "epoch": 2.956217162872154, "grad_norm": 0.22605499625205994, "learning_rate": 5.361303602882123e-06, "loss": 0.4786, "step": 1688 }, { "epoch": 2.957968476357268, "grad_norm": 0.2354632019996643, "learning_rate": 5.356588756849065e-06, "loss": 0.4723, "step": 1689 }, { "epoch": 2.959719789842382, "grad_norm": 0.22129569947719574, "learning_rate": 5.3518735920979436e-06, "loss": 0.4632, "step": 1690 }, { "epoch": 2.9614711033274954, "grad_norm": 0.24113252758979797, "learning_rate": 5.347158112843163e-06, "loss": 0.4719, "step": 1691 }, { "epoch": 2.9632224168126093, "grad_norm": 0.2366192489862442, "learning_rate": 5.342442323299403e-06, "loss": 0.466, "step": 1692 }, { "epoch": 2.9649737302977233, "grad_norm": 0.2286851704120636, "learning_rate": 5.3377262276816245e-06, "loss": 0.4725, "step": 1693 }, { "epoch": 2.966725043782837, "grad_norm": 0.21661265194416046, "learning_rate": 5.3330098302050615e-06, "loss": 0.478, "step": 1694 }, { "epoch": 2.968476357267951, "grad_norm": 0.21314197778701782, "learning_rate": 5.328293135085215e-06, "loss": 0.47, "step": 1695 }, { "epoch": 2.970227670753065, "grad_norm": 0.25960657000541687, "learning_rate": 5.323576146537858e-06, "loss": 0.4756, "step": 1696 }, { "epoch": 2.9719789842381785, "grad_norm": 0.21431352198123932, "learning_rate": 5.31885886877902e-06, "loss": 0.4662, "step": 1697 }, { "epoch": 2.9737302977232924, "grad_norm": 0.23800024390220642, "learning_rate": 5.314141306024991e-06, "loss": 0.4777, "step": 1698 }, { "epoch": 2.9754816112084064, "grad_norm": 0.22554229199886322, "learning_rate": 5.309423462492314e-06, "loss": 0.476, "step": 1699 }, { "epoch": 2.9772329246935203, "grad_norm": 0.2291075885295868, "learning_rate": 5.30470534239779e-06, "loss": 0.4639, "step": 1700 }, { "epoch": 2.978984238178634, "grad_norm": 0.22404474020004272, "learning_rate": 5.299986949958457e-06, "loss": 0.4692, "step": 1701 }, { "epoch": 2.9807355516637477, "grad_norm": 0.27964073419570923, "learning_rate": 5.295268289391603e-06, "loss": 0.4781, "step": 1702 }, { "epoch": 2.9824868651488616, "grad_norm": 0.24851295351982117, "learning_rate": 5.290549364914754e-06, "loss": 0.477, "step": 1703 }, { "epoch": 2.9842381786339756, "grad_norm": 0.23800911009311676, "learning_rate": 5.285830180745673e-06, "loss": 0.4658, "step": 1704 }, { "epoch": 2.9859894921190895, "grad_norm": 0.27632954716682434, "learning_rate": 5.281110741102352e-06, "loss": 0.4728, "step": 1705 }, { "epoch": 2.9877408056042034, "grad_norm": 0.23602963984012604, "learning_rate": 5.276391050203015e-06, "loss": 0.4696, "step": 1706 }, { "epoch": 2.989492119089317, "grad_norm": 0.22848673164844513, "learning_rate": 5.2716711122661065e-06, "loss": 0.4704, "step": 1707 }, { "epoch": 2.991243432574431, "grad_norm": 0.23686154186725616, "learning_rate": 5.266950931510296e-06, "loss": 0.4827, "step": 1708 }, { "epoch": 2.9929947460595447, "grad_norm": 0.22382597625255585, "learning_rate": 5.262230512154469e-06, "loss": 0.4696, "step": 1709 }, { "epoch": 2.9947460595446582, "grad_norm": 0.2254233956336975, "learning_rate": 5.257509858417723e-06, "loss": 0.4661, "step": 1710 }, { "epoch": 2.996497373029772, "grad_norm": 0.21427403390407562, "learning_rate": 5.252788974519364e-06, "loss": 0.4696, "step": 1711 }, { "epoch": 2.998248686514886, "grad_norm": 0.19886791706085205, "learning_rate": 5.248067864678906e-06, "loss": 0.4626, "step": 1712 }, { "epoch": 3.0, "grad_norm": 0.209398090839386, "learning_rate": 5.243346533116065e-06, "loss": 0.4778, "step": 1713 }, { "epoch": 3.001751313485114, "grad_norm": 0.22619199752807617, "learning_rate": 5.238624984050754e-06, "loss": 0.4596, "step": 1714 }, { "epoch": 3.003502626970228, "grad_norm": 0.21871943771839142, "learning_rate": 5.233903221703079e-06, "loss": 0.4598, "step": 1715 }, { "epoch": 3.0052539404553413, "grad_norm": 0.20545242726802826, "learning_rate": 5.229181250293341e-06, "loss": 0.4399, "step": 1716 }, { "epoch": 3.0070052539404553, "grad_norm": 0.23317183554172516, "learning_rate": 5.224459074042023e-06, "loss": 0.442, "step": 1717 }, { "epoch": 3.008756567425569, "grad_norm": 0.2282424122095108, "learning_rate": 5.219736697169795e-06, "loss": 0.4464, "step": 1718 }, { "epoch": 3.010507880910683, "grad_norm": 0.22727178037166595, "learning_rate": 5.215014123897504e-06, "loss": 0.4535, "step": 1719 }, { "epoch": 3.012259194395797, "grad_norm": 0.23303240537643433, "learning_rate": 5.210291358446173e-06, "loss": 0.4511, "step": 1720 }, { "epoch": 3.0140105078809105, "grad_norm": 0.2661000192165375, "learning_rate": 5.205568405036996e-06, "loss": 0.4469, "step": 1721 }, { "epoch": 3.0157618213660244, "grad_norm": 0.23714618384838104, "learning_rate": 5.200845267891338e-06, "loss": 0.4572, "step": 1722 }, { "epoch": 3.0175131348511384, "grad_norm": 0.30523112416267395, "learning_rate": 5.196121951230726e-06, "loss": 0.4614, "step": 1723 }, { "epoch": 3.0192644483362523, "grad_norm": 0.220992773771286, "learning_rate": 5.1913984592768455e-06, "loss": 0.4518, "step": 1724 }, { "epoch": 3.021015761821366, "grad_norm": 0.20687095820903778, "learning_rate": 5.186674796251543e-06, "loss": 0.4477, "step": 1725 }, { "epoch": 3.0227670753064797, "grad_norm": 0.2610771656036377, "learning_rate": 5.181950966376814e-06, "loss": 0.4622, "step": 1726 }, { "epoch": 3.0245183887915936, "grad_norm": 0.22322794795036316, "learning_rate": 5.177226973874805e-06, "loss": 0.4572, "step": 1727 }, { "epoch": 3.0262697022767076, "grad_norm": 0.23303568363189697, "learning_rate": 5.172502822967808e-06, "loss": 0.4634, "step": 1728 }, { "epoch": 3.0280210157618215, "grad_norm": 0.22127562761306763, "learning_rate": 5.167778517878258e-06, "loss": 0.4596, "step": 1729 }, { "epoch": 3.0297723292469354, "grad_norm": 0.24110743403434753, "learning_rate": 5.163054062828724e-06, "loss": 0.4593, "step": 1730 }, { "epoch": 3.031523642732049, "grad_norm": 0.23171627521514893, "learning_rate": 5.158329462041911e-06, "loss": 0.4642, "step": 1731 }, { "epoch": 3.033274956217163, "grad_norm": 0.2725125253200531, "learning_rate": 5.153604719740655e-06, "loss": 0.4639, "step": 1732 }, { "epoch": 3.0350262697022767, "grad_norm": 0.24851952493190765, "learning_rate": 5.148879840147918e-06, "loss": 0.46, "step": 1733 }, { "epoch": 3.0367775831873907, "grad_norm": 0.23115791380405426, "learning_rate": 5.1441548274867834e-06, "loss": 0.4578, "step": 1734 }, { "epoch": 3.0385288966725046, "grad_norm": 0.25328898429870605, "learning_rate": 5.139429685980457e-06, "loss": 0.4597, "step": 1735 }, { "epoch": 3.040280210157618, "grad_norm": 0.23674151301383972, "learning_rate": 5.134704419852253e-06, "loss": 0.4448, "step": 1736 }, { "epoch": 3.042031523642732, "grad_norm": 0.244899719953537, "learning_rate": 5.129979033325608e-06, "loss": 0.4485, "step": 1737 }, { "epoch": 3.043782837127846, "grad_norm": 0.22300291061401367, "learning_rate": 5.125253530624054e-06, "loss": 0.4502, "step": 1738 }, { "epoch": 3.04553415061296, "grad_norm": 0.23049508035182953, "learning_rate": 5.120527915971235e-06, "loss": 0.4585, "step": 1739 }, { "epoch": 3.0472854640980738, "grad_norm": 0.249300017952919, "learning_rate": 5.115802193590893e-06, "loss": 0.45, "step": 1740 }, { "epoch": 3.0490367775831873, "grad_norm": 0.23576366901397705, "learning_rate": 5.111076367706864e-06, "loss": 0.4603, "step": 1741 }, { "epoch": 3.050788091068301, "grad_norm": 0.24602776765823364, "learning_rate": 5.10635044254308e-06, "loss": 0.4537, "step": 1742 }, { "epoch": 3.052539404553415, "grad_norm": 0.2309609353542328, "learning_rate": 5.1016244223235576e-06, "loss": 0.4481, "step": 1743 }, { "epoch": 3.054290718038529, "grad_norm": 0.23748698830604553, "learning_rate": 5.096898311272404e-06, "loss": 0.4505, "step": 1744 }, { "epoch": 3.056042031523643, "grad_norm": 0.24891015887260437, "learning_rate": 5.0921721136138025e-06, "loss": 0.4576, "step": 1745 }, { "epoch": 3.0577933450087564, "grad_norm": 0.2478199154138565, "learning_rate": 5.087445833572017e-06, "loss": 0.4565, "step": 1746 }, { "epoch": 3.0595446584938704, "grad_norm": 0.2714397609233856, "learning_rate": 5.082719475371382e-06, "loss": 0.4584, "step": 1747 }, { "epoch": 3.0612959719789843, "grad_norm": 0.208139106631279, "learning_rate": 5.077993043236306e-06, "loss": 0.4635, "step": 1748 }, { "epoch": 3.063047285464098, "grad_norm": 0.2546875774860382, "learning_rate": 5.073266541391259e-06, "loss": 0.4697, "step": 1749 }, { "epoch": 3.0647985989492117, "grad_norm": 0.2194024622440338, "learning_rate": 5.068539974060776e-06, "loss": 0.4616, "step": 1750 }, { "epoch": 3.0665499124343256, "grad_norm": 0.22760754823684692, "learning_rate": 5.06381334546945e-06, "loss": 0.4683, "step": 1751 }, { "epoch": 3.0683012259194395, "grad_norm": 0.24283096194267273, "learning_rate": 5.05908665984193e-06, "loss": 0.4505, "step": 1752 }, { "epoch": 3.0700525394045535, "grad_norm": 0.21893030405044556, "learning_rate": 5.054359921402914e-06, "loss": 0.4597, "step": 1753 }, { "epoch": 3.0718038528896674, "grad_norm": 0.23510171473026276, "learning_rate": 5.049633134377147e-06, "loss": 0.4504, "step": 1754 }, { "epoch": 3.073555166374781, "grad_norm": 0.2488582879304886, "learning_rate": 5.04490630298942e-06, "loss": 0.4689, "step": 1755 }, { "epoch": 3.075306479859895, "grad_norm": 0.23296350240707397, "learning_rate": 5.04017943146456e-06, "loss": 0.4483, "step": 1756 }, { "epoch": 3.0770577933450087, "grad_norm": 0.22671784460544586, "learning_rate": 5.035452524027436e-06, "loss": 0.4651, "step": 1757 }, { "epoch": 3.0788091068301227, "grad_norm": 0.23635120689868927, "learning_rate": 5.03072558490294e-06, "loss": 0.4518, "step": 1758 }, { "epoch": 3.0805604203152366, "grad_norm": 0.23146425187587738, "learning_rate": 5.0259986183160006e-06, "loss": 0.468, "step": 1759 }, { "epoch": 3.08231173380035, "grad_norm": 0.21827596426010132, "learning_rate": 5.021271628491566e-06, "loss": 0.4629, "step": 1760 }, { "epoch": 3.084063047285464, "grad_norm": 0.24224427342414856, "learning_rate": 5.016544619654609e-06, "loss": 0.4604, "step": 1761 }, { "epoch": 3.085814360770578, "grad_norm": 0.2064700871706009, "learning_rate": 5.011817596030115e-06, "loss": 0.4602, "step": 1762 }, { "epoch": 3.087565674255692, "grad_norm": 0.21807518601417542, "learning_rate": 5.0070905618430865e-06, "loss": 0.4593, "step": 1763 }, { "epoch": 3.0893169877408058, "grad_norm": 0.2171521931886673, "learning_rate": 5.0023635213185325e-06, "loss": 0.4598, "step": 1764 }, { "epoch": 3.0910683012259192, "grad_norm": 0.20919018983840942, "learning_rate": 4.997636478681468e-06, "loss": 0.458, "step": 1765 }, { "epoch": 3.092819614711033, "grad_norm": 0.21738307178020477, "learning_rate": 4.992909438156915e-06, "loss": 0.4644, "step": 1766 }, { "epoch": 3.094570928196147, "grad_norm": 0.19866666197776794, "learning_rate": 4.988182403969886e-06, "loss": 0.4424, "step": 1767 }, { "epoch": 3.096322241681261, "grad_norm": 0.22668208181858063, "learning_rate": 4.983455380345392e-06, "loss": 0.4477, "step": 1768 }, { "epoch": 3.098073555166375, "grad_norm": 0.21874718368053436, "learning_rate": 4.978728371508433e-06, "loss": 0.4578, "step": 1769 }, { "epoch": 3.0998248686514884, "grad_norm": 0.22217831015586853, "learning_rate": 4.974001381684e-06, "loss": 0.4514, "step": 1770 }, { "epoch": 3.1015761821366024, "grad_norm": 0.22468885779380798, "learning_rate": 4.96927441509706e-06, "loss": 0.4641, "step": 1771 }, { "epoch": 3.1033274956217163, "grad_norm": 0.22239579260349274, "learning_rate": 4.964547475972565e-06, "loss": 0.4431, "step": 1772 }, { "epoch": 3.10507880910683, "grad_norm": 0.23708350956439972, "learning_rate": 4.9598205685354404e-06, "loss": 0.4594, "step": 1773 }, { "epoch": 3.106830122591944, "grad_norm": 0.21981069445610046, "learning_rate": 4.955093697010581e-06, "loss": 0.4567, "step": 1774 }, { "epoch": 3.1085814360770576, "grad_norm": 0.2735148072242737, "learning_rate": 4.950366865622854e-06, "loss": 0.4496, "step": 1775 }, { "epoch": 3.1103327495621715, "grad_norm": 0.2173786461353302, "learning_rate": 4.9456400785970875e-06, "loss": 0.463, "step": 1776 }, { "epoch": 3.1120840630472855, "grad_norm": 0.23358671367168427, "learning_rate": 4.940913340158072e-06, "loss": 0.4529, "step": 1777 }, { "epoch": 3.1138353765323994, "grad_norm": 0.22771500051021576, "learning_rate": 4.936186654530551e-06, "loss": 0.4618, "step": 1778 }, { "epoch": 3.1155866900175133, "grad_norm": 0.2186444103717804, "learning_rate": 4.931460025939226e-06, "loss": 0.4519, "step": 1779 }, { "epoch": 3.117338003502627, "grad_norm": 0.22646282613277435, "learning_rate": 4.926733458608744e-06, "loss": 0.4627, "step": 1780 }, { "epoch": 3.1190893169877407, "grad_norm": 0.22751818597316742, "learning_rate": 4.922006956763697e-06, "loss": 0.4484, "step": 1781 }, { "epoch": 3.1208406304728546, "grad_norm": 0.20223505795001984, "learning_rate": 4.917280524628619e-06, "loss": 0.4492, "step": 1782 }, { "epoch": 3.1225919439579686, "grad_norm": 0.23477105796337128, "learning_rate": 4.912554166427985e-06, "loss": 0.459, "step": 1783 }, { "epoch": 3.1243432574430825, "grad_norm": 0.20917773246765137, "learning_rate": 4.907827886386199e-06, "loss": 0.4593, "step": 1784 }, { "epoch": 3.126094570928196, "grad_norm": 0.20871500670909882, "learning_rate": 4.903101688727598e-06, "loss": 0.4565, "step": 1785 }, { "epoch": 3.12784588441331, "grad_norm": 0.22116875648498535, "learning_rate": 4.898375577676444e-06, "loss": 0.4467, "step": 1786 }, { "epoch": 3.129597197898424, "grad_norm": 0.21090276539325714, "learning_rate": 4.893649557456922e-06, "loss": 0.4512, "step": 1787 }, { "epoch": 3.1313485113835378, "grad_norm": 0.23858961462974548, "learning_rate": 4.888923632293137e-06, "loss": 0.4614, "step": 1788 }, { "epoch": 3.1330998248686517, "grad_norm": 0.23359625041484833, "learning_rate": 4.884197806409109e-06, "loss": 0.4606, "step": 1789 }, { "epoch": 3.134851138353765, "grad_norm": 0.20302541553974152, "learning_rate": 4.8794720840287665e-06, "loss": 0.4541, "step": 1790 }, { "epoch": 3.136602451838879, "grad_norm": 0.20285554230213165, "learning_rate": 4.874746469375947e-06, "loss": 0.4594, "step": 1791 }, { "epoch": 3.138353765323993, "grad_norm": 0.20902223885059357, "learning_rate": 4.870020966674394e-06, "loss": 0.4565, "step": 1792 }, { "epoch": 3.140105078809107, "grad_norm": 0.215532124042511, "learning_rate": 4.865295580147747e-06, "loss": 0.4574, "step": 1793 }, { "epoch": 3.141856392294221, "grad_norm": 0.21441063284873962, "learning_rate": 4.860570314019546e-06, "loss": 0.4555, "step": 1794 }, { "epoch": 3.1436077057793343, "grad_norm": 0.19976525008678436, "learning_rate": 4.855845172513219e-06, "loss": 0.4629, "step": 1795 }, { "epoch": 3.1453590192644483, "grad_norm": 0.23325879871845245, "learning_rate": 4.851120159852085e-06, "loss": 0.4543, "step": 1796 }, { "epoch": 3.147110332749562, "grad_norm": 0.21101799607276917, "learning_rate": 4.846395280259348e-06, "loss": 0.4566, "step": 1797 }, { "epoch": 3.148861646234676, "grad_norm": 0.20536737143993378, "learning_rate": 4.841670537958092e-06, "loss": 0.4633, "step": 1798 }, { "epoch": 3.1506129597197896, "grad_norm": 0.2286081463098526, "learning_rate": 4.836945937171279e-06, "loss": 0.4532, "step": 1799 }, { "epoch": 3.1523642732049035, "grad_norm": 0.21703296899795532, "learning_rate": 4.832221482121745e-06, "loss": 0.461, "step": 1800 }, { "epoch": 3.1541155866900175, "grad_norm": 0.23466940224170685, "learning_rate": 4.827497177032193e-06, "loss": 0.4572, "step": 1801 }, { "epoch": 3.1558669001751314, "grad_norm": 0.20751696825027466, "learning_rate": 4.822773026125197e-06, "loss": 0.4538, "step": 1802 }, { "epoch": 3.1576182136602453, "grad_norm": 0.22961769998073578, "learning_rate": 4.8180490336231865e-06, "loss": 0.4505, "step": 1803 }, { "epoch": 3.1593695271453592, "grad_norm": 0.2372453212738037, "learning_rate": 4.813325203748458e-06, "loss": 0.4668, "step": 1804 }, { "epoch": 3.1611208406304727, "grad_norm": 0.22908031940460205, "learning_rate": 4.8086015407231545e-06, "loss": 0.4682, "step": 1805 }, { "epoch": 3.1628721541155866, "grad_norm": 0.20643161237239838, "learning_rate": 4.803878048769275e-06, "loss": 0.4595, "step": 1806 }, { "epoch": 3.1646234676007006, "grad_norm": 0.2086883783340454, "learning_rate": 4.799154732108662e-06, "loss": 0.4592, "step": 1807 }, { "epoch": 3.1663747810858145, "grad_norm": 0.20789404213428497, "learning_rate": 4.794431594963004e-06, "loss": 0.4553, "step": 1808 }, { "epoch": 3.168126094570928, "grad_norm": 0.20146717131137848, "learning_rate": 4.789708641553828e-06, "loss": 0.4417, "step": 1809 }, { "epoch": 3.169877408056042, "grad_norm": 0.2264011800289154, "learning_rate": 4.784985876102498e-06, "loss": 0.462, "step": 1810 }, { "epoch": 3.171628721541156, "grad_norm": 0.21424752473831177, "learning_rate": 4.7802633028302065e-06, "loss": 0.4556, "step": 1811 }, { "epoch": 3.1733800350262698, "grad_norm": 0.2330653816461563, "learning_rate": 4.7755409259579786e-06, "loss": 0.4553, "step": 1812 }, { "epoch": 3.1751313485113837, "grad_norm": 0.23404136300086975, "learning_rate": 4.7708187497066595e-06, "loss": 0.4572, "step": 1813 }, { "epoch": 3.1768826619964976, "grad_norm": 0.2169305682182312, "learning_rate": 4.766096778296922e-06, "loss": 0.458, "step": 1814 }, { "epoch": 3.178633975481611, "grad_norm": 0.22185629606246948, "learning_rate": 4.761375015949248e-06, "loss": 0.4538, "step": 1815 }, { "epoch": 3.180385288966725, "grad_norm": 0.21547670662403107, "learning_rate": 4.756653466883937e-06, "loss": 0.4562, "step": 1816 }, { "epoch": 3.182136602451839, "grad_norm": 0.2335556596517563, "learning_rate": 4.751932135321095e-06, "loss": 0.4648, "step": 1817 }, { "epoch": 3.183887915936953, "grad_norm": 0.21306295692920685, "learning_rate": 4.7472110254806375e-06, "loss": 0.4475, "step": 1818 }, { "epoch": 3.1856392294220663, "grad_norm": 0.23616619408130646, "learning_rate": 4.742490141582279e-06, "loss": 0.4598, "step": 1819 }, { "epoch": 3.1873905429071803, "grad_norm": 0.20938828587532043, "learning_rate": 4.737769487845532e-06, "loss": 0.4574, "step": 1820 }, { "epoch": 3.189141856392294, "grad_norm": 0.20004914700984955, "learning_rate": 4.733049068489704e-06, "loss": 0.4518, "step": 1821 }, { "epoch": 3.190893169877408, "grad_norm": 0.21709266304969788, "learning_rate": 4.728328887733894e-06, "loss": 0.4519, "step": 1822 }, { "epoch": 3.192644483362522, "grad_norm": 0.22165168821811676, "learning_rate": 4.723608949796987e-06, "loss": 0.4637, "step": 1823 }, { "epoch": 3.1943957968476355, "grad_norm": 0.2106124311685562, "learning_rate": 4.7188892588976496e-06, "loss": 0.4577, "step": 1824 }, { "epoch": 3.1961471103327495, "grad_norm": 0.21161848306655884, "learning_rate": 4.714169819254328e-06, "loss": 0.4526, "step": 1825 }, { "epoch": 3.1978984238178634, "grad_norm": 0.21906466782093048, "learning_rate": 4.709450635085246e-06, "loss": 0.4476, "step": 1826 }, { "epoch": 3.1996497373029773, "grad_norm": 0.21796174347400665, "learning_rate": 4.704731710608398e-06, "loss": 0.4651, "step": 1827 }, { "epoch": 3.2014010507880912, "grad_norm": 0.21092528104782104, "learning_rate": 4.700013050041544e-06, "loss": 0.4479, "step": 1828 }, { "epoch": 3.2031523642732047, "grad_norm": 0.21205800771713257, "learning_rate": 4.695294657602212e-06, "loss": 0.4597, "step": 1829 }, { "epoch": 3.2049036777583186, "grad_norm": 0.2363952100276947, "learning_rate": 4.6905765375076865e-06, "loss": 0.462, "step": 1830 }, { "epoch": 3.2066549912434326, "grad_norm": 0.23527564108371735, "learning_rate": 4.685858693975011e-06, "loss": 0.4586, "step": 1831 }, { "epoch": 3.2084063047285465, "grad_norm": 0.20967750251293182, "learning_rate": 4.681141131220982e-06, "loss": 0.4576, "step": 1832 }, { "epoch": 3.2101576182136604, "grad_norm": 0.25220462679862976, "learning_rate": 4.6764238534621436e-06, "loss": 0.4441, "step": 1833 }, { "epoch": 3.211908931698774, "grad_norm": 0.2386987805366516, "learning_rate": 4.671706864914786e-06, "loss": 0.4576, "step": 1834 }, { "epoch": 3.213660245183888, "grad_norm": 0.21119943261146545, "learning_rate": 4.666990169794942e-06, "loss": 0.4491, "step": 1835 }, { "epoch": 3.2154115586690017, "grad_norm": 0.2685398459434509, "learning_rate": 4.662273772318378e-06, "loss": 0.4547, "step": 1836 }, { "epoch": 3.2171628721541157, "grad_norm": 0.2502460479736328, "learning_rate": 4.6575576767006e-06, "loss": 0.4704, "step": 1837 }, { "epoch": 3.2189141856392296, "grad_norm": 0.23327574133872986, "learning_rate": 4.652841887156841e-06, "loss": 0.4601, "step": 1838 }, { "epoch": 3.220665499124343, "grad_norm": 0.21762852370738983, "learning_rate": 4.648126407902058e-06, "loss": 0.4397, "step": 1839 }, { "epoch": 3.222416812609457, "grad_norm": 0.25235557556152344, "learning_rate": 4.6434112431509376e-06, "loss": 0.4604, "step": 1840 }, { "epoch": 3.224168126094571, "grad_norm": 0.22347316145896912, "learning_rate": 4.638696397117877e-06, "loss": 0.4597, "step": 1841 }, { "epoch": 3.225919439579685, "grad_norm": 0.22043895721435547, "learning_rate": 4.6339818740169975e-06, "loss": 0.4701, "step": 1842 }, { "epoch": 3.227670753064799, "grad_norm": 0.2392449676990509, "learning_rate": 4.6292676780621245e-06, "loss": 0.4607, "step": 1843 }, { "epoch": 3.2294220665499123, "grad_norm": 0.23343002796173096, "learning_rate": 4.624553813466794e-06, "loss": 0.4598, "step": 1844 }, { "epoch": 3.231173380035026, "grad_norm": 0.22859469056129456, "learning_rate": 4.619840284444245e-06, "loss": 0.4556, "step": 1845 }, { "epoch": 3.23292469352014, "grad_norm": 0.24265216290950775, "learning_rate": 4.615127095207418e-06, "loss": 0.4593, "step": 1846 }, { "epoch": 3.234676007005254, "grad_norm": 0.21817143261432648, "learning_rate": 4.610414249968947e-06, "loss": 0.4614, "step": 1847 }, { "epoch": 3.236427320490368, "grad_norm": 0.24689942598342896, "learning_rate": 4.605701752941161e-06, "loss": 0.4507, "step": 1848 }, { "epoch": 3.2381786339754814, "grad_norm": 0.22036674618721008, "learning_rate": 4.600989608336078e-06, "loss": 0.4609, "step": 1849 }, { "epoch": 3.2399299474605954, "grad_norm": 0.21569839119911194, "learning_rate": 4.596277820365398e-06, "loss": 0.4584, "step": 1850 }, { "epoch": 3.2416812609457093, "grad_norm": 0.2205539345741272, "learning_rate": 4.591566393240507e-06, "loss": 0.4625, "step": 1851 }, { "epoch": 3.2434325744308232, "grad_norm": 0.2125576138496399, "learning_rate": 4.5868553311724635e-06, "loss": 0.4423, "step": 1852 }, { "epoch": 3.245183887915937, "grad_norm": 0.19952869415283203, "learning_rate": 4.5821446383720025e-06, "loss": 0.4454, "step": 1853 }, { "epoch": 3.2469352014010506, "grad_norm": 0.25014716386795044, "learning_rate": 4.577434319049529e-06, "loss": 0.4584, "step": 1854 }, { "epoch": 3.2486865148861646, "grad_norm": 0.2220078706741333, "learning_rate": 4.572724377415114e-06, "loss": 0.4518, "step": 1855 }, { "epoch": 3.2504378283712785, "grad_norm": 0.22463807463645935, "learning_rate": 4.568014817678488e-06, "loss": 0.4705, "step": 1856 }, { "epoch": 3.2521891418563924, "grad_norm": 0.22111590206623077, "learning_rate": 4.5633056440490455e-06, "loss": 0.4565, "step": 1857 }, { "epoch": 3.253940455341506, "grad_norm": 0.2069559097290039, "learning_rate": 4.558596860735833e-06, "loss": 0.4638, "step": 1858 }, { "epoch": 3.25569176882662, "grad_norm": 0.24707920849323273, "learning_rate": 4.553888471947546e-06, "loss": 0.4591, "step": 1859 }, { "epoch": 3.2574430823117337, "grad_norm": 0.2039840817451477, "learning_rate": 4.549180481892532e-06, "loss": 0.458, "step": 1860 }, { "epoch": 3.2591943957968477, "grad_norm": 0.20928554236888885, "learning_rate": 4.544472894778779e-06, "loss": 0.4596, "step": 1861 }, { "epoch": 3.2609457092819616, "grad_norm": 0.22417663037776947, "learning_rate": 4.539765714813915e-06, "loss": 0.4481, "step": 1862 }, { "epoch": 3.2626970227670755, "grad_norm": 0.2094704806804657, "learning_rate": 4.535058946205205e-06, "loss": 0.4462, "step": 1863 }, { "epoch": 3.264448336252189, "grad_norm": 0.2259502410888672, "learning_rate": 4.530352593159546e-06, "loss": 0.4532, "step": 1864 }, { "epoch": 3.266199649737303, "grad_norm": 0.20844152569770813, "learning_rate": 4.525646659883463e-06, "loss": 0.45, "step": 1865 }, { "epoch": 3.267950963222417, "grad_norm": 0.22336114943027496, "learning_rate": 4.5209411505831066e-06, "loss": 0.4545, "step": 1866 }, { "epoch": 3.2697022767075308, "grad_norm": 0.24756690859794617, "learning_rate": 4.51623606946425e-06, "loss": 0.4529, "step": 1867 }, { "epoch": 3.2714535901926443, "grad_norm": 0.22043733298778534, "learning_rate": 4.5115314207322794e-06, "loss": 0.4589, "step": 1868 }, { "epoch": 3.273204903677758, "grad_norm": 0.2260100245475769, "learning_rate": 4.506827208592198e-06, "loss": 0.4568, "step": 1869 }, { "epoch": 3.274956217162872, "grad_norm": 0.21982935070991516, "learning_rate": 4.502123437248619e-06, "loss": 0.4527, "step": 1870 }, { "epoch": 3.276707530647986, "grad_norm": 0.22962847352027893, "learning_rate": 4.497420110905758e-06, "loss": 0.4587, "step": 1871 }, { "epoch": 3.2784588441331, "grad_norm": 0.21519626677036285, "learning_rate": 4.492717233767438e-06, "loss": 0.4599, "step": 1872 }, { "epoch": 3.280210157618214, "grad_norm": 0.21543879806995392, "learning_rate": 4.4880148100370765e-06, "loss": 0.4587, "step": 1873 }, { "epoch": 3.2819614711033274, "grad_norm": 0.2306310385465622, "learning_rate": 4.483312843917685e-06, "loss": 0.4595, "step": 1874 }, { "epoch": 3.2837127845884413, "grad_norm": 0.23168328404426575, "learning_rate": 4.4786113396118715e-06, "loss": 0.451, "step": 1875 }, { "epoch": 3.285464098073555, "grad_norm": 0.21043945848941803, "learning_rate": 4.473910301321827e-06, "loss": 0.4605, "step": 1876 }, { "epoch": 3.287215411558669, "grad_norm": 0.22302810847759247, "learning_rate": 4.469209733249326e-06, "loss": 0.4506, "step": 1877 }, { "epoch": 3.2889667250437826, "grad_norm": 0.22644802927970886, "learning_rate": 4.4645096395957235e-06, "loss": 0.4571, "step": 1878 }, { "epoch": 3.2907180385288965, "grad_norm": 0.22491557896137238, "learning_rate": 4.4598100245619505e-06, "loss": 0.4479, "step": 1879 }, { "epoch": 3.2924693520140105, "grad_norm": 0.21639658510684967, "learning_rate": 4.455110892348508e-06, "loss": 0.4652, "step": 1880 }, { "epoch": 3.2942206654991244, "grad_norm": 0.23638448119163513, "learning_rate": 4.450412247155473e-06, "loss": 0.456, "step": 1881 }, { "epoch": 3.2959719789842383, "grad_norm": 0.2180439829826355, "learning_rate": 4.445714093182477e-06, "loss": 0.4379, "step": 1882 }, { "epoch": 3.2977232924693523, "grad_norm": 0.222629576921463, "learning_rate": 4.441016434628719e-06, "loss": 0.4622, "step": 1883 }, { "epoch": 3.2994746059544657, "grad_norm": 0.22215494513511658, "learning_rate": 4.436319275692954e-06, "loss": 0.4639, "step": 1884 }, { "epoch": 3.3012259194395797, "grad_norm": 0.20466788113117218, "learning_rate": 4.43162262057349e-06, "loss": 0.4655, "step": 1885 }, { "epoch": 3.3029772329246936, "grad_norm": 0.22291219234466553, "learning_rate": 4.426926473468183e-06, "loss": 0.4579, "step": 1886 }, { "epoch": 3.3047285464098075, "grad_norm": 0.22310172021389008, "learning_rate": 4.4222308385744375e-06, "loss": 0.4639, "step": 1887 }, { "epoch": 3.306479859894921, "grad_norm": 0.22832876443862915, "learning_rate": 4.417535720089199e-06, "loss": 0.4522, "step": 1888 }, { "epoch": 3.308231173380035, "grad_norm": 0.23690758645534515, "learning_rate": 4.412841122208951e-06, "loss": 0.4584, "step": 1889 }, { "epoch": 3.309982486865149, "grad_norm": 0.23834975063800812, "learning_rate": 4.408147049129715e-06, "loss": 0.4429, "step": 1890 }, { "epoch": 3.3117338003502628, "grad_norm": 0.2768918573856354, "learning_rate": 4.403453505047037e-06, "loss": 0.4633, "step": 1891 }, { "epoch": 3.3134851138353767, "grad_norm": 0.26461681723594666, "learning_rate": 4.3987604941559966e-06, "loss": 0.4552, "step": 1892 }, { "epoch": 3.31523642732049, "grad_norm": 0.22121474146842957, "learning_rate": 4.3940680206511935e-06, "loss": 0.4581, "step": 1893 }, { "epoch": 3.316987740805604, "grad_norm": 0.25864046812057495, "learning_rate": 4.389376088726747e-06, "loss": 0.4592, "step": 1894 }, { "epoch": 3.318739054290718, "grad_norm": 0.2809195816516876, "learning_rate": 4.3846847025762944e-06, "loss": 0.4595, "step": 1895 }, { "epoch": 3.320490367775832, "grad_norm": 0.20894934237003326, "learning_rate": 4.3799938663929835e-06, "loss": 0.4492, "step": 1896 }, { "epoch": 3.322241681260946, "grad_norm": 0.30420973896980286, "learning_rate": 4.37530358436947e-06, "loss": 0.4649, "step": 1897 }, { "epoch": 3.3239929947460594, "grad_norm": 0.23627489805221558, "learning_rate": 4.370613860697916e-06, "loss": 0.4567, "step": 1898 }, { "epoch": 3.3257443082311733, "grad_norm": 0.21668481826782227, "learning_rate": 4.3659246995699845e-06, "loss": 0.4442, "step": 1899 }, { "epoch": 3.327495621716287, "grad_norm": 0.2470669448375702, "learning_rate": 4.361236105176832e-06, "loss": 0.462, "step": 1900 }, { "epoch": 3.329246935201401, "grad_norm": 0.22871045768260956, "learning_rate": 4.356548081709116e-06, "loss": 0.451, "step": 1901 }, { "epoch": 3.330998248686515, "grad_norm": 0.23591849207878113, "learning_rate": 4.351860633356976e-06, "loss": 0.4695, "step": 1902 }, { "epoch": 3.3327495621716285, "grad_norm": 0.21596726775169373, "learning_rate": 4.347173764310041e-06, "loss": 0.4518, "step": 1903 }, { "epoch": 3.3345008756567425, "grad_norm": 0.23217318952083588, "learning_rate": 4.342487478757422e-06, "loss": 0.4612, "step": 1904 }, { "epoch": 3.3362521891418564, "grad_norm": 0.24428029358386993, "learning_rate": 4.3378017808877095e-06, "loss": 0.4614, "step": 1905 }, { "epoch": 3.3380035026269703, "grad_norm": 0.21430841088294983, "learning_rate": 4.333116674888966e-06, "loss": 0.4611, "step": 1906 }, { "epoch": 3.3397548161120842, "grad_norm": 0.22758716344833374, "learning_rate": 4.328432164948727e-06, "loss": 0.4609, "step": 1907 }, { "epoch": 3.3415061295971977, "grad_norm": 0.22857460379600525, "learning_rate": 4.323748255253995e-06, "loss": 0.4572, "step": 1908 }, { "epoch": 3.3432574430823117, "grad_norm": 0.21455112099647522, "learning_rate": 4.319064949991236e-06, "loss": 0.4488, "step": 1909 }, { "epoch": 3.3450087565674256, "grad_norm": 0.23477448523044586, "learning_rate": 4.314382253346374e-06, "loss": 0.4558, "step": 1910 }, { "epoch": 3.3467600700525395, "grad_norm": 0.2514097988605499, "learning_rate": 4.309700169504792e-06, "loss": 0.4613, "step": 1911 }, { "epoch": 3.3485113835376534, "grad_norm": 0.2302718460559845, "learning_rate": 4.305018702651323e-06, "loss": 0.4546, "step": 1912 }, { "epoch": 3.350262697022767, "grad_norm": 0.232121542096138, "learning_rate": 4.300337856970251e-06, "loss": 0.4627, "step": 1913 }, { "epoch": 3.352014010507881, "grad_norm": 0.2811051905155182, "learning_rate": 4.295657636645302e-06, "loss": 0.4564, "step": 1914 }, { "epoch": 3.3537653239929948, "grad_norm": 0.23689796030521393, "learning_rate": 4.290978045859643e-06, "loss": 0.4572, "step": 1915 }, { "epoch": 3.3555166374781087, "grad_norm": 0.22757931053638458, "learning_rate": 4.2862990887958815e-06, "loss": 0.4586, "step": 1916 }, { "epoch": 3.357267950963222, "grad_norm": 0.2525290250778198, "learning_rate": 4.281620769636055e-06, "loss": 0.4567, "step": 1917 }, { "epoch": 3.359019264448336, "grad_norm": 0.2344074249267578, "learning_rate": 4.276943092561632e-06, "loss": 0.4489, "step": 1918 }, { "epoch": 3.36077057793345, "grad_norm": 0.22562973201274872, "learning_rate": 4.2722660617535105e-06, "loss": 0.4436, "step": 1919 }, { "epoch": 3.362521891418564, "grad_norm": 0.28819534182548523, "learning_rate": 4.267589681392006e-06, "loss": 0.4573, "step": 1920 }, { "epoch": 3.364273204903678, "grad_norm": 0.25353050231933594, "learning_rate": 4.262913955656856e-06, "loss": 0.4453, "step": 1921 }, { "epoch": 3.366024518388792, "grad_norm": 0.2351161539554596, "learning_rate": 4.258238888727209e-06, "loss": 0.4562, "step": 1922 }, { "epoch": 3.3677758318739053, "grad_norm": 0.2402224987745285, "learning_rate": 4.253564484781629e-06, "loss": 0.4572, "step": 1923 }, { "epoch": 3.369527145359019, "grad_norm": 0.19943936169147491, "learning_rate": 4.248890747998085e-06, "loss": 0.4429, "step": 1924 }, { "epoch": 3.371278458844133, "grad_norm": 0.23058617115020752, "learning_rate": 4.24421768255395e-06, "loss": 0.4553, "step": 1925 }, { "epoch": 3.373029772329247, "grad_norm": 0.1957748830318451, "learning_rate": 4.2395452926259965e-06, "loss": 0.4527, "step": 1926 }, { "epoch": 3.3747810858143605, "grad_norm": 0.23122042417526245, "learning_rate": 4.234873582390395e-06, "loss": 0.4606, "step": 1927 }, { "epoch": 3.3765323992994745, "grad_norm": 0.22064389288425446, "learning_rate": 4.230202556022708e-06, "loss": 0.4525, "step": 1928 }, { "epoch": 3.3782837127845884, "grad_norm": 0.20167340338230133, "learning_rate": 4.225532217697886e-06, "loss": 0.4595, "step": 1929 }, { "epoch": 3.3800350262697023, "grad_norm": 0.23078639805316925, "learning_rate": 4.220862571590264e-06, "loss": 0.4511, "step": 1930 }, { "epoch": 3.3817863397548162, "grad_norm": 0.23031553626060486, "learning_rate": 4.216193621873559e-06, "loss": 0.4665, "step": 1931 }, { "epoch": 3.38353765323993, "grad_norm": 0.20281624794006348, "learning_rate": 4.211525372720865e-06, "loss": 0.4566, "step": 1932 }, { "epoch": 3.3852889667250436, "grad_norm": 0.2242661565542221, "learning_rate": 4.206857828304651e-06, "loss": 0.4552, "step": 1933 }, { "epoch": 3.3870402802101576, "grad_norm": 0.22546860575675964, "learning_rate": 4.202190992796756e-06, "loss": 0.457, "step": 1934 }, { "epoch": 3.3887915936952715, "grad_norm": 0.21249529719352722, "learning_rate": 4.1975248703683855e-06, "loss": 0.4488, "step": 1935 }, { "epoch": 3.3905429071803854, "grad_norm": 0.20555062592029572, "learning_rate": 4.1928594651901055e-06, "loss": 0.4601, "step": 1936 }, { "epoch": 3.392294220665499, "grad_norm": 0.19877231121063232, "learning_rate": 4.188194781431843e-06, "loss": 0.4465, "step": 1937 }, { "epoch": 3.394045534150613, "grad_norm": 0.23534318804740906, "learning_rate": 4.18353082326288e-06, "loss": 0.464, "step": 1938 }, { "epoch": 3.3957968476357268, "grad_norm": 0.20516455173492432, "learning_rate": 4.178867594851849e-06, "loss": 0.4723, "step": 1939 }, { "epoch": 3.3975481611208407, "grad_norm": 0.20903511345386505, "learning_rate": 4.1742051003667315e-06, "loss": 0.4555, "step": 1940 }, { "epoch": 3.3992994746059546, "grad_norm": 0.20506207644939423, "learning_rate": 4.169543343974852e-06, "loss": 0.447, "step": 1941 }, { "epoch": 3.4010507880910685, "grad_norm": 0.20813116431236267, "learning_rate": 4.164882329842876e-06, "loss": 0.4619, "step": 1942 }, { "epoch": 3.402802101576182, "grad_norm": 0.2134970873594284, "learning_rate": 4.160222062136805e-06, "loss": 0.4556, "step": 1943 }, { "epoch": 3.404553415061296, "grad_norm": 0.2240849882364273, "learning_rate": 4.1555625450219725e-06, "loss": 0.4636, "step": 1944 }, { "epoch": 3.40630472854641, "grad_norm": 0.21897949278354645, "learning_rate": 4.150903782663045e-06, "loss": 0.4469, "step": 1945 }, { "epoch": 3.408056042031524, "grad_norm": 0.2231896072626114, "learning_rate": 4.146245779224011e-06, "loss": 0.4657, "step": 1946 }, { "epoch": 3.4098073555166373, "grad_norm": 0.23057235777378082, "learning_rate": 4.141588538868179e-06, "loss": 0.4599, "step": 1947 }, { "epoch": 3.411558669001751, "grad_norm": 0.22166191041469574, "learning_rate": 4.13693206575818e-06, "loss": 0.4528, "step": 1948 }, { "epoch": 3.413309982486865, "grad_norm": 0.2163696587085724, "learning_rate": 4.132276364055956e-06, "loss": 0.4574, "step": 1949 }, { "epoch": 3.415061295971979, "grad_norm": 0.21286427974700928, "learning_rate": 4.127621437922761e-06, "loss": 0.4564, "step": 1950 }, { "epoch": 3.416812609457093, "grad_norm": 0.2648962438106537, "learning_rate": 4.122967291519155e-06, "loss": 0.4549, "step": 1951 }, { "epoch": 3.418563922942207, "grad_norm": 0.22225701808929443, "learning_rate": 4.118313929005002e-06, "loss": 0.4437, "step": 1952 }, { "epoch": 3.4203152364273204, "grad_norm": 0.2018117755651474, "learning_rate": 4.113661354539464e-06, "loss": 0.4587, "step": 1953 }, { "epoch": 3.4220665499124343, "grad_norm": 0.231964111328125, "learning_rate": 4.109009572280999e-06, "loss": 0.4504, "step": 1954 }, { "epoch": 3.4238178633975482, "grad_norm": 0.23498661816120148, "learning_rate": 4.1043585863873595e-06, "loss": 0.45, "step": 1955 }, { "epoch": 3.425569176882662, "grad_norm": 0.20466820895671844, "learning_rate": 4.09970840101558e-06, "loss": 0.4576, "step": 1956 }, { "epoch": 3.4273204903677756, "grad_norm": 0.22887544333934784, "learning_rate": 4.095059020321988e-06, "loss": 0.463, "step": 1957 }, { "epoch": 3.4290718038528896, "grad_norm": 0.23074814677238464, "learning_rate": 4.090410448462186e-06, "loss": 0.4584, "step": 1958 }, { "epoch": 3.4308231173380035, "grad_norm": 0.23982548713684082, "learning_rate": 4.085762689591054e-06, "loss": 0.4658, "step": 1959 }, { "epoch": 3.4325744308231174, "grad_norm": 0.24694116413593292, "learning_rate": 4.081115747862747e-06, "loss": 0.4602, "step": 1960 }, { "epoch": 3.4343257443082313, "grad_norm": 0.21450595557689667, "learning_rate": 4.076469627430689e-06, "loss": 0.4645, "step": 1961 }, { "epoch": 3.436077057793345, "grad_norm": 0.26826155185699463, "learning_rate": 4.071824332447568e-06, "loss": 0.4551, "step": 1962 }, { "epoch": 3.4378283712784588, "grad_norm": 0.22964194416999817, "learning_rate": 4.067179867065339e-06, "loss": 0.4556, "step": 1963 }, { "epoch": 3.4395796847635727, "grad_norm": 0.22160066664218903, "learning_rate": 4.0625362354352105e-06, "loss": 0.457, "step": 1964 }, { "epoch": 3.4413309982486866, "grad_norm": 0.22796642780303955, "learning_rate": 4.0578934417076486e-06, "loss": 0.4528, "step": 1965 }, { "epoch": 3.4430823117338005, "grad_norm": 0.24684715270996094, "learning_rate": 4.053251490032368e-06, "loss": 0.4561, "step": 1966 }, { "epoch": 3.444833625218914, "grad_norm": 0.20523357391357422, "learning_rate": 4.048610384558335e-06, "loss": 0.4462, "step": 1967 }, { "epoch": 3.446584938704028, "grad_norm": 0.2167152762413025, "learning_rate": 4.043970129433757e-06, "loss": 0.454, "step": 1968 }, { "epoch": 3.448336252189142, "grad_norm": 0.27841392159461975, "learning_rate": 4.039330728806079e-06, "loss": 0.4703, "step": 1969 }, { "epoch": 3.450087565674256, "grad_norm": 0.20391839742660522, "learning_rate": 4.034692186821986e-06, "loss": 0.4526, "step": 1970 }, { "epoch": 3.4518388791593697, "grad_norm": 0.21542814373970032, "learning_rate": 4.030054507627395e-06, "loss": 0.467, "step": 1971 }, { "epoch": 3.453590192644483, "grad_norm": 0.2395976185798645, "learning_rate": 4.0254176953674505e-06, "loss": 0.4599, "step": 1972 }, { "epoch": 3.455341506129597, "grad_norm": 0.22489766776561737, "learning_rate": 4.020781754186523e-06, "loss": 0.4511, "step": 1973 }, { "epoch": 3.457092819614711, "grad_norm": 0.2193014919757843, "learning_rate": 4.016146688228203e-06, "loss": 0.4544, "step": 1974 }, { "epoch": 3.458844133099825, "grad_norm": 0.22906002402305603, "learning_rate": 4.011512501635301e-06, "loss": 0.4538, "step": 1975 }, { "epoch": 3.460595446584939, "grad_norm": 0.21076591312885284, "learning_rate": 4.00687919854984e-06, "loss": 0.4563, "step": 1976 }, { "epoch": 3.4623467600700524, "grad_norm": 0.22111371159553528, "learning_rate": 4.002246783113054e-06, "loss": 0.4652, "step": 1977 }, { "epoch": 3.4640980735551663, "grad_norm": 0.2131267935037613, "learning_rate": 3.997615259465382e-06, "loss": 0.4542, "step": 1978 }, { "epoch": 3.4658493870402802, "grad_norm": 0.2304059863090515, "learning_rate": 3.992984631746469e-06, "loss": 0.4583, "step": 1979 }, { "epoch": 3.467600700525394, "grad_norm": 0.21537156403064728, "learning_rate": 3.988354904095157e-06, "loss": 0.4549, "step": 1980 }, { "epoch": 3.469352014010508, "grad_norm": 0.18976251780986786, "learning_rate": 3.983726080649484e-06, "loss": 0.4488, "step": 1981 }, { "epoch": 3.4711033274956216, "grad_norm": 0.2092619389295578, "learning_rate": 3.979098165546679e-06, "loss": 0.4556, "step": 1982 }, { "epoch": 3.4728546409807355, "grad_norm": 0.21620923280715942, "learning_rate": 3.974471162923162e-06, "loss": 0.457, "step": 1983 }, { "epoch": 3.4746059544658494, "grad_norm": 0.1956823617219925, "learning_rate": 3.969845076914532e-06, "loss": 0.4517, "step": 1984 }, { "epoch": 3.4763572679509633, "grad_norm": 0.20896896719932556, "learning_rate": 3.965219911655575e-06, "loss": 0.4573, "step": 1985 }, { "epoch": 3.478108581436077, "grad_norm": 0.21143919229507446, "learning_rate": 3.96059567128025e-06, "loss": 0.4474, "step": 1986 }, { "epoch": 3.4798598949211907, "grad_norm": 0.195909783244133, "learning_rate": 3.9559723599216905e-06, "loss": 0.4534, "step": 1987 }, { "epoch": 3.4816112084063047, "grad_norm": 0.19896525144577026, "learning_rate": 3.951349981712199e-06, "loss": 0.4563, "step": 1988 }, { "epoch": 3.4833625218914186, "grad_norm": 0.21068283915519714, "learning_rate": 3.946728540783247e-06, "loss": 0.4615, "step": 1989 }, { "epoch": 3.4851138353765325, "grad_norm": 0.20493537187576294, "learning_rate": 3.942108041265463e-06, "loss": 0.4472, "step": 1990 }, { "epoch": 3.4868651488616464, "grad_norm": 0.19332073628902435, "learning_rate": 3.9374884872886394e-06, "loss": 0.4581, "step": 1991 }, { "epoch": 3.48861646234676, "grad_norm": 0.22417889535427094, "learning_rate": 3.9328698829817195e-06, "loss": 0.4583, "step": 1992 }, { "epoch": 3.490367775831874, "grad_norm": 0.21460232138633728, "learning_rate": 3.928252232472799e-06, "loss": 0.4683, "step": 1993 }, { "epoch": 3.492119089316988, "grad_norm": 0.20144741237163544, "learning_rate": 3.923635539889121e-06, "loss": 0.4628, "step": 1994 }, { "epoch": 3.4938704028021017, "grad_norm": 0.20537950098514557, "learning_rate": 3.919019809357075e-06, "loss": 0.4613, "step": 1995 }, { "epoch": 3.495621716287215, "grad_norm": 0.22492748498916626, "learning_rate": 3.914405045002186e-06, "loss": 0.4542, "step": 1996 }, { "epoch": 3.497373029772329, "grad_norm": 0.2053975611925125, "learning_rate": 3.9097912509491195e-06, "loss": 0.458, "step": 1997 }, { "epoch": 3.499124343257443, "grad_norm": 0.20657432079315186, "learning_rate": 3.905178431321672e-06, "loss": 0.4596, "step": 1998 }, { "epoch": 3.500875656742557, "grad_norm": 0.23722675442695618, "learning_rate": 3.9005665902427695e-06, "loss": 0.4646, "step": 1999 }, { "epoch": 3.502626970227671, "grad_norm": 0.21591103076934814, "learning_rate": 3.895955731834461e-06, "loss": 0.4565, "step": 2000 }, { "epoch": 3.504378283712785, "grad_norm": 0.20763039588928223, "learning_rate": 3.8913458602179215e-06, "loss": 0.4628, "step": 2001 }, { "epoch": 3.5061295971978983, "grad_norm": 0.19894342124462128, "learning_rate": 3.88673697951344e-06, "loss": 0.4578, "step": 2002 }, { "epoch": 3.507880910683012, "grad_norm": 0.21286582946777344, "learning_rate": 3.882129093840423e-06, "loss": 0.4626, "step": 2003 }, { "epoch": 3.509632224168126, "grad_norm": 0.2373683899641037, "learning_rate": 3.8775222073173844e-06, "loss": 0.4666, "step": 2004 }, { "epoch": 3.51138353765324, "grad_norm": 0.21972715854644775, "learning_rate": 3.872916324061949e-06, "loss": 0.456, "step": 2005 }, { "epoch": 3.5131348511383536, "grad_norm": 0.21272645890712738, "learning_rate": 3.8683114481908415e-06, "loss": 0.4596, "step": 2006 }, { "epoch": 3.5148861646234675, "grad_norm": 0.20684289932250977, "learning_rate": 3.8637075838198864e-06, "loss": 0.4592, "step": 2007 }, { "epoch": 3.5166374781085814, "grad_norm": 0.20367766916751862, "learning_rate": 3.859104735064008e-06, "loss": 0.4636, "step": 2008 }, { "epoch": 3.5183887915936953, "grad_norm": 0.20506712794303894, "learning_rate": 3.854502906037217e-06, "loss": 0.452, "step": 2009 }, { "epoch": 3.5201401050788093, "grad_norm": 0.20287546515464783, "learning_rate": 3.849902100852618e-06, "loss": 0.4567, "step": 2010 }, { "epoch": 3.521891418563923, "grad_norm": 0.20235401391983032, "learning_rate": 3.845302323622398e-06, "loss": 0.4555, "step": 2011 }, { "epoch": 3.5236427320490367, "grad_norm": 0.21425926685333252, "learning_rate": 3.8407035784578226e-06, "loss": 0.4598, "step": 2012 }, { "epoch": 3.5253940455341506, "grad_norm": 0.20745502412319183, "learning_rate": 3.8361058694692414e-06, "loss": 0.4573, "step": 2013 }, { "epoch": 3.5271453590192645, "grad_norm": 0.21161706745624542, "learning_rate": 3.83150920076607e-06, "loss": 0.4609, "step": 2014 }, { "epoch": 3.5288966725043784, "grad_norm": 0.20763900876045227, "learning_rate": 3.826913576456803e-06, "loss": 0.4445, "step": 2015 }, { "epoch": 3.530647985989492, "grad_norm": 0.2056819200515747, "learning_rate": 3.822319000648995e-06, "loss": 0.4569, "step": 2016 }, { "epoch": 3.532399299474606, "grad_norm": 0.2379406988620758, "learning_rate": 3.817725477449265e-06, "loss": 0.4609, "step": 2017 }, { "epoch": 3.5341506129597198, "grad_norm": 0.21110188961029053, "learning_rate": 3.8131330109632925e-06, "loss": 0.4497, "step": 2018 }, { "epoch": 3.5359019264448337, "grad_norm": 0.21274013817310333, "learning_rate": 3.8085416052958107e-06, "loss": 0.4585, "step": 2019 }, { "epoch": 3.5376532399299476, "grad_norm": 0.205294668674469, "learning_rate": 3.8039512645506055e-06, "loss": 0.45, "step": 2020 }, { "epoch": 3.5394045534150615, "grad_norm": 0.20796476304531097, "learning_rate": 3.7993619928305113e-06, "loss": 0.4458, "step": 2021 }, { "epoch": 3.541155866900175, "grad_norm": 0.21082095801830292, "learning_rate": 3.7947737942374065e-06, "loss": 0.4608, "step": 2022 }, { "epoch": 3.542907180385289, "grad_norm": 0.21594072878360748, "learning_rate": 3.7901866728722102e-06, "loss": 0.4463, "step": 2023 }, { "epoch": 3.544658493870403, "grad_norm": 0.23790018260478973, "learning_rate": 3.7856006328348805e-06, "loss": 0.4452, "step": 2024 }, { "epoch": 3.5464098073555164, "grad_norm": 0.20691396296024323, "learning_rate": 3.7810156782244055e-06, "loss": 0.4487, "step": 2025 }, { "epoch": 3.5481611208406303, "grad_norm": 0.24907729029655457, "learning_rate": 3.7764318131388067e-06, "loss": 0.4622, "step": 2026 }, { "epoch": 3.549912434325744, "grad_norm": 0.22943726181983948, "learning_rate": 3.7718490416751297e-06, "loss": 0.4548, "step": 2027 }, { "epoch": 3.551663747810858, "grad_norm": 0.22877225279808044, "learning_rate": 3.7672673679294436e-06, "loss": 0.452, "step": 2028 }, { "epoch": 3.553415061295972, "grad_norm": 0.20296043157577515, "learning_rate": 3.762686795996835e-06, "loss": 0.461, "step": 2029 }, { "epoch": 3.555166374781086, "grad_norm": 0.2145472913980484, "learning_rate": 3.7581073299714077e-06, "loss": 0.4682, "step": 2030 }, { "epoch": 3.5569176882662, "grad_norm": 0.21685798466205597, "learning_rate": 3.7535289739462757e-06, "loss": 0.4508, "step": 2031 }, { "epoch": 3.5586690017513134, "grad_norm": 0.22128385305404663, "learning_rate": 3.7489517320135583e-06, "loss": 0.449, "step": 2032 }, { "epoch": 3.5604203152364273, "grad_norm": 0.24109897017478943, "learning_rate": 3.7443756082643866e-06, "loss": 0.4665, "step": 2033 }, { "epoch": 3.5621716287215412, "grad_norm": 0.21840791404247284, "learning_rate": 3.739800606788886e-06, "loss": 0.4671, "step": 2034 }, { "epoch": 3.5639229422066547, "grad_norm": 0.24935562908649445, "learning_rate": 3.735226731676179e-06, "loss": 0.4656, "step": 2035 }, { "epoch": 3.5656742556917687, "grad_norm": 0.2284000813961029, "learning_rate": 3.7306539870143853e-06, "loss": 0.4547, "step": 2036 }, { "epoch": 3.5674255691768826, "grad_norm": 0.22383010387420654, "learning_rate": 3.7260823768906096e-06, "loss": 0.4677, "step": 2037 }, { "epoch": 3.5691768826619965, "grad_norm": 0.24768376350402832, "learning_rate": 3.7215119053909444e-06, "loss": 0.4501, "step": 2038 }, { "epoch": 3.5709281961471104, "grad_norm": 0.21223124861717224, "learning_rate": 3.7169425766004653e-06, "loss": 0.4416, "step": 2039 }, { "epoch": 3.5726795096322244, "grad_norm": 0.22866210341453552, "learning_rate": 3.7123743946032263e-06, "loss": 0.4499, "step": 2040 }, { "epoch": 3.574430823117338, "grad_norm": 0.24689362943172455, "learning_rate": 3.7078073634822565e-06, "loss": 0.459, "step": 2041 }, { "epoch": 3.5761821366024518, "grad_norm": 0.2467159479856491, "learning_rate": 3.7032414873195544e-06, "loss": 0.4641, "step": 2042 }, { "epoch": 3.5779334500875657, "grad_norm": 0.21250154078006744, "learning_rate": 3.698676770196089e-06, "loss": 0.4622, "step": 2043 }, { "epoch": 3.5796847635726796, "grad_norm": 0.2557922303676605, "learning_rate": 3.6941132161917916e-06, "loss": 0.4486, "step": 2044 }, { "epoch": 3.581436077057793, "grad_norm": 0.20122045278549194, "learning_rate": 3.6895508293855546e-06, "loss": 0.4529, "step": 2045 }, { "epoch": 3.583187390542907, "grad_norm": 0.22113946080207825, "learning_rate": 3.6849896138552265e-06, "loss": 0.4594, "step": 2046 }, { "epoch": 3.584938704028021, "grad_norm": 0.2693116366863251, "learning_rate": 3.68042957367761e-06, "loss": 0.463, "step": 2047 }, { "epoch": 3.586690017513135, "grad_norm": 0.19556470215320587, "learning_rate": 3.6758707129284556e-06, "loss": 0.4633, "step": 2048 }, { "epoch": 3.588441330998249, "grad_norm": 0.23036935925483704, "learning_rate": 3.6713130356824615e-06, "loss": 0.455, "step": 2049 }, { "epoch": 3.5901926444833627, "grad_norm": 0.22654253244400024, "learning_rate": 3.666756546013267e-06, "loss": 0.4534, "step": 2050 }, { "epoch": 3.591943957968476, "grad_norm": 0.19805629551410675, "learning_rate": 3.6622012479934505e-06, "loss": 0.4544, "step": 2051 }, { "epoch": 3.59369527145359, "grad_norm": 0.23916606605052948, "learning_rate": 3.6576471456945244e-06, "loss": 0.4621, "step": 2052 }, { "epoch": 3.595446584938704, "grad_norm": 0.22372816503047943, "learning_rate": 3.653094243186933e-06, "loss": 0.4641, "step": 2053 }, { "epoch": 3.597197898423818, "grad_norm": 0.2112518697977066, "learning_rate": 3.648542544540049e-06, "loss": 0.4444, "step": 2054 }, { "epoch": 3.5989492119089315, "grad_norm": 0.21357852220535278, "learning_rate": 3.6439920538221675e-06, "loss": 0.4484, "step": 2055 }, { "epoch": 3.6007005253940454, "grad_norm": 0.22537311911582947, "learning_rate": 3.639442775100505e-06, "loss": 0.4646, "step": 2056 }, { "epoch": 3.6024518388791593, "grad_norm": 0.2556622326374054, "learning_rate": 3.634894712441193e-06, "loss": 0.4615, "step": 2057 }, { "epoch": 3.6042031523642732, "grad_norm": 0.20044252276420593, "learning_rate": 3.630347869909281e-06, "loss": 0.459, "step": 2058 }, { "epoch": 3.605954465849387, "grad_norm": 0.222383052110672, "learning_rate": 3.6258022515687215e-06, "loss": 0.4479, "step": 2059 }, { "epoch": 3.607705779334501, "grad_norm": 0.22983823716640472, "learning_rate": 3.6212578614823767e-06, "loss": 0.4522, "step": 2060 }, { "epoch": 3.6094570928196146, "grad_norm": 0.20674709975719452, "learning_rate": 3.616714703712011e-06, "loss": 0.4527, "step": 2061 }, { "epoch": 3.6112084063047285, "grad_norm": 0.21983937919139862, "learning_rate": 3.6121727823182854e-06, "loss": 0.4512, "step": 2062 }, { "epoch": 3.6129597197898424, "grad_norm": 0.23016715049743652, "learning_rate": 3.6076321013607575e-06, "loss": 0.4513, "step": 2063 }, { "epoch": 3.6147110332749564, "grad_norm": 0.1907363384962082, "learning_rate": 3.6030926648978748e-06, "loss": 0.4596, "step": 2064 }, { "epoch": 3.61646234676007, "grad_norm": 0.2116628736257553, "learning_rate": 3.598554476986974e-06, "loss": 0.4628, "step": 2065 }, { "epoch": 3.6182136602451838, "grad_norm": 0.21333667635917664, "learning_rate": 3.594017541684275e-06, "loss": 0.4565, "step": 2066 }, { "epoch": 3.6199649737302977, "grad_norm": 0.2004368007183075, "learning_rate": 3.589481863044878e-06, "loss": 0.4494, "step": 2067 }, { "epoch": 3.6217162872154116, "grad_norm": 0.20493796467781067, "learning_rate": 3.5849474451227607e-06, "loss": 0.4576, "step": 2068 }, { "epoch": 3.6234676007005255, "grad_norm": 0.20617583394050598, "learning_rate": 3.5804142919707734e-06, "loss": 0.4689, "step": 2069 }, { "epoch": 3.6252189141856395, "grad_norm": 0.2137409746646881, "learning_rate": 3.5758824076406358e-06, "loss": 0.4496, "step": 2070 }, { "epoch": 3.626970227670753, "grad_norm": 0.2060672640800476, "learning_rate": 3.5713517961829314e-06, "loss": 0.4539, "step": 2071 }, { "epoch": 3.628721541155867, "grad_norm": 0.21934933960437775, "learning_rate": 3.5668224616471136e-06, "loss": 0.4465, "step": 2072 }, { "epoch": 3.630472854640981, "grad_norm": 0.2055792659521103, "learning_rate": 3.562294408081486e-06, "loss": 0.4634, "step": 2073 }, { "epoch": 3.6322241681260947, "grad_norm": 0.21143947541713715, "learning_rate": 3.5577676395332116e-06, "loss": 0.4642, "step": 2074 }, { "epoch": 3.633975481611208, "grad_norm": 0.2053321748971939, "learning_rate": 3.5532421600483035e-06, "loss": 0.4566, "step": 2075 }, { "epoch": 3.635726795096322, "grad_norm": 0.2092052549123764, "learning_rate": 3.5487179736716233e-06, "loss": 0.4529, "step": 2076 }, { "epoch": 3.637478108581436, "grad_norm": 0.2117375135421753, "learning_rate": 3.5441950844468754e-06, "loss": 0.4597, "step": 2077 }, { "epoch": 3.63922942206655, "grad_norm": 0.21172714233398438, "learning_rate": 3.539673496416607e-06, "loss": 0.4536, "step": 2078 }, { "epoch": 3.640980735551664, "grad_norm": 0.18582333624362946, "learning_rate": 3.5351532136222012e-06, "loss": 0.4558, "step": 2079 }, { "epoch": 3.642732049036778, "grad_norm": 0.218819260597229, "learning_rate": 3.5306342401038727e-06, "loss": 0.4614, "step": 2080 }, { "epoch": 3.6444833625218913, "grad_norm": 0.217325359582901, "learning_rate": 3.52611657990067e-06, "loss": 0.4589, "step": 2081 }, { "epoch": 3.6462346760070052, "grad_norm": 0.1977737545967102, "learning_rate": 3.5216002370504656e-06, "loss": 0.4588, "step": 2082 }, { "epoch": 3.647985989492119, "grad_norm": 0.19268754124641418, "learning_rate": 3.5170852155899517e-06, "loss": 0.4584, "step": 2083 }, { "epoch": 3.649737302977233, "grad_norm": 0.21060225367546082, "learning_rate": 3.512571519554645e-06, "loss": 0.454, "step": 2084 }, { "epoch": 3.6514886164623466, "grad_norm": 0.20049543678760529, "learning_rate": 3.5080591529788736e-06, "loss": 0.4616, "step": 2085 }, { "epoch": 3.6532399299474605, "grad_norm": 0.20022451877593994, "learning_rate": 3.5035481198957794e-06, "loss": 0.4723, "step": 2086 }, { "epoch": 3.6549912434325744, "grad_norm": 0.2019282877445221, "learning_rate": 3.4990384243373105e-06, "loss": 0.4655, "step": 2087 }, { "epoch": 3.6567425569176883, "grad_norm": 0.20322802662849426, "learning_rate": 3.494530070334221e-06, "loss": 0.4542, "step": 2088 }, { "epoch": 3.6584938704028023, "grad_norm": 0.19141307473182678, "learning_rate": 3.490023061916065e-06, "loss": 0.4523, "step": 2089 }, { "epoch": 3.660245183887916, "grad_norm": 0.1896030753850937, "learning_rate": 3.4855174031111947e-06, "loss": 0.4467, "step": 2090 }, { "epoch": 3.6619964973730297, "grad_norm": 0.19331222772598267, "learning_rate": 3.481013097946756e-06, "loss": 0.4525, "step": 2091 }, { "epoch": 3.6637478108581436, "grad_norm": 0.1906977742910385, "learning_rate": 3.4765101504486842e-06, "loss": 0.45, "step": 2092 }, { "epoch": 3.6654991243432575, "grad_norm": 0.2065323293209076, "learning_rate": 3.4720085646417025e-06, "loss": 0.456, "step": 2093 }, { "epoch": 3.667250437828371, "grad_norm": 0.2020210176706314, "learning_rate": 3.467508344549314e-06, "loss": 0.4523, "step": 2094 }, { "epoch": 3.669001751313485, "grad_norm": 0.1972569227218628, "learning_rate": 3.4630094941938054e-06, "loss": 0.4587, "step": 2095 }, { "epoch": 3.670753064798599, "grad_norm": 0.1985439956188202, "learning_rate": 3.458512017596235e-06, "loss": 0.4592, "step": 2096 }, { "epoch": 3.672504378283713, "grad_norm": 0.19823411107063293, "learning_rate": 3.454015918776437e-06, "loss": 0.4471, "step": 2097 }, { "epoch": 3.6742556917688267, "grad_norm": 0.19383248686790466, "learning_rate": 3.449521201753011e-06, "loss": 0.4571, "step": 2098 }, { "epoch": 3.6760070052539406, "grad_norm": 0.191032275557518, "learning_rate": 3.445027870543323e-06, "loss": 0.4561, "step": 2099 }, { "epoch": 3.6777583187390546, "grad_norm": 0.21386392414569855, "learning_rate": 3.4405359291635008e-06, "loss": 0.4592, "step": 2100 }, { "epoch": 3.679509632224168, "grad_norm": 0.21193839609622955, "learning_rate": 3.436045381628429e-06, "loss": 0.4561, "step": 2101 }, { "epoch": 3.681260945709282, "grad_norm": 0.20027931034564972, "learning_rate": 3.431556231951747e-06, "loss": 0.459, "step": 2102 }, { "epoch": 3.683012259194396, "grad_norm": 0.22158019244670868, "learning_rate": 3.4270684841458445e-06, "loss": 0.453, "step": 2103 }, { "epoch": 3.6847635726795094, "grad_norm": 0.21377669274806976, "learning_rate": 3.4225821422218585e-06, "loss": 0.4607, "step": 2104 }, { "epoch": 3.6865148861646233, "grad_norm": 0.21499444544315338, "learning_rate": 3.418097210189669e-06, "loss": 0.4579, "step": 2105 }, { "epoch": 3.6882661996497372, "grad_norm": 0.1973264217376709, "learning_rate": 3.4136136920578967e-06, "loss": 0.444, "step": 2106 }, { "epoch": 3.690017513134851, "grad_norm": 0.23088927567005157, "learning_rate": 3.4091315918338968e-06, "loss": 0.4524, "step": 2107 }, { "epoch": 3.691768826619965, "grad_norm": 0.2131594717502594, "learning_rate": 3.4046509135237593e-06, "loss": 0.4545, "step": 2108 }, { "epoch": 3.693520140105079, "grad_norm": 0.1968134343624115, "learning_rate": 3.400171661132299e-06, "loss": 0.4617, "step": 2109 }, { "epoch": 3.6952714535901925, "grad_norm": 0.21182771027088165, "learning_rate": 3.395693838663065e-06, "loss": 0.4462, "step": 2110 }, { "epoch": 3.6970227670753064, "grad_norm": 0.20164285600185394, "learning_rate": 3.3912174501183206e-06, "loss": 0.4595, "step": 2111 }, { "epoch": 3.6987740805604203, "grad_norm": 0.20484952628612518, "learning_rate": 3.3867424994990483e-06, "loss": 0.4586, "step": 2112 }, { "epoch": 3.7005253940455343, "grad_norm": 0.2139595001935959, "learning_rate": 3.3822689908049487e-06, "loss": 0.446, "step": 2113 }, { "epoch": 3.7022767075306477, "grad_norm": 0.20098160207271576, "learning_rate": 3.37779692803443e-06, "loss": 0.4572, "step": 2114 }, { "epoch": 3.7040280210157617, "grad_norm": 0.21869154274463654, "learning_rate": 3.373326315184612e-06, "loss": 0.456, "step": 2115 }, { "epoch": 3.7057793345008756, "grad_norm": 0.20701950788497925, "learning_rate": 3.368857156251314e-06, "loss": 0.4555, "step": 2116 }, { "epoch": 3.7075306479859895, "grad_norm": 0.20150119066238403, "learning_rate": 3.364389455229059e-06, "loss": 0.456, "step": 2117 }, { "epoch": 3.7092819614711035, "grad_norm": 0.21742917597293854, "learning_rate": 3.359923216111067e-06, "loss": 0.4607, "step": 2118 }, { "epoch": 3.7110332749562174, "grad_norm": 0.19985169172286987, "learning_rate": 3.3554584428892488e-06, "loss": 0.4514, "step": 2119 }, { "epoch": 3.712784588441331, "grad_norm": 0.22888252139091492, "learning_rate": 3.3509951395542073e-06, "loss": 0.4524, "step": 2120 }, { "epoch": 3.714535901926445, "grad_norm": 0.22526273131370544, "learning_rate": 3.3465333100952306e-06, "loss": 0.4542, "step": 2121 }, { "epoch": 3.7162872154115587, "grad_norm": 0.1869342178106308, "learning_rate": 3.342072958500291e-06, "loss": 0.4591, "step": 2122 }, { "epoch": 3.7180385288966726, "grad_norm": 0.22480198740959167, "learning_rate": 3.337614088756037e-06, "loss": 0.4667, "step": 2123 }, { "epoch": 3.719789842381786, "grad_norm": 0.24795544147491455, "learning_rate": 3.3331567048477946e-06, "loss": 0.4488, "step": 2124 }, { "epoch": 3.7215411558669, "grad_norm": 0.19785112142562866, "learning_rate": 3.328700810759562e-06, "loss": 0.4724, "step": 2125 }, { "epoch": 3.723292469352014, "grad_norm": 0.20214931666851044, "learning_rate": 3.324246410474006e-06, "loss": 0.4514, "step": 2126 }, { "epoch": 3.725043782837128, "grad_norm": 0.2156546711921692, "learning_rate": 3.319793507972454e-06, "loss": 0.4683, "step": 2127 }, { "epoch": 3.726795096322242, "grad_norm": 0.21608425676822662, "learning_rate": 3.3153421072349024e-06, "loss": 0.4695, "step": 2128 }, { "epoch": 3.7285464098073557, "grad_norm": 0.1947939246892929, "learning_rate": 3.310892212239999e-06, "loss": 0.4587, "step": 2129 }, { "epoch": 3.7302977232924692, "grad_norm": 0.20343255996704102, "learning_rate": 3.306443826965049e-06, "loss": 0.4588, "step": 2130 }, { "epoch": 3.732049036777583, "grad_norm": 0.2094586044549942, "learning_rate": 3.3019969553860056e-06, "loss": 0.4604, "step": 2131 }, { "epoch": 3.733800350262697, "grad_norm": 0.21866390109062195, "learning_rate": 3.2975516014774713e-06, "loss": 0.4509, "step": 2132 }, { "epoch": 3.735551663747811, "grad_norm": 0.19964271783828735, "learning_rate": 3.2931077692126912e-06, "loss": 0.4661, "step": 2133 }, { "epoch": 3.7373029772329245, "grad_norm": 0.18892401456832886, "learning_rate": 3.2886654625635507e-06, "loss": 0.4592, "step": 2134 }, { "epoch": 3.7390542907180384, "grad_norm": 0.21166585385799408, "learning_rate": 3.284224685500571e-06, "loss": 0.4608, "step": 2135 }, { "epoch": 3.7408056042031523, "grad_norm": 0.1995638608932495, "learning_rate": 3.2797854419929064e-06, "loss": 0.4514, "step": 2136 }, { "epoch": 3.7425569176882663, "grad_norm": 0.2005520761013031, "learning_rate": 3.2753477360083417e-06, "loss": 0.4436, "step": 2137 }, { "epoch": 3.74430823117338, "grad_norm": 0.20193207263946533, "learning_rate": 3.2709115715132868e-06, "loss": 0.4547, "step": 2138 }, { "epoch": 3.746059544658494, "grad_norm": 0.22023898363113403, "learning_rate": 3.2664769524727712e-06, "loss": 0.4617, "step": 2139 }, { "epoch": 3.7478108581436076, "grad_norm": 0.2206220179796219, "learning_rate": 3.2620438828504473e-06, "loss": 0.4547, "step": 2140 }, { "epoch": 3.7495621716287215, "grad_norm": 0.20926430821418762, "learning_rate": 3.25761236660858e-06, "loss": 0.4479, "step": 2141 }, { "epoch": 3.7513134851138354, "grad_norm": 0.1920793503522873, "learning_rate": 3.2531824077080466e-06, "loss": 0.4636, "step": 2142 }, { "epoch": 3.7530647985989494, "grad_norm": 0.20208372175693512, "learning_rate": 3.2487540101083325e-06, "loss": 0.4489, "step": 2143 }, { "epoch": 3.754816112084063, "grad_norm": 0.24990582466125488, "learning_rate": 3.244327177767527e-06, "loss": 0.4714, "step": 2144 }, { "epoch": 3.7565674255691768, "grad_norm": 0.20851914584636688, "learning_rate": 3.23990191464232e-06, "loss": 0.4606, "step": 2145 }, { "epoch": 3.7583187390542907, "grad_norm": 0.2022835612297058, "learning_rate": 3.2354782246880016e-06, "loss": 0.4518, "step": 2146 }, { "epoch": 3.7600700525394046, "grad_norm": 0.19553935527801514, "learning_rate": 3.2310561118584526e-06, "loss": 0.4483, "step": 2147 }, { "epoch": 3.7618213660245186, "grad_norm": 0.22163023054599762, "learning_rate": 3.226635580106147e-06, "loss": 0.4515, "step": 2148 }, { "epoch": 3.7635726795096325, "grad_norm": 0.19627784192562103, "learning_rate": 3.222216633382142e-06, "loss": 0.4474, "step": 2149 }, { "epoch": 3.765323992994746, "grad_norm": 0.1879592090845108, "learning_rate": 3.2177992756360814e-06, "loss": 0.4472, "step": 2150 }, { "epoch": 3.76707530647986, "grad_norm": 0.2029295712709427, "learning_rate": 3.2133835108161875e-06, "loss": 0.4481, "step": 2151 }, { "epoch": 3.768826619964974, "grad_norm": 0.19930410385131836, "learning_rate": 3.2089693428692587e-06, "loss": 0.4568, "step": 2152 }, { "epoch": 3.7705779334500873, "grad_norm": 0.20696066319942474, "learning_rate": 3.204556775740665e-06, "loss": 0.4627, "step": 2153 }, { "epoch": 3.772329246935201, "grad_norm": 0.1923670619726181, "learning_rate": 3.200145813374349e-06, "loss": 0.4584, "step": 2154 }, { "epoch": 3.774080560420315, "grad_norm": 0.21735291182994843, "learning_rate": 3.195736459712817e-06, "loss": 0.4544, "step": 2155 }, { "epoch": 3.775831873905429, "grad_norm": 0.19754131138324738, "learning_rate": 3.1913287186971353e-06, "loss": 0.448, "step": 2156 }, { "epoch": 3.777583187390543, "grad_norm": 0.20246216654777527, "learning_rate": 3.186922594266932e-06, "loss": 0.4542, "step": 2157 }, { "epoch": 3.779334500875657, "grad_norm": 0.2049606591463089, "learning_rate": 3.1825180903603903e-06, "loss": 0.4563, "step": 2158 }, { "epoch": 3.781085814360771, "grad_norm": 0.19626542925834656, "learning_rate": 3.178115210914242e-06, "loss": 0.4403, "step": 2159 }, { "epoch": 3.7828371278458843, "grad_norm": 0.21816235780715942, "learning_rate": 3.1737139598637688e-06, "loss": 0.4617, "step": 2160 }, { "epoch": 3.7845884413309983, "grad_norm": 0.2130071371793747, "learning_rate": 3.1693143411427984e-06, "loss": 0.4634, "step": 2161 }, { "epoch": 3.786339754816112, "grad_norm": 0.18968476355075836, "learning_rate": 3.164916358683697e-06, "loss": 0.4655, "step": 2162 }, { "epoch": 3.7880910683012257, "grad_norm": 0.20599573850631714, "learning_rate": 3.160520016417369e-06, "loss": 0.4491, "step": 2163 }, { "epoch": 3.7898423817863396, "grad_norm": 0.19829101860523224, "learning_rate": 3.1561253182732544e-06, "loss": 0.4588, "step": 2164 }, { "epoch": 3.7915936952714535, "grad_norm": 0.20909522473812103, "learning_rate": 3.1517322681793215e-06, "loss": 0.457, "step": 2165 }, { "epoch": 3.7933450087565674, "grad_norm": 0.19272498786449432, "learning_rate": 3.1473408700620665e-06, "loss": 0.4549, "step": 2166 }, { "epoch": 3.7950963222416814, "grad_norm": 0.1917303055524826, "learning_rate": 3.1429511278465104e-06, "loss": 0.4512, "step": 2167 }, { "epoch": 3.7968476357267953, "grad_norm": 0.2078457474708557, "learning_rate": 3.138563045456191e-06, "loss": 0.4579, "step": 2168 }, { "epoch": 3.7985989492119088, "grad_norm": 0.21398450434207916, "learning_rate": 3.1341766268131657e-06, "loss": 0.456, "step": 2169 }, { "epoch": 3.8003502626970227, "grad_norm": 0.19938796758651733, "learning_rate": 3.1297918758380035e-06, "loss": 0.4568, "step": 2170 }, { "epoch": 3.8021015761821366, "grad_norm": 0.1939896196126938, "learning_rate": 3.1254087964497816e-06, "loss": 0.4581, "step": 2171 }, { "epoch": 3.8038528896672505, "grad_norm": 0.19545885920524597, "learning_rate": 3.121027392566086e-06, "loss": 0.4533, "step": 2172 }, { "epoch": 3.805604203152364, "grad_norm": 0.18870501220226288, "learning_rate": 3.1166476681030035e-06, "loss": 0.4611, "step": 2173 }, { "epoch": 3.807355516637478, "grad_norm": 0.19867683947086334, "learning_rate": 3.1122696269751196e-06, "loss": 0.4565, "step": 2174 }, { "epoch": 3.809106830122592, "grad_norm": 0.19597354531288147, "learning_rate": 3.1078932730955152e-06, "loss": 0.4502, "step": 2175 }, { "epoch": 3.810858143607706, "grad_norm": 0.20202913880348206, "learning_rate": 3.103518610375764e-06, "loss": 0.4555, "step": 2176 }, { "epoch": 3.8126094570928197, "grad_norm": 0.2176588922739029, "learning_rate": 3.099145642725927e-06, "loss": 0.4521, "step": 2177 }, { "epoch": 3.8143607705779337, "grad_norm": 0.19942069053649902, "learning_rate": 3.0947743740545523e-06, "loss": 0.4497, "step": 2178 }, { "epoch": 3.816112084063047, "grad_norm": 0.1925959438085556, "learning_rate": 3.0904048082686655e-06, "loss": 0.4586, "step": 2179 }, { "epoch": 3.817863397548161, "grad_norm": 0.2238992601633072, "learning_rate": 3.0860369492737753e-06, "loss": 0.4525, "step": 2180 }, { "epoch": 3.819614711033275, "grad_norm": 0.19954635202884674, "learning_rate": 3.0816708009738605e-06, "loss": 0.4568, "step": 2181 }, { "epoch": 3.821366024518389, "grad_norm": 0.21864308416843414, "learning_rate": 3.0773063672713725e-06, "loss": 0.4613, "step": 2182 }, { "epoch": 3.8231173380035024, "grad_norm": 0.2176475077867508, "learning_rate": 3.0729436520672318e-06, "loss": 0.4722, "step": 2183 }, { "epoch": 3.8248686514886163, "grad_norm": 0.20186229050159454, "learning_rate": 3.0685826592608193e-06, "loss": 0.4582, "step": 2184 }, { "epoch": 3.8266199649737302, "grad_norm": 0.20876501500606537, "learning_rate": 3.0642233927499786e-06, "loss": 0.4637, "step": 2185 }, { "epoch": 3.828371278458844, "grad_norm": 0.22609052062034607, "learning_rate": 3.0598658564310122e-06, "loss": 0.4559, "step": 2186 }, { "epoch": 3.830122591943958, "grad_norm": 0.20986875891685486, "learning_rate": 3.055510054198675e-06, "loss": 0.4454, "step": 2187 }, { "epoch": 3.831873905429072, "grad_norm": 0.2159881889820099, "learning_rate": 3.0511559899461684e-06, "loss": 0.4438, "step": 2188 }, { "epoch": 3.8336252189141855, "grad_norm": 0.21394000947475433, "learning_rate": 3.0468036675651447e-06, "loss": 0.4598, "step": 2189 }, { "epoch": 3.8353765323992994, "grad_norm": 0.21933840215206146, "learning_rate": 3.0424530909456974e-06, "loss": 0.4652, "step": 2190 }, { "epoch": 3.8371278458844134, "grad_norm": 0.21888591349124908, "learning_rate": 3.03810426397636e-06, "loss": 0.4516, "step": 2191 }, { "epoch": 3.8388791593695273, "grad_norm": 0.22307850420475006, "learning_rate": 3.033757190544102e-06, "loss": 0.4596, "step": 2192 }, { "epoch": 3.8406304728546408, "grad_norm": 0.20145606994628906, "learning_rate": 3.029411874534327e-06, "loss": 0.4566, "step": 2193 }, { "epoch": 3.8423817863397547, "grad_norm": 0.21064123511314392, "learning_rate": 3.0250683198308656e-06, "loss": 0.4612, "step": 2194 }, { "epoch": 3.8441330998248686, "grad_norm": 0.2083745002746582, "learning_rate": 3.020726530315975e-06, "loss": 0.4581, "step": 2195 }, { "epoch": 3.8458844133099825, "grad_norm": 0.20300911366939545, "learning_rate": 3.016386509870335e-06, "loss": 0.4655, "step": 2196 }, { "epoch": 3.8476357267950965, "grad_norm": 0.2155468910932541, "learning_rate": 3.0120482623730452e-06, "loss": 0.4607, "step": 2197 }, { "epoch": 3.8493870402802104, "grad_norm": 0.22203272581100464, "learning_rate": 3.0077117917016187e-06, "loss": 0.4656, "step": 2198 }, { "epoch": 3.851138353765324, "grad_norm": 0.19978104531764984, "learning_rate": 3.00337710173198e-06, "loss": 0.4528, "step": 2199 }, { "epoch": 3.852889667250438, "grad_norm": 0.20566125214099884, "learning_rate": 2.9990441963384657e-06, "loss": 0.4602, "step": 2200 }, { "epoch": 3.8546409807355517, "grad_norm": 0.22553515434265137, "learning_rate": 2.9947130793938127e-06, "loss": 0.4562, "step": 2201 }, { "epoch": 3.8563922942206657, "grad_norm": 0.22077244520187378, "learning_rate": 2.9903837547691634e-06, "loss": 0.4643, "step": 2202 }, { "epoch": 3.858143607705779, "grad_norm": 0.2296818494796753, "learning_rate": 2.986056226334054e-06, "loss": 0.4515, "step": 2203 }, { "epoch": 3.859894921190893, "grad_norm": 0.20015932619571686, "learning_rate": 2.9817304979564206e-06, "loss": 0.4614, "step": 2204 }, { "epoch": 3.861646234676007, "grad_norm": 0.2311914563179016, "learning_rate": 2.977406573502585e-06, "loss": 0.4666, "step": 2205 }, { "epoch": 3.863397548161121, "grad_norm": 0.20306460559368134, "learning_rate": 2.9730844568372598e-06, "loss": 0.4625, "step": 2206 }, { "epoch": 3.865148861646235, "grad_norm": 0.21536439657211304, "learning_rate": 2.968764151823542e-06, "loss": 0.4547, "step": 2207 }, { "epoch": 3.8669001751313488, "grad_norm": 0.22801481187343597, "learning_rate": 2.964445662322908e-06, "loss": 0.4623, "step": 2208 }, { "epoch": 3.8686514886164622, "grad_norm": 0.21323882043361664, "learning_rate": 2.960128992195211e-06, "loss": 0.4687, "step": 2209 }, { "epoch": 3.870402802101576, "grad_norm": 0.21159495413303375, "learning_rate": 2.95581414529868e-06, "loss": 0.4643, "step": 2210 }, { "epoch": 3.87215411558669, "grad_norm": 0.2122330665588379, "learning_rate": 2.951501125489914e-06, "loss": 0.4597, "step": 2211 }, { "epoch": 3.873905429071804, "grad_norm": 0.23887906968593597, "learning_rate": 2.9471899366238777e-06, "loss": 0.4634, "step": 2212 }, { "epoch": 3.8756567425569175, "grad_norm": 0.21312737464904785, "learning_rate": 2.9428805825538993e-06, "loss": 0.4693, "step": 2213 }, { "epoch": 3.8774080560420314, "grad_norm": 0.21791113913059235, "learning_rate": 2.9385730671316687e-06, "loss": 0.4622, "step": 2214 }, { "epoch": 3.8791593695271454, "grad_norm": 0.23701231181621552, "learning_rate": 2.934267394207231e-06, "loss": 0.4551, "step": 2215 }, { "epoch": 3.8809106830122593, "grad_norm": 0.21469324827194214, "learning_rate": 2.929963567628985e-06, "loss": 0.4425, "step": 2216 }, { "epoch": 3.882661996497373, "grad_norm": 0.2244265228509903, "learning_rate": 2.9256615912436803e-06, "loss": 0.4489, "step": 2217 }, { "epoch": 3.884413309982487, "grad_norm": 0.23417745530605316, "learning_rate": 2.9213614688964104e-06, "loss": 0.4577, "step": 2218 }, { "epoch": 3.8861646234676006, "grad_norm": 0.21046794950962067, "learning_rate": 2.9170632044306137e-06, "loss": 0.4525, "step": 2219 }, { "epoch": 3.8879159369527145, "grad_norm": 0.2472815066576004, "learning_rate": 2.9127668016880674e-06, "loss": 0.4516, "step": 2220 }, { "epoch": 3.8896672504378285, "grad_norm": 0.23605385422706604, "learning_rate": 2.9084722645088825e-06, "loss": 0.4595, "step": 2221 }, { "epoch": 3.891418563922942, "grad_norm": 0.20141780376434326, "learning_rate": 2.904179596731509e-06, "loss": 0.4694, "step": 2222 }, { "epoch": 3.893169877408056, "grad_norm": 0.21499311923980713, "learning_rate": 2.8998888021927146e-06, "loss": 0.4566, "step": 2223 }, { "epoch": 3.89492119089317, "grad_norm": 0.2323315441608429, "learning_rate": 2.895599884727608e-06, "loss": 0.4582, "step": 2224 }, { "epoch": 3.8966725043782837, "grad_norm": 0.22175763547420502, "learning_rate": 2.8913128481696067e-06, "loss": 0.452, "step": 2225 }, { "epoch": 3.8984238178633976, "grad_norm": 0.21138714253902435, "learning_rate": 2.8870276963504562e-06, "loss": 0.4588, "step": 2226 }, { "epoch": 3.9001751313485116, "grad_norm": 0.18901801109313965, "learning_rate": 2.8827444331002087e-06, "loss": 0.4435, "step": 2227 }, { "epoch": 3.9019264448336255, "grad_norm": 0.20129668712615967, "learning_rate": 2.8784630622472376e-06, "loss": 0.4557, "step": 2228 }, { "epoch": 3.903677758318739, "grad_norm": 0.2363346964120865, "learning_rate": 2.8741835876182165e-06, "loss": 0.4597, "step": 2229 }, { "epoch": 3.905429071803853, "grad_norm": 0.19962775707244873, "learning_rate": 2.8699060130381334e-06, "loss": 0.4347, "step": 2230 }, { "epoch": 3.907180385288967, "grad_norm": 0.20244371891021729, "learning_rate": 2.8656303423302678e-06, "loss": 0.4538, "step": 2231 }, { "epoch": 3.9089316987740803, "grad_norm": 0.22740694880485535, "learning_rate": 2.861356579316204e-06, "loss": 0.4556, "step": 2232 }, { "epoch": 3.9106830122591942, "grad_norm": 0.23896324634552002, "learning_rate": 2.857084727815823e-06, "loss": 0.4723, "step": 2233 }, { "epoch": 3.912434325744308, "grad_norm": 0.1869499385356903, "learning_rate": 2.8528147916472895e-06, "loss": 0.4617, "step": 2234 }, { "epoch": 3.914185639229422, "grad_norm": 0.2007441520690918, "learning_rate": 2.848546774627064e-06, "loss": 0.4648, "step": 2235 }, { "epoch": 3.915936952714536, "grad_norm": 0.20187421143054962, "learning_rate": 2.8442806805698857e-06, "loss": 0.4603, "step": 2236 }, { "epoch": 3.91768826619965, "grad_norm": 0.23164695501327515, "learning_rate": 2.840016513288782e-06, "loss": 0.4523, "step": 2237 }, { "epoch": 3.9194395796847634, "grad_norm": 0.2386346161365509, "learning_rate": 2.8357542765950493e-06, "loss": 0.4572, "step": 2238 }, { "epoch": 3.9211908931698773, "grad_norm": 0.18550102412700653, "learning_rate": 2.8314939742982673e-06, "loss": 0.4549, "step": 2239 }, { "epoch": 3.9229422066549913, "grad_norm": 0.2051028460264206, "learning_rate": 2.8272356102062792e-06, "loss": 0.4515, "step": 2240 }, { "epoch": 3.924693520140105, "grad_norm": 0.21694958209991455, "learning_rate": 2.8229791881252023e-06, "loss": 0.4539, "step": 2241 }, { "epoch": 3.9264448336252187, "grad_norm": 0.19975368678569794, "learning_rate": 2.818724711859412e-06, "loss": 0.4531, "step": 2242 }, { "epoch": 3.9281961471103326, "grad_norm": 0.18969646096229553, "learning_rate": 2.814472185211551e-06, "loss": 0.4526, "step": 2243 }, { "epoch": 3.9299474605954465, "grad_norm": 0.20731142163276672, "learning_rate": 2.810221611982512e-06, "loss": 0.459, "step": 2244 }, { "epoch": 3.9316987740805605, "grad_norm": 0.192525714635849, "learning_rate": 2.805972995971451e-06, "loss": 0.4467, "step": 2245 }, { "epoch": 3.9334500875656744, "grad_norm": 0.20006868243217468, "learning_rate": 2.8017263409757646e-06, "loss": 0.4545, "step": 2246 }, { "epoch": 3.9352014010507883, "grad_norm": 0.19467806816101074, "learning_rate": 2.7974816507911047e-06, "loss": 0.4414, "step": 2247 }, { "epoch": 3.936952714535902, "grad_norm": 0.19643349945545197, "learning_rate": 2.7932389292113604e-06, "loss": 0.4414, "step": 2248 }, { "epoch": 3.9387040280210157, "grad_norm": 0.1990184634923935, "learning_rate": 2.788998180028668e-06, "loss": 0.4502, "step": 2249 }, { "epoch": 3.9404553415061296, "grad_norm": 0.19805185496807098, "learning_rate": 2.7847594070333983e-06, "loss": 0.45, "step": 2250 }, { "epoch": 3.9422066549912436, "grad_norm": 0.20095215737819672, "learning_rate": 2.7805226140141505e-06, "loss": 0.4593, "step": 2251 }, { "epoch": 3.943957968476357, "grad_norm": 0.2098454385995865, "learning_rate": 2.776287804757765e-06, "loss": 0.4454, "step": 2252 }, { "epoch": 3.945709281961471, "grad_norm": 0.2033102810382843, "learning_rate": 2.772054983049296e-06, "loss": 0.4532, "step": 2253 }, { "epoch": 3.947460595446585, "grad_norm": 0.1891983151435852, "learning_rate": 2.7678241526720338e-06, "loss": 0.4583, "step": 2254 }, { "epoch": 3.949211908931699, "grad_norm": 0.23819297552108765, "learning_rate": 2.7635953174074787e-06, "loss": 0.449, "step": 2255 }, { "epoch": 3.9509632224168127, "grad_norm": 0.20526765286922455, "learning_rate": 2.759368481035356e-06, "loss": 0.458, "step": 2256 }, { "epoch": 3.9527145359019267, "grad_norm": 0.18887849152088165, "learning_rate": 2.755143647333597e-06, "loss": 0.4622, "step": 2257 }, { "epoch": 3.95446584938704, "grad_norm": 0.20422644913196564, "learning_rate": 2.7509208200783505e-06, "loss": 0.46, "step": 2258 }, { "epoch": 3.956217162872154, "grad_norm": 0.22939211130142212, "learning_rate": 2.746700003043964e-06, "loss": 0.4559, "step": 2259 }, { "epoch": 3.957968476357268, "grad_norm": 0.2014060765504837, "learning_rate": 2.7424812000029955e-06, "loss": 0.4556, "step": 2260 }, { "epoch": 3.959719789842382, "grad_norm": 0.21326269209384918, "learning_rate": 2.738264414726196e-06, "loss": 0.4725, "step": 2261 }, { "epoch": 3.9614711033274954, "grad_norm": 0.24347561597824097, "learning_rate": 2.73404965098252e-06, "loss": 0.4508, "step": 2262 }, { "epoch": 3.9632224168126093, "grad_norm": 0.20914268493652344, "learning_rate": 2.7298369125391112e-06, "loss": 0.448, "step": 2263 }, { "epoch": 3.9649737302977233, "grad_norm": 0.2181508094072342, "learning_rate": 2.7256262031613044e-06, "loss": 0.4509, "step": 2264 }, { "epoch": 3.966725043782837, "grad_norm": 0.2128145843744278, "learning_rate": 2.7214175266126184e-06, "loss": 0.4567, "step": 2265 }, { "epoch": 3.968476357267951, "grad_norm": 0.23032024502754211, "learning_rate": 2.7172108866547593e-06, "loss": 0.4544, "step": 2266 }, { "epoch": 3.970227670753065, "grad_norm": 0.20230281352996826, "learning_rate": 2.7130062870476072e-06, "loss": 0.4605, "step": 2267 }, { "epoch": 3.9719789842381785, "grad_norm": 0.19681450724601746, "learning_rate": 2.7088037315492254e-06, "loss": 0.4519, "step": 2268 }, { "epoch": 3.9737302977232924, "grad_norm": 0.23665401339530945, "learning_rate": 2.7046032239158428e-06, "loss": 0.4505, "step": 2269 }, { "epoch": 3.9754816112084064, "grad_norm": 0.2096850872039795, "learning_rate": 2.7004047679018652e-06, "loss": 0.4631, "step": 2270 }, { "epoch": 3.9772329246935203, "grad_norm": 0.195414736866951, "learning_rate": 2.6962083672598584e-06, "loss": 0.4568, "step": 2271 }, { "epoch": 3.978984238178634, "grad_norm": 0.20313428342342377, "learning_rate": 2.692014025740557e-06, "loss": 0.4521, "step": 2272 }, { "epoch": 3.9807355516637477, "grad_norm": 0.20755040645599365, "learning_rate": 2.687821747092848e-06, "loss": 0.4534, "step": 2273 }, { "epoch": 3.9824868651488616, "grad_norm": 0.2052566111087799, "learning_rate": 2.683631535063783e-06, "loss": 0.4437, "step": 2274 }, { "epoch": 3.9842381786339756, "grad_norm": 0.2111840695142746, "learning_rate": 2.679443393398558e-06, "loss": 0.4578, "step": 2275 }, { "epoch": 3.9859894921190895, "grad_norm": 0.19180183112621307, "learning_rate": 2.6752573258405245e-06, "loss": 0.4488, "step": 2276 }, { "epoch": 3.9877408056042034, "grad_norm": 0.1930307149887085, "learning_rate": 2.6710733361311812e-06, "loss": 0.4593, "step": 2277 }, { "epoch": 3.989492119089317, "grad_norm": 0.19929374754428864, "learning_rate": 2.6668914280101615e-06, "loss": 0.4623, "step": 2278 }, { "epoch": 3.991243432574431, "grad_norm": 0.18213118612766266, "learning_rate": 2.662711605215248e-06, "loss": 0.4569, "step": 2279 }, { "epoch": 3.9929947460595447, "grad_norm": 0.19420553743839264, "learning_rate": 2.658533871482351e-06, "loss": 0.4494, "step": 2280 }, { "epoch": 3.9947460595446582, "grad_norm": 0.19503778219223022, "learning_rate": 2.6543582305455217e-06, "loss": 0.4471, "step": 2281 }, { "epoch": 3.996497373029772, "grad_norm": 0.19423207640647888, "learning_rate": 2.650184686136932e-06, "loss": 0.4593, "step": 2282 }, { "epoch": 3.998248686514886, "grad_norm": 0.19494888186454773, "learning_rate": 2.6460132419868893e-06, "loss": 0.4568, "step": 2283 }, { "epoch": 4.0, "grad_norm": 0.1878107786178589, "learning_rate": 2.641843901823815e-06, "loss": 0.4618, "step": 2284 }, { "epoch": 4.001751313485114, "grad_norm": 0.23779134452342987, "learning_rate": 2.637676669374258e-06, "loss": 0.4548, "step": 2285 }, { "epoch": 4.003502626970228, "grad_norm": 0.1970338523387909, "learning_rate": 2.6335115483628747e-06, "loss": 0.4412, "step": 2286 }, { "epoch": 4.005253940455342, "grad_norm": 0.18821589648723602, "learning_rate": 2.629348542512443e-06, "loss": 0.4547, "step": 2287 }, { "epoch": 4.007005253940456, "grad_norm": 0.19570598006248474, "learning_rate": 2.625187655543844e-06, "loss": 0.4557, "step": 2288 }, { "epoch": 4.008756567425569, "grad_norm": 0.20279647409915924, "learning_rate": 2.6210288911760684e-06, "loss": 0.441, "step": 2289 }, { "epoch": 4.010507880910683, "grad_norm": 0.20559468865394592, "learning_rate": 2.616872253126207e-06, "loss": 0.4347, "step": 2290 }, { "epoch": 4.012259194395797, "grad_norm": 0.2105250507593155, "learning_rate": 2.612717745109453e-06, "loss": 0.4434, "step": 2291 }, { "epoch": 4.0140105078809105, "grad_norm": 0.205309197306633, "learning_rate": 2.6085653708390936e-06, "loss": 0.447, "step": 2292 }, { "epoch": 4.015761821366024, "grad_norm": 0.20453983545303345, "learning_rate": 2.604415134026509e-06, "loss": 0.4605, "step": 2293 }, { "epoch": 4.017513134851138, "grad_norm": 0.20557504892349243, "learning_rate": 2.6002670383811724e-06, "loss": 0.4358, "step": 2294 }, { "epoch": 4.019264448336252, "grad_norm": 0.22608932852745056, "learning_rate": 2.596121087610636e-06, "loss": 0.4495, "step": 2295 }, { "epoch": 4.021015761821366, "grad_norm": 0.19931770861148834, "learning_rate": 2.591977285420545e-06, "loss": 0.4414, "step": 2296 }, { "epoch": 4.02276707530648, "grad_norm": 0.2097368985414505, "learning_rate": 2.587835635514613e-06, "loss": 0.4474, "step": 2297 }, { "epoch": 4.024518388791594, "grad_norm": 0.20020104944705963, "learning_rate": 2.5836961415946394e-06, "loss": 0.4543, "step": 2298 }, { "epoch": 4.026269702276707, "grad_norm": 0.1907244473695755, "learning_rate": 2.579558807360489e-06, "loss": 0.4404, "step": 2299 }, { "epoch": 4.028021015761821, "grad_norm": 0.20167069137096405, "learning_rate": 2.5754236365101024e-06, "loss": 0.4488, "step": 2300 }, { "epoch": 4.029772329246935, "grad_norm": 0.19978542625904083, "learning_rate": 2.5712906327394836e-06, "loss": 0.4549, "step": 2301 }, { "epoch": 4.031523642732049, "grad_norm": 0.2129971981048584, "learning_rate": 2.5671597997427023e-06, "loss": 0.4478, "step": 2302 }, { "epoch": 4.033274956217163, "grad_norm": 0.20118184387683868, "learning_rate": 2.5630311412118815e-06, "loss": 0.4395, "step": 2303 }, { "epoch": 4.035026269702277, "grad_norm": 0.20418570935726166, "learning_rate": 2.558904660837209e-06, "loss": 0.4418, "step": 2304 }, { "epoch": 4.036777583187391, "grad_norm": 0.20190295577049255, "learning_rate": 2.554780362306918e-06, "loss": 0.4353, "step": 2305 }, { "epoch": 4.038528896672505, "grad_norm": 0.2176922857761383, "learning_rate": 2.5506582493072994e-06, "loss": 0.4425, "step": 2306 }, { "epoch": 4.0402802101576185, "grad_norm": 0.19553542137145996, "learning_rate": 2.546538325522682e-06, "loss": 0.4443, "step": 2307 }, { "epoch": 4.042031523642732, "grad_norm": 0.213442862033844, "learning_rate": 2.5424205946354475e-06, "loss": 0.4484, "step": 2308 }, { "epoch": 4.0437828371278455, "grad_norm": 0.1984376311302185, "learning_rate": 2.5383050603260074e-06, "loss": 0.4434, "step": 2309 }, { "epoch": 4.045534150612959, "grad_norm": 0.18011341989040375, "learning_rate": 2.53419172627282e-06, "loss": 0.4352, "step": 2310 }, { "epoch": 4.047285464098073, "grad_norm": 0.2093537300825119, "learning_rate": 2.5300805961523677e-06, "loss": 0.446, "step": 2311 }, { "epoch": 4.049036777583187, "grad_norm": 0.200219988822937, "learning_rate": 2.525971673639172e-06, "loss": 0.453, "step": 2312 }, { "epoch": 4.050788091068301, "grad_norm": 0.20340774953365326, "learning_rate": 2.5218649624057717e-06, "loss": 0.4486, "step": 2313 }, { "epoch": 4.052539404553415, "grad_norm": 0.19928807020187378, "learning_rate": 2.517760466122739e-06, "loss": 0.4562, "step": 2314 }, { "epoch": 4.054290718038529, "grad_norm": 0.2088070809841156, "learning_rate": 2.513658188458657e-06, "loss": 0.446, "step": 2315 }, { "epoch": 4.056042031523643, "grad_norm": 0.19059933722019196, "learning_rate": 2.5095581330801345e-06, "loss": 0.4381, "step": 2316 }, { "epoch": 4.057793345008757, "grad_norm": 0.1996975541114807, "learning_rate": 2.5054603036517856e-06, "loss": 0.4477, "step": 2317 }, { "epoch": 4.059544658493871, "grad_norm": 0.2126842737197876, "learning_rate": 2.5013647038362437e-06, "loss": 0.4392, "step": 2318 }, { "epoch": 4.061295971978984, "grad_norm": 0.19933517277240753, "learning_rate": 2.4972713372941406e-06, "loss": 0.4386, "step": 2319 }, { "epoch": 4.063047285464098, "grad_norm": 0.2043513059616089, "learning_rate": 2.4931802076841176e-06, "loss": 0.4444, "step": 2320 }, { "epoch": 4.064798598949212, "grad_norm": 0.2009945809841156, "learning_rate": 2.4890913186628178e-06, "loss": 0.4545, "step": 2321 }, { "epoch": 4.066549912434326, "grad_norm": 0.2049172818660736, "learning_rate": 2.4850046738848752e-06, "loss": 0.4378, "step": 2322 }, { "epoch": 4.0683012259194395, "grad_norm": 0.19202595949172974, "learning_rate": 2.480920277002926e-06, "loss": 0.4371, "step": 2323 }, { "epoch": 4.0700525394045535, "grad_norm": 0.20455335080623627, "learning_rate": 2.4768381316675883e-06, "loss": 0.4306, "step": 2324 }, { "epoch": 4.071803852889667, "grad_norm": 0.19036826491355896, "learning_rate": 2.472758241527477e-06, "loss": 0.4492, "step": 2325 }, { "epoch": 4.073555166374781, "grad_norm": 0.20109494030475616, "learning_rate": 2.468680610229182e-06, "loss": 0.4486, "step": 2326 }, { "epoch": 4.075306479859895, "grad_norm": 0.20436914265155792, "learning_rate": 2.4646052414172844e-06, "loss": 0.4407, "step": 2327 }, { "epoch": 4.077057793345009, "grad_norm": 0.19247280061244965, "learning_rate": 2.4605321387343334e-06, "loss": 0.4493, "step": 2328 }, { "epoch": 4.078809106830122, "grad_norm": 0.18818528950214386, "learning_rate": 2.4564613058208603e-06, "loss": 0.444, "step": 2329 }, { "epoch": 4.080560420315236, "grad_norm": 0.2115478366613388, "learning_rate": 2.452392746315361e-06, "loss": 0.4467, "step": 2330 }, { "epoch": 4.08231173380035, "grad_norm": 0.18842513859272003, "learning_rate": 2.4483264638543076e-06, "loss": 0.4311, "step": 2331 }, { "epoch": 4.084063047285464, "grad_norm": 0.18694466352462769, "learning_rate": 2.4442624620721274e-06, "loss": 0.4376, "step": 2332 }, { "epoch": 4.085814360770578, "grad_norm": 0.18634238839149475, "learning_rate": 2.440200744601218e-06, "loss": 0.4371, "step": 2333 }, { "epoch": 4.087565674255692, "grad_norm": 0.1977769434452057, "learning_rate": 2.4361413150719287e-06, "loss": 0.4458, "step": 2334 }, { "epoch": 4.089316987740806, "grad_norm": 0.20030644536018372, "learning_rate": 2.432084177112569e-06, "loss": 0.4477, "step": 2335 }, { "epoch": 4.09106830122592, "grad_norm": 0.194778174161911, "learning_rate": 2.4280293343493954e-06, "loss": 0.4411, "step": 2336 }, { "epoch": 4.092819614711034, "grad_norm": 0.1981002539396286, "learning_rate": 2.423976790406616e-06, "loss": 0.4469, "step": 2337 }, { "epoch": 4.0945709281961475, "grad_norm": 0.19796708226203918, "learning_rate": 2.4199265489063846e-06, "loss": 0.4312, "step": 2338 }, { "epoch": 4.096322241681261, "grad_norm": 0.19003282487392426, "learning_rate": 2.4158786134687966e-06, "loss": 0.4497, "step": 2339 }, { "epoch": 4.0980735551663745, "grad_norm": 0.19754332304000854, "learning_rate": 2.4118329877118836e-06, "loss": 0.4389, "step": 2340 }, { "epoch": 4.099824868651488, "grad_norm": 0.18681828677654266, "learning_rate": 2.407789675251617e-06, "loss": 0.445, "step": 2341 }, { "epoch": 4.101576182136602, "grad_norm": 0.1962013691663742, "learning_rate": 2.403748679701896e-06, "loss": 0.443, "step": 2342 }, { "epoch": 4.103327495621716, "grad_norm": 0.21758933365345, "learning_rate": 2.3997100046745554e-06, "loss": 0.4414, "step": 2343 }, { "epoch": 4.10507880910683, "grad_norm": 0.19161191582679749, "learning_rate": 2.3956736537793468e-06, "loss": 0.4396, "step": 2344 }, { "epoch": 4.106830122591944, "grad_norm": 0.19337643682956696, "learning_rate": 2.3916396306239525e-06, "loss": 0.4417, "step": 2345 }, { "epoch": 4.108581436077058, "grad_norm": 0.1950804442167282, "learning_rate": 2.3876079388139733e-06, "loss": 0.4419, "step": 2346 }, { "epoch": 4.110332749562172, "grad_norm": 0.20351946353912354, "learning_rate": 2.38357858195292e-06, "loss": 0.4383, "step": 2347 }, { "epoch": 4.112084063047286, "grad_norm": 0.1942957490682602, "learning_rate": 2.3795515636422257e-06, "loss": 0.4508, "step": 2348 }, { "epoch": 4.113835376532399, "grad_norm": 0.19797411561012268, "learning_rate": 2.3755268874812237e-06, "loss": 0.4461, "step": 2349 }, { "epoch": 4.115586690017513, "grad_norm": 0.20845496654510498, "learning_rate": 2.371504557067163e-06, "loss": 0.4384, "step": 2350 }, { "epoch": 4.117338003502627, "grad_norm": 0.19268839061260223, "learning_rate": 2.367484575995187e-06, "loss": 0.4428, "step": 2351 }, { "epoch": 4.119089316987741, "grad_norm": 0.19510866701602936, "learning_rate": 2.3634669478583484e-06, "loss": 0.4509, "step": 2352 }, { "epoch": 4.120840630472855, "grad_norm": 0.21246735751628876, "learning_rate": 2.3594516762475882e-06, "loss": 0.4531, "step": 2353 }, { "epoch": 4.122591943957969, "grad_norm": 0.19666586816310883, "learning_rate": 2.3554387647517497e-06, "loss": 0.4379, "step": 2354 }, { "epoch": 4.1243432574430825, "grad_norm": 0.19851261377334595, "learning_rate": 2.3514282169575575e-06, "loss": 0.4507, "step": 2355 }, { "epoch": 4.126094570928196, "grad_norm": 0.2009512335062027, "learning_rate": 2.3474200364496326e-06, "loss": 0.4368, "step": 2356 }, { "epoch": 4.12784588441331, "grad_norm": 0.19704148173332214, "learning_rate": 2.343414226810472e-06, "loss": 0.4322, "step": 2357 }, { "epoch": 4.129597197898423, "grad_norm": 0.19236518442630768, "learning_rate": 2.3394107916204617e-06, "loss": 0.4451, "step": 2358 }, { "epoch": 4.131348511383537, "grad_norm": 0.1995588093996048, "learning_rate": 2.3354097344578565e-06, "loss": 0.4365, "step": 2359 }, { "epoch": 4.133099824868651, "grad_norm": 0.2041490077972412, "learning_rate": 2.3314110588987946e-06, "loss": 0.4433, "step": 2360 }, { "epoch": 4.134851138353765, "grad_norm": 0.19718019664287567, "learning_rate": 2.327414768517278e-06, "loss": 0.4364, "step": 2361 }, { "epoch": 4.136602451838879, "grad_norm": 0.20429936051368713, "learning_rate": 2.3234208668851843e-06, "loss": 0.4421, "step": 2362 }, { "epoch": 4.138353765323993, "grad_norm": 0.21696601808071136, "learning_rate": 2.3194293575722464e-06, "loss": 0.4405, "step": 2363 }, { "epoch": 4.140105078809107, "grad_norm": 0.20063844323158264, "learning_rate": 2.3154402441460685e-06, "loss": 0.4381, "step": 2364 }, { "epoch": 4.141856392294221, "grad_norm": 0.18610960245132446, "learning_rate": 2.311453530172109e-06, "loss": 0.4436, "step": 2365 }, { "epoch": 4.143607705779335, "grad_norm": 0.205152228474617, "learning_rate": 2.3074692192136803e-06, "loss": 0.4456, "step": 2366 }, { "epoch": 4.145359019264449, "grad_norm": 0.18401828408241272, "learning_rate": 2.303487314831952e-06, "loss": 0.4336, "step": 2367 }, { "epoch": 4.147110332749562, "grad_norm": 0.19570638239383698, "learning_rate": 2.2995078205859344e-06, "loss": 0.4404, "step": 2368 }, { "epoch": 4.148861646234676, "grad_norm": 0.2045496255159378, "learning_rate": 2.295530740032494e-06, "loss": 0.4411, "step": 2369 }, { "epoch": 4.15061295971979, "grad_norm": 0.2025604397058487, "learning_rate": 2.2915560767263296e-06, "loss": 0.4464, "step": 2370 }, { "epoch": 4.1523642732049035, "grad_norm": 0.22357074916362762, "learning_rate": 2.287583834219989e-06, "loss": 0.4499, "step": 2371 }, { "epoch": 4.1541155866900175, "grad_norm": 0.21349874138832092, "learning_rate": 2.2836140160638477e-06, "loss": 0.4543, "step": 2372 }, { "epoch": 4.155866900175131, "grad_norm": 0.18732772767543793, "learning_rate": 2.2796466258061223e-06, "loss": 0.4433, "step": 2373 }, { "epoch": 4.157618213660245, "grad_norm": 0.23758696019649506, "learning_rate": 2.2756816669928516e-06, "loss": 0.4357, "step": 2374 }, { "epoch": 4.159369527145359, "grad_norm": 0.1971878856420517, "learning_rate": 2.2717191431679085e-06, "loss": 0.4423, "step": 2375 }, { "epoch": 4.161120840630473, "grad_norm": 0.18399740755558014, "learning_rate": 2.2677590578729815e-06, "loss": 0.4412, "step": 2376 }, { "epoch": 4.162872154115587, "grad_norm": 0.2037816196680069, "learning_rate": 2.2638014146475894e-06, "loss": 0.4482, "step": 2377 }, { "epoch": 4.1646234676007, "grad_norm": 0.2149297297000885, "learning_rate": 2.2598462170290598e-06, "loss": 0.4436, "step": 2378 }, { "epoch": 4.166374781085814, "grad_norm": 0.18959595263004303, "learning_rate": 2.25589346855254e-06, "loss": 0.4404, "step": 2379 }, { "epoch": 4.168126094570928, "grad_norm": 0.18842807412147522, "learning_rate": 2.2519431727509823e-06, "loss": 0.4376, "step": 2380 }, { "epoch": 4.169877408056042, "grad_norm": 0.18514296412467957, "learning_rate": 2.2479953331551534e-06, "loss": 0.4523, "step": 2381 }, { "epoch": 4.171628721541156, "grad_norm": 0.19816093146800995, "learning_rate": 2.2440499532936196e-06, "loss": 0.4495, "step": 2382 }, { "epoch": 4.17338003502627, "grad_norm": 0.18251432478427887, "learning_rate": 2.240107036692753e-06, "loss": 0.4423, "step": 2383 }, { "epoch": 4.175131348511384, "grad_norm": 0.1843310296535492, "learning_rate": 2.2361665868767185e-06, "loss": 0.4496, "step": 2384 }, { "epoch": 4.176882661996498, "grad_norm": 0.20009943842887878, "learning_rate": 2.2322286073674832e-06, "loss": 0.4435, "step": 2385 }, { "epoch": 4.1786339754816115, "grad_norm": 0.1882532238960266, "learning_rate": 2.228293101684799e-06, "loss": 0.4481, "step": 2386 }, { "epoch": 4.1803852889667255, "grad_norm": 0.18110540509223938, "learning_rate": 2.224360073346213e-06, "loss": 0.4363, "step": 2387 }, { "epoch": 4.1821366024518385, "grad_norm": 0.1995220184326172, "learning_rate": 2.2204295258670524e-06, "loss": 0.4475, "step": 2388 }, { "epoch": 4.183887915936952, "grad_norm": 0.2037324160337448, "learning_rate": 2.2165014627604313e-06, "loss": 0.4475, "step": 2389 }, { "epoch": 4.185639229422066, "grad_norm": 0.190536230802536, "learning_rate": 2.212575887537244e-06, "loss": 0.4342, "step": 2390 }, { "epoch": 4.18739054290718, "grad_norm": 0.2015506774187088, "learning_rate": 2.208652803706155e-06, "loss": 0.4568, "step": 2391 }, { "epoch": 4.189141856392294, "grad_norm": 0.1903296560049057, "learning_rate": 2.2047322147736096e-06, "loss": 0.4422, "step": 2392 }, { "epoch": 4.190893169877408, "grad_norm": 0.20880772173404694, "learning_rate": 2.2008141242438165e-06, "loss": 0.4491, "step": 2393 }, { "epoch": 4.192644483362522, "grad_norm": 0.2130403220653534, "learning_rate": 2.196898535618757e-06, "loss": 0.4434, "step": 2394 }, { "epoch": 4.194395796847636, "grad_norm": 0.221250519156456, "learning_rate": 2.192985452398171e-06, "loss": 0.4494, "step": 2395 }, { "epoch": 4.19614711033275, "grad_norm": 0.20668071508407593, "learning_rate": 2.1890748780795644e-06, "loss": 0.4527, "step": 2396 }, { "epoch": 4.197898423817864, "grad_norm": 0.18673354387283325, "learning_rate": 2.1851668161581946e-06, "loss": 0.4497, "step": 2397 }, { "epoch": 4.199649737302977, "grad_norm": 0.1916963905096054, "learning_rate": 2.181261270127081e-06, "loss": 0.4465, "step": 2398 }, { "epoch": 4.201401050788091, "grad_norm": 0.20407924056053162, "learning_rate": 2.1773582434769854e-06, "loss": 0.4442, "step": 2399 }, { "epoch": 4.203152364273205, "grad_norm": 0.2020706981420517, "learning_rate": 2.1734577396964264e-06, "loss": 0.4444, "step": 2400 }, { "epoch": 4.204903677758319, "grad_norm": 0.20074722170829773, "learning_rate": 2.1695597622716603e-06, "loss": 0.443, "step": 2401 }, { "epoch": 4.206654991243433, "grad_norm": 0.19509154558181763, "learning_rate": 2.165664314686692e-06, "loss": 0.4402, "step": 2402 }, { "epoch": 4.2084063047285465, "grad_norm": 0.2059420347213745, "learning_rate": 2.1617714004232588e-06, "loss": 0.4484, "step": 2403 }, { "epoch": 4.21015761821366, "grad_norm": 0.18641522526741028, "learning_rate": 2.15788102296084e-06, "loss": 0.4457, "step": 2404 }, { "epoch": 4.211908931698774, "grad_norm": 0.20743213593959808, "learning_rate": 2.153993185776641e-06, "loss": 0.442, "step": 2405 }, { "epoch": 4.213660245183888, "grad_norm": 0.2014147788286209, "learning_rate": 2.150107892345602e-06, "loss": 0.4393, "step": 2406 }, { "epoch": 4.215411558669002, "grad_norm": 0.1996014565229416, "learning_rate": 2.14622514614039e-06, "loss": 0.4557, "step": 2407 }, { "epoch": 4.217162872154115, "grad_norm": 0.18913216888904572, "learning_rate": 2.142344950631388e-06, "loss": 0.4426, "step": 2408 }, { "epoch": 4.218914185639229, "grad_norm": 0.1955144703388214, "learning_rate": 2.138467309286709e-06, "loss": 0.4443, "step": 2409 }, { "epoch": 4.220665499124343, "grad_norm": 0.18979205191135406, "learning_rate": 2.1345922255721745e-06, "loss": 0.4362, "step": 2410 }, { "epoch": 4.222416812609457, "grad_norm": 0.19166506826877594, "learning_rate": 2.1307197029513276e-06, "loss": 0.4407, "step": 2411 }, { "epoch": 4.224168126094571, "grad_norm": 0.19717451930046082, "learning_rate": 2.1268497448854143e-06, "loss": 0.445, "step": 2412 }, { "epoch": 4.225919439579685, "grad_norm": 0.1878136694431305, "learning_rate": 2.1229823548333965e-06, "loss": 0.4335, "step": 2413 }, { "epoch": 4.227670753064799, "grad_norm": 0.18415404856204987, "learning_rate": 2.1191175362519316e-06, "loss": 0.4371, "step": 2414 }, { "epoch": 4.229422066549913, "grad_norm": 0.20404976606369019, "learning_rate": 2.115255292595391e-06, "loss": 0.4389, "step": 2415 }, { "epoch": 4.231173380035027, "grad_norm": 0.19489754736423492, "learning_rate": 2.111395627315832e-06, "loss": 0.447, "step": 2416 }, { "epoch": 4.23292469352014, "grad_norm": 0.1939016878604889, "learning_rate": 2.1075385438630168e-06, "loss": 0.4499, "step": 2417 }, { "epoch": 4.234676007005254, "grad_norm": 0.2002265304327011, "learning_rate": 2.103684045684393e-06, "loss": 0.4424, "step": 2418 }, { "epoch": 4.2364273204903675, "grad_norm": 0.19343554973602295, "learning_rate": 2.0998321362251036e-06, "loss": 0.4398, "step": 2419 }, { "epoch": 4.2381786339754814, "grad_norm": 0.20907309651374817, "learning_rate": 2.0959828189279712e-06, "loss": 0.446, "step": 2420 }, { "epoch": 4.239929947460595, "grad_norm": 0.1997525840997696, "learning_rate": 2.0921360972335086e-06, "loss": 0.4443, "step": 2421 }, { "epoch": 4.241681260945709, "grad_norm": 0.19612444937229156, "learning_rate": 2.0882919745799023e-06, "loss": 0.4476, "step": 2422 }, { "epoch": 4.243432574430823, "grad_norm": 0.21656812727451324, "learning_rate": 2.084450454403022e-06, "loss": 0.4394, "step": 2423 }, { "epoch": 4.245183887915937, "grad_norm": 0.20946522057056427, "learning_rate": 2.0806115401364036e-06, "loss": 0.4431, "step": 2424 }, { "epoch": 4.246935201401051, "grad_norm": 0.21186482906341553, "learning_rate": 2.076775235211263e-06, "loss": 0.4468, "step": 2425 }, { "epoch": 4.248686514886165, "grad_norm": 0.2050619274377823, "learning_rate": 2.0729415430564748e-06, "loss": 0.4497, "step": 2426 }, { "epoch": 4.250437828371279, "grad_norm": 0.20558154582977295, "learning_rate": 2.0691104670985868e-06, "loss": 0.4537, "step": 2427 }, { "epoch": 4.252189141856392, "grad_norm": 0.21000975370407104, "learning_rate": 2.0652820107618e-06, "loss": 0.4488, "step": 2428 }, { "epoch": 4.253940455341506, "grad_norm": 0.20923325419425964, "learning_rate": 2.0614561774679826e-06, "loss": 0.4523, "step": 2429 }, { "epoch": 4.25569176882662, "grad_norm": 0.1945674866437912, "learning_rate": 2.0576329706366494e-06, "loss": 0.4452, "step": 2430 }, { "epoch": 4.257443082311734, "grad_norm": 0.20008769631385803, "learning_rate": 2.053812393684977e-06, "loss": 0.4466, "step": 2431 }, { "epoch": 4.259194395796848, "grad_norm": 0.19468821585178375, "learning_rate": 2.0499944500277825e-06, "loss": 0.4521, "step": 2432 }, { "epoch": 4.260945709281962, "grad_norm": 0.21135465800762177, "learning_rate": 2.046179143077535e-06, "loss": 0.4406, "step": 2433 }, { "epoch": 4.2626970227670755, "grad_norm": 0.19755582511425018, "learning_rate": 2.0423664762443483e-06, "loss": 0.4514, "step": 2434 }, { "epoch": 4.264448336252189, "grad_norm": 0.18632306158542633, "learning_rate": 2.0385564529359696e-06, "loss": 0.4461, "step": 2435 }, { "epoch": 4.266199649737303, "grad_norm": 0.19761861860752106, "learning_rate": 2.034749076557791e-06, "loss": 0.4349, "step": 2436 }, { "epoch": 4.267950963222416, "grad_norm": 0.20615805685520172, "learning_rate": 2.030944350512831e-06, "loss": 0.4443, "step": 2437 }, { "epoch": 4.26970227670753, "grad_norm": 0.18164633214473724, "learning_rate": 2.027142278201748e-06, "loss": 0.4325, "step": 2438 }, { "epoch": 4.271453590192644, "grad_norm": 0.19090235233306885, "learning_rate": 2.023342863022819e-06, "loss": 0.4314, "step": 2439 }, { "epoch": 4.273204903677758, "grad_norm": 0.19961769878864288, "learning_rate": 2.0195461083719557e-06, "loss": 0.4438, "step": 2440 }, { "epoch": 4.274956217162872, "grad_norm": 0.2046177089214325, "learning_rate": 2.015752017642684e-06, "loss": 0.4533, "step": 2441 }, { "epoch": 4.276707530647986, "grad_norm": 0.19360627233982086, "learning_rate": 2.0119605942261535e-06, "loss": 0.4452, "step": 2442 }, { "epoch": 4.2784588441331, "grad_norm": 0.1950562298297882, "learning_rate": 2.0081718415111263e-06, "loss": 0.4462, "step": 2443 }, { "epoch": 4.280210157618214, "grad_norm": 0.1982090324163437, "learning_rate": 2.004385762883982e-06, "loss": 0.4448, "step": 2444 }, { "epoch": 4.281961471103328, "grad_norm": 0.19751675426959991, "learning_rate": 2.000602361728704e-06, "loss": 0.4427, "step": 2445 }, { "epoch": 4.283712784588442, "grad_norm": 0.20399275422096252, "learning_rate": 1.99682164142689e-06, "loss": 0.44, "step": 2446 }, { "epoch": 4.285464098073555, "grad_norm": 0.19284959137439728, "learning_rate": 1.993043605357733e-06, "loss": 0.4523, "step": 2447 }, { "epoch": 4.287215411558669, "grad_norm": 0.20713481307029724, "learning_rate": 1.989268256898036e-06, "loss": 0.4428, "step": 2448 }, { "epoch": 4.288966725043783, "grad_norm": 0.20101502537727356, "learning_rate": 1.985495599422191e-06, "loss": 0.4465, "step": 2449 }, { "epoch": 4.2907180385288965, "grad_norm": 0.1958715170621872, "learning_rate": 1.9817256363021907e-06, "loss": 0.4404, "step": 2450 }, { "epoch": 4.2924693520140105, "grad_norm": 0.18420349061489105, "learning_rate": 1.9779583709076207e-06, "loss": 0.4521, "step": 2451 }, { "epoch": 4.294220665499124, "grad_norm": 0.2039525955915451, "learning_rate": 1.9741938066056475e-06, "loss": 0.4384, "step": 2452 }, { "epoch": 4.295971978984238, "grad_norm": 0.21783152222633362, "learning_rate": 1.970431946761031e-06, "loss": 0.436, "step": 2453 }, { "epoch": 4.297723292469352, "grad_norm": 0.2019842118024826, "learning_rate": 1.9666727947361135e-06, "loss": 0.4509, "step": 2454 }, { "epoch": 4.299474605954466, "grad_norm": 0.19562357664108276, "learning_rate": 1.962916353890811e-06, "loss": 0.4507, "step": 2455 }, { "epoch": 4.301225919439579, "grad_norm": 0.20720234513282776, "learning_rate": 1.959162627582623e-06, "loss": 0.4444, "step": 2456 }, { "epoch": 4.302977232924693, "grad_norm": 0.21145720779895782, "learning_rate": 1.955411619166617e-06, "loss": 0.4473, "step": 2457 }, { "epoch": 4.304728546409807, "grad_norm": 0.18154136836528778, "learning_rate": 1.951663331995436e-06, "loss": 0.4463, "step": 2458 }, { "epoch": 4.306479859894921, "grad_norm": 0.1826966106891632, "learning_rate": 1.94791776941929e-06, "loss": 0.439, "step": 2459 }, { "epoch": 4.308231173380035, "grad_norm": 0.19608238339424133, "learning_rate": 1.9441749347859503e-06, "loss": 0.4419, "step": 2460 }, { "epoch": 4.309982486865149, "grad_norm": 0.18760983645915985, "learning_rate": 1.9404348314407544e-06, "loss": 0.4345, "step": 2461 }, { "epoch": 4.311733800350263, "grad_norm": 0.18777252733707428, "learning_rate": 1.9366974627265945e-06, "loss": 0.4486, "step": 2462 }, { "epoch": 4.313485113835377, "grad_norm": 0.19393132627010345, "learning_rate": 1.9329628319839227e-06, "loss": 0.4456, "step": 2463 }, { "epoch": 4.315236427320491, "grad_norm": 0.19096344709396362, "learning_rate": 1.929230942550739e-06, "loss": 0.4572, "step": 2464 }, { "epoch": 4.3169877408056045, "grad_norm": 0.1891128420829773, "learning_rate": 1.9255017977625995e-06, "loss": 0.4498, "step": 2465 }, { "epoch": 4.3187390542907185, "grad_norm": 0.1848282814025879, "learning_rate": 1.9217754009526e-06, "loss": 0.4539, "step": 2466 }, { "epoch": 4.3204903677758315, "grad_norm": 0.18754084408283234, "learning_rate": 1.9180517554513884e-06, "loss": 0.4448, "step": 2467 }, { "epoch": 4.322241681260945, "grad_norm": 0.18674413859844208, "learning_rate": 1.914330864587144e-06, "loss": 0.4415, "step": 2468 }, { "epoch": 4.323992994746059, "grad_norm": 0.18801620602607727, "learning_rate": 1.910612731685593e-06, "loss": 0.4417, "step": 2469 }, { "epoch": 4.325744308231173, "grad_norm": 0.18046508729457855, "learning_rate": 1.9068973600699887e-06, "loss": 0.4338, "step": 2470 }, { "epoch": 4.327495621716287, "grad_norm": 0.1856769174337387, "learning_rate": 1.9031847530611237e-06, "loss": 0.4372, "step": 2471 }, { "epoch": 4.329246935201401, "grad_norm": 0.18567900359630585, "learning_rate": 1.8994749139773133e-06, "loss": 0.4478, "step": 2472 }, { "epoch": 4.330998248686515, "grad_norm": 0.18434803187847137, "learning_rate": 1.8957678461344048e-06, "loss": 0.4552, "step": 2473 }, { "epoch": 4.332749562171629, "grad_norm": 0.1914127618074417, "learning_rate": 1.892063552845761e-06, "loss": 0.442, "step": 2474 }, { "epoch": 4.334500875656743, "grad_norm": 0.1958920955657959, "learning_rate": 1.888362037422274e-06, "loss": 0.4476, "step": 2475 }, { "epoch": 4.336252189141856, "grad_norm": 0.18116575479507446, "learning_rate": 1.8846633031723438e-06, "loss": 0.4577, "step": 2476 }, { "epoch": 4.33800350262697, "grad_norm": 0.19646622240543365, "learning_rate": 1.8809673534018908e-06, "loss": 0.4276, "step": 2477 }, { "epoch": 4.339754816112084, "grad_norm": 0.18913552165031433, "learning_rate": 1.8772741914143472e-06, "loss": 0.453, "step": 2478 }, { "epoch": 4.341506129597198, "grad_norm": 0.19566214084625244, "learning_rate": 1.873583820510647e-06, "loss": 0.4325, "step": 2479 }, { "epoch": 4.343257443082312, "grad_norm": 0.19312849640846252, "learning_rate": 1.8698962439892383e-06, "loss": 0.4409, "step": 2480 }, { "epoch": 4.345008756567426, "grad_norm": 0.1927546113729477, "learning_rate": 1.8662114651460617e-06, "loss": 0.445, "step": 2481 }, { "epoch": 4.3467600700525395, "grad_norm": 0.19328242540359497, "learning_rate": 1.8625294872745675e-06, "loss": 0.4431, "step": 2482 }, { "epoch": 4.348511383537653, "grad_norm": 0.19125834107398987, "learning_rate": 1.8588503136656938e-06, "loss": 0.437, "step": 2483 }, { "epoch": 4.350262697022767, "grad_norm": 0.20308637619018555, "learning_rate": 1.8551739476078784e-06, "loss": 0.4473, "step": 2484 }, { "epoch": 4.352014010507881, "grad_norm": 0.19868417084217072, "learning_rate": 1.851500392387045e-06, "loss": 0.4396, "step": 2485 }, { "epoch": 4.353765323992995, "grad_norm": 0.1941150426864624, "learning_rate": 1.8478296512866105e-06, "loss": 0.4414, "step": 2486 }, { "epoch": 4.355516637478108, "grad_norm": 0.17802336812019348, "learning_rate": 1.8441617275874701e-06, "loss": 0.4405, "step": 2487 }, { "epoch": 4.357267950963222, "grad_norm": 0.19954873621463776, "learning_rate": 1.8404966245680068e-06, "loss": 0.4583, "step": 2488 }, { "epoch": 4.359019264448336, "grad_norm": 0.19692030549049377, "learning_rate": 1.836834345504076e-06, "loss": 0.4405, "step": 2489 }, { "epoch": 4.36077057793345, "grad_norm": 0.20099438726902008, "learning_rate": 1.8331748936690175e-06, "loss": 0.4309, "step": 2490 }, { "epoch": 4.362521891418564, "grad_norm": 0.18351592123508453, "learning_rate": 1.829518272333633e-06, "loss": 0.4463, "step": 2491 }, { "epoch": 4.364273204903678, "grad_norm": 0.20933455228805542, "learning_rate": 1.8258644847662087e-06, "loss": 0.4495, "step": 2492 }, { "epoch": 4.366024518388792, "grad_norm": 0.18967418372631073, "learning_rate": 1.8222135342324844e-06, "loss": 0.4387, "step": 2493 }, { "epoch": 4.367775831873906, "grad_norm": 0.2035159319639206, "learning_rate": 1.8185654239956723e-06, "loss": 0.4447, "step": 2494 }, { "epoch": 4.36952714535902, "grad_norm": 0.19757504761219025, "learning_rate": 1.8149201573164415e-06, "loss": 0.4426, "step": 2495 }, { "epoch": 4.371278458844133, "grad_norm": 0.19712916016578674, "learning_rate": 1.811277737452924e-06, "loss": 0.4423, "step": 2496 }, { "epoch": 4.373029772329247, "grad_norm": 0.1992250233888626, "learning_rate": 1.807638167660702e-06, "loss": 0.4475, "step": 2497 }, { "epoch": 4.3747810858143605, "grad_norm": 0.19208073616027832, "learning_rate": 1.8040014511928155e-06, "loss": 0.4374, "step": 2498 }, { "epoch": 4.3765323992994745, "grad_norm": 0.19424568116664886, "learning_rate": 1.8003675912997487e-06, "loss": 0.4433, "step": 2499 }, { "epoch": 4.378283712784588, "grad_norm": 0.19591772556304932, "learning_rate": 1.7967365912294398e-06, "loss": 0.4541, "step": 2500 }, { "epoch": 4.380035026269702, "grad_norm": 0.18505723774433136, "learning_rate": 1.7931084542272632e-06, "loss": 0.4441, "step": 2501 }, { "epoch": 4.381786339754816, "grad_norm": 0.19415037333965302, "learning_rate": 1.7894831835360387e-06, "loss": 0.4433, "step": 2502 }, { "epoch": 4.38353765323993, "grad_norm": 0.190581277012825, "learning_rate": 1.7858607823960262e-06, "loss": 0.4329, "step": 2503 }, { "epoch": 4.385288966725044, "grad_norm": 0.1884370893239975, "learning_rate": 1.7822412540449141e-06, "loss": 0.4448, "step": 2504 }, { "epoch": 4.387040280210158, "grad_norm": 0.19805777072906494, "learning_rate": 1.77862460171783e-06, "loss": 0.4458, "step": 2505 }, { "epoch": 4.388791593695271, "grad_norm": 0.18981699645519257, "learning_rate": 1.7750108286473255e-06, "loss": 0.4411, "step": 2506 }, { "epoch": 4.390542907180385, "grad_norm": 0.17939285933971405, "learning_rate": 1.771399938063384e-06, "loss": 0.4428, "step": 2507 }, { "epoch": 4.392294220665499, "grad_norm": 0.18342307209968567, "learning_rate": 1.7677919331934057e-06, "loss": 0.4412, "step": 2508 }, { "epoch": 4.394045534150613, "grad_norm": 0.1887931376695633, "learning_rate": 1.7641868172622201e-06, "loss": 0.4531, "step": 2509 }, { "epoch": 4.395796847635727, "grad_norm": 0.2024248242378235, "learning_rate": 1.7605845934920663e-06, "loss": 0.4456, "step": 2510 }, { "epoch": 4.397548161120841, "grad_norm": 0.18996797502040863, "learning_rate": 1.7569852651026048e-06, "loss": 0.445, "step": 2511 }, { "epoch": 4.399299474605955, "grad_norm": 0.19526568055152893, "learning_rate": 1.7533888353109029e-06, "loss": 0.4437, "step": 2512 }, { "epoch": 4.4010507880910685, "grad_norm": 0.18529021739959717, "learning_rate": 1.7497953073314433e-06, "loss": 0.4415, "step": 2513 }, { "epoch": 4.4028021015761825, "grad_norm": 0.18522337079048157, "learning_rate": 1.7462046843761078e-06, "loss": 0.4482, "step": 2514 }, { "epoch": 4.404553415061296, "grad_norm": 0.19596901535987854, "learning_rate": 1.7426169696541884e-06, "loss": 0.4476, "step": 2515 }, { "epoch": 4.406304728546409, "grad_norm": 0.20372462272644043, "learning_rate": 1.739032166372372e-06, "loss": 0.4547, "step": 2516 }, { "epoch": 4.408056042031523, "grad_norm": 0.1738111972808838, "learning_rate": 1.7354502777347487e-06, "loss": 0.4428, "step": 2517 }, { "epoch": 4.409807355516637, "grad_norm": 0.18926116824150085, "learning_rate": 1.731871306942799e-06, "loss": 0.4443, "step": 2518 }, { "epoch": 4.411558669001751, "grad_norm": 0.2018405795097351, "learning_rate": 1.7282952571953987e-06, "loss": 0.4577, "step": 2519 }, { "epoch": 4.413309982486865, "grad_norm": 0.19830143451690674, "learning_rate": 1.724722131688809e-06, "loss": 0.4502, "step": 2520 }, { "epoch": 4.415061295971979, "grad_norm": 0.18817836046218872, "learning_rate": 1.721151933616682e-06, "loss": 0.4443, "step": 2521 }, { "epoch": 4.416812609457093, "grad_norm": 0.18891680240631104, "learning_rate": 1.7175846661700518e-06, "loss": 0.4371, "step": 2522 }, { "epoch": 4.418563922942207, "grad_norm": 0.19447599351406097, "learning_rate": 1.7140203325373294e-06, "loss": 0.4422, "step": 2523 }, { "epoch": 4.420315236427321, "grad_norm": 0.18898914754390717, "learning_rate": 1.7104589359043095e-06, "loss": 0.4414, "step": 2524 }, { "epoch": 4.422066549912435, "grad_norm": 0.18041899800300598, "learning_rate": 1.706900479454156e-06, "loss": 0.4322, "step": 2525 }, { "epoch": 4.423817863397548, "grad_norm": 0.18399524688720703, "learning_rate": 1.7033449663674102e-06, "loss": 0.4361, "step": 2526 }, { "epoch": 4.425569176882662, "grad_norm": 0.18145202100276947, "learning_rate": 1.6997923998219774e-06, "loss": 0.4448, "step": 2527 }, { "epoch": 4.427320490367776, "grad_norm": 0.19068633019924164, "learning_rate": 1.6962427829931348e-06, "loss": 0.439, "step": 2528 }, { "epoch": 4.42907180385289, "grad_norm": 0.19482937455177307, "learning_rate": 1.6926961190535152e-06, "loss": 0.4553, "step": 2529 }, { "epoch": 4.4308231173380035, "grad_norm": 0.19733022153377533, "learning_rate": 1.6891524111731246e-06, "loss": 0.4527, "step": 2530 }, { "epoch": 4.432574430823117, "grad_norm": 0.19219112396240234, "learning_rate": 1.6856116625193135e-06, "loss": 0.4419, "step": 2531 }, { "epoch": 4.434325744308231, "grad_norm": 0.1978471279144287, "learning_rate": 1.6820738762567978e-06, "loss": 0.4502, "step": 2532 }, { "epoch": 4.436077057793345, "grad_norm": 0.19881929457187653, "learning_rate": 1.678539055547637e-06, "loss": 0.4423, "step": 2533 }, { "epoch": 4.437828371278459, "grad_norm": 0.1960039585828781, "learning_rate": 1.6750072035512482e-06, "loss": 0.4358, "step": 2534 }, { "epoch": 4.439579684763572, "grad_norm": 0.1829202026128769, "learning_rate": 1.6714783234243875e-06, "loss": 0.4432, "step": 2535 }, { "epoch": 4.441330998248686, "grad_norm": 0.192953422665596, "learning_rate": 1.6679524183211616e-06, "loss": 0.4341, "step": 2536 }, { "epoch": 4.4430823117338, "grad_norm": 0.1849454939365387, "learning_rate": 1.6644294913930115e-06, "loss": 0.4413, "step": 2537 }, { "epoch": 4.444833625218914, "grad_norm": 0.18339887261390686, "learning_rate": 1.6609095457887237e-06, "loss": 0.4378, "step": 2538 }, { "epoch": 4.446584938704028, "grad_norm": 0.18450048565864563, "learning_rate": 1.657392584654412e-06, "loss": 0.4465, "step": 2539 }, { "epoch": 4.448336252189142, "grad_norm": 0.19121824204921722, "learning_rate": 1.6538786111335303e-06, "loss": 0.4533, "step": 2540 }, { "epoch": 4.450087565674256, "grad_norm": 0.19863902032375336, "learning_rate": 1.6503676283668553e-06, "loss": 0.4511, "step": 2541 }, { "epoch": 4.45183887915937, "grad_norm": 0.19948391616344452, "learning_rate": 1.6468596394924974e-06, "loss": 0.4555, "step": 2542 }, { "epoch": 4.453590192644484, "grad_norm": 0.18150630593299866, "learning_rate": 1.6433546476458833e-06, "loss": 0.4389, "step": 2543 }, { "epoch": 4.455341506129598, "grad_norm": 0.19861501455307007, "learning_rate": 1.63985265595977e-06, "loss": 0.4392, "step": 2544 }, { "epoch": 4.4570928196147115, "grad_norm": 0.1849460005760193, "learning_rate": 1.6363536675642233e-06, "loss": 0.4322, "step": 2545 }, { "epoch": 4.4588441330998245, "grad_norm": 0.1787780523300171, "learning_rate": 1.632857685586632e-06, "loss": 0.438, "step": 2546 }, { "epoch": 4.4605954465849385, "grad_norm": 0.18933455646038055, "learning_rate": 1.629364713151697e-06, "loss": 0.4278, "step": 2547 }, { "epoch": 4.462346760070052, "grad_norm": 0.19214482605457306, "learning_rate": 1.625874753381424e-06, "loss": 0.4355, "step": 2548 }, { "epoch": 4.464098073555166, "grad_norm": 0.19586370885372162, "learning_rate": 1.6223878093951329e-06, "loss": 0.4346, "step": 2549 }, { "epoch": 4.46584938704028, "grad_norm": 0.1873989701271057, "learning_rate": 1.6189038843094419e-06, "loss": 0.4352, "step": 2550 }, { "epoch": 4.467600700525394, "grad_norm": 0.2062930166721344, "learning_rate": 1.615422981238276e-06, "loss": 0.4399, "step": 2551 }, { "epoch": 4.469352014010508, "grad_norm": 0.18334275484085083, "learning_rate": 1.6119451032928547e-06, "loss": 0.4371, "step": 2552 }, { "epoch": 4.471103327495622, "grad_norm": 0.190707728266716, "learning_rate": 1.6084702535816987e-06, "loss": 0.4449, "step": 2553 }, { "epoch": 4.472854640980736, "grad_norm": 0.20049847662448883, "learning_rate": 1.6049984352106163e-06, "loss": 0.4502, "step": 2554 }, { "epoch": 4.474605954465849, "grad_norm": 0.18532223999500275, "learning_rate": 1.6015296512827122e-06, "loss": 0.4485, "step": 2555 }, { "epoch": 4.476357267950963, "grad_norm": 0.17863832414150238, "learning_rate": 1.5980639048983726e-06, "loss": 0.4436, "step": 2556 }, { "epoch": 4.478108581436077, "grad_norm": 0.18495942652225494, "learning_rate": 1.5946011991552757e-06, "loss": 0.454, "step": 2557 }, { "epoch": 4.479859894921191, "grad_norm": 0.19324585795402527, "learning_rate": 1.5911415371483757e-06, "loss": 0.4509, "step": 2558 }, { "epoch": 4.481611208406305, "grad_norm": 0.18256796896457672, "learning_rate": 1.587684921969912e-06, "loss": 0.4406, "step": 2559 }, { "epoch": 4.483362521891419, "grad_norm": 0.19789396226406097, "learning_rate": 1.5842313567093942e-06, "loss": 0.4441, "step": 2560 }, { "epoch": 4.4851138353765325, "grad_norm": 0.1917116492986679, "learning_rate": 1.5807808444536132e-06, "loss": 0.4576, "step": 2561 }, { "epoch": 4.4868651488616464, "grad_norm": 0.19028547406196594, "learning_rate": 1.5773333882866242e-06, "loss": 0.4305, "step": 2562 }, { "epoch": 4.48861646234676, "grad_norm": 0.18520963191986084, "learning_rate": 1.5738889912897575e-06, "loss": 0.4441, "step": 2563 }, { "epoch": 4.490367775831874, "grad_norm": 0.18838217854499817, "learning_rate": 1.5704476565416027e-06, "loss": 0.4376, "step": 2564 }, { "epoch": 4.492119089316987, "grad_norm": 0.20003461837768555, "learning_rate": 1.5670093871180165e-06, "loss": 0.4377, "step": 2565 }, { "epoch": 4.493870402802101, "grad_norm": 0.20302923023700714, "learning_rate": 1.5635741860921166e-06, "loss": 0.4495, "step": 2566 }, { "epoch": 4.495621716287215, "grad_norm": 0.20121584832668304, "learning_rate": 1.5601420565342734e-06, "loss": 0.4521, "step": 2567 }, { "epoch": 4.497373029772329, "grad_norm": 0.1920022964477539, "learning_rate": 1.5567130015121169e-06, "loss": 0.4378, "step": 2568 }, { "epoch": 4.499124343257443, "grad_norm": 0.1795172095298767, "learning_rate": 1.5532870240905272e-06, "loss": 0.4427, "step": 2569 }, { "epoch": 4.500875656742557, "grad_norm": 0.1969147026538849, "learning_rate": 1.5498641273316322e-06, "loss": 0.4504, "step": 2570 }, { "epoch": 4.502626970227671, "grad_norm": 0.19097661972045898, "learning_rate": 1.5464443142948083e-06, "loss": 0.4392, "step": 2571 }, { "epoch": 4.504378283712785, "grad_norm": 0.1796099990606308, "learning_rate": 1.5430275880366774e-06, "loss": 0.4377, "step": 2572 }, { "epoch": 4.506129597197899, "grad_norm": 0.1815309226512909, "learning_rate": 1.5396139516110959e-06, "loss": 0.4474, "step": 2573 }, { "epoch": 4.507880910683012, "grad_norm": 0.1815909594297409, "learning_rate": 1.5362034080691673e-06, "loss": 0.4423, "step": 2574 }, { "epoch": 4.509632224168126, "grad_norm": 0.19353191554546356, "learning_rate": 1.5327959604592218e-06, "loss": 0.4407, "step": 2575 }, { "epoch": 4.51138353765324, "grad_norm": 0.1992959976196289, "learning_rate": 1.5293916118268304e-06, "loss": 0.4502, "step": 2576 }, { "epoch": 4.5131348511383536, "grad_norm": 0.1838209331035614, "learning_rate": 1.5259903652147878e-06, "loss": 0.4437, "step": 2577 }, { "epoch": 4.5148861646234675, "grad_norm": 0.18150800466537476, "learning_rate": 1.5225922236631225e-06, "loss": 0.4428, "step": 2578 }, { "epoch": 4.516637478108581, "grad_norm": 0.19816996157169342, "learning_rate": 1.5191971902090797e-06, "loss": 0.4424, "step": 2579 }, { "epoch": 4.518388791593695, "grad_norm": 0.19969409704208374, "learning_rate": 1.515805267887135e-06, "loss": 0.4395, "step": 2580 }, { "epoch": 4.520140105078809, "grad_norm": 0.18648476898670197, "learning_rate": 1.5124164597289765e-06, "loss": 0.4493, "step": 2581 }, { "epoch": 4.521891418563923, "grad_norm": 0.1898619532585144, "learning_rate": 1.5090307687635141e-06, "loss": 0.436, "step": 2582 }, { "epoch": 4.523642732049037, "grad_norm": 0.1961052268743515, "learning_rate": 1.505648198016867e-06, "loss": 0.4425, "step": 2583 }, { "epoch": 4.525394045534151, "grad_norm": 0.19113856554031372, "learning_rate": 1.5022687505123702e-06, "loss": 0.4403, "step": 2584 }, { "epoch": 4.527145359019264, "grad_norm": 0.18959607183933258, "learning_rate": 1.4988924292705615e-06, "loss": 0.4426, "step": 2585 }, { "epoch": 4.528896672504378, "grad_norm": 0.19084583222866058, "learning_rate": 1.4955192373091915e-06, "loss": 0.447, "step": 2586 }, { "epoch": 4.530647985989492, "grad_norm": 0.18493342399597168, "learning_rate": 1.4921491776432062e-06, "loss": 0.4374, "step": 2587 }, { "epoch": 4.532399299474606, "grad_norm": 0.19164182245731354, "learning_rate": 1.488782253284759e-06, "loss": 0.4399, "step": 2588 }, { "epoch": 4.53415061295972, "grad_norm": 0.18819572031497955, "learning_rate": 1.4854184672431948e-06, "loss": 0.4448, "step": 2589 }, { "epoch": 4.535901926444834, "grad_norm": 0.18494167923927307, "learning_rate": 1.4820578225250582e-06, "loss": 0.445, "step": 2590 }, { "epoch": 4.537653239929948, "grad_norm": 0.18865573406219482, "learning_rate": 1.4787003221340851e-06, "loss": 0.4401, "step": 2591 }, { "epoch": 4.5394045534150615, "grad_norm": 0.2230251133441925, "learning_rate": 1.475345969071198e-06, "loss": 0.4477, "step": 2592 }, { "epoch": 4.5411558669001755, "grad_norm": 0.2076566219329834, "learning_rate": 1.4719947663345112e-06, "loss": 0.4467, "step": 2593 }, { "epoch": 4.5429071803852885, "grad_norm": 0.1936158984899521, "learning_rate": 1.468646716919318e-06, "loss": 0.4436, "step": 2594 }, { "epoch": 4.544658493870402, "grad_norm": 0.1790137141942978, "learning_rate": 1.4653018238180982e-06, "loss": 0.4518, "step": 2595 }, { "epoch": 4.546409807355516, "grad_norm": 0.18924160301685333, "learning_rate": 1.461960090020505e-06, "loss": 0.4473, "step": 2596 }, { "epoch": 4.54816112084063, "grad_norm": 0.19302435219287872, "learning_rate": 1.458621518513375e-06, "loss": 0.4473, "step": 2597 }, { "epoch": 4.549912434325744, "grad_norm": 0.18501585721969604, "learning_rate": 1.455286112280711e-06, "loss": 0.446, "step": 2598 }, { "epoch": 4.551663747810858, "grad_norm": 0.20047691464424133, "learning_rate": 1.4519538743036927e-06, "loss": 0.4447, "step": 2599 }, { "epoch": 4.553415061295972, "grad_norm": 0.19338788092136383, "learning_rate": 1.4486248075606624e-06, "loss": 0.4473, "step": 2600 }, { "epoch": 4.555166374781086, "grad_norm": 0.17569667100906372, "learning_rate": 1.445298915027134e-06, "loss": 0.437, "step": 2601 }, { "epoch": 4.5569176882662, "grad_norm": 0.20210270583629608, "learning_rate": 1.441976199675778e-06, "loss": 0.4487, "step": 2602 }, { "epoch": 4.558669001751314, "grad_norm": 0.20176537334918976, "learning_rate": 1.4386566644764321e-06, "loss": 0.4473, "step": 2603 }, { "epoch": 4.560420315236428, "grad_norm": 0.19576801359653473, "learning_rate": 1.435340312396084e-06, "loss": 0.4457, "step": 2604 }, { "epoch": 4.562171628721541, "grad_norm": 0.1927487850189209, "learning_rate": 1.4320271463988822e-06, "loss": 0.4438, "step": 2605 }, { "epoch": 4.563922942206655, "grad_norm": 0.18469730019569397, "learning_rate": 1.428717169446125e-06, "loss": 0.4448, "step": 2606 }, { "epoch": 4.565674255691769, "grad_norm": 0.19538114964962006, "learning_rate": 1.425410384496263e-06, "loss": 0.451, "step": 2607 }, { "epoch": 4.567425569176883, "grad_norm": 0.18643233180046082, "learning_rate": 1.4221067945048877e-06, "loss": 0.4441, "step": 2608 }, { "epoch": 4.5691768826619965, "grad_norm": 0.19173777103424072, "learning_rate": 1.4188064024247428e-06, "loss": 0.4465, "step": 2609 }, { "epoch": 4.57092819614711, "grad_norm": 0.1966649293899536, "learning_rate": 1.415509211205706e-06, "loss": 0.4364, "step": 2610 }, { "epoch": 4.572679509632224, "grad_norm": 0.18393643200397491, "learning_rate": 1.4122152237948018e-06, "loss": 0.449, "step": 2611 }, { "epoch": 4.574430823117338, "grad_norm": 0.17964550852775574, "learning_rate": 1.4089244431361826e-06, "loss": 0.4372, "step": 2612 }, { "epoch": 4.576182136602452, "grad_norm": 0.1901542991399765, "learning_rate": 1.405636872171144e-06, "loss": 0.4542, "step": 2613 }, { "epoch": 4.577933450087565, "grad_norm": 0.18971438705921173, "learning_rate": 1.4023525138381039e-06, "loss": 0.4474, "step": 2614 }, { "epoch": 4.579684763572679, "grad_norm": 0.1905529946088791, "learning_rate": 1.3990713710726145e-06, "loss": 0.4416, "step": 2615 }, { "epoch": 4.581436077057793, "grad_norm": 0.17938245832920074, "learning_rate": 1.395793446807354e-06, "loss": 0.4434, "step": 2616 }, { "epoch": 4.583187390542907, "grad_norm": 0.18562990427017212, "learning_rate": 1.3925187439721193e-06, "loss": 0.4431, "step": 2617 }, { "epoch": 4.584938704028021, "grad_norm": 0.17602430284023285, "learning_rate": 1.3892472654938338e-06, "loss": 0.438, "step": 2618 }, { "epoch": 4.586690017513135, "grad_norm": 0.1879691630601883, "learning_rate": 1.385979014296533e-06, "loss": 0.4437, "step": 2619 }, { "epoch": 4.588441330998249, "grad_norm": 0.20229563117027283, "learning_rate": 1.3827139933013739e-06, "loss": 0.4444, "step": 2620 }, { "epoch": 4.590192644483363, "grad_norm": 0.18487417697906494, "learning_rate": 1.3794522054266207e-06, "loss": 0.437, "step": 2621 }, { "epoch": 4.591943957968477, "grad_norm": 0.18017494678497314, "learning_rate": 1.376193653587653e-06, "loss": 0.4544, "step": 2622 }, { "epoch": 4.593695271453591, "grad_norm": 0.18770964443683624, "learning_rate": 1.3729383406969533e-06, "loss": 0.4486, "step": 2623 }, { "epoch": 4.5954465849387045, "grad_norm": 0.19591930508613586, "learning_rate": 1.3696862696641133e-06, "loss": 0.4485, "step": 2624 }, { "epoch": 4.5971978984238175, "grad_norm": 0.18719133734703064, "learning_rate": 1.366437443395824e-06, "loss": 0.4428, "step": 2625 }, { "epoch": 4.5989492119089315, "grad_norm": 0.18984559178352356, "learning_rate": 1.3631918647958786e-06, "loss": 0.4484, "step": 2626 }, { "epoch": 4.600700525394045, "grad_norm": 0.1765299141407013, "learning_rate": 1.3599495367651645e-06, "loss": 0.4386, "step": 2627 }, { "epoch": 4.602451838879159, "grad_norm": 0.18665264546871185, "learning_rate": 1.3567104622016685e-06, "loss": 0.4477, "step": 2628 }, { "epoch": 4.604203152364273, "grad_norm": 0.18592776358127594, "learning_rate": 1.3534746440004637e-06, "loss": 0.4408, "step": 2629 }, { "epoch": 4.605954465849387, "grad_norm": 0.18887758255004883, "learning_rate": 1.3502420850537185e-06, "loss": 0.4405, "step": 2630 }, { "epoch": 4.607705779334501, "grad_norm": 0.17549128830432892, "learning_rate": 1.3470127882506824e-06, "loss": 0.4456, "step": 2631 }, { "epoch": 4.609457092819615, "grad_norm": 0.181070938706398, "learning_rate": 1.3437867564776952e-06, "loss": 0.4423, "step": 2632 }, { "epoch": 4.611208406304728, "grad_norm": 0.18252477049827576, "learning_rate": 1.3405639926181718e-06, "loss": 0.4467, "step": 2633 }, { "epoch": 4.612959719789842, "grad_norm": 0.17651928961277008, "learning_rate": 1.337344499552612e-06, "loss": 0.4313, "step": 2634 }, { "epoch": 4.614711033274956, "grad_norm": 0.18648593127727509, "learning_rate": 1.3341282801585914e-06, "loss": 0.4512, "step": 2635 }, { "epoch": 4.61646234676007, "grad_norm": 0.18897487223148346, "learning_rate": 1.3309153373107547e-06, "loss": 0.4449, "step": 2636 }, { "epoch": 4.618213660245184, "grad_norm": 0.18745891749858856, "learning_rate": 1.3277056738808242e-06, "loss": 0.4397, "step": 2637 }, { "epoch": 4.619964973730298, "grad_norm": 0.18823988735675812, "learning_rate": 1.3244992927375865e-06, "loss": 0.4387, "step": 2638 }, { "epoch": 4.621716287215412, "grad_norm": 0.18586160242557526, "learning_rate": 1.3212961967468985e-06, "loss": 0.4474, "step": 2639 }, { "epoch": 4.6234676007005255, "grad_norm": 0.17959251999855042, "learning_rate": 1.3180963887716758e-06, "loss": 0.4419, "step": 2640 }, { "epoch": 4.6252189141856395, "grad_norm": 0.18626120686531067, "learning_rate": 1.3148998716719014e-06, "loss": 0.448, "step": 2641 }, { "epoch": 4.626970227670753, "grad_norm": 0.18442517518997192, "learning_rate": 1.3117066483046098e-06, "loss": 0.4541, "step": 2642 }, { "epoch": 4.628721541155867, "grad_norm": 0.17779774963855743, "learning_rate": 1.3085167215238982e-06, "loss": 0.4462, "step": 2643 }, { "epoch": 4.63047285464098, "grad_norm": 0.18184028565883636, "learning_rate": 1.3053300941809132e-06, "loss": 0.4471, "step": 2644 }, { "epoch": 4.632224168126094, "grad_norm": 0.1906604766845703, "learning_rate": 1.302146769123856e-06, "loss": 0.438, "step": 2645 }, { "epoch": 4.633975481611208, "grad_norm": 0.1833563596010208, "learning_rate": 1.2989667491979702e-06, "loss": 0.4409, "step": 2646 }, { "epoch": 4.635726795096322, "grad_norm": 0.1786334365606308, "learning_rate": 1.2957900372455529e-06, "loss": 0.4414, "step": 2647 }, { "epoch": 4.637478108581436, "grad_norm": 0.18159659206867218, "learning_rate": 1.2926166361059366e-06, "loss": 0.4411, "step": 2648 }, { "epoch": 4.63922942206655, "grad_norm": 0.18656983971595764, "learning_rate": 1.2894465486155028e-06, "loss": 0.4569, "step": 2649 }, { "epoch": 4.640980735551664, "grad_norm": 0.17904484272003174, "learning_rate": 1.2862797776076624e-06, "loss": 0.4487, "step": 2650 }, { "epoch": 4.642732049036778, "grad_norm": 0.18346834182739258, "learning_rate": 1.2831163259128714e-06, "loss": 0.4389, "step": 2651 }, { "epoch": 4.644483362521892, "grad_norm": 0.18092474341392517, "learning_rate": 1.2799561963586104e-06, "loss": 0.4484, "step": 2652 }, { "epoch": 4.646234676007005, "grad_norm": 0.17897264659404755, "learning_rate": 1.2767993917693984e-06, "loss": 0.4442, "step": 2653 }, { "epoch": 4.647985989492119, "grad_norm": 0.1822444498538971, "learning_rate": 1.2736459149667747e-06, "loss": 0.4368, "step": 2654 }, { "epoch": 4.649737302977233, "grad_norm": 0.18774814903736115, "learning_rate": 1.2704957687693114e-06, "loss": 0.456, "step": 2655 }, { "epoch": 4.651488616462347, "grad_norm": 0.1987684816122055, "learning_rate": 1.2673489559925989e-06, "loss": 0.4506, "step": 2656 }, { "epoch": 4.6532399299474605, "grad_norm": 0.183443084359169, "learning_rate": 1.2642054794492514e-06, "loss": 0.4444, "step": 2657 }, { "epoch": 4.654991243432574, "grad_norm": 0.1995455026626587, "learning_rate": 1.2610653419488978e-06, "loss": 0.4434, "step": 2658 }, { "epoch": 4.656742556917688, "grad_norm": 0.1812172532081604, "learning_rate": 1.2579285462981855e-06, "loss": 0.4409, "step": 2659 }, { "epoch": 4.658493870402802, "grad_norm": 0.18441878259181976, "learning_rate": 1.2547950953007754e-06, "loss": 0.4408, "step": 2660 }, { "epoch": 4.660245183887916, "grad_norm": 0.18995966017246246, "learning_rate": 1.2516649917573342e-06, "loss": 0.4463, "step": 2661 }, { "epoch": 4.66199649737303, "grad_norm": 0.17487666010856628, "learning_rate": 1.2485382384655437e-06, "loss": 0.4492, "step": 2662 }, { "epoch": 4.663747810858144, "grad_norm": 0.18983764946460724, "learning_rate": 1.2454148382200837e-06, "loss": 0.4451, "step": 2663 }, { "epoch": 4.665499124343257, "grad_norm": 0.19136656820774078, "learning_rate": 1.2422947938126445e-06, "loss": 0.4449, "step": 2664 }, { "epoch": 4.667250437828371, "grad_norm": 0.1932392567396164, "learning_rate": 1.2391781080319098e-06, "loss": 0.4444, "step": 2665 }, { "epoch": 4.669001751313485, "grad_norm": 0.1930726319551468, "learning_rate": 1.236064783663568e-06, "loss": 0.4493, "step": 2666 }, { "epoch": 4.670753064798599, "grad_norm": 0.18424487113952637, "learning_rate": 1.2329548234902976e-06, "loss": 0.4532, "step": 2667 }, { "epoch": 4.672504378283713, "grad_norm": 0.19204920530319214, "learning_rate": 1.2298482302917752e-06, "loss": 0.4358, "step": 2668 }, { "epoch": 4.674255691768827, "grad_norm": 0.18657632172107697, "learning_rate": 1.2267450068446617e-06, "loss": 0.4458, "step": 2669 }, { "epoch": 4.676007005253941, "grad_norm": 0.190129816532135, "learning_rate": 1.2236451559226137e-06, "loss": 0.4529, "step": 2670 }, { "epoch": 4.677758318739055, "grad_norm": 0.1779433786869049, "learning_rate": 1.2205486802962656e-06, "loss": 0.4451, "step": 2671 }, { "epoch": 4.6795096322241685, "grad_norm": 0.1836177110671997, "learning_rate": 1.217455582733243e-06, "loss": 0.4339, "step": 2672 }, { "epoch": 4.6812609457092815, "grad_norm": 0.17593680322170258, "learning_rate": 1.2143658659981444e-06, "loss": 0.4414, "step": 2673 }, { "epoch": 4.6830122591943955, "grad_norm": 0.17930686473846436, "learning_rate": 1.2112795328525529e-06, "loss": 0.4487, "step": 2674 }, { "epoch": 4.684763572679509, "grad_norm": 0.17523658275604248, "learning_rate": 1.2081965860550227e-06, "loss": 0.4466, "step": 2675 }, { "epoch": 4.686514886164623, "grad_norm": 0.18831877410411835, "learning_rate": 1.2051170283610858e-06, "loss": 0.4438, "step": 2676 }, { "epoch": 4.688266199649737, "grad_norm": 0.19641190767288208, "learning_rate": 1.2020408625232388e-06, "loss": 0.4552, "step": 2677 }, { "epoch": 4.690017513134851, "grad_norm": 0.17786258459091187, "learning_rate": 1.198968091290953e-06, "loss": 0.4455, "step": 2678 }, { "epoch": 4.691768826619965, "grad_norm": 0.18103504180908203, "learning_rate": 1.195898717410664e-06, "loss": 0.4458, "step": 2679 }, { "epoch": 4.693520140105079, "grad_norm": 0.17848777770996094, "learning_rate": 1.1928327436257669e-06, "loss": 0.4523, "step": 2680 }, { "epoch": 4.695271453590193, "grad_norm": 0.18372154235839844, "learning_rate": 1.189770172676623e-06, "loss": 0.4498, "step": 2681 }, { "epoch": 4.697022767075307, "grad_norm": 0.18716579675674438, "learning_rate": 1.1867110073005483e-06, "loss": 0.4411, "step": 2682 }, { "epoch": 4.698774080560421, "grad_norm": 0.17984992265701294, "learning_rate": 1.1836552502318171e-06, "loss": 0.452, "step": 2683 }, { "epoch": 4.700525394045534, "grad_norm": 0.18642757833003998, "learning_rate": 1.1806029042016581e-06, "loss": 0.4443, "step": 2684 }, { "epoch": 4.702276707530648, "grad_norm": 0.18899250030517578, "learning_rate": 1.1775539719382473e-06, "loss": 0.4452, "step": 2685 }, { "epoch": 4.704028021015762, "grad_norm": 0.19022449851036072, "learning_rate": 1.1745084561667136e-06, "loss": 0.4564, "step": 2686 }, { "epoch": 4.705779334500876, "grad_norm": 0.18530772626399994, "learning_rate": 1.1714663596091319e-06, "loss": 0.4506, "step": 2687 }, { "epoch": 4.7075306479859895, "grad_norm": 0.1738152951002121, "learning_rate": 1.1684276849845171e-06, "loss": 0.4388, "step": 2688 }, { "epoch": 4.7092819614711035, "grad_norm": 0.19069068133831024, "learning_rate": 1.1653924350088319e-06, "loss": 0.4465, "step": 2689 }, { "epoch": 4.711033274956217, "grad_norm": 0.18595032393932343, "learning_rate": 1.1623606123949705e-06, "loss": 0.4595, "step": 2690 }, { "epoch": 4.712784588441331, "grad_norm": 0.17912279069423676, "learning_rate": 1.1593322198527707e-06, "loss": 0.4394, "step": 2691 }, { "epoch": 4.714535901926444, "grad_norm": 0.18257759511470795, "learning_rate": 1.1563072600889992e-06, "loss": 0.4342, "step": 2692 }, { "epoch": 4.716287215411558, "grad_norm": 0.2004423588514328, "learning_rate": 1.1532857358073597e-06, "loss": 0.4516, "step": 2693 }, { "epoch": 4.718038528896672, "grad_norm": 0.1834355741739273, "learning_rate": 1.1502676497084786e-06, "loss": 0.4407, "step": 2694 }, { "epoch": 4.719789842381786, "grad_norm": 0.19116994738578796, "learning_rate": 1.147253004489917e-06, "loss": 0.4433, "step": 2695 }, { "epoch": 4.7215411558669, "grad_norm": 0.17753487825393677, "learning_rate": 1.1442418028461537e-06, "loss": 0.4385, "step": 2696 }, { "epoch": 4.723292469352014, "grad_norm": 0.1882096529006958, "learning_rate": 1.1412340474685946e-06, "loss": 0.4503, "step": 2697 }, { "epoch": 4.725043782837128, "grad_norm": 0.19259989261627197, "learning_rate": 1.1382297410455612e-06, "loss": 0.4409, "step": 2698 }, { "epoch": 4.726795096322242, "grad_norm": 0.17659862339496613, "learning_rate": 1.1352288862622968e-06, "loss": 0.4442, "step": 2699 }, { "epoch": 4.728546409807356, "grad_norm": 0.1819458156824112, "learning_rate": 1.1322314858009547e-06, "loss": 0.46, "step": 2700 }, { "epoch": 4.73029772329247, "grad_norm": 0.1774996668100357, "learning_rate": 1.129237542340606e-06, "loss": 0.4426, "step": 2701 }, { "epoch": 4.732049036777584, "grad_norm": 0.18341293931007385, "learning_rate": 1.126247058557226e-06, "loss": 0.4462, "step": 2702 }, { "epoch": 4.7338003502626975, "grad_norm": 0.17591115832328796, "learning_rate": 1.1232600371237035e-06, "loss": 0.4469, "step": 2703 }, { "epoch": 4.735551663747811, "grad_norm": 0.17263774573802948, "learning_rate": 1.1202764807098305e-06, "loss": 0.4392, "step": 2704 }, { "epoch": 4.7373029772329245, "grad_norm": 0.18362440168857574, "learning_rate": 1.1172963919822988e-06, "loss": 0.436, "step": 2705 }, { "epoch": 4.739054290718038, "grad_norm": 0.1859900951385498, "learning_rate": 1.1143197736047073e-06, "loss": 0.4334, "step": 2706 }, { "epoch": 4.740805604203152, "grad_norm": 0.18099337816238403, "learning_rate": 1.1113466282375457e-06, "loss": 0.4504, "step": 2707 }, { "epoch": 4.742556917688266, "grad_norm": 0.1772105097770691, "learning_rate": 1.108376958538207e-06, "loss": 0.4542, "step": 2708 }, { "epoch": 4.74430823117338, "grad_norm": 0.17780350148677826, "learning_rate": 1.1054107671609697e-06, "loss": 0.4326, "step": 2709 }, { "epoch": 4.746059544658494, "grad_norm": 0.17927221953868866, "learning_rate": 1.1024480567570123e-06, "loss": 0.4523, "step": 2710 }, { "epoch": 4.747810858143608, "grad_norm": 0.17971307039260864, "learning_rate": 1.0994888299743929e-06, "loss": 0.4381, "step": 2711 }, { "epoch": 4.749562171628721, "grad_norm": 0.17238891124725342, "learning_rate": 1.0965330894580638e-06, "loss": 0.4402, "step": 2712 }, { "epoch": 4.751313485113835, "grad_norm": 0.18534216284751892, "learning_rate": 1.0935808378498547e-06, "loss": 0.4432, "step": 2713 }, { "epoch": 4.753064798598949, "grad_norm": 0.17654868960380554, "learning_rate": 1.0906320777884833e-06, "loss": 0.4489, "step": 2714 }, { "epoch": 4.754816112084063, "grad_norm": 0.1843319982290268, "learning_rate": 1.0876868119095407e-06, "loss": 0.4388, "step": 2715 }, { "epoch": 4.756567425569177, "grad_norm": 0.18933045864105225, "learning_rate": 1.0847450428454997e-06, "loss": 0.4517, "step": 2716 }, { "epoch": 4.758318739054291, "grad_norm": 0.18268585205078125, "learning_rate": 1.0818067732257032e-06, "loss": 0.4449, "step": 2717 }, { "epoch": 4.760070052539405, "grad_norm": 0.17378591001033783, "learning_rate": 1.0788720056763712e-06, "loss": 0.4464, "step": 2718 }, { "epoch": 4.7618213660245186, "grad_norm": 0.18795107305049896, "learning_rate": 1.075940742820588e-06, "loss": 0.4422, "step": 2719 }, { "epoch": 4.7635726795096325, "grad_norm": 0.18555808067321777, "learning_rate": 1.0730129872783102e-06, "loss": 0.4545, "step": 2720 }, { "epoch": 4.765323992994746, "grad_norm": 0.18039703369140625, "learning_rate": 1.0700887416663575e-06, "loss": 0.4392, "step": 2721 }, { "epoch": 4.76707530647986, "grad_norm": 0.1784902960062027, "learning_rate": 1.0671680085984137e-06, "loss": 0.4407, "step": 2722 }, { "epoch": 4.768826619964973, "grad_norm": 0.1770930141210556, "learning_rate": 1.0642507906850202e-06, "loss": 0.4424, "step": 2723 }, { "epoch": 4.770577933450087, "grad_norm": 0.17746441066265106, "learning_rate": 1.0613370905335801e-06, "loss": 0.4472, "step": 2724 }, { "epoch": 4.772329246935201, "grad_norm": 0.1747593730688095, "learning_rate": 1.058426910748349e-06, "loss": 0.4439, "step": 2725 }, { "epoch": 4.774080560420315, "grad_norm": 0.19094917178153992, "learning_rate": 1.0555202539304398e-06, "loss": 0.438, "step": 2726 }, { "epoch": 4.775831873905429, "grad_norm": 0.1818997710943222, "learning_rate": 1.052617122677812e-06, "loss": 0.4373, "step": 2727 }, { "epoch": 4.777583187390543, "grad_norm": 0.18351078033447266, "learning_rate": 1.0497175195852777e-06, "loss": 0.4396, "step": 2728 }, { "epoch": 4.779334500875657, "grad_norm": 0.18195785582065582, "learning_rate": 1.0468214472444955e-06, "loss": 0.4524, "step": 2729 }, { "epoch": 4.781085814360771, "grad_norm": 0.1878281682729721, "learning_rate": 1.0439289082439647e-06, "loss": 0.4424, "step": 2730 }, { "epoch": 4.782837127845885, "grad_norm": 0.19376899302005768, "learning_rate": 1.0410399051690306e-06, "loss": 0.4474, "step": 2731 }, { "epoch": 4.784588441330998, "grad_norm": 0.19767536222934723, "learning_rate": 1.0381544406018751e-06, "loss": 0.4351, "step": 2732 }, { "epoch": 4.786339754816112, "grad_norm": 0.18478864431381226, "learning_rate": 1.0352725171215194e-06, "loss": 0.4458, "step": 2733 }, { "epoch": 4.788091068301226, "grad_norm": 0.1849551945924759, "learning_rate": 1.0323941373038177e-06, "loss": 0.4352, "step": 2734 }, { "epoch": 4.78984238178634, "grad_norm": 0.17249716818332672, "learning_rate": 1.0295193037214606e-06, "loss": 0.4377, "step": 2735 }, { "epoch": 4.7915936952714535, "grad_norm": 0.18404757976531982, "learning_rate": 1.0266480189439631e-06, "loss": 0.4552, "step": 2736 }, { "epoch": 4.793345008756567, "grad_norm": 0.18083223700523376, "learning_rate": 1.0237802855376745e-06, "loss": 0.4458, "step": 2737 }, { "epoch": 4.795096322241681, "grad_norm": 0.1814054697751999, "learning_rate": 1.0209161060657651e-06, "loss": 0.4405, "step": 2738 }, { "epoch": 4.796847635726795, "grad_norm": 0.18524421751499176, "learning_rate": 1.0180554830882333e-06, "loss": 0.4584, "step": 2739 }, { "epoch": 4.798598949211909, "grad_norm": 0.17441292107105255, "learning_rate": 1.015198419161893e-06, "loss": 0.4484, "step": 2740 }, { "epoch": 4.800350262697023, "grad_norm": 0.1791742891073227, "learning_rate": 1.0123449168403826e-06, "loss": 0.4401, "step": 2741 }, { "epoch": 4.802101576182137, "grad_norm": 0.18196730315685272, "learning_rate": 1.0094949786741531e-06, "loss": 0.4317, "step": 2742 }, { "epoch": 4.80385288966725, "grad_norm": 0.1784345656633377, "learning_rate": 1.0066486072104736e-06, "loss": 0.4404, "step": 2743 }, { "epoch": 4.805604203152364, "grad_norm": 0.1842123568058014, "learning_rate": 1.0038058049934203e-06, "loss": 0.451, "step": 2744 }, { "epoch": 4.807355516637478, "grad_norm": 0.1813959777355194, "learning_rate": 1.0009665745638852e-06, "loss": 0.4455, "step": 2745 }, { "epoch": 4.809106830122592, "grad_norm": 0.19377174973487854, "learning_rate": 9.981309184595617e-07, "loss": 0.4499, "step": 2746 }, { "epoch": 4.810858143607706, "grad_norm": 0.1745368391275406, "learning_rate": 9.952988392149532e-07, "loss": 0.4399, "step": 2747 }, { "epoch": 4.81260945709282, "grad_norm": 0.17667347192764282, "learning_rate": 9.924703393613655e-07, "loss": 0.4436, "step": 2748 }, { "epoch": 4.814360770577934, "grad_norm": 0.18014727532863617, "learning_rate": 9.896454214269014e-07, "loss": 0.4506, "step": 2749 }, { "epoch": 4.816112084063048, "grad_norm": 0.1864462047815323, "learning_rate": 9.868240879364676e-07, "loss": 0.4469, "step": 2750 }, { "epoch": 4.8178633975481615, "grad_norm": 0.2008577436208725, "learning_rate": 9.840063414117618e-07, "loss": 0.4553, "step": 2751 }, { "epoch": 4.8196147110332745, "grad_norm": 0.17952631413936615, "learning_rate": 9.811921843712813e-07, "loss": 0.4437, "step": 2752 }, { "epoch": 4.8213660245183885, "grad_norm": 0.17594963312149048, "learning_rate": 9.78381619330308e-07, "loss": 0.4518, "step": 2753 }, { "epoch": 4.823117338003502, "grad_norm": 0.182461217045784, "learning_rate": 9.75574648800921e-07, "loss": 0.438, "step": 2754 }, { "epoch": 4.824868651488616, "grad_norm": 0.17418032884597778, "learning_rate": 9.727712752919804e-07, "loss": 0.4376, "step": 2755 }, { "epoch": 4.82661996497373, "grad_norm": 0.18112415075302124, "learning_rate": 9.699715013091354e-07, "loss": 0.4418, "step": 2756 }, { "epoch": 4.828371278458844, "grad_norm": 0.191219761967659, "learning_rate": 9.671753293548153e-07, "loss": 0.4587, "step": 2757 }, { "epoch": 4.830122591943958, "grad_norm": 0.188163161277771, "learning_rate": 9.643827619282309e-07, "loss": 0.4426, "step": 2758 }, { "epoch": 4.831873905429072, "grad_norm": 0.18481263518333435, "learning_rate": 9.615938015253723e-07, "loss": 0.4524, "step": 2759 }, { "epoch": 4.833625218914186, "grad_norm": 0.1795421987771988, "learning_rate": 9.588084506390055e-07, "loss": 0.437, "step": 2760 }, { "epoch": 4.8353765323993, "grad_norm": 0.1919804811477661, "learning_rate": 9.56026711758667e-07, "loss": 0.4439, "step": 2761 }, { "epoch": 4.837127845884414, "grad_norm": 0.18518638610839844, "learning_rate": 9.532485873706704e-07, "loss": 0.4504, "step": 2762 }, { "epoch": 4.838879159369527, "grad_norm": 0.19572201371192932, "learning_rate": 9.504740799580931e-07, "loss": 0.4518, "step": 2763 }, { "epoch": 4.840630472854641, "grad_norm": 0.2189120352268219, "learning_rate": 9.477031920007845e-07, "loss": 0.444, "step": 2764 }, { "epoch": 4.842381786339755, "grad_norm": 0.1820036917924881, "learning_rate": 9.449359259753543e-07, "loss": 0.4375, "step": 2765 }, { "epoch": 4.844133099824869, "grad_norm": 0.1881389319896698, "learning_rate": 9.421722843551801e-07, "loss": 0.4526, "step": 2766 }, { "epoch": 4.8458844133099825, "grad_norm": 0.17670923471450806, "learning_rate": 9.394122696103947e-07, "loss": 0.4348, "step": 2767 }, { "epoch": 4.8476357267950965, "grad_norm": 0.18662436306476593, "learning_rate": 9.366558842078943e-07, "loss": 0.4467, "step": 2768 }, { "epoch": 4.84938704028021, "grad_norm": 0.2006414532661438, "learning_rate": 9.339031306113256e-07, "loss": 0.4359, "step": 2769 }, { "epoch": 4.851138353765324, "grad_norm": 0.17964285612106323, "learning_rate": 9.311540112810957e-07, "loss": 0.4344, "step": 2770 }, { "epoch": 4.852889667250437, "grad_norm": 0.1795053482055664, "learning_rate": 9.284085286743571e-07, "loss": 0.4505, "step": 2771 }, { "epoch": 4.854640980735551, "grad_norm": 0.188054621219635, "learning_rate": 9.256666852450158e-07, "loss": 0.451, "step": 2772 }, { "epoch": 4.856392294220665, "grad_norm": 0.19963699579238892, "learning_rate": 9.229284834437252e-07, "loss": 0.4468, "step": 2773 }, { "epoch": 4.858143607705779, "grad_norm": 0.1797536164522171, "learning_rate": 9.20193925717881e-07, "loss": 0.4317, "step": 2774 }, { "epoch": 4.859894921190893, "grad_norm": 0.1746804267168045, "learning_rate": 9.174630145116248e-07, "loss": 0.4395, "step": 2775 }, { "epoch": 4.861646234676007, "grad_norm": 0.17362630367279053, "learning_rate": 9.147357522658351e-07, "loss": 0.4454, "step": 2776 }, { "epoch": 4.863397548161121, "grad_norm": 0.1927916556596756, "learning_rate": 9.120121414181354e-07, "loss": 0.4541, "step": 2777 }, { "epoch": 4.865148861646235, "grad_norm": 0.20407606661319733, "learning_rate": 9.092921844028779e-07, "loss": 0.4478, "step": 2778 }, { "epoch": 4.866900175131349, "grad_norm": 0.1863381564617157, "learning_rate": 9.065758836511556e-07, "loss": 0.4484, "step": 2779 }, { "epoch": 4.868651488616463, "grad_norm": 0.1776265949010849, "learning_rate": 9.038632415907883e-07, "loss": 0.4504, "step": 2780 }, { "epoch": 4.870402802101577, "grad_norm": 0.18439516425132751, "learning_rate": 9.011542606463309e-07, "loss": 0.4431, "step": 2781 }, { "epoch": 4.87215411558669, "grad_norm": 0.2002282738685608, "learning_rate": 8.984489432390597e-07, "loss": 0.4478, "step": 2782 }, { "epoch": 4.873905429071804, "grad_norm": 0.18337716162204742, "learning_rate": 8.957472917869836e-07, "loss": 0.4482, "step": 2783 }, { "epoch": 4.8756567425569175, "grad_norm": 0.18510794639587402, "learning_rate": 8.930493087048286e-07, "loss": 0.45, "step": 2784 }, { "epoch": 4.877408056042031, "grad_norm": 0.18403007090091705, "learning_rate": 8.90354996404047e-07, "loss": 0.4573, "step": 2785 }, { "epoch": 4.879159369527145, "grad_norm": 0.17671333253383636, "learning_rate": 8.876643572928056e-07, "loss": 0.4456, "step": 2786 }, { "epoch": 4.880910683012259, "grad_norm": 0.18077312409877777, "learning_rate": 8.84977393775992e-07, "loss": 0.4394, "step": 2787 }, { "epoch": 4.882661996497373, "grad_norm": 0.18023362755775452, "learning_rate": 8.822941082552055e-07, "loss": 0.4411, "step": 2788 }, { "epoch": 4.884413309982487, "grad_norm": 0.18992935121059418, "learning_rate": 8.796145031287612e-07, "loss": 0.4529, "step": 2789 }, { "epoch": 4.886164623467601, "grad_norm": 0.17336557805538177, "learning_rate": 8.769385807916808e-07, "loss": 0.4319, "step": 2790 }, { "epoch": 4.887915936952714, "grad_norm": 0.17982949316501617, "learning_rate": 8.742663436356969e-07, "loss": 0.4513, "step": 2791 }, { "epoch": 4.889667250437828, "grad_norm": 0.1921672821044922, "learning_rate": 8.715977940492504e-07, "loss": 0.4416, "step": 2792 }, { "epoch": 4.891418563922942, "grad_norm": 0.17339016497135162, "learning_rate": 8.689329344174797e-07, "loss": 0.4319, "step": 2793 }, { "epoch": 4.893169877408056, "grad_norm": 0.181754007935524, "learning_rate": 8.662717671222326e-07, "loss": 0.4516, "step": 2794 }, { "epoch": 4.89492119089317, "grad_norm": 0.18195189535617828, "learning_rate": 8.6361429454205e-07, "loss": 0.4416, "step": 2795 }, { "epoch": 4.896672504378284, "grad_norm": 0.17904551327228546, "learning_rate": 8.609605190521752e-07, "loss": 0.441, "step": 2796 }, { "epoch": 4.898423817863398, "grad_norm": 0.18798615038394928, "learning_rate": 8.583104430245476e-07, "loss": 0.446, "step": 2797 }, { "epoch": 4.900175131348512, "grad_norm": 0.1834365427494049, "learning_rate": 8.55664068827794e-07, "loss": 0.448, "step": 2798 }, { "epoch": 4.9019264448336255, "grad_norm": 0.18231186270713806, "learning_rate": 8.53021398827239e-07, "loss": 0.4279, "step": 2799 }, { "epoch": 4.903677758318739, "grad_norm": 0.17930921912193298, "learning_rate": 8.503824353848944e-07, "loss": 0.4535, "step": 2800 }, { "epoch": 4.905429071803853, "grad_norm": 0.18244701623916626, "learning_rate": 8.477471808594556e-07, "loss": 0.456, "step": 2801 }, { "epoch": 4.907180385288966, "grad_norm": 0.17636866867542267, "learning_rate": 8.451156376063096e-07, "loss": 0.4475, "step": 2802 }, { "epoch": 4.90893169877408, "grad_norm": 0.17803026735782623, "learning_rate": 8.424878079775195e-07, "loss": 0.4411, "step": 2803 }, { "epoch": 4.910683012259194, "grad_norm": 0.1776597499847412, "learning_rate": 8.398636943218353e-07, "loss": 0.4412, "step": 2804 }, { "epoch": 4.912434325744308, "grad_norm": 0.17400692403316498, "learning_rate": 8.37243298984679e-07, "loss": 0.4372, "step": 2805 }, { "epoch": 4.914185639229422, "grad_norm": 0.1891811043024063, "learning_rate": 8.346266243081563e-07, "loss": 0.4394, "step": 2806 }, { "epoch": 4.915936952714536, "grad_norm": 0.17312295734882355, "learning_rate": 8.320136726310413e-07, "loss": 0.4389, "step": 2807 }, { "epoch": 4.91768826619965, "grad_norm": 0.18208345770835876, "learning_rate": 8.294044462887852e-07, "loss": 0.4471, "step": 2808 }, { "epoch": 4.919439579684764, "grad_norm": 0.1935824304819107, "learning_rate": 8.267989476135052e-07, "loss": 0.4332, "step": 2809 }, { "epoch": 4.921190893169878, "grad_norm": 0.1813029795885086, "learning_rate": 8.24197178933992e-07, "loss": 0.443, "step": 2810 }, { "epoch": 4.922942206654991, "grad_norm": 0.17628316581249237, "learning_rate": 8.215991425756959e-07, "loss": 0.4438, "step": 2811 }, { "epoch": 4.924693520140105, "grad_norm": 0.17758998274803162, "learning_rate": 8.190048408607376e-07, "loss": 0.4449, "step": 2812 }, { "epoch": 4.926444833625219, "grad_norm": 0.18030907213687897, "learning_rate": 8.164142761078941e-07, "loss": 0.438, "step": 2813 }, { "epoch": 4.928196147110333, "grad_norm": 0.18978147208690643, "learning_rate": 8.138274506326088e-07, "loss": 0.4575, "step": 2814 }, { "epoch": 4.9299474605954465, "grad_norm": 0.19209273159503937, "learning_rate": 8.112443667469754e-07, "loss": 0.4459, "step": 2815 }, { "epoch": 4.9316987740805605, "grad_norm": 0.17907345294952393, "learning_rate": 8.08665026759749e-07, "loss": 0.4421, "step": 2816 }, { "epoch": 4.933450087565674, "grad_norm": 0.18975917994976044, "learning_rate": 8.060894329763386e-07, "loss": 0.4465, "step": 2817 }, { "epoch": 4.935201401050788, "grad_norm": 0.180275097489357, "learning_rate": 8.035175876987994e-07, "loss": 0.4588, "step": 2818 }, { "epoch": 4.936952714535902, "grad_norm": 0.18448902666568756, "learning_rate": 8.009494932258427e-07, "loss": 0.4506, "step": 2819 }, { "epoch": 4.938704028021016, "grad_norm": 0.18001598119735718, "learning_rate": 7.983851518528224e-07, "loss": 0.4376, "step": 2820 }, { "epoch": 4.94045534150613, "grad_norm": 0.18156622350215912, "learning_rate": 7.958245658717411e-07, "loss": 0.4418, "step": 2821 }, { "epoch": 4.942206654991243, "grad_norm": 0.18459512293338776, "learning_rate": 7.932677375712423e-07, "loss": 0.4385, "step": 2822 }, { "epoch": 4.943957968476357, "grad_norm": 0.177482008934021, "learning_rate": 7.907146692366135e-07, "loss": 0.4487, "step": 2823 }, { "epoch": 4.945709281961471, "grad_norm": 0.17846524715423584, "learning_rate": 7.881653631497793e-07, "loss": 0.457, "step": 2824 }, { "epoch": 4.947460595446585, "grad_norm": 0.18181680142879486, "learning_rate": 7.856198215893035e-07, "loss": 0.445, "step": 2825 }, { "epoch": 4.949211908931699, "grad_norm": 0.17908291518688202, "learning_rate": 7.830780468303828e-07, "loss": 0.4413, "step": 2826 }, { "epoch": 4.950963222416813, "grad_norm": 0.18289996683597565, "learning_rate": 7.805400411448505e-07, "loss": 0.4433, "step": 2827 }, { "epoch": 4.952714535901927, "grad_norm": 0.17495928704738617, "learning_rate": 7.78005806801167e-07, "loss": 0.4459, "step": 2828 }, { "epoch": 4.954465849387041, "grad_norm": 0.18119587004184723, "learning_rate": 7.754753460644266e-07, "loss": 0.4507, "step": 2829 }, { "epoch": 4.956217162872154, "grad_norm": 0.1804075390100479, "learning_rate": 7.729486611963449e-07, "loss": 0.447, "step": 2830 }, { "epoch": 4.957968476357268, "grad_norm": 0.18040263652801514, "learning_rate": 7.704257544552696e-07, "loss": 0.453, "step": 2831 }, { "epoch": 4.9597197898423815, "grad_norm": 0.18001849949359894, "learning_rate": 7.67906628096165e-07, "loss": 0.4512, "step": 2832 }, { "epoch": 4.961471103327495, "grad_norm": 0.19176733493804932, "learning_rate": 7.653912843706219e-07, "loss": 0.4516, "step": 2833 }, { "epoch": 4.963222416812609, "grad_norm": 0.18264059722423553, "learning_rate": 7.628797255268439e-07, "loss": 0.446, "step": 2834 }, { "epoch": 4.964973730297723, "grad_norm": 0.1808706372976303, "learning_rate": 7.603719538096604e-07, "loss": 0.4466, "step": 2835 }, { "epoch": 4.966725043782837, "grad_norm": 0.1750388741493225, "learning_rate": 7.57867971460507e-07, "loss": 0.4358, "step": 2836 }, { "epoch": 4.968476357267951, "grad_norm": 0.1866728663444519, "learning_rate": 7.553677807174392e-07, "loss": 0.4403, "step": 2837 }, { "epoch": 4.970227670753065, "grad_norm": 0.1912350356578827, "learning_rate": 7.52871383815118e-07, "loss": 0.4412, "step": 2838 }, { "epoch": 4.971978984238179, "grad_norm": 0.17515753209590912, "learning_rate": 7.503787829848191e-07, "loss": 0.4415, "step": 2839 }, { "epoch": 4.973730297723293, "grad_norm": 0.1762426644563675, "learning_rate": 7.478899804544193e-07, "loss": 0.4378, "step": 2840 }, { "epoch": 4.975481611208406, "grad_norm": 0.17561806738376617, "learning_rate": 7.454049784484063e-07, "loss": 0.4357, "step": 2841 }, { "epoch": 4.97723292469352, "grad_norm": 0.1797967106103897, "learning_rate": 7.429237791878652e-07, "loss": 0.4416, "step": 2842 }, { "epoch": 4.978984238178634, "grad_norm": 0.18268421292304993, "learning_rate": 7.40446384890487e-07, "loss": 0.4472, "step": 2843 }, { "epoch": 4.980735551663748, "grad_norm": 0.17508912086486816, "learning_rate": 7.379727977705614e-07, "loss": 0.4375, "step": 2844 }, { "epoch": 4.982486865148862, "grad_norm": 0.17771507799625397, "learning_rate": 7.355030200389706e-07, "loss": 0.4514, "step": 2845 }, { "epoch": 4.984238178633976, "grad_norm": 0.18374666571617126, "learning_rate": 7.330370539031978e-07, "loss": 0.4454, "step": 2846 }, { "epoch": 4.9859894921190895, "grad_norm": 0.18590903282165527, "learning_rate": 7.305749015673153e-07, "loss": 0.4435, "step": 2847 }, { "epoch": 4.987740805604203, "grad_norm": 0.18491898477077484, "learning_rate": 7.281165652319893e-07, "loss": 0.4361, "step": 2848 }, { "epoch": 4.989492119089317, "grad_norm": 0.19144216179847717, "learning_rate": 7.25662047094473e-07, "loss": 0.4434, "step": 2849 }, { "epoch": 4.99124343257443, "grad_norm": 0.17835365235805511, "learning_rate": 7.232113493486099e-07, "loss": 0.4459, "step": 2850 }, { "epoch": 4.992994746059544, "grad_norm": 0.18610155582427979, "learning_rate": 7.207644741848241e-07, "loss": 0.4499, "step": 2851 }, { "epoch": 4.994746059544658, "grad_norm": 0.1796485334634781, "learning_rate": 7.183214237901287e-07, "loss": 0.4555, "step": 2852 }, { "epoch": 4.996497373029772, "grad_norm": 0.18881872296333313, "learning_rate": 7.158822003481136e-07, "loss": 0.4482, "step": 2853 }, { "epoch": 4.998248686514886, "grad_norm": 0.17956650257110596, "learning_rate": 7.134468060389521e-07, "loss": 0.4474, "step": 2854 }, { "epoch": 5.0, "grad_norm": 0.1892032027244568, "learning_rate": 7.110152430393902e-07, "loss": 0.4353, "step": 2855 }, { "epoch": 5.001751313485114, "grad_norm": 0.18339687585830688, "learning_rate": 7.085875135227555e-07, "loss": 0.4349, "step": 2856 }, { "epoch": 5.003502626970228, "grad_norm": 0.17908015847206116, "learning_rate": 7.061636196589433e-07, "loss": 0.4378, "step": 2857 }, { "epoch": 5.005253940455342, "grad_norm": 0.17627288401126862, "learning_rate": 7.037435636144252e-07, "loss": 0.4313, "step": 2858 }, { "epoch": 5.007005253940456, "grad_norm": 0.17925016582012177, "learning_rate": 7.013273475522392e-07, "loss": 0.4318, "step": 2859 }, { "epoch": 5.008756567425569, "grad_norm": 0.18576040863990784, "learning_rate": 6.989149736319939e-07, "loss": 0.4328, "step": 2860 }, { "epoch": 5.010507880910683, "grad_norm": 0.1929345428943634, "learning_rate": 6.965064440098629e-07, "loss": 0.4267, "step": 2861 }, { "epoch": 5.012259194395797, "grad_norm": 0.18095698952674866, "learning_rate": 6.941017608385813e-07, "loss": 0.4296, "step": 2862 }, { "epoch": 5.0140105078809105, "grad_norm": 0.18371140956878662, "learning_rate": 6.917009262674512e-07, "loss": 0.4319, "step": 2863 }, { "epoch": 5.015761821366024, "grad_norm": 0.19179238379001617, "learning_rate": 6.893039424423298e-07, "loss": 0.4375, "step": 2864 }, { "epoch": 5.017513134851138, "grad_norm": 0.18362492322921753, "learning_rate": 6.869108115056361e-07, "loss": 0.4333, "step": 2865 }, { "epoch": 5.019264448336252, "grad_norm": 0.1919475942850113, "learning_rate": 6.845215355963425e-07, "loss": 0.4328, "step": 2866 }, { "epoch": 5.021015761821366, "grad_norm": 0.19067642092704773, "learning_rate": 6.821361168499796e-07, "loss": 0.4443, "step": 2867 }, { "epoch": 5.02276707530648, "grad_norm": 0.18898649513721466, "learning_rate": 6.797545573986253e-07, "loss": 0.4327, "step": 2868 }, { "epoch": 5.024518388791594, "grad_norm": 0.17824044823646545, "learning_rate": 6.77376859370914e-07, "loss": 0.4407, "step": 2869 }, { "epoch": 5.026269702276707, "grad_norm": 0.19573338329792023, "learning_rate": 6.750030248920231e-07, "loss": 0.4455, "step": 2870 }, { "epoch": 5.028021015761821, "grad_norm": 0.18074728548526764, "learning_rate": 6.726330560836813e-07, "loss": 0.4338, "step": 2871 }, { "epoch": 5.029772329246935, "grad_norm": 0.1899799406528473, "learning_rate": 6.702669550641589e-07, "loss": 0.4477, "step": 2872 }, { "epoch": 5.031523642732049, "grad_norm": 0.18903033435344696, "learning_rate": 6.679047239482711e-07, "loss": 0.4444, "step": 2873 }, { "epoch": 5.033274956217163, "grad_norm": 0.1772882640361786, "learning_rate": 6.655463648473736e-07, "loss": 0.4353, "step": 2874 }, { "epoch": 5.035026269702277, "grad_norm": 0.17777280509471893, "learning_rate": 6.631918798693632e-07, "loss": 0.4374, "step": 2875 }, { "epoch": 5.036777583187391, "grad_norm": 0.17449764907360077, "learning_rate": 6.60841271118669e-07, "loss": 0.4417, "step": 2876 }, { "epoch": 5.038528896672505, "grad_norm": 0.19205345213413239, "learning_rate": 6.584945406962617e-07, "loss": 0.4417, "step": 2877 }, { "epoch": 5.0402802101576185, "grad_norm": 0.18151569366455078, "learning_rate": 6.561516906996395e-07, "loss": 0.4334, "step": 2878 }, { "epoch": 5.042031523642732, "grad_norm": 0.17573758959770203, "learning_rate": 6.53812723222838e-07, "loss": 0.443, "step": 2879 }, { "epoch": 5.0437828371278455, "grad_norm": 0.1771005243062973, "learning_rate": 6.514776403564183e-07, "loss": 0.4388, "step": 2880 }, { "epoch": 5.045534150612959, "grad_norm": 0.1785949021577835, "learning_rate": 6.491464441874728e-07, "loss": 0.4322, "step": 2881 }, { "epoch": 5.047285464098073, "grad_norm": 0.17945988476276398, "learning_rate": 6.468191367996163e-07, "loss": 0.4326, "step": 2882 }, { "epoch": 5.049036777583187, "grad_norm": 0.18183410167694092, "learning_rate": 6.444957202729918e-07, "loss": 0.4396, "step": 2883 }, { "epoch": 5.050788091068301, "grad_norm": 0.1787223517894745, "learning_rate": 6.421761966842604e-07, "loss": 0.4354, "step": 2884 }, { "epoch": 5.052539404553415, "grad_norm": 0.17645907402038574, "learning_rate": 6.398605681066083e-07, "loss": 0.4369, "step": 2885 }, { "epoch": 5.054290718038529, "grad_norm": 0.18226055800914764, "learning_rate": 6.37548836609736e-07, "loss": 0.4418, "step": 2886 }, { "epoch": 5.056042031523643, "grad_norm": 0.17362649738788605, "learning_rate": 6.352410042598639e-07, "loss": 0.4366, "step": 2887 }, { "epoch": 5.057793345008757, "grad_norm": 0.178994283080101, "learning_rate": 6.329370731197276e-07, "loss": 0.4372, "step": 2888 }, { "epoch": 5.059544658493871, "grad_norm": 0.17684637010097504, "learning_rate": 6.306370452485716e-07, "loss": 0.4547, "step": 2889 }, { "epoch": 5.061295971978984, "grad_norm": 0.17915549874305725, "learning_rate": 6.283409227021575e-07, "loss": 0.4274, "step": 2890 }, { "epoch": 5.063047285464098, "grad_norm": 0.1724509596824646, "learning_rate": 6.260487075327504e-07, "loss": 0.4487, "step": 2891 }, { "epoch": 5.064798598949212, "grad_norm": 0.1730109453201294, "learning_rate": 6.237604017891291e-07, "loss": 0.4426, "step": 2892 }, { "epoch": 5.066549912434326, "grad_norm": 0.1754777431488037, "learning_rate": 6.214760075165721e-07, "loss": 0.4421, "step": 2893 }, { "epoch": 5.0683012259194395, "grad_norm": 0.1734795868396759, "learning_rate": 6.191955267568678e-07, "loss": 0.4238, "step": 2894 }, { "epoch": 5.0700525394045535, "grad_norm": 0.1744484007358551, "learning_rate": 6.169189615483007e-07, "loss": 0.4397, "step": 2895 }, { "epoch": 5.071803852889667, "grad_norm": 0.1815190613269806, "learning_rate": 6.146463139256609e-07, "loss": 0.4321, "step": 2896 }, { "epoch": 5.073555166374781, "grad_norm": 0.17381595075130463, "learning_rate": 6.12377585920233e-07, "loss": 0.4288, "step": 2897 }, { "epoch": 5.075306479859895, "grad_norm": 0.1837455928325653, "learning_rate": 6.101127795598016e-07, "loss": 0.4468, "step": 2898 }, { "epoch": 5.077057793345009, "grad_norm": 0.1676664501428604, "learning_rate": 6.078518968686426e-07, "loss": 0.4262, "step": 2899 }, { "epoch": 5.078809106830122, "grad_norm": 0.17756840586662292, "learning_rate": 6.055949398675293e-07, "loss": 0.4247, "step": 2900 }, { "epoch": 5.080560420315236, "grad_norm": 0.17770889401435852, "learning_rate": 6.033419105737204e-07, "loss": 0.4334, "step": 2901 }, { "epoch": 5.08231173380035, "grad_norm": 0.17292848229408264, "learning_rate": 6.010928110009711e-07, "loss": 0.4445, "step": 2902 }, { "epoch": 5.084063047285464, "grad_norm": 0.18139012157917023, "learning_rate": 5.98847643159517e-07, "loss": 0.442, "step": 2903 }, { "epoch": 5.085814360770578, "grad_norm": 0.17989981174468994, "learning_rate": 5.966064090560847e-07, "loss": 0.4322, "step": 2904 }, { "epoch": 5.087565674255692, "grad_norm": 0.17316873371601105, "learning_rate": 5.943691106938843e-07, "loss": 0.4498, "step": 2905 }, { "epoch": 5.089316987740806, "grad_norm": 0.1756884902715683, "learning_rate": 5.921357500726044e-07, "loss": 0.4419, "step": 2906 }, { "epoch": 5.09106830122592, "grad_norm": 0.17377357184886932, "learning_rate": 5.899063291884193e-07, "loss": 0.4397, "step": 2907 }, { "epoch": 5.092819614711034, "grad_norm": 0.16876980662345886, "learning_rate": 5.87680850033977e-07, "loss": 0.4396, "step": 2908 }, { "epoch": 5.0945709281961475, "grad_norm": 0.17192958295345306, "learning_rate": 5.854593145984067e-07, "loss": 0.4315, "step": 2909 }, { "epoch": 5.096322241681261, "grad_norm": 0.17521262168884277, "learning_rate": 5.832417248673089e-07, "loss": 0.446, "step": 2910 }, { "epoch": 5.0980735551663745, "grad_norm": 0.17210577428340912, "learning_rate": 5.810280828227599e-07, "loss": 0.4482, "step": 2911 }, { "epoch": 5.099824868651488, "grad_norm": 0.17754951119422913, "learning_rate": 5.788183904433076e-07, "loss": 0.4453, "step": 2912 }, { "epoch": 5.101576182136602, "grad_norm": 0.17957009375095367, "learning_rate": 5.766126497039698e-07, "loss": 0.4266, "step": 2913 }, { "epoch": 5.103327495621716, "grad_norm": 0.18651577830314636, "learning_rate": 5.744108625762291e-07, "loss": 0.4392, "step": 2914 }, { "epoch": 5.10507880910683, "grad_norm": 0.17926912009716034, "learning_rate": 5.722130310280394e-07, "loss": 0.449, "step": 2915 }, { "epoch": 5.106830122591944, "grad_norm": 0.1788308471441269, "learning_rate": 5.70019157023814e-07, "loss": 0.4366, "step": 2916 }, { "epoch": 5.108581436077058, "grad_norm": 0.17543089389801025, "learning_rate": 5.678292425244347e-07, "loss": 0.4431, "step": 2917 }, { "epoch": 5.110332749562172, "grad_norm": 0.1761842668056488, "learning_rate": 5.656432894872376e-07, "loss": 0.4395, "step": 2918 }, { "epoch": 5.112084063047286, "grad_norm": 0.17552781105041504, "learning_rate": 5.634612998660249e-07, "loss": 0.4305, "step": 2919 }, { "epoch": 5.113835376532399, "grad_norm": 0.1756703406572342, "learning_rate": 5.612832756110498e-07, "loss": 0.4383, "step": 2920 }, { "epoch": 5.115586690017513, "grad_norm": 0.18030181527137756, "learning_rate": 5.591092186690278e-07, "loss": 0.4338, "step": 2921 }, { "epoch": 5.117338003502627, "grad_norm": 0.178649440407753, "learning_rate": 5.569391309831218e-07, "loss": 0.4328, "step": 2922 }, { "epoch": 5.119089316987741, "grad_norm": 0.18557512760162354, "learning_rate": 5.547730144929531e-07, "loss": 0.4323, "step": 2923 }, { "epoch": 5.120840630472855, "grad_norm": 0.17473773658275604, "learning_rate": 5.526108711345884e-07, "loss": 0.4373, "step": 2924 }, { "epoch": 5.122591943957969, "grad_norm": 0.1826568990945816, "learning_rate": 5.504527028405482e-07, "loss": 0.4355, "step": 2925 }, { "epoch": 5.1243432574430825, "grad_norm": 0.17969191074371338, "learning_rate": 5.482985115397943e-07, "loss": 0.4365, "step": 2926 }, { "epoch": 5.126094570928196, "grad_norm": 0.17167864739894867, "learning_rate": 5.461482991577399e-07, "loss": 0.4504, "step": 2927 }, { "epoch": 5.12784588441331, "grad_norm": 0.17751820385456085, "learning_rate": 5.440020676162367e-07, "loss": 0.4381, "step": 2928 }, { "epoch": 5.129597197898423, "grad_norm": 0.1972580999135971, "learning_rate": 5.418598188335816e-07, "loss": 0.4348, "step": 2929 }, { "epoch": 5.131348511383537, "grad_norm": 0.17985142767429352, "learning_rate": 5.397215547245116e-07, "loss": 0.4405, "step": 2930 }, { "epoch": 5.133099824868651, "grad_norm": 0.17987579107284546, "learning_rate": 5.375872772002e-07, "loss": 0.4484, "step": 2931 }, { "epoch": 5.134851138353765, "grad_norm": 0.1774570196866989, "learning_rate": 5.354569881682592e-07, "loss": 0.4279, "step": 2932 }, { "epoch": 5.136602451838879, "grad_norm": 0.18406032025814056, "learning_rate": 5.33330689532735e-07, "loss": 0.4497, "step": 2933 }, { "epoch": 5.138353765323993, "grad_norm": 0.1757873147726059, "learning_rate": 5.312083831941084e-07, "loss": 0.4325, "step": 2934 }, { "epoch": 5.140105078809107, "grad_norm": 0.1738872230052948, "learning_rate": 5.290900710492891e-07, "loss": 0.4409, "step": 2935 }, { "epoch": 5.141856392294221, "grad_norm": 0.17504049837589264, "learning_rate": 5.269757549916216e-07, "loss": 0.4314, "step": 2936 }, { "epoch": 5.143607705779335, "grad_norm": 0.17290334403514862, "learning_rate": 5.24865436910873e-07, "loss": 0.435, "step": 2937 }, { "epoch": 5.145359019264449, "grad_norm": 0.18320316076278687, "learning_rate": 5.227591186932418e-07, "loss": 0.437, "step": 2938 }, { "epoch": 5.147110332749562, "grad_norm": 0.1780366450548172, "learning_rate": 5.206568022213482e-07, "loss": 0.446, "step": 2939 }, { "epoch": 5.148861646234676, "grad_norm": 0.18348141014575958, "learning_rate": 5.185584893742379e-07, "loss": 0.4529, "step": 2940 }, { "epoch": 5.15061295971979, "grad_norm": 0.17625033855438232, "learning_rate": 5.164641820273764e-07, "loss": 0.4293, "step": 2941 }, { "epoch": 5.1523642732049035, "grad_norm": 0.18292494118213654, "learning_rate": 5.143738820526517e-07, "loss": 0.4398, "step": 2942 }, { "epoch": 5.1541155866900175, "grad_norm": 0.18028804659843445, "learning_rate": 5.122875913183656e-07, "loss": 0.4495, "step": 2943 }, { "epoch": 5.155866900175131, "grad_norm": 0.1681380569934845, "learning_rate": 5.102053116892414e-07, "loss": 0.4377, "step": 2944 }, { "epoch": 5.157618213660245, "grad_norm": 0.1816532462835312, "learning_rate": 5.081270450264125e-07, "loss": 0.4574, "step": 2945 }, { "epoch": 5.159369527145359, "grad_norm": 0.19164025783538818, "learning_rate": 5.060527931874309e-07, "loss": 0.4373, "step": 2946 }, { "epoch": 5.161120840630473, "grad_norm": 0.1752689778804779, "learning_rate": 5.039825580262553e-07, "loss": 0.4361, "step": 2947 }, { "epoch": 5.162872154115587, "grad_norm": 0.17528560757637024, "learning_rate": 5.019163413932565e-07, "loss": 0.4433, "step": 2948 }, { "epoch": 5.1646234676007, "grad_norm": 0.17381566762924194, "learning_rate": 4.998541451352134e-07, "loss": 0.4347, "step": 2949 }, { "epoch": 5.166374781085814, "grad_norm": 0.1746198534965515, "learning_rate": 4.977959710953128e-07, "loss": 0.44, "step": 2950 }, { "epoch": 5.168126094570928, "grad_norm": 0.16969835758209229, "learning_rate": 4.957418211131432e-07, "loss": 0.437, "step": 2951 }, { "epoch": 5.169877408056042, "grad_norm": 0.17223408818244934, "learning_rate": 4.936916970246997e-07, "loss": 0.4433, "step": 2952 }, { "epoch": 5.171628721541156, "grad_norm": 0.17285758256912231, "learning_rate": 4.91645600662376e-07, "loss": 0.4449, "step": 2953 }, { "epoch": 5.17338003502627, "grad_norm": 0.1761043816804886, "learning_rate": 4.896035338549693e-07, "loss": 0.4444, "step": 2954 }, { "epoch": 5.175131348511384, "grad_norm": 0.17877714335918427, "learning_rate": 4.875654984276718e-07, "loss": 0.445, "step": 2955 }, { "epoch": 5.176882661996498, "grad_norm": 0.18180438876152039, "learning_rate": 4.855314962020746e-07, "loss": 0.4426, "step": 2956 }, { "epoch": 5.1786339754816115, "grad_norm": 0.17703212797641754, "learning_rate": 4.835015289961642e-07, "loss": 0.4387, "step": 2957 }, { "epoch": 5.1803852889667255, "grad_norm": 0.1773122400045395, "learning_rate": 4.81475598624318e-07, "loss": 0.4305, "step": 2958 }, { "epoch": 5.1821366024518385, "grad_norm": 0.17686717212200165, "learning_rate": 4.794537068973093e-07, "loss": 0.4394, "step": 2959 }, { "epoch": 5.183887915936952, "grad_norm": 0.1874701827764511, "learning_rate": 4.774358556222974e-07, "loss": 0.447, "step": 2960 }, { "epoch": 5.185639229422066, "grad_norm": 0.16814056038856506, "learning_rate": 4.7542204660283363e-07, "loss": 0.4385, "step": 2961 }, { "epoch": 5.18739054290718, "grad_norm": 0.18193727731704712, "learning_rate": 4.7341228163885424e-07, "loss": 0.4451, "step": 2962 }, { "epoch": 5.189141856392294, "grad_norm": 0.174592062830925, "learning_rate": 4.7140656252668226e-07, "loss": 0.4409, "step": 2963 }, { "epoch": 5.190893169877408, "grad_norm": 0.17424607276916504, "learning_rate": 4.6940489105902323e-07, "loss": 0.4367, "step": 2964 }, { "epoch": 5.192644483362522, "grad_norm": 0.18354471027851105, "learning_rate": 4.67407269024967e-07, "loss": 0.4364, "step": 2965 }, { "epoch": 5.194395796847636, "grad_norm": 0.1829138845205307, "learning_rate": 4.654136982099805e-07, "loss": 0.4215, "step": 2966 }, { "epoch": 5.19614711033275, "grad_norm": 0.18040652573108673, "learning_rate": 4.634241803959144e-07, "loss": 0.4348, "step": 2967 }, { "epoch": 5.197898423817864, "grad_norm": 0.1744736284017563, "learning_rate": 4.6143871736099246e-07, "loss": 0.4398, "step": 2968 }, { "epoch": 5.199649737302977, "grad_norm": 0.17995105683803558, "learning_rate": 4.5945731087981783e-07, "loss": 0.4485, "step": 2969 }, { "epoch": 5.201401050788091, "grad_norm": 0.1758815497159958, "learning_rate": 4.574799627233639e-07, "loss": 0.4312, "step": 2970 }, { "epoch": 5.203152364273205, "grad_norm": 0.17820559442043304, "learning_rate": 4.5550667465898204e-07, "loss": 0.4429, "step": 2971 }, { "epoch": 5.204903677758319, "grad_norm": 0.17667333781719208, "learning_rate": 4.535374484503885e-07, "loss": 0.441, "step": 2972 }, { "epoch": 5.206654991243433, "grad_norm": 0.17378152906894684, "learning_rate": 4.515722858576743e-07, "loss": 0.4304, "step": 2973 }, { "epoch": 5.2084063047285465, "grad_norm": 0.17736554145812988, "learning_rate": 4.4961118863729725e-07, "loss": 0.4266, "step": 2974 }, { "epoch": 5.21015761821366, "grad_norm": 0.17789612710475922, "learning_rate": 4.476541585420785e-07, "loss": 0.4361, "step": 2975 }, { "epoch": 5.211908931698774, "grad_norm": 0.17982910573482513, "learning_rate": 4.457011973212083e-07, "loss": 0.4254, "step": 2976 }, { "epoch": 5.213660245183888, "grad_norm": 0.18474817276000977, "learning_rate": 4.437523067202365e-07, "loss": 0.4376, "step": 2977 }, { "epoch": 5.215411558669002, "grad_norm": 0.18042786419391632, "learning_rate": 4.4180748848107745e-07, "loss": 0.4399, "step": 2978 }, { "epoch": 5.217162872154115, "grad_norm": 0.18407054245471954, "learning_rate": 4.398667443420029e-07, "loss": 0.4527, "step": 2979 }, { "epoch": 5.218914185639229, "grad_norm": 0.1707213670015335, "learning_rate": 4.379300760376465e-07, "loss": 0.4362, "step": 2980 }, { "epoch": 5.220665499124343, "grad_norm": 0.1706569343805313, "learning_rate": 4.359974852989951e-07, "loss": 0.4263, "step": 2981 }, { "epoch": 5.222416812609457, "grad_norm": 0.16915754973888397, "learning_rate": 4.3406897385339544e-07, "loss": 0.436, "step": 2982 }, { "epoch": 5.224168126094571, "grad_norm": 0.17703983187675476, "learning_rate": 4.3214454342454357e-07, "loss": 0.445, "step": 2983 }, { "epoch": 5.225919439579685, "grad_norm": 0.18190419673919678, "learning_rate": 4.302241957324915e-07, "loss": 0.453, "step": 2984 }, { "epoch": 5.227670753064799, "grad_norm": 0.18114261329174042, "learning_rate": 4.283079324936401e-07, "loss": 0.4269, "step": 2985 }, { "epoch": 5.229422066549913, "grad_norm": 0.17388176918029785, "learning_rate": 4.263957554207415e-07, "loss": 0.4423, "step": 2986 }, { "epoch": 5.231173380035027, "grad_norm": 0.17318624258041382, "learning_rate": 4.244876662228914e-07, "loss": 0.4256, "step": 2987 }, { "epoch": 5.23292469352014, "grad_norm": 0.17531174421310425, "learning_rate": 4.2258366660553885e-07, "loss": 0.4375, "step": 2988 }, { "epoch": 5.234676007005254, "grad_norm": 0.17232327163219452, "learning_rate": 4.206837582704704e-07, "loss": 0.4338, "step": 2989 }, { "epoch": 5.2364273204903675, "grad_norm": 0.17247456312179565, "learning_rate": 4.187879429158204e-07, "loss": 0.4437, "step": 2990 }, { "epoch": 5.2381786339754814, "grad_norm": 0.17375794053077698, "learning_rate": 4.1689622223606283e-07, "loss": 0.4432, "step": 2991 }, { "epoch": 5.239929947460595, "grad_norm": 0.16695353388786316, "learning_rate": 4.1500859792201363e-07, "loss": 0.4416, "step": 2992 }, { "epoch": 5.241681260945709, "grad_norm": 0.1731974184513092, "learning_rate": 4.1312507166082425e-07, "loss": 0.4455, "step": 2993 }, { "epoch": 5.243432574430823, "grad_norm": 0.17687860131263733, "learning_rate": 4.1124564513598755e-07, "loss": 0.4374, "step": 2994 }, { "epoch": 5.245183887915937, "grad_norm": 0.17320647835731506, "learning_rate": 4.09370320027328e-07, "loss": 0.4352, "step": 2995 }, { "epoch": 5.246935201401051, "grad_norm": 0.17620357871055603, "learning_rate": 4.0749909801100775e-07, "loss": 0.4412, "step": 2996 }, { "epoch": 5.248686514886165, "grad_norm": 0.17705442011356354, "learning_rate": 4.0563198075951824e-07, "loss": 0.4466, "step": 2997 }, { "epoch": 5.250437828371279, "grad_norm": 0.1791222244501114, "learning_rate": 4.037689699416858e-07, "loss": 0.4403, "step": 2998 }, { "epoch": 5.252189141856392, "grad_norm": 0.17971085011959076, "learning_rate": 4.019100672226617e-07, "loss": 0.4368, "step": 2999 }, { "epoch": 5.253940455341506, "grad_norm": 0.18135520815849304, "learning_rate": 4.000552742639302e-07, "loss": 0.4372, "step": 3000 }, { "epoch": 5.25569176882662, "grad_norm": 0.1739327311515808, "learning_rate": 3.982045927233008e-07, "loss": 0.4279, "step": 3001 }, { "epoch": 5.257443082311734, "grad_norm": 0.18161709606647491, "learning_rate": 3.963580242549053e-07, "loss": 0.4426, "step": 3002 }, { "epoch": 5.259194395796848, "grad_norm": 0.17195238173007965, "learning_rate": 3.9451557050920406e-07, "loss": 0.4408, "step": 3003 }, { "epoch": 5.260945709281962, "grad_norm": 0.17461644113063812, "learning_rate": 3.9267723313297535e-07, "loss": 0.424, "step": 3004 }, { "epoch": 5.2626970227670755, "grad_norm": 0.18905742466449738, "learning_rate": 3.9084301376932143e-07, "loss": 0.4391, "step": 3005 }, { "epoch": 5.264448336252189, "grad_norm": 0.1782294660806656, "learning_rate": 3.8901291405766097e-07, "loss": 0.4364, "step": 3006 }, { "epoch": 5.266199649737303, "grad_norm": 0.17899881303310394, "learning_rate": 3.8718693563373376e-07, "loss": 0.4502, "step": 3007 }, { "epoch": 5.267950963222416, "grad_norm": 0.18046674132347107, "learning_rate": 3.853650801295933e-07, "loss": 0.4464, "step": 3008 }, { "epoch": 5.26970227670753, "grad_norm": 0.17832240462303162, "learning_rate": 3.8354734917361024e-07, "loss": 0.4439, "step": 3009 }, { "epoch": 5.271453590192644, "grad_norm": 0.1728062629699707, "learning_rate": 3.817337443904662e-07, "loss": 0.4408, "step": 3010 }, { "epoch": 5.273204903677758, "grad_norm": 0.18667152523994446, "learning_rate": 3.799242674011572e-07, "loss": 0.4383, "step": 3011 }, { "epoch": 5.274956217162872, "grad_norm": 0.17318326234817505, "learning_rate": 3.7811891982298843e-07, "loss": 0.4282, "step": 3012 }, { "epoch": 5.276707530647986, "grad_norm": 0.1824670135974884, "learning_rate": 3.7631770326957475e-07, "loss": 0.4479, "step": 3013 }, { "epoch": 5.2784588441331, "grad_norm": 0.17614661157131195, "learning_rate": 3.7452061935083826e-07, "loss": 0.4508, "step": 3014 }, { "epoch": 5.280210157618214, "grad_norm": 0.17678509652614594, "learning_rate": 3.7272766967300864e-07, "loss": 0.4373, "step": 3015 }, { "epoch": 5.281961471103328, "grad_norm": 0.1749906837940216, "learning_rate": 3.709388558386179e-07, "loss": 0.4445, "step": 3016 }, { "epoch": 5.283712784588442, "grad_norm": 0.17544318735599518, "learning_rate": 3.691541794465042e-07, "loss": 0.4397, "step": 3017 }, { "epoch": 5.285464098073555, "grad_norm": 0.16987966001033783, "learning_rate": 3.673736420918067e-07, "loss": 0.4412, "step": 3018 }, { "epoch": 5.287215411558669, "grad_norm": 0.18016661703586578, "learning_rate": 3.65597245365964e-07, "loss": 0.4388, "step": 3019 }, { "epoch": 5.288966725043783, "grad_norm": 0.18016588687896729, "learning_rate": 3.638249908567154e-07, "loss": 0.4511, "step": 3020 }, { "epoch": 5.2907180385288965, "grad_norm": 0.18470831215381622, "learning_rate": 3.6205688014809615e-07, "loss": 0.4475, "step": 3021 }, { "epoch": 5.2924693520140105, "grad_norm": 0.17220953106880188, "learning_rate": 3.602929148204398e-07, "loss": 0.443, "step": 3022 }, { "epoch": 5.294220665499124, "grad_norm": 0.18002602458000183, "learning_rate": 3.5853309645037306e-07, "loss": 0.4396, "step": 3023 }, { "epoch": 5.295971978984238, "grad_norm": 0.17509552836418152, "learning_rate": 3.567774266108176e-07, "loss": 0.4389, "step": 3024 }, { "epoch": 5.297723292469352, "grad_norm": 0.1748841106891632, "learning_rate": 3.550259068709844e-07, "loss": 0.4389, "step": 3025 }, { "epoch": 5.299474605954466, "grad_norm": 0.16974937915802002, "learning_rate": 3.5327853879637994e-07, "loss": 0.4226, "step": 3026 }, { "epoch": 5.301225919439579, "grad_norm": 0.1723744422197342, "learning_rate": 3.5153532394879507e-07, "loss": 0.4356, "step": 3027 }, { "epoch": 5.302977232924693, "grad_norm": 0.18084216117858887, "learning_rate": 3.4979626388631174e-07, "loss": 0.4412, "step": 3028 }, { "epoch": 5.304728546409807, "grad_norm": 0.17594529688358307, "learning_rate": 3.4806136016329537e-07, "loss": 0.4377, "step": 3029 }, { "epoch": 5.306479859894921, "grad_norm": 0.17454294860363007, "learning_rate": 3.4633061433040014e-07, "loss": 0.4394, "step": 3030 }, { "epoch": 5.308231173380035, "grad_norm": 0.16935960948467255, "learning_rate": 3.4460402793456083e-07, "loss": 0.4245, "step": 3031 }, { "epoch": 5.309982486865149, "grad_norm": 0.16735833883285522, "learning_rate": 3.4288160251899625e-07, "loss": 0.4301, "step": 3032 }, { "epoch": 5.311733800350263, "grad_norm": 0.18331657350063324, "learning_rate": 3.4116333962320445e-07, "loss": 0.4497, "step": 3033 }, { "epoch": 5.313485113835377, "grad_norm": 0.1767379641532898, "learning_rate": 3.394492407829658e-07, "loss": 0.4391, "step": 3034 }, { "epoch": 5.315236427320491, "grad_norm": 0.17600907385349274, "learning_rate": 3.3773930753033535e-07, "loss": 0.443, "step": 3035 }, { "epoch": 5.3169877408056045, "grad_norm": 0.17997175455093384, "learning_rate": 3.360335413936489e-07, "loss": 0.4402, "step": 3036 }, { "epoch": 5.3187390542907185, "grad_norm": 0.17211440205574036, "learning_rate": 3.343319438975134e-07, "loss": 0.4484, "step": 3037 }, { "epoch": 5.3204903677758315, "grad_norm": 0.17477986216545105, "learning_rate": 3.3263451656281364e-07, "loss": 0.4389, "step": 3038 }, { "epoch": 5.322241681260945, "grad_norm": 0.1720334142446518, "learning_rate": 3.3094126090670477e-07, "loss": 0.4357, "step": 3039 }, { "epoch": 5.323992994746059, "grad_norm": 0.17040905356407166, "learning_rate": 3.2925217844261503e-07, "loss": 0.4367, "step": 3040 }, { "epoch": 5.325744308231173, "grad_norm": 0.1774139553308487, "learning_rate": 3.275672706802402e-07, "loss": 0.4392, "step": 3041 }, { "epoch": 5.327495621716287, "grad_norm": 0.17319455742835999, "learning_rate": 3.258865391255478e-07, "loss": 0.4429, "step": 3042 }, { "epoch": 5.329246935201401, "grad_norm": 0.18072032928466797, "learning_rate": 3.242099852807695e-07, "loss": 0.4342, "step": 3043 }, { "epoch": 5.330998248686515, "grad_norm": 0.1762019842863083, "learning_rate": 3.225376106444056e-07, "loss": 0.4462, "step": 3044 }, { "epoch": 5.332749562171629, "grad_norm": 0.1737089902162552, "learning_rate": 3.2086941671122084e-07, "loss": 0.4381, "step": 3045 }, { "epoch": 5.334500875656743, "grad_norm": 0.17595107853412628, "learning_rate": 3.1920540497224095e-07, "loss": 0.4423, "step": 3046 }, { "epoch": 5.336252189141856, "grad_norm": 0.1763865202665329, "learning_rate": 3.1754557691475617e-07, "loss": 0.4475, "step": 3047 }, { "epoch": 5.33800350262697, "grad_norm": 0.1819225400686264, "learning_rate": 3.158899340223154e-07, "loss": 0.4359, "step": 3048 }, { "epoch": 5.339754816112084, "grad_norm": 0.17320585250854492, "learning_rate": 3.142384777747287e-07, "loss": 0.4404, "step": 3049 }, { "epoch": 5.341506129597198, "grad_norm": 0.17940358817577362, "learning_rate": 3.125912096480621e-07, "loss": 0.4356, "step": 3050 }, { "epoch": 5.343257443082312, "grad_norm": 0.17598369717597961, "learning_rate": 3.1094813111464004e-07, "loss": 0.438, "step": 3051 }, { "epoch": 5.345008756567426, "grad_norm": 0.17069900035858154, "learning_rate": 3.0930924364304117e-07, "loss": 0.4398, "step": 3052 }, { "epoch": 5.3467600700525395, "grad_norm": 0.16994261741638184, "learning_rate": 3.076745486981003e-07, "loss": 0.4371, "step": 3053 }, { "epoch": 5.348511383537653, "grad_norm": 0.17698051035404205, "learning_rate": 3.060440477409005e-07, "loss": 0.4354, "step": 3054 }, { "epoch": 5.350262697022767, "grad_norm": 0.17916439473628998, "learning_rate": 3.0441774222878194e-07, "loss": 0.4342, "step": 3055 }, { "epoch": 5.352014010507881, "grad_norm": 0.17683953046798706, "learning_rate": 3.027956336153298e-07, "loss": 0.4489, "step": 3056 }, { "epoch": 5.353765323992995, "grad_norm": 0.1699439436197281, "learning_rate": 3.011777233503821e-07, "loss": 0.4379, "step": 3057 }, { "epoch": 5.355516637478108, "grad_norm": 0.17524494230747223, "learning_rate": 2.995640128800209e-07, "loss": 0.434, "step": 3058 }, { "epoch": 5.357267950963222, "grad_norm": 0.17189376056194305, "learning_rate": 2.9795450364657865e-07, "loss": 0.438, "step": 3059 }, { "epoch": 5.359019264448336, "grad_norm": 0.1767062544822693, "learning_rate": 2.963491970886273e-07, "loss": 0.4422, "step": 3060 }, { "epoch": 5.36077057793345, "grad_norm": 0.17249172925949097, "learning_rate": 2.947480946409875e-07, "loss": 0.4397, "step": 3061 }, { "epoch": 5.362521891418564, "grad_norm": 0.18039987981319427, "learning_rate": 2.9315119773472067e-07, "loss": 0.4388, "step": 3062 }, { "epoch": 5.364273204903678, "grad_norm": 0.17195789515972137, "learning_rate": 2.9155850779712723e-07, "loss": 0.4405, "step": 3063 }, { "epoch": 5.366024518388792, "grad_norm": 0.177845299243927, "learning_rate": 2.899700262517502e-07, "loss": 0.4451, "step": 3064 }, { "epoch": 5.367775831873906, "grad_norm": 0.1795899122953415, "learning_rate": 2.8838575451837046e-07, "loss": 0.4398, "step": 3065 }, { "epoch": 5.36952714535902, "grad_norm": 0.16783587634563446, "learning_rate": 2.868056940130048e-07, "loss": 0.4435, "step": 3066 }, { "epoch": 5.371278458844133, "grad_norm": 0.17229382693767548, "learning_rate": 2.852298461479075e-07, "loss": 0.4431, "step": 3067 }, { "epoch": 5.373029772329247, "grad_norm": 0.1763918399810791, "learning_rate": 2.8365821233156677e-07, "loss": 0.434, "step": 3068 }, { "epoch": 5.3747810858143605, "grad_norm": 0.17558953166007996, "learning_rate": 2.8209079396870454e-07, "loss": 0.4379, "step": 3069 }, { "epoch": 5.3765323992994745, "grad_norm": 0.16633668541908264, "learning_rate": 2.8052759246027626e-07, "loss": 0.4358, "step": 3070 }, { "epoch": 5.378283712784588, "grad_norm": 0.17430154979228973, "learning_rate": 2.7896860920346605e-07, "loss": 0.433, "step": 3071 }, { "epoch": 5.380035026269702, "grad_norm": 0.1752610206604004, "learning_rate": 2.774138455916897e-07, "loss": 0.4433, "step": 3072 }, { "epoch": 5.381786339754816, "grad_norm": 0.17011594772338867, "learning_rate": 2.758633030145891e-07, "loss": 0.4338, "step": 3073 }, { "epoch": 5.38353765323993, "grad_norm": 0.16925902664661407, "learning_rate": 2.743169828580372e-07, "loss": 0.4414, "step": 3074 }, { "epoch": 5.385288966725044, "grad_norm": 0.17471835017204285, "learning_rate": 2.727748865041291e-07, "loss": 0.4334, "step": 3075 }, { "epoch": 5.387040280210158, "grad_norm": 0.17703384160995483, "learning_rate": 2.712370153311872e-07, "loss": 0.4444, "step": 3076 }, { "epoch": 5.388791593695271, "grad_norm": 0.17192551493644714, "learning_rate": 2.6970337071375584e-07, "loss": 0.4447, "step": 3077 }, { "epoch": 5.390542907180385, "grad_norm": 0.17984390258789062, "learning_rate": 2.681739540226036e-07, "loss": 0.4424, "step": 3078 }, { "epoch": 5.392294220665499, "grad_norm": 0.17474453151226044, "learning_rate": 2.6664876662471697e-07, "loss": 0.4369, "step": 3079 }, { "epoch": 5.394045534150613, "grad_norm": 0.16928963363170624, "learning_rate": 2.6512780988330654e-07, "loss": 0.4442, "step": 3080 }, { "epoch": 5.395796847635727, "grad_norm": 0.172274649143219, "learning_rate": 2.6361108515779777e-07, "loss": 0.441, "step": 3081 }, { "epoch": 5.397548161120841, "grad_norm": 0.1743282973766327, "learning_rate": 2.6209859380383673e-07, "loss": 0.4345, "step": 3082 }, { "epoch": 5.399299474605955, "grad_norm": 0.17648088932037354, "learning_rate": 2.6059033717328154e-07, "loss": 0.4279, "step": 3083 }, { "epoch": 5.4010507880910685, "grad_norm": 0.1674264371395111, "learning_rate": 2.590863166142105e-07, "loss": 0.441, "step": 3084 }, { "epoch": 5.4028021015761825, "grad_norm": 0.17715856432914734, "learning_rate": 2.5758653347091103e-07, "loss": 0.4486, "step": 3085 }, { "epoch": 5.404553415061296, "grad_norm": 0.1779506653547287, "learning_rate": 2.560909890838864e-07, "loss": 0.4353, "step": 3086 }, { "epoch": 5.406304728546409, "grad_norm": 0.18662282824516296, "learning_rate": 2.5459968478985074e-07, "loss": 0.4417, "step": 3087 }, { "epoch": 5.408056042031523, "grad_norm": 0.17400725185871124, "learning_rate": 2.531126219217256e-07, "loss": 0.4306, "step": 3088 }, { "epoch": 5.409807355516637, "grad_norm": 0.17001307010650635, "learning_rate": 2.5162980180864605e-07, "loss": 0.435, "step": 3089 }, { "epoch": 5.411558669001751, "grad_norm": 0.17127738893032074, "learning_rate": 2.5015122577595084e-07, "loss": 0.4298, "step": 3090 }, { "epoch": 5.413309982486865, "grad_norm": 0.17574961483478546, "learning_rate": 2.48676895145189e-07, "loss": 0.4407, "step": 3091 }, { "epoch": 5.415061295971979, "grad_norm": 0.17373836040496826, "learning_rate": 2.472068112341114e-07, "loss": 0.4316, "step": 3092 }, { "epoch": 5.416812609457093, "grad_norm": 0.1742943376302719, "learning_rate": 2.4574097535667696e-07, "loss": 0.4336, "step": 3093 }, { "epoch": 5.418563922942207, "grad_norm": 0.1730850636959076, "learning_rate": 2.4427938882304383e-07, "loss": 0.4327, "step": 3094 }, { "epoch": 5.420315236427321, "grad_norm": 0.17222850024700165, "learning_rate": 2.428220529395753e-07, "loss": 0.4311, "step": 3095 }, { "epoch": 5.422066549912435, "grad_norm": 0.1775062084197998, "learning_rate": 2.413689690088339e-07, "loss": 0.4317, "step": 3096 }, { "epoch": 5.423817863397548, "grad_norm": 0.17715953290462494, "learning_rate": 2.399201383295824e-07, "loss": 0.428, "step": 3097 }, { "epoch": 5.425569176882662, "grad_norm": 0.1741517037153244, "learning_rate": 2.3847556219678105e-07, "loss": 0.4439, "step": 3098 }, { "epoch": 5.427320490367776, "grad_norm": 0.17616590857505798, "learning_rate": 2.370352419015892e-07, "loss": 0.4364, "step": 3099 }, { "epoch": 5.42907180385289, "grad_norm": 0.1694258600473404, "learning_rate": 2.3559917873135996e-07, "loss": 0.4351, "step": 3100 }, { "epoch": 5.4308231173380035, "grad_norm": 0.18136680126190186, "learning_rate": 2.3416737396964373e-07, "loss": 0.4388, "step": 3101 }, { "epoch": 5.432574430823117, "grad_norm": 0.16963878273963928, "learning_rate": 2.3273982889618186e-07, "loss": 0.441, "step": 3102 }, { "epoch": 5.434325744308231, "grad_norm": 0.1718294769525528, "learning_rate": 2.3131654478691258e-07, "loss": 0.4403, "step": 3103 }, { "epoch": 5.436077057793345, "grad_norm": 0.1746172159910202, "learning_rate": 2.2989752291396218e-07, "loss": 0.4344, "step": 3104 }, { "epoch": 5.437828371278459, "grad_norm": 0.17074638605117798, "learning_rate": 2.2848276454564943e-07, "loss": 0.4351, "step": 3105 }, { "epoch": 5.439579684763572, "grad_norm": 0.1717109978199005, "learning_rate": 2.270722709464801e-07, "loss": 0.4273, "step": 3106 }, { "epoch": 5.441330998248686, "grad_norm": 0.16843710839748383, "learning_rate": 2.2566604337715127e-07, "loss": 0.4374, "step": 3107 }, { "epoch": 5.4430823117338, "grad_norm": 0.16889427602291107, "learning_rate": 2.242640830945436e-07, "loss": 0.4293, "step": 3108 }, { "epoch": 5.444833625218914, "grad_norm": 0.17384794354438782, "learning_rate": 2.228663913517265e-07, "loss": 0.4404, "step": 3109 }, { "epoch": 5.446584938704028, "grad_norm": 0.17383615672588348, "learning_rate": 2.2147296939795226e-07, "loss": 0.4366, "step": 3110 }, { "epoch": 5.448336252189142, "grad_norm": 0.17334918677806854, "learning_rate": 2.2008381847865911e-07, "loss": 0.4322, "step": 3111 }, { "epoch": 5.450087565674256, "grad_norm": 0.1811426281929016, "learning_rate": 2.1869893983546442e-07, "loss": 0.4428, "step": 3112 }, { "epoch": 5.45183887915937, "grad_norm": 0.17078621685504913, "learning_rate": 2.173183347061697e-07, "loss": 0.4341, "step": 3113 }, { "epoch": 5.453590192644484, "grad_norm": 0.17333611845970154, "learning_rate": 2.1594200432475675e-07, "loss": 0.4337, "step": 3114 }, { "epoch": 5.455341506129598, "grad_norm": 0.17212922871112823, "learning_rate": 2.145699499213849e-07, "loss": 0.4423, "step": 3115 }, { "epoch": 5.4570928196147115, "grad_norm": 0.17884641885757446, "learning_rate": 2.132021727223943e-07, "loss": 0.4411, "step": 3116 }, { "epoch": 5.4588441330998245, "grad_norm": 0.1779637634754181, "learning_rate": 2.1183867395029812e-07, "loss": 0.4325, "step": 3117 }, { "epoch": 5.4605954465849385, "grad_norm": 0.1762283444404602, "learning_rate": 2.1047945482378985e-07, "loss": 0.4386, "step": 3118 }, { "epoch": 5.462346760070052, "grad_norm": 0.17964288592338562, "learning_rate": 2.091245165577349e-07, "loss": 0.4546, "step": 3119 }, { "epoch": 5.464098073555166, "grad_norm": 0.17922194302082062, "learning_rate": 2.0777386036317404e-07, "loss": 0.4406, "step": 3120 }, { "epoch": 5.46584938704028, "grad_norm": 0.1767795830965042, "learning_rate": 2.0642748744731878e-07, "loss": 0.4401, "step": 3121 }, { "epoch": 5.467600700525394, "grad_norm": 0.1673709750175476, "learning_rate": 2.050853990135554e-07, "loss": 0.4377, "step": 3122 }, { "epoch": 5.469352014010508, "grad_norm": 0.17348125576972961, "learning_rate": 2.0374759626143714e-07, "loss": 0.4441, "step": 3123 }, { "epoch": 5.471103327495622, "grad_norm": 0.17424534261226654, "learning_rate": 2.0241408038668976e-07, "loss": 0.441, "step": 3124 }, { "epoch": 5.472854640980736, "grad_norm": 0.17427121102809906, "learning_rate": 2.0108485258120536e-07, "loss": 0.4361, "step": 3125 }, { "epoch": 5.474605954465849, "grad_norm": 0.17401234805583954, "learning_rate": 1.9975991403304529e-07, "loss": 0.4388, "step": 3126 }, { "epoch": 5.476357267950963, "grad_norm": 0.17399689555168152, "learning_rate": 1.9843926592643392e-07, "loss": 0.4349, "step": 3127 }, { "epoch": 5.478108581436077, "grad_norm": 0.17410141229629517, "learning_rate": 1.9712290944176537e-07, "loss": 0.4361, "step": 3128 }, { "epoch": 5.479859894921191, "grad_norm": 0.17544011771678925, "learning_rate": 1.9581084575559352e-07, "loss": 0.442, "step": 3129 }, { "epoch": 5.481611208406305, "grad_norm": 0.18126288056373596, "learning_rate": 1.94503076040638e-07, "loss": 0.4321, "step": 3130 }, { "epoch": 5.483362521891419, "grad_norm": 0.17377562820911407, "learning_rate": 1.9319960146578055e-07, "loss": 0.4334, "step": 3131 }, { "epoch": 5.4851138353765325, "grad_norm": 0.1764148622751236, "learning_rate": 1.919004231960625e-07, "loss": 0.4397, "step": 3132 }, { "epoch": 5.4868651488616464, "grad_norm": 0.17616155743598938, "learning_rate": 1.9060554239268613e-07, "loss": 0.4428, "step": 3133 }, { "epoch": 5.48861646234676, "grad_norm": 0.17109334468841553, "learning_rate": 1.8931496021301178e-07, "loss": 0.4329, "step": 3134 }, { "epoch": 5.490367775831874, "grad_norm": 0.17657902836799622, "learning_rate": 1.8802867781055943e-07, "loss": 0.438, "step": 3135 }, { "epoch": 5.492119089316987, "grad_norm": 0.16984206438064575, "learning_rate": 1.8674669633500342e-07, "loss": 0.4365, "step": 3136 }, { "epoch": 5.493870402802101, "grad_norm": 0.17706407606601715, "learning_rate": 1.8546901693217655e-07, "loss": 0.4385, "step": 3137 }, { "epoch": 5.495621716287215, "grad_norm": 0.1783907115459442, "learning_rate": 1.8419564074406427e-07, "loss": 0.4409, "step": 3138 }, { "epoch": 5.497373029772329, "grad_norm": 0.17246200144290924, "learning_rate": 1.8292656890880722e-07, "loss": 0.4376, "step": 3139 }, { "epoch": 5.499124343257443, "grad_norm": 0.17568419873714447, "learning_rate": 1.8166180256069753e-07, "loss": 0.4297, "step": 3140 }, { "epoch": 5.500875656742557, "grad_norm": 0.17136606574058533, "learning_rate": 1.8040134283018207e-07, "loss": 0.4363, "step": 3141 }, { "epoch": 5.502626970227671, "grad_norm": 0.17559997737407684, "learning_rate": 1.7914519084385407e-07, "loss": 0.4415, "step": 3142 }, { "epoch": 5.504378283712785, "grad_norm": 0.16959697008132935, "learning_rate": 1.7789334772445987e-07, "loss": 0.4315, "step": 3143 }, { "epoch": 5.506129597197899, "grad_norm": 0.17859020829200745, "learning_rate": 1.7664581459089335e-07, "loss": 0.43, "step": 3144 }, { "epoch": 5.507880910683012, "grad_norm": 0.1776002198457718, "learning_rate": 1.754025925581959e-07, "loss": 0.4501, "step": 3145 }, { "epoch": 5.509632224168126, "grad_norm": 0.1730557084083557, "learning_rate": 1.7416368273755646e-07, "loss": 0.4415, "step": 3146 }, { "epoch": 5.51138353765324, "grad_norm": 0.16926303505897522, "learning_rate": 1.7292908623630866e-07, "loss": 0.425, "step": 3147 }, { "epoch": 5.5131348511383536, "grad_norm": 0.17582599818706512, "learning_rate": 1.7169880415793206e-07, "loss": 0.4351, "step": 3148 }, { "epoch": 5.5148861646234675, "grad_norm": 0.16952812671661377, "learning_rate": 1.7047283760204925e-07, "loss": 0.4436, "step": 3149 }, { "epoch": 5.516637478108581, "grad_norm": 0.1739274561405182, "learning_rate": 1.6925118766442538e-07, "loss": 0.4293, "step": 3150 }, { "epoch": 5.518388791593695, "grad_norm": 0.171127051115036, "learning_rate": 1.6803385543696872e-07, "loss": 0.4374, "step": 3151 }, { "epoch": 5.520140105078809, "grad_norm": 0.1711086481809616, "learning_rate": 1.6682084200772663e-07, "loss": 0.4462, "step": 3152 }, { "epoch": 5.521891418563923, "grad_norm": 0.178030326962471, "learning_rate": 1.65612148460888e-07, "loss": 0.4396, "step": 3153 }, { "epoch": 5.523642732049037, "grad_norm": 0.17508868873119354, "learning_rate": 1.6440777587677924e-07, "loss": 0.444, "step": 3154 }, { "epoch": 5.525394045534151, "grad_norm": 0.17116321623325348, "learning_rate": 1.6320772533186536e-07, "loss": 0.4351, "step": 3155 }, { "epoch": 5.527145359019264, "grad_norm": 0.17824143171310425, "learning_rate": 1.6201199789874833e-07, "loss": 0.436, "step": 3156 }, { "epoch": 5.528896672504378, "grad_norm": 0.17407330870628357, "learning_rate": 1.6082059464616662e-07, "loss": 0.4401, "step": 3157 }, { "epoch": 5.530647985989492, "grad_norm": 0.17104989290237427, "learning_rate": 1.5963351663899284e-07, "loss": 0.4431, "step": 3158 }, { "epoch": 5.532399299474606, "grad_norm": 0.17024724185466766, "learning_rate": 1.5845076493823331e-07, "loss": 0.4449, "step": 3159 }, { "epoch": 5.53415061295972, "grad_norm": 0.17384310066699982, "learning_rate": 1.572723406010296e-07, "loss": 0.44, "step": 3160 }, { "epoch": 5.535901926444834, "grad_norm": 0.1758754998445511, "learning_rate": 1.5609824468065306e-07, "loss": 0.4438, "step": 3161 }, { "epoch": 5.537653239929948, "grad_norm": 0.16902753710746765, "learning_rate": 1.549284782265087e-07, "loss": 0.4412, "step": 3162 }, { "epoch": 5.5394045534150615, "grad_norm": 0.1760496348142624, "learning_rate": 1.537630422841291e-07, "loss": 0.4444, "step": 3163 }, { "epoch": 5.5411558669001755, "grad_norm": 0.16742749512195587, "learning_rate": 1.526019378951793e-07, "loss": 0.4309, "step": 3164 }, { "epoch": 5.5429071803852885, "grad_norm": 0.17274701595306396, "learning_rate": 1.5144516609744975e-07, "loss": 0.4459, "step": 3165 }, { "epoch": 5.544658493870402, "grad_norm": 0.16740649938583374, "learning_rate": 1.5029272792486116e-07, "loss": 0.4363, "step": 3166 }, { "epoch": 5.546409807355516, "grad_norm": 0.17751780152320862, "learning_rate": 1.4914462440745847e-07, "loss": 0.4412, "step": 3167 }, { "epoch": 5.54816112084063, "grad_norm": 0.1763242930173874, "learning_rate": 1.480008565714147e-07, "loss": 0.4432, "step": 3168 }, { "epoch": 5.549912434325744, "grad_norm": 0.17876876890659332, "learning_rate": 1.4686142543902548e-07, "loss": 0.4472, "step": 3169 }, { "epoch": 5.551663747810858, "grad_norm": 0.1752900928258896, "learning_rate": 1.4572633202871222e-07, "loss": 0.4449, "step": 3170 }, { "epoch": 5.553415061295972, "grad_norm": 0.1757035106420517, "learning_rate": 1.4459557735501671e-07, "loss": 0.4421, "step": 3171 }, { "epoch": 5.555166374781086, "grad_norm": 0.16979941725730896, "learning_rate": 1.4346916242860664e-07, "loss": 0.4347, "step": 3172 }, { "epoch": 5.5569176882662, "grad_norm": 0.18074661493301392, "learning_rate": 1.4234708825626664e-07, "loss": 0.427, "step": 3173 }, { "epoch": 5.558669001751314, "grad_norm": 0.17147329449653625, "learning_rate": 1.4122935584090447e-07, "loss": 0.4482, "step": 3174 }, { "epoch": 5.560420315236428, "grad_norm": 0.17978663742542267, "learning_rate": 1.4011596618154655e-07, "loss": 0.436, "step": 3175 }, { "epoch": 5.562171628721541, "grad_norm": 0.16839028894901276, "learning_rate": 1.3900692027333628e-07, "loss": 0.4461, "step": 3176 }, { "epoch": 5.563922942206655, "grad_norm": 0.17067769169807434, "learning_rate": 1.37902219107538e-07, "loss": 0.4399, "step": 3177 }, { "epoch": 5.565674255691769, "grad_norm": 0.17075173556804657, "learning_rate": 1.3680186367152848e-07, "loss": 0.4448, "step": 3178 }, { "epoch": 5.567425569176883, "grad_norm": 0.172356516122818, "learning_rate": 1.3570585494880328e-07, "loss": 0.4349, "step": 3179 }, { "epoch": 5.5691768826619965, "grad_norm": 0.1824556291103363, "learning_rate": 1.3461419391897268e-07, "loss": 0.4352, "step": 3180 }, { "epoch": 5.57092819614711, "grad_norm": 0.1769983321428299, "learning_rate": 1.3352688155775896e-07, "loss": 0.4399, "step": 3181 }, { "epoch": 5.572679509632224, "grad_norm": 0.16916704177856445, "learning_rate": 1.3244391883700025e-07, "loss": 0.4426, "step": 3182 }, { "epoch": 5.574430823117338, "grad_norm": 0.17393191158771515, "learning_rate": 1.313653067246451e-07, "loss": 0.4363, "step": 3183 }, { "epoch": 5.576182136602452, "grad_norm": 0.1775522083044052, "learning_rate": 1.3029104618475397e-07, "loss": 0.4352, "step": 3184 }, { "epoch": 5.577933450087565, "grad_norm": 0.17496418952941895, "learning_rate": 1.292211381774988e-07, "loss": 0.4318, "step": 3185 }, { "epoch": 5.579684763572679, "grad_norm": 0.18009530007839203, "learning_rate": 1.2815558365915916e-07, "loss": 0.439, "step": 3186 }, { "epoch": 5.581436077057793, "grad_norm": 0.16915638744831085, "learning_rate": 1.2709438358212644e-07, "loss": 0.4376, "step": 3187 }, { "epoch": 5.583187390542907, "grad_norm": 0.17013020813465118, "learning_rate": 1.2603753889489645e-07, "loss": 0.4431, "step": 3188 }, { "epoch": 5.584938704028021, "grad_norm": 0.16884258389472961, "learning_rate": 1.2498505054207578e-07, "loss": 0.4404, "step": 3189 }, { "epoch": 5.586690017513135, "grad_norm": 0.17102716863155365, "learning_rate": 1.2393691946437414e-07, "loss": 0.4339, "step": 3190 }, { "epoch": 5.588441330998249, "grad_norm": 0.17540892958641052, "learning_rate": 1.228931465986094e-07, "loss": 0.4378, "step": 3191 }, { "epoch": 5.590192644483363, "grad_norm": 0.16777098178863525, "learning_rate": 1.2185373287770254e-07, "loss": 0.4217, "step": 3192 }, { "epoch": 5.591943957968477, "grad_norm": 0.17321659624576569, "learning_rate": 1.2081867923067925e-07, "loss": 0.44, "step": 3193 }, { "epoch": 5.593695271453591, "grad_norm": 0.16668735444545746, "learning_rate": 1.1978798658266678e-07, "loss": 0.4319, "step": 3194 }, { "epoch": 5.5954465849387045, "grad_norm": 0.17647089064121246, "learning_rate": 1.1876165585489596e-07, "loss": 0.4481, "step": 3195 }, { "epoch": 5.5971978984238175, "grad_norm": 0.17642100155353546, "learning_rate": 1.1773968796469748e-07, "loss": 0.444, "step": 3196 }, { "epoch": 5.5989492119089315, "grad_norm": 0.1748809814453125, "learning_rate": 1.1672208382550509e-07, "loss": 0.4445, "step": 3197 }, { "epoch": 5.600700525394045, "grad_norm": 0.17743046581745148, "learning_rate": 1.1570884434684959e-07, "loss": 0.4407, "step": 3198 }, { "epoch": 5.602451838879159, "grad_norm": 0.17289437353610992, "learning_rate": 1.1469997043436154e-07, "loss": 0.4483, "step": 3199 }, { "epoch": 5.604203152364273, "grad_norm": 0.17218148708343506, "learning_rate": 1.1369546298977018e-07, "loss": 0.4426, "step": 3200 }, { "epoch": 5.605954465849387, "grad_norm": 0.1682223081588745, "learning_rate": 1.1269532291090124e-07, "loss": 0.4347, "step": 3201 }, { "epoch": 5.607705779334501, "grad_norm": 0.17390166223049164, "learning_rate": 1.1169955109167741e-07, "loss": 0.4419, "step": 3202 }, { "epoch": 5.609457092819615, "grad_norm": 0.16547738015651703, "learning_rate": 1.1070814842211675e-07, "loss": 0.4375, "step": 3203 }, { "epoch": 5.611208406304728, "grad_norm": 0.1670323759317398, "learning_rate": 1.0972111578833266e-07, "loss": 0.4351, "step": 3204 }, { "epoch": 5.612959719789842, "grad_norm": 0.1762978732585907, "learning_rate": 1.0873845407253114e-07, "loss": 0.445, "step": 3205 }, { "epoch": 5.614711033274956, "grad_norm": 0.17253799736499786, "learning_rate": 1.0776016415301405e-07, "loss": 0.4363, "step": 3206 }, { "epoch": 5.61646234676007, "grad_norm": 0.17038656771183014, "learning_rate": 1.067862469041725e-07, "loss": 0.4384, "step": 3207 }, { "epoch": 5.618213660245184, "grad_norm": 0.17730577290058136, "learning_rate": 1.0581670319649295e-07, "loss": 0.44, "step": 3208 }, { "epoch": 5.619964973730298, "grad_norm": 0.1738574355840683, "learning_rate": 1.0485153389654945e-07, "loss": 0.449, "step": 3209 }, { "epoch": 5.621716287215412, "grad_norm": 0.1719108372926712, "learning_rate": 1.038907398670086e-07, "loss": 0.4406, "step": 3210 }, { "epoch": 5.6234676007005255, "grad_norm": 0.1728861778974533, "learning_rate": 1.0293432196662512e-07, "loss": 0.4257, "step": 3211 }, { "epoch": 5.6252189141856395, "grad_norm": 0.1743529736995697, "learning_rate": 1.01982281050243e-07, "loss": 0.4298, "step": 3212 }, { "epoch": 5.626970227670753, "grad_norm": 0.178388774394989, "learning_rate": 1.0103461796879265e-07, "loss": 0.4383, "step": 3213 }, { "epoch": 5.628721541155867, "grad_norm": 0.1816776841878891, "learning_rate": 1.000913335692949e-07, "loss": 0.4426, "step": 3214 }, { "epoch": 5.63047285464098, "grad_norm": 0.17102472484111786, "learning_rate": 9.915242869485309e-08, "loss": 0.4374, "step": 3215 }, { "epoch": 5.632224168126094, "grad_norm": 0.1717868596315384, "learning_rate": 9.821790418465815e-08, "loss": 0.4442, "step": 3216 }, { "epoch": 5.633975481611208, "grad_norm": 0.1759672462940216, "learning_rate": 9.728776087398583e-08, "loss": 0.4394, "step": 3217 }, { "epoch": 5.635726795096322, "grad_norm": 0.1647067368030548, "learning_rate": 9.636199959419557e-08, "loss": 0.433, "step": 3218 }, { "epoch": 5.637478108581436, "grad_norm": 0.1710233986377716, "learning_rate": 9.544062117273045e-08, "loss": 0.4413, "step": 3219 }, { "epoch": 5.63922942206655, "grad_norm": 0.17059938609600067, "learning_rate": 9.452362643311619e-08, "loss": 0.4394, "step": 3220 }, { "epoch": 5.640980735551664, "grad_norm": 0.1760408729314804, "learning_rate": 9.361101619495994e-08, "loss": 0.4407, "step": 3221 }, { "epoch": 5.642732049036778, "grad_norm": 0.1708299219608307, "learning_rate": 9.270279127394977e-08, "loss": 0.4336, "step": 3222 }, { "epoch": 5.644483362521892, "grad_norm": 0.17218463122844696, "learning_rate": 9.179895248185522e-08, "loss": 0.4389, "step": 3223 }, { "epoch": 5.646234676007005, "grad_norm": 0.17241954803466797, "learning_rate": 9.089950062652508e-08, "loss": 0.4354, "step": 3224 }, { "epoch": 5.647985989492119, "grad_norm": 0.17596521973609924, "learning_rate": 9.000443651188628e-08, "loss": 0.441, "step": 3225 }, { "epoch": 5.649737302977233, "grad_norm": 0.1667822301387787, "learning_rate": 8.911376093794555e-08, "loss": 0.4419, "step": 3226 }, { "epoch": 5.651488616462347, "grad_norm": 0.16586104035377502, "learning_rate": 8.822747470078552e-08, "loss": 0.441, "step": 3227 }, { "epoch": 5.6532399299474605, "grad_norm": 0.17107515037059784, "learning_rate": 8.734557859256698e-08, "loss": 0.4499, "step": 3228 }, { "epoch": 5.654991243432574, "grad_norm": 0.1719045788049698, "learning_rate": 8.646807340152607e-08, "loss": 0.4402, "step": 3229 }, { "epoch": 5.656742556917688, "grad_norm": 0.16991625726222992, "learning_rate": 8.55949599119743e-08, "loss": 0.4321, "step": 3230 }, { "epoch": 5.658493870402802, "grad_norm": 0.17116208374500275, "learning_rate": 8.47262389042991e-08, "loss": 0.434, "step": 3231 }, { "epoch": 5.660245183887916, "grad_norm": 0.1707262247800827, "learning_rate": 8.386191115495934e-08, "loss": 0.4462, "step": 3232 }, { "epoch": 5.66199649737303, "grad_norm": 0.17645332217216492, "learning_rate": 8.300197743649041e-08, "loss": 0.4343, "step": 3233 }, { "epoch": 5.663747810858144, "grad_norm": 0.17797423899173737, "learning_rate": 8.214643851749748e-08, "loss": 0.4555, "step": 3234 }, { "epoch": 5.665499124343257, "grad_norm": 0.17499491572380066, "learning_rate": 8.129529516266e-08, "loss": 0.4452, "step": 3235 }, { "epoch": 5.667250437828371, "grad_norm": 0.1747831255197525, "learning_rate": 8.044854813272662e-08, "loss": 0.4421, "step": 3236 }, { "epoch": 5.669001751313485, "grad_norm": 0.1729750633239746, "learning_rate": 7.960619818451865e-08, "loss": 0.4376, "step": 3237 }, { "epoch": 5.670753064798599, "grad_norm": 0.16944245994091034, "learning_rate": 7.876824607092437e-08, "loss": 0.4379, "step": 3238 }, { "epoch": 5.672504378283713, "grad_norm": 0.1740196794271469, "learning_rate": 7.793469254090524e-08, "loss": 0.4412, "step": 3239 }, { "epoch": 5.674255691768827, "grad_norm": 0.17038215696811676, "learning_rate": 7.710553833948753e-08, "loss": 0.4497, "step": 3240 }, { "epoch": 5.676007005253941, "grad_norm": 0.16905799508094788, "learning_rate": 7.628078420776786e-08, "loss": 0.4419, "step": 3241 }, { "epoch": 5.677758318739055, "grad_norm": 0.17036326229572296, "learning_rate": 7.546043088290878e-08, "loss": 0.4316, "step": 3242 }, { "epoch": 5.6795096322241685, "grad_norm": 0.1669592559337616, "learning_rate": 7.464447909814043e-08, "loss": 0.44, "step": 3243 }, { "epoch": 5.6812609457092815, "grad_norm": 0.1766437441110611, "learning_rate": 7.383292958275778e-08, "loss": 0.4397, "step": 3244 }, { "epoch": 5.6830122591943955, "grad_norm": 0.1696997433900833, "learning_rate": 7.302578306212116e-08, "loss": 0.4388, "step": 3245 }, { "epoch": 5.684763572679509, "grad_norm": 0.16974177956581116, "learning_rate": 7.222304025765736e-08, "loss": 0.4383, "step": 3246 }, { "epoch": 5.686514886164623, "grad_norm": 0.17005132138729095, "learning_rate": 7.142470188685413e-08, "loss": 0.4476, "step": 3247 }, { "epoch": 5.688266199649737, "grad_norm": 0.1741643249988556, "learning_rate": 7.063076866326568e-08, "loss": 0.4393, "step": 3248 }, { "epoch": 5.690017513134851, "grad_norm": 0.1734933704137802, "learning_rate": 6.984124129650605e-08, "loss": 0.4419, "step": 3249 }, { "epoch": 5.691768826619965, "grad_norm": 0.17386986315250397, "learning_rate": 6.905612049225352e-08, "loss": 0.4403, "step": 3250 }, { "epoch": 5.693520140105079, "grad_norm": 0.17525798082351685, "learning_rate": 6.827540695224622e-08, "loss": 0.4363, "step": 3251 }, { "epoch": 5.695271453590193, "grad_norm": 0.1706988364458084, "learning_rate": 6.749910137428484e-08, "loss": 0.4412, "step": 3252 }, { "epoch": 5.697022767075307, "grad_norm": 0.1715410202741623, "learning_rate": 6.672720445222769e-08, "loss": 0.4361, "step": 3253 }, { "epoch": 5.698774080560421, "grad_norm": 0.1732766032218933, "learning_rate": 6.595971687599567e-08, "loss": 0.4459, "step": 3254 }, { "epoch": 5.700525394045534, "grad_norm": 0.174923837184906, "learning_rate": 6.519663933156562e-08, "loss": 0.4412, "step": 3255 }, { "epoch": 5.702276707530648, "grad_norm": 0.1806623637676239, "learning_rate": 6.443797250097528e-08, "loss": 0.4441, "step": 3256 }, { "epoch": 5.704028021015762, "grad_norm": 0.17096135020256042, "learning_rate": 6.368371706231891e-08, "loss": 0.4252, "step": 3257 }, { "epoch": 5.705779334500876, "grad_norm": 0.17156869173049927, "learning_rate": 6.29338736897478e-08, "loss": 0.4415, "step": 3258 }, { "epoch": 5.7075306479859895, "grad_norm": 0.17268310487270355, "learning_rate": 6.218844305346916e-08, "loss": 0.4435, "step": 3259 }, { "epoch": 5.7092819614711035, "grad_norm": 0.1735750436782837, "learning_rate": 6.144742581974839e-08, "loss": 0.4402, "step": 3260 }, { "epoch": 5.711033274956217, "grad_norm": 0.17237108945846558, "learning_rate": 6.071082265090344e-08, "loss": 0.4366, "step": 3261 }, { "epoch": 5.712784588441331, "grad_norm": 0.17374956607818604, "learning_rate": 5.99786342053088e-08, "loss": 0.433, "step": 3262 }, { "epoch": 5.714535901926444, "grad_norm": 0.1769857406616211, "learning_rate": 5.925086113739209e-08, "loss": 0.4262, "step": 3263 }, { "epoch": 5.716287215411558, "grad_norm": 0.17170462012290955, "learning_rate": 5.852750409763575e-08, "loss": 0.4444, "step": 3264 }, { "epoch": 5.718038528896672, "grad_norm": 0.1728275567293167, "learning_rate": 5.7808563732573155e-08, "loss": 0.4524, "step": 3265 }, { "epoch": 5.719789842381786, "grad_norm": 0.1804431527853012, "learning_rate": 5.709404068479196e-08, "loss": 0.4378, "step": 3266 }, { "epoch": 5.7215411558669, "grad_norm": 0.17352168262004852, "learning_rate": 5.638393559293076e-08, "loss": 0.4328, "step": 3267 }, { "epoch": 5.723292469352014, "grad_norm": 0.17234793305397034, "learning_rate": 5.5678249091679624e-08, "loss": 0.4454, "step": 3268 }, { "epoch": 5.725043782837128, "grad_norm": 0.1759670078754425, "learning_rate": 5.4976981811779575e-08, "loss": 0.4446, "step": 3269 }, { "epoch": 5.726795096322242, "grad_norm": 0.17718161642551422, "learning_rate": 5.4280134380020336e-08, "loss": 0.438, "step": 3270 }, { "epoch": 5.728546409807356, "grad_norm": 0.1715938299894333, "learning_rate": 5.358770741924368e-08, "loss": 0.4181, "step": 3271 }, { "epoch": 5.73029772329247, "grad_norm": 0.17949502170085907, "learning_rate": 5.28997015483379e-08, "loss": 0.4314, "step": 3272 }, { "epoch": 5.732049036777584, "grad_norm": 0.1726769655942917, "learning_rate": 5.2216117382241084e-08, "loss": 0.4268, "step": 3273 }, { "epoch": 5.7338003502626975, "grad_norm": 0.16768142580986023, "learning_rate": 5.1536955531938937e-08, "loss": 0.4362, "step": 3274 }, { "epoch": 5.735551663747811, "grad_norm": 0.1731017529964447, "learning_rate": 5.086221660446422e-08, "loss": 0.4451, "step": 3275 }, { "epoch": 5.7373029772329245, "grad_norm": 0.17383445799350739, "learning_rate": 5.0191901202897295e-08, "loss": 0.4371, "step": 3276 }, { "epoch": 5.739054290718038, "grad_norm": 0.17267975211143494, "learning_rate": 4.952600992636392e-08, "loss": 0.432, "step": 3277 }, { "epoch": 5.740805604203152, "grad_norm": 0.1689492166042328, "learning_rate": 4.8864543370035764e-08, "loss": 0.4389, "step": 3278 }, { "epoch": 5.742556917688266, "grad_norm": 0.17043599486351013, "learning_rate": 4.820750212513048e-08, "loss": 0.4284, "step": 3279 }, { "epoch": 5.74430823117338, "grad_norm": 0.16956056654453278, "learning_rate": 4.755488677890829e-08, "loss": 0.4365, "step": 3280 }, { "epoch": 5.746059544658494, "grad_norm": 0.16831637918949127, "learning_rate": 4.6906697914677055e-08, "loss": 0.4304, "step": 3281 }, { "epoch": 5.747810858143608, "grad_norm": 0.1703185737133026, "learning_rate": 4.626293611178445e-08, "loss": 0.4388, "step": 3282 }, { "epoch": 5.749562171628721, "grad_norm": 0.16543163359165192, "learning_rate": 4.5623601945624096e-08, "loss": 0.4387, "step": 3283 }, { "epoch": 5.751313485113835, "grad_norm": 0.17056521773338318, "learning_rate": 4.498869598763056e-08, "loss": 0.4441, "step": 3284 }, { "epoch": 5.753064798598949, "grad_norm": 0.17401225864887238, "learning_rate": 4.435821880528157e-08, "loss": 0.4401, "step": 3285 }, { "epoch": 5.754816112084063, "grad_norm": 0.16678465902805328, "learning_rate": 4.373217096209526e-08, "loss": 0.431, "step": 3286 }, { "epoch": 5.756567425569177, "grad_norm": 0.17196443676948547, "learning_rate": 4.3110553017631786e-08, "loss": 0.4249, "step": 3287 }, { "epoch": 5.758318739054291, "grad_norm": 0.17641869187355042, "learning_rate": 4.249336552749172e-08, "loss": 0.4257, "step": 3288 }, { "epoch": 5.760070052539405, "grad_norm": 0.17034536600112915, "learning_rate": 4.188060904331548e-08, "loss": 0.4428, "step": 3289 }, { "epoch": 5.7618213660245186, "grad_norm": 0.17908018827438354, "learning_rate": 4.127228411278383e-08, "loss": 0.434, "step": 3290 }, { "epoch": 5.7635726795096325, "grad_norm": 0.17565733194351196, "learning_rate": 4.0668391279614085e-08, "loss": 0.4379, "step": 3291 }, { "epoch": 5.765323992994746, "grad_norm": 0.16608615219593048, "learning_rate": 4.0068931083566134e-08, "loss": 0.4322, "step": 3292 }, { "epoch": 5.76707530647986, "grad_norm": 0.1778799295425415, "learning_rate": 3.947390406043472e-08, "loss": 0.4272, "step": 3293 }, { "epoch": 5.768826619964973, "grad_norm": 0.17243194580078125, "learning_rate": 3.8883310742053316e-08, "loss": 0.4497, "step": 3294 }, { "epoch": 5.770577933450087, "grad_norm": 0.17176561057567596, "learning_rate": 3.8297151656293e-08, "loss": 0.4376, "step": 3295 }, { "epoch": 5.772329246935201, "grad_norm": 0.1704113632440567, "learning_rate": 3.771542732706135e-08, "loss": 0.4406, "step": 3296 }, { "epoch": 5.774080560420315, "grad_norm": 0.16995082795619965, "learning_rate": 3.713813827430135e-08, "loss": 0.4442, "step": 3297 }, { "epoch": 5.775831873905429, "grad_norm": 0.1701585054397583, "learning_rate": 3.656528501399359e-08, "loss": 0.4436, "step": 3298 }, { "epoch": 5.777583187390543, "grad_norm": 0.1743958443403244, "learning_rate": 3.599686805815128e-08, "loss": 0.4521, "step": 3299 }, { "epoch": 5.779334500875657, "grad_norm": 0.182917058467865, "learning_rate": 3.543288791482469e-08, "loss": 0.4496, "step": 3300 }, { "epoch": 5.781085814360771, "grad_norm": 0.1694219559431076, "learning_rate": 3.4873345088097255e-08, "loss": 0.4292, "step": 3301 }, { "epoch": 5.782837127845885, "grad_norm": 0.17522276937961578, "learning_rate": 3.4318240078087816e-08, "loss": 0.4341, "step": 3302 }, { "epoch": 5.784588441330998, "grad_norm": 0.1674177646636963, "learning_rate": 3.3767573380946717e-08, "loss": 0.4366, "step": 3303 }, { "epoch": 5.786339754816112, "grad_norm": 0.17699368298053741, "learning_rate": 3.3221345488859134e-08, "loss": 0.4428, "step": 3304 }, { "epoch": 5.788091068301226, "grad_norm": 0.17353492975234985, "learning_rate": 3.267955689004176e-08, "loss": 0.4303, "step": 3305 }, { "epoch": 5.78984238178634, "grad_norm": 0.17077843844890594, "learning_rate": 3.214220806874335e-08, "loss": 0.4398, "step": 3306 }, { "epoch": 5.7915936952714535, "grad_norm": 0.17145124077796936, "learning_rate": 3.1609299505245275e-08, "loss": 0.4451, "step": 3307 }, { "epoch": 5.793345008756567, "grad_norm": 0.1712695062160492, "learning_rate": 3.1080831675859845e-08, "loss": 0.4414, "step": 3308 }, { "epoch": 5.795096322241681, "grad_norm": 0.17091913521289825, "learning_rate": 3.055680505292979e-08, "loss": 0.4287, "step": 3309 }, { "epoch": 5.796847635726795, "grad_norm": 0.17074044048786163, "learning_rate": 3.0037220104829344e-08, "loss": 0.423, "step": 3310 }, { "epoch": 5.798598949211909, "grad_norm": 0.17411068081855774, "learning_rate": 2.952207729596146e-08, "loss": 0.4414, "step": 3311 }, { "epoch": 5.800350262697023, "grad_norm": 0.16789673268795013, "learning_rate": 2.9011377086759496e-08, "loss": 0.4315, "step": 3312 }, { "epoch": 5.802101576182137, "grad_norm": 0.17422978579998016, "learning_rate": 2.85051199336861e-08, "loss": 0.4381, "step": 3313 }, { "epoch": 5.80385288966725, "grad_norm": 0.1645941585302353, "learning_rate": 2.800330628923209e-08, "loss": 0.442, "step": 3314 }, { "epoch": 5.805604203152364, "grad_norm": 0.17029091715812683, "learning_rate": 2.7505936601918137e-08, "loss": 0.4348, "step": 3315 }, { "epoch": 5.807355516637478, "grad_norm": 0.170720636844635, "learning_rate": 2.701301131629086e-08, "loss": 0.4373, "step": 3316 }, { "epoch": 5.809106830122592, "grad_norm": 0.16621440649032593, "learning_rate": 2.652453087292617e-08, "loss": 0.4315, "step": 3317 }, { "epoch": 5.810858143607706, "grad_norm": 0.1730540692806244, "learning_rate": 2.6040495708425928e-08, "loss": 0.4477, "step": 3318 }, { "epoch": 5.81260945709282, "grad_norm": 0.17134837806224823, "learning_rate": 2.5560906255420737e-08, "loss": 0.4382, "step": 3319 }, { "epoch": 5.814360770577934, "grad_norm": 0.1753847599029541, "learning_rate": 2.5085762942564927e-08, "loss": 0.4432, "step": 3320 }, { "epoch": 5.816112084063048, "grad_norm": 0.17469382286071777, "learning_rate": 2.4615066194541015e-08, "loss": 0.4435, "step": 3321 }, { "epoch": 5.8178633975481615, "grad_norm": 0.1731780767440796, "learning_rate": 2.41488164320558e-08, "loss": 0.4383, "step": 3322 }, { "epoch": 5.8196147110332745, "grad_norm": 0.18626071512699127, "learning_rate": 2.368701407184315e-08, "loss": 0.4449, "step": 3323 }, { "epoch": 5.8213660245183885, "grad_norm": 0.16707304120063782, "learning_rate": 2.322965952666012e-08, "loss": 0.4407, "step": 3324 }, { "epoch": 5.823117338003502, "grad_norm": 0.17357873916625977, "learning_rate": 2.277675320528916e-08, "loss": 0.4414, "step": 3325 }, { "epoch": 5.824868651488616, "grad_norm": 0.1696147620677948, "learning_rate": 2.2328295512535902e-08, "loss": 0.4364, "step": 3326 }, { "epoch": 5.82661996497373, "grad_norm": 0.17475558817386627, "learning_rate": 2.188428684923194e-08, "loss": 0.447, "step": 3327 }, { "epoch": 5.828371278458844, "grad_norm": 0.17341896891593933, "learning_rate": 2.144472761222982e-08, "loss": 0.4347, "step": 3328 }, { "epoch": 5.830122591943958, "grad_norm": 0.172355055809021, "learning_rate": 2.10096181944075e-08, "loss": 0.433, "step": 3329 }, { "epoch": 5.831873905429072, "grad_norm": 0.17361091077327728, "learning_rate": 2.0578958984663888e-08, "loss": 0.4424, "step": 3330 }, { "epoch": 5.833625218914186, "grad_norm": 0.16856062412261963, "learning_rate": 2.015275036792108e-08, "loss": 0.4336, "step": 3331 }, { "epoch": 5.8353765323993, "grad_norm": 0.17127853631973267, "learning_rate": 1.973099272512269e-08, "loss": 0.4426, "step": 3332 }, { "epoch": 5.837127845884414, "grad_norm": 0.1765851080417633, "learning_rate": 1.9313686433236057e-08, "loss": 0.4226, "step": 3333 }, { "epoch": 5.838879159369527, "grad_norm": 0.16681037843227386, "learning_rate": 1.8900831865246717e-08, "loss": 0.4412, "step": 3334 }, { "epoch": 5.840630472854641, "grad_norm": 0.1667599380016327, "learning_rate": 1.849242939016449e-08, "loss": 0.4456, "step": 3335 }, { "epoch": 5.842381786339755, "grad_norm": 0.16965456306934357, "learning_rate": 1.8088479373016833e-08, "loss": 0.4295, "step": 3336 }, { "epoch": 5.844133099824869, "grad_norm": 0.17780576646327972, "learning_rate": 1.768898217485493e-08, "loss": 0.4454, "step": 3337 }, { "epoch": 5.8458844133099825, "grad_norm": 0.16954724490642548, "learning_rate": 1.7293938152746493e-08, "loss": 0.4391, "step": 3338 }, { "epoch": 5.8476357267950965, "grad_norm": 0.174340158700943, "learning_rate": 1.6903347659781856e-08, "loss": 0.4379, "step": 3339 }, { "epoch": 5.84938704028021, "grad_norm": 0.17039935290813446, "learning_rate": 1.6517211045069537e-08, "loss": 0.4206, "step": 3340 }, { "epoch": 5.851138353765324, "grad_norm": 0.1692134290933609, "learning_rate": 1.6135528653737353e-08, "loss": 0.4316, "step": 3341 }, { "epoch": 5.852889667250437, "grad_norm": 0.17281252145767212, "learning_rate": 1.575830082693186e-08, "loss": 0.4462, "step": 3342 }, { "epoch": 5.854640980735551, "grad_norm": 0.17349451780319214, "learning_rate": 1.5385527901818907e-08, "loss": 0.4432, "step": 3343 }, { "epoch": 5.856392294220665, "grad_norm": 0.17281298339366913, "learning_rate": 1.5017210211580868e-08, "loss": 0.43, "step": 3344 }, { "epoch": 5.858143607705779, "grad_norm": 0.17443081736564636, "learning_rate": 1.4653348085419961e-08, "loss": 0.4354, "step": 3345 }, { "epoch": 5.859894921190893, "grad_norm": 0.17698606848716736, "learning_rate": 1.4293941848554926e-08, "loss": 0.4335, "step": 3346 }, { "epoch": 5.861646234676007, "grad_norm": 0.170796200633049, "learning_rate": 1.3938991822222137e-08, "loss": 0.4387, "step": 3347 }, { "epoch": 5.863397548161121, "grad_norm": 0.17669326066970825, "learning_rate": 1.3588498323674482e-08, "loss": 0.4462, "step": 3348 }, { "epoch": 5.865148861646235, "grad_norm": 0.16665245592594147, "learning_rate": 1.3242461666182483e-08, "loss": 0.4376, "step": 3349 }, { "epoch": 5.866900175131349, "grad_norm": 0.17532989382743835, "learning_rate": 1.2900882159033178e-08, "loss": 0.4395, "step": 3350 }, { "epoch": 5.868651488616463, "grad_norm": 0.17438919842243195, "learning_rate": 1.2563760107528466e-08, "loss": 0.4413, "step": 3351 }, { "epoch": 5.870402802101577, "grad_norm": 0.17368347942829132, "learning_rate": 1.223109581298787e-08, "loss": 0.43, "step": 3352 }, { "epoch": 5.87215411558669, "grad_norm": 0.17284400761127472, "learning_rate": 1.1902889572745213e-08, "loss": 0.4403, "step": 3353 }, { "epoch": 5.873905429071804, "grad_norm": 0.17239230871200562, "learning_rate": 1.1579141680150841e-08, "loss": 0.4342, "step": 3354 }, { "epoch": 5.8756567425569175, "grad_norm": 0.17725518345832825, "learning_rate": 1.125985242456995e-08, "loss": 0.4412, "step": 3355 }, { "epoch": 5.877408056042031, "grad_norm": 0.17030663788318634, "learning_rate": 1.094502209138204e-08, "loss": 0.437, "step": 3356 }, { "epoch": 5.879159369527145, "grad_norm": 0.16781379282474518, "learning_rate": 1.0634650961981462e-08, "loss": 0.433, "step": 3357 }, { "epoch": 5.880910683012259, "grad_norm": 0.17331275343894958, "learning_rate": 1.0328739313776869e-08, "loss": 0.4348, "step": 3358 }, { "epoch": 5.882661996497373, "grad_norm": 0.1685137301683426, "learning_rate": 1.0027287420192322e-08, "loss": 0.4379, "step": 3359 }, { "epoch": 5.884413309982487, "grad_norm": 0.16752517223358154, "learning_rate": 9.730295550663959e-09, "loss": 0.4301, "step": 3360 }, { "epoch": 5.886164623467601, "grad_norm": 0.16789115965366364, "learning_rate": 9.437763970642223e-09, "loss": 0.4503, "step": 3361 }, { "epoch": 5.887915936952714, "grad_norm": 0.16929715871810913, "learning_rate": 9.149692941590738e-09, "loss": 0.4354, "step": 3362 }, { "epoch": 5.889667250437828, "grad_norm": 0.17013132572174072, "learning_rate": 8.866082720987989e-09, "loss": 0.4316, "step": 3363 }, { "epoch": 5.891418563922942, "grad_norm": 0.17241939902305603, "learning_rate": 8.58693356232232e-09, "loss": 0.4535, "step": 3364 }, { "epoch": 5.893169877408056, "grad_norm": 0.16884790360927582, "learning_rate": 8.312245715097478e-09, "loss": 0.4345, "step": 3365 }, { "epoch": 5.89492119089317, "grad_norm": 0.17026962339878082, "learning_rate": 8.042019424828739e-09, "loss": 0.4322, "step": 3366 }, { "epoch": 5.896672504378284, "grad_norm": 0.17954979836940765, "learning_rate": 7.776254933043458e-09, "loss": 0.4259, "step": 3367 }, { "epoch": 5.898423817863398, "grad_norm": 0.17374953627586365, "learning_rate": 7.514952477281067e-09, "loss": 0.4306, "step": 3368 }, { "epoch": 5.900175131348512, "grad_norm": 0.17606769502162933, "learning_rate": 7.258112291093078e-09, "loss": 0.4427, "step": 3369 }, { "epoch": 5.9019264448336255, "grad_norm": 0.17232105135917664, "learning_rate": 7.005734604041969e-09, "loss": 0.4378, "step": 3370 }, { "epoch": 5.903677758318739, "grad_norm": 0.17063938081264496, "learning_rate": 6.757819641703966e-09, "loss": 0.4482, "step": 3371 }, { "epoch": 5.905429071803853, "grad_norm": 0.16984008252620697, "learning_rate": 6.514367625662932e-09, "loss": 0.4379, "step": 3372 }, { "epoch": 5.907180385288966, "grad_norm": 0.1703241914510727, "learning_rate": 6.275378773516472e-09, "loss": 0.4449, "step": 3373 }, { "epoch": 5.90893169877408, "grad_norm": 0.1752474308013916, "learning_rate": 6.040853298872051e-09, "loss": 0.4332, "step": 3374 }, { "epoch": 5.910683012259194, "grad_norm": 0.17063473165035248, "learning_rate": 5.8107914113475494e-09, "loss": 0.4477, "step": 3375 }, { "epoch": 5.912434325744308, "grad_norm": 0.18454699218273163, "learning_rate": 5.585193316572368e-09, "loss": 0.4408, "step": 3376 }, { "epoch": 5.914185639229422, "grad_norm": 0.1731073409318924, "learning_rate": 5.364059216185213e-09, "loss": 0.448, "step": 3377 }, { "epoch": 5.915936952714536, "grad_norm": 0.1695115566253662, "learning_rate": 5.147389307834649e-09, "loss": 0.4378, "step": 3378 }, { "epoch": 5.91768826619965, "grad_norm": 0.17055736482143402, "learning_rate": 4.935183785180209e-09, "loss": 0.4332, "step": 3379 }, { "epoch": 5.919439579684764, "grad_norm": 0.17015299201011658, "learning_rate": 4.727442837890173e-09, "loss": 0.4459, "step": 3380 }, { "epoch": 5.921190893169878, "grad_norm": 0.18402382731437683, "learning_rate": 4.5241666516432355e-09, "loss": 0.4387, "step": 3381 }, { "epoch": 5.922942206654991, "grad_norm": 0.16788598895072937, "learning_rate": 4.325355408127396e-09, "loss": 0.4266, "step": 3382 }, { "epoch": 5.924693520140105, "grad_norm": 0.17438051104545593, "learning_rate": 4.131009285038845e-09, "loss": 0.4456, "step": 3383 }, { "epoch": 5.926444833625219, "grad_norm": 0.17182444036006927, "learning_rate": 3.941128456083631e-09, "loss": 0.4379, "step": 3384 }, { "epoch": 5.928196147110333, "grad_norm": 0.16978712379932404, "learning_rate": 3.755713090977664e-09, "loss": 0.4342, "step": 3385 }, { "epoch": 5.9299474605954465, "grad_norm": 0.17983481287956238, "learning_rate": 3.5747633554433782e-09, "loss": 0.448, "step": 3386 }, { "epoch": 5.9316987740805605, "grad_norm": 0.17474865913391113, "learning_rate": 3.3982794112141783e-09, "loss": 0.4457, "step": 3387 }, { "epoch": 5.933450087565674, "grad_norm": 0.17059017717838287, "learning_rate": 3.226261416030552e-09, "loss": 0.4462, "step": 3388 }, { "epoch": 5.935201401050788, "grad_norm": 0.17224717140197754, "learning_rate": 3.0587095236417342e-09, "loss": 0.4389, "step": 3389 }, { "epoch": 5.936952714535902, "grad_norm": 0.16956286132335663, "learning_rate": 2.8956238838051544e-09, "loss": 0.4337, "step": 3390 }, { "epoch": 5.938704028021016, "grad_norm": 0.16643649339675903, "learning_rate": 2.737004642286434e-09, "loss": 0.4393, "step": 3391 }, { "epoch": 5.94045534150613, "grad_norm": 0.18163904547691345, "learning_rate": 2.5828519408593877e-09, "loss": 0.4395, "step": 3392 }, { "epoch": 5.942206654991243, "grad_norm": 0.17112119495868683, "learning_rate": 2.4331659173038038e-09, "loss": 0.4278, "step": 3393 }, { "epoch": 5.943957968476357, "grad_norm": 0.168158158659935, "learning_rate": 2.2879467054104378e-09, "loss": 0.4294, "step": 3394 }, { "epoch": 5.945709281961471, "grad_norm": 0.17687764763832092, "learning_rate": 2.1471944349749086e-09, "loss": 0.442, "step": 3395 }, { "epoch": 5.947460595446585, "grad_norm": 0.17431797087192535, "learning_rate": 2.0109092318010283e-09, "loss": 0.4361, "step": 3396 }, { "epoch": 5.949211908931699, "grad_norm": 0.16621404886245728, "learning_rate": 1.8790912177008015e-09, "loss": 0.4296, "step": 3397 }, { "epoch": 5.950963222416813, "grad_norm": 0.1735975295305252, "learning_rate": 1.7517405104922057e-09, "loss": 0.436, "step": 3398 }, { "epoch": 5.952714535901927, "grad_norm": 0.16897691786289215, "learning_rate": 1.6288572240014123e-09, "loss": 0.4354, "step": 3399 }, { "epoch": 5.954465849387041, "grad_norm": 0.16867290437221527, "learning_rate": 1.510441468061119e-09, "loss": 0.4328, "step": 3400 }, { "epoch": 5.956217162872154, "grad_norm": 0.17421278357505798, "learning_rate": 1.3964933485105526e-09, "loss": 0.4305, "step": 3401 }, { "epoch": 5.957968476357268, "grad_norm": 0.1654721349477768, "learning_rate": 1.287012967196577e-09, "loss": 0.429, "step": 3402 }, { "epoch": 5.9597197898423815, "grad_norm": 0.17075689136981964, "learning_rate": 1.1820004219725845e-09, "loss": 0.4347, "step": 3403 }, { "epoch": 5.961471103327495, "grad_norm": 0.16690115630626678, "learning_rate": 1.0814558066979397e-09, "loss": 0.4274, "step": 3404 }, { "epoch": 5.963222416812609, "grad_norm": 0.17001157999038696, "learning_rate": 9.853792112402005e-10, "loss": 0.4492, "step": 3405 }, { "epoch": 5.964973730297723, "grad_norm": 0.16445551812648773, "learning_rate": 8.937707214712321e-10, "loss": 0.4416, "step": 3406 }, { "epoch": 5.966725043782837, "grad_norm": 0.16672343015670776, "learning_rate": 8.066304192710927e-10, "loss": 0.4393, "step": 3407 }, { "epoch": 5.968476357267951, "grad_norm": 0.1742452085018158, "learning_rate": 7.239583825252583e-10, "loss": 0.4463, "step": 3408 }, { "epoch": 5.970227670753065, "grad_norm": 0.1736869513988495, "learning_rate": 6.457546851262875e-10, "loss": 0.4408, "step": 3409 }, { "epoch": 5.971978984238179, "grad_norm": 0.1674644649028778, "learning_rate": 5.720193969716015e-10, "loss": 0.4387, "step": 3410 }, { "epoch": 5.973730297723293, "grad_norm": 0.1681266874074936, "learning_rate": 5.027525839662594e-10, "loss": 0.4271, "step": 3411 }, { "epoch": 5.975481611208406, "grad_norm": 0.171678364276886, "learning_rate": 4.3795430802073825e-10, "loss": 0.4478, "step": 3412 }, { "epoch": 5.97723292469352, "grad_norm": 0.17598727345466614, "learning_rate": 3.7762462705093204e-10, "loss": 0.4523, "step": 3413 }, { "epoch": 5.978984238178634, "grad_norm": 0.1778467744588852, "learning_rate": 3.2176359498037325e-10, "loss": 0.4418, "step": 3414 }, { "epoch": 5.980735551663748, "grad_norm": 0.17961090803146362, "learning_rate": 2.703712617363463e-10, "loss": 0.4434, "step": 3415 }, { "epoch": 5.982486865148862, "grad_norm": 0.17788732051849365, "learning_rate": 2.234476732537738e-10, "loss": 0.4387, "step": 3416 }, { "epoch": 5.984238178633976, "grad_norm": 0.18892410397529602, "learning_rate": 1.8099287147355095e-10, "loss": 0.4437, "step": 3417 }, { "epoch": 5.9859894921190895, "grad_norm": 0.17075549066066742, "learning_rate": 1.4300689434032512e-10, "loss": 0.4384, "step": 3418 }, { "epoch": 5.987740805604203, "grad_norm": 0.17863821983337402, "learning_rate": 1.0948977580638176e-10, "loss": 0.4415, "step": 3419 }, { "epoch": 5.989492119089317, "grad_norm": 0.1691814661026001, "learning_rate": 8.044154582942388e-11, "loss": 0.4398, "step": 3420 }, { "epoch": 5.99124343257443, "grad_norm": 0.1722111999988556, "learning_rate": 5.586223037257199e-11, "loss": 0.4496, "step": 3421 }, { "epoch": 5.992994746059544, "grad_norm": 0.1697397232055664, "learning_rate": 3.575185140436421e-11, "loss": 0.438, "step": 3422 }, { "epoch": 5.994746059544658, "grad_norm": 0.16722774505615234, "learning_rate": 2.0110426900421532e-11, "loss": 0.4317, "step": 3423 }, { "epoch": 5.996497373029772, "grad_norm": 0.1661006063222885, "learning_rate": 8.937970840117161e-12, "loss": 0.4308, "step": 3424 }, { "epoch": 5.998248686514886, "grad_norm": 0.17609171569347382, "learning_rate": 2.2344932093520955e-12, "loss": 0.435, "step": 3425 }, { "epoch": 6.0, "grad_norm": 0.16996462643146515, "learning_rate": 0.0, "loss": 0.4446, "step": 3426 }, { "epoch": 6.0, "step": 3426, "total_flos": 7183858667618304.0, "train_loss": 0.4827731969808182, "train_runtime": 116347.75, "train_samples_per_second": 1.884, "train_steps_per_second": 0.029 } ], "logging_steps": 1, "max_steps": 3426, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 571, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7183858667618304.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }