{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0005646527385659, "eval_steps": 222, "global_step": 443, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002258610954263128, "grad_norm": 0.696092426776886, "learning_rate": 3.3333333333333333e-06, "loss": 2.1698, "step": 1 }, { "epoch": 0.002258610954263128, "eval_loss": 2.083223819732666, "eval_runtime": 250.3739, "eval_samples_per_second": 2.98, "eval_steps_per_second": 0.375, "step": 1 }, { "epoch": 0.004517221908526256, "grad_norm": 0.7258424758911133, "learning_rate": 6.666666666666667e-06, "loss": 2.2383, "step": 2 }, { "epoch": 0.006775832862789385, "grad_norm": 0.999031662940979, "learning_rate": 1e-05, "loss": 1.9518, "step": 3 }, { "epoch": 0.009034443817052512, "grad_norm": 0.8558158278465271, "learning_rate": 1.3333333333333333e-05, "loss": 2.3031, "step": 4 }, { "epoch": 0.01129305477131564, "grad_norm": 1.0158196687698364, "learning_rate": 1.6666666666666667e-05, "loss": 2.3383, "step": 5 }, { "epoch": 0.01355166572557877, "grad_norm": 1.3310142755508423, "learning_rate": 2e-05, "loss": 2.3352, "step": 6 }, { "epoch": 0.015810276679841896, "grad_norm": 0.7580591440200806, "learning_rate": 2.3333333333333336e-05, "loss": 1.6814, "step": 7 }, { "epoch": 0.018068887634105024, "grad_norm": 0.7518572211265564, "learning_rate": 2.6666666666666667e-05, "loss": 1.7568, "step": 8 }, { "epoch": 0.020327498588368152, "grad_norm": 2.006964683532715, "learning_rate": 3e-05, "loss": 2.6346, "step": 9 }, { "epoch": 0.02258610954263128, "grad_norm": 0.6915990114212036, "learning_rate": 3.3333333333333335e-05, "loss": 2.3084, "step": 10 }, { "epoch": 0.024844720496894408, "grad_norm": 0.8575783967971802, "learning_rate": 3.6666666666666666e-05, "loss": 2.0648, "step": 11 }, { "epoch": 0.02710333145115754, "grad_norm": 1.1422725915908813, "learning_rate": 4e-05, "loss": 2.1607, "step": 12 }, { "epoch": 0.029361942405420668, "grad_norm": 1.0447956323623657, "learning_rate": 4.3333333333333334e-05, "loss": 1.9577, "step": 13 }, { "epoch": 0.03162055335968379, "grad_norm": 1.0838192701339722, "learning_rate": 4.666666666666667e-05, "loss": 1.8649, "step": 14 }, { "epoch": 0.03387916431394692, "grad_norm": 1.2682744264602661, "learning_rate": 5e-05, "loss": 2.0382, "step": 15 }, { "epoch": 0.03613777526821005, "grad_norm": 1.5851482152938843, "learning_rate": 5.333333333333333e-05, "loss": 1.9882, "step": 16 }, { "epoch": 0.038396386222473176, "grad_norm": 1.1369619369506836, "learning_rate": 5.666666666666667e-05, "loss": 1.609, "step": 17 }, { "epoch": 0.040654997176736304, "grad_norm": 1.489020824432373, "learning_rate": 6e-05, "loss": 1.6515, "step": 18 }, { "epoch": 0.04291360813099943, "grad_norm": 1.0247050523757935, "learning_rate": 6.333333333333333e-05, "loss": 1.7473, "step": 19 }, { "epoch": 0.04517221908526256, "grad_norm": 0.913925051689148, "learning_rate": 6.666666666666667e-05, "loss": 1.7019, "step": 20 }, { "epoch": 0.04743083003952569, "grad_norm": 1.1576547622680664, "learning_rate": 7e-05, "loss": 1.5537, "step": 21 }, { "epoch": 0.049689440993788817, "grad_norm": 1.4580681324005127, "learning_rate": 7.333333333333333e-05, "loss": 1.6705, "step": 22 }, { "epoch": 0.05194805194805195, "grad_norm": 1.6036103963851929, "learning_rate": 7.666666666666667e-05, "loss": 1.6114, "step": 23 }, { "epoch": 0.05420666290231508, "grad_norm": 1.218241572380066, "learning_rate": 8e-05, "loss": 1.8225, "step": 24 }, { "epoch": 0.05646527385657821, "grad_norm": 1.334812879562378, "learning_rate": 8.333333333333334e-05, "loss": 1.6176, "step": 25 }, { "epoch": 0.058723884810841336, "grad_norm": 1.0912553071975708, "learning_rate": 8.666666666666667e-05, "loss": 1.7103, "step": 26 }, { "epoch": 0.060982495765104464, "grad_norm": 0.8983953595161438, "learning_rate": 9e-05, "loss": 1.826, "step": 27 }, { "epoch": 0.06324110671936758, "grad_norm": 1.1796810626983643, "learning_rate": 9.333333333333334e-05, "loss": 1.3892, "step": 28 }, { "epoch": 0.06549971767363072, "grad_norm": 0.9814470410346985, "learning_rate": 9.666666666666667e-05, "loss": 1.4582, "step": 29 }, { "epoch": 0.06775832862789384, "grad_norm": 1.072553038597107, "learning_rate": 0.0001, "loss": 1.4753, "step": 30 }, { "epoch": 0.07001693958215698, "grad_norm": 1.1693260669708252, "learning_rate": 9.999855343632036e-05, "loss": 1.6425, "step": 31 }, { "epoch": 0.0722755505364201, "grad_norm": 0.8432520031929016, "learning_rate": 9.999421382898329e-05, "loss": 1.6913, "step": 32 }, { "epoch": 0.07453416149068323, "grad_norm": 0.6651790738105774, "learning_rate": 9.998698142908953e-05, "loss": 1.2049, "step": 33 }, { "epoch": 0.07679277244494635, "grad_norm": 1.2236829996109009, "learning_rate": 9.997685665512418e-05, "loss": 1.3681, "step": 34 }, { "epoch": 0.07905138339920949, "grad_norm": 0.5604356527328491, "learning_rate": 9.99638400929324e-05, "loss": 1.2572, "step": 35 }, { "epoch": 0.08130999435347261, "grad_norm": 0.5676224827766418, "learning_rate": 9.994793249568569e-05, "loss": 1.3594, "step": 36 }, { "epoch": 0.08356860530773574, "grad_norm": 0.5219910144805908, "learning_rate": 9.99291347838381e-05, "loss": 1.597, "step": 37 }, { "epoch": 0.08582721626199886, "grad_norm": 0.7226600646972656, "learning_rate": 9.990744804507315e-05, "loss": 1.3577, "step": 38 }, { "epoch": 0.088085827216262, "grad_norm": 0.5216960310935974, "learning_rate": 9.988287353424077e-05, "loss": 1.7175, "step": 39 }, { "epoch": 0.09034443817052512, "grad_norm": 0.5965225100517273, "learning_rate": 9.985541267328477e-05, "loss": 1.5252, "step": 40 }, { "epoch": 0.09260304912478826, "grad_norm": 0.5782445669174194, "learning_rate": 9.98250670511605e-05, "loss": 1.9063, "step": 41 }, { "epoch": 0.09486166007905138, "grad_norm": 0.9046345353126526, "learning_rate": 9.979183842374293e-05, "loss": 1.371, "step": 42 }, { "epoch": 0.09712027103331451, "grad_norm": 0.7153818607330322, "learning_rate": 9.975572871372513e-05, "loss": 1.3616, "step": 43 }, { "epoch": 0.09937888198757763, "grad_norm": 1.479604959487915, "learning_rate": 9.971674001050686e-05, "loss": 1.4922, "step": 44 }, { "epoch": 0.10163749294184077, "grad_norm": 0.7710794806480408, "learning_rate": 9.967487457007381e-05, "loss": 1.0812, "step": 45 }, { "epoch": 0.1038961038961039, "grad_norm": 0.792734682559967, "learning_rate": 9.963013481486703e-05, "loss": 1.5412, "step": 46 }, { "epoch": 0.10615471485036702, "grad_norm": 0.6719285249710083, "learning_rate": 9.958252333364267e-05, "loss": 1.1906, "step": 47 }, { "epoch": 0.10841332580463016, "grad_norm": 1.0701615810394287, "learning_rate": 9.953204288132234e-05, "loss": 1.3426, "step": 48 }, { "epoch": 0.11067193675889328, "grad_norm": 1.067221760749817, "learning_rate": 9.947869637883358e-05, "loss": 1.435, "step": 49 }, { "epoch": 0.11293054771315642, "grad_norm": 0.6624462008476257, "learning_rate": 9.942248691294093e-05, "loss": 1.148, "step": 50 }, { "epoch": 0.11518915866741954, "grad_norm": 1.10640549659729, "learning_rate": 9.936341773606723e-05, "loss": 1.4428, "step": 51 }, { "epoch": 0.11744776962168267, "grad_norm": 0.7549408674240112, "learning_rate": 9.930149226610554e-05, "loss": 1.4182, "step": 52 }, { "epoch": 0.11970638057594579, "grad_norm": 0.6992340683937073, "learning_rate": 9.923671408622129e-05, "loss": 1.3721, "step": 53 }, { "epoch": 0.12196499153020893, "grad_norm": 0.6394023895263672, "learning_rate": 9.916908694464492e-05, "loss": 1.5456, "step": 54 }, { "epoch": 0.12422360248447205, "grad_norm": 0.8388894200325012, "learning_rate": 9.909861475445517e-05, "loss": 1.6266, "step": 55 }, { "epoch": 0.12648221343873517, "grad_norm": 0.5841717720031738, "learning_rate": 9.902530159335243e-05, "loss": 1.6483, "step": 56 }, { "epoch": 0.1287408243929983, "grad_norm": 0.635645866394043, "learning_rate": 9.894915170342295e-05, "loss": 1.2798, "step": 57 }, { "epoch": 0.13099943534726144, "grad_norm": 0.8329370617866516, "learning_rate": 9.887016949089333e-05, "loss": 1.5148, "step": 58 }, { "epoch": 0.13325804630152457, "grad_norm": 0.7651770710945129, "learning_rate": 9.878835952587559e-05, "loss": 1.4382, "step": 59 }, { "epoch": 0.13551665725578768, "grad_norm": 0.6698804497718811, "learning_rate": 9.870372654210265e-05, "loss": 1.4922, "step": 60 }, { "epoch": 0.13777526821005082, "grad_norm": 0.807618260383606, "learning_rate": 9.861627543665456e-05, "loss": 1.8907, "step": 61 }, { "epoch": 0.14003387916431395, "grad_norm": 0.610428512096405, "learning_rate": 9.852601126967502e-05, "loss": 1.4187, "step": 62 }, { "epoch": 0.1422924901185771, "grad_norm": 0.5798942446708679, "learning_rate": 9.843293926407866e-05, "loss": 1.4308, "step": 63 }, { "epoch": 0.1445511010728402, "grad_norm": 0.5696601271629333, "learning_rate": 9.833706480524878e-05, "loss": 1.6111, "step": 64 }, { "epoch": 0.14680971202710333, "grad_norm": 0.5774890780448914, "learning_rate": 9.82383934407258e-05, "loss": 1.2417, "step": 65 }, { "epoch": 0.14906832298136646, "grad_norm": 0.6529080271720886, "learning_rate": 9.81369308798862e-05, "loss": 1.3103, "step": 66 }, { "epoch": 0.1513269339356296, "grad_norm": 0.47429534792900085, "learning_rate": 9.803268299361217e-05, "loss": 1.2813, "step": 67 }, { "epoch": 0.1535855448898927, "grad_norm": 0.9300480484962463, "learning_rate": 9.7925655813952e-05, "loss": 1.5553, "step": 68 }, { "epoch": 0.15584415584415584, "grad_norm": 0.6896660327911377, "learning_rate": 9.781585553377085e-05, "loss": 1.3391, "step": 69 }, { "epoch": 0.15810276679841898, "grad_norm": 0.513652503490448, "learning_rate": 9.770328850639268e-05, "loss": 1.2952, "step": 70 }, { "epoch": 0.1603613777526821, "grad_norm": 0.6096103191375732, "learning_rate": 9.758796124523239e-05, "loss": 1.6856, "step": 71 }, { "epoch": 0.16261998870694522, "grad_norm": 0.9341354370117188, "learning_rate": 9.746988042341906e-05, "loss": 1.2378, "step": 72 }, { "epoch": 0.16487859966120835, "grad_norm": 0.616295337677002, "learning_rate": 9.734905287340985e-05, "loss": 1.4577, "step": 73 }, { "epoch": 0.1671372106154715, "grad_norm": 0.5131121277809143, "learning_rate": 9.722548558659457e-05, "loss": 1.6515, "step": 74 }, { "epoch": 0.16939582156973462, "grad_norm": 0.7272030711174011, "learning_rate": 9.709918571289114e-05, "loss": 1.3936, "step": 75 }, { "epoch": 0.17165443252399773, "grad_norm": 0.6986638903617859, "learning_rate": 9.697016056033201e-05, "loss": 1.5857, "step": 76 }, { "epoch": 0.17391304347826086, "grad_norm": 0.5578714609146118, "learning_rate": 9.683841759464113e-05, "loss": 1.4122, "step": 77 }, { "epoch": 0.176171654432524, "grad_norm": 0.49430081248283386, "learning_rate": 9.670396443880208e-05, "loss": 1.5742, "step": 78 }, { "epoch": 0.17843026538678713, "grad_norm": 0.7834444642066956, "learning_rate": 9.656680887261693e-05, "loss": 1.4708, "step": 79 }, { "epoch": 0.18068887634105024, "grad_norm": 0.7123986482620239, "learning_rate": 9.64269588322561e-05, "loss": 1.6196, "step": 80 }, { "epoch": 0.18294748729531338, "grad_norm": 0.7276789546012878, "learning_rate": 9.628442240979916e-05, "loss": 1.38, "step": 81 }, { "epoch": 0.1852060982495765, "grad_norm": 0.580315351486206, "learning_rate": 9.613920785276656e-05, "loss": 1.62, "step": 82 }, { "epoch": 0.18746470920383965, "grad_norm": 0.8134167790412903, "learning_rate": 9.599132356364247e-05, "loss": 1.3458, "step": 83 }, { "epoch": 0.18972332015810275, "grad_norm": 0.9041950702667236, "learning_rate": 9.584077809938855e-05, "loss": 1.3943, "step": 84 }, { "epoch": 0.1919819311123659, "grad_norm": 0.6255788207054138, "learning_rate": 9.568758017094883e-05, "loss": 1.2289, "step": 85 }, { "epoch": 0.19424054206662902, "grad_norm": 0.5172642469406128, "learning_rate": 9.553173864274567e-05, "loss": 1.4261, "step": 86 }, { "epoch": 0.19649915302089216, "grad_norm": 0.4685104489326477, "learning_rate": 9.537326253216685e-05, "loss": 1.4084, "step": 87 }, { "epoch": 0.19875776397515527, "grad_norm": 0.5454607605934143, "learning_rate": 9.521216100904378e-05, "loss": 1.5901, "step": 88 }, { "epoch": 0.2010163749294184, "grad_norm": 0.5021257996559143, "learning_rate": 9.504844339512095e-05, "loss": 1.4189, "step": 89 }, { "epoch": 0.20327498588368154, "grad_norm": 0.6921396851539612, "learning_rate": 9.488211916351656e-05, "loss": 1.3857, "step": 90 }, { "epoch": 0.20553359683794467, "grad_norm": 0.6917213797569275, "learning_rate": 9.471319793817426e-05, "loss": 1.6815, "step": 91 }, { "epoch": 0.2077922077922078, "grad_norm": 1.099507451057434, "learning_rate": 9.454168949330645e-05, "loss": 1.4097, "step": 92 }, { "epoch": 0.2100508187464709, "grad_norm": 0.5121451616287231, "learning_rate": 9.436760375282859e-05, "loss": 1.4267, "step": 93 }, { "epoch": 0.21230942970073405, "grad_norm": 0.5218266248703003, "learning_rate": 9.419095078978506e-05, "loss": 1.2902, "step": 94 }, { "epoch": 0.21456804065499718, "grad_norm": 0.5271095633506775, "learning_rate": 9.40117408257663e-05, "loss": 1.402, "step": 95 }, { "epoch": 0.21682665160926032, "grad_norm": 0.5236210227012634, "learning_rate": 9.382998423031727e-05, "loss": 1.7724, "step": 96 }, { "epoch": 0.21908526256352343, "grad_norm": 0.5875529646873474, "learning_rate": 9.364569152033756e-05, "loss": 1.7372, "step": 97 }, { "epoch": 0.22134387351778656, "grad_norm": 0.6892306208610535, "learning_rate": 9.345887335947281e-05, "loss": 1.6303, "step": 98 }, { "epoch": 0.2236024844720497, "grad_norm": 0.5231241583824158, "learning_rate": 9.326954055749767e-05, "loss": 1.618, "step": 99 }, { "epoch": 0.22586109542631283, "grad_norm": 0.6638787388801575, "learning_rate": 9.30777040696903e-05, "loss": 1.1699, "step": 100 }, { "epoch": 0.22811970638057594, "grad_norm": 0.604011058807373, "learning_rate": 9.288337499619857e-05, "loss": 1.3338, "step": 101 }, { "epoch": 0.23037831733483907, "grad_norm": 0.5968809127807617, "learning_rate": 9.268656458139762e-05, "loss": 1.4609, "step": 102 }, { "epoch": 0.2326369282891022, "grad_norm": 0.5389347672462463, "learning_rate": 9.248728421323941e-05, "loss": 1.5215, "step": 103 }, { "epoch": 0.23489553924336534, "grad_norm": 0.712431013584137, "learning_rate": 9.22855454225936e-05, "loss": 1.0865, "step": 104 }, { "epoch": 0.23715415019762845, "grad_norm": 0.7650911808013916, "learning_rate": 9.208135988258051e-05, "loss": 1.478, "step": 105 }, { "epoch": 0.23941276115189158, "grad_norm": 0.9736595153808594, "learning_rate": 9.187473940789557e-05, "loss": 1.4201, "step": 106 }, { "epoch": 0.24167137210615472, "grad_norm": 0.703115701675415, "learning_rate": 9.166569595412575e-05, "loss": 1.3264, "step": 107 }, { "epoch": 0.24392998306041785, "grad_norm": 0.9081875085830688, "learning_rate": 9.145424161705776e-05, "loss": 1.3541, "step": 108 }, { "epoch": 0.24618859401468096, "grad_norm": 0.5371877551078796, "learning_rate": 9.124038863197818e-05, "loss": 1.4782, "step": 109 }, { "epoch": 0.2484472049689441, "grad_norm": 0.5240530967712402, "learning_rate": 9.10241493729654e-05, "loss": 1.5567, "step": 110 }, { "epoch": 0.25070581592320723, "grad_norm": 0.6024327874183655, "learning_rate": 9.08055363521738e-05, "loss": 1.565, "step": 111 }, { "epoch": 0.25296442687747034, "grad_norm": 0.6612941026687622, "learning_rate": 9.058456221910956e-05, "loss": 1.489, "step": 112 }, { "epoch": 0.2552230378317335, "grad_norm": 0.6889235377311707, "learning_rate": 9.036123975989892e-05, "loss": 1.4708, "step": 113 }, { "epoch": 0.2574816487859966, "grad_norm": 0.611516535282135, "learning_rate": 9.013558189654819e-05, "loss": 1.4802, "step": 114 }, { "epoch": 0.2597402597402597, "grad_norm": 0.5951672196388245, "learning_rate": 8.990760168619615e-05, "loss": 1.2703, "step": 115 }, { "epoch": 0.2619988706945229, "grad_norm": 0.670804500579834, "learning_rate": 8.967731232035847e-05, "loss": 1.8547, "step": 116 }, { "epoch": 0.264257481648786, "grad_norm": 0.4999150037765503, "learning_rate": 8.944472712416447e-05, "loss": 1.44, "step": 117 }, { "epoch": 0.26651609260304915, "grad_norm": 0.5740485191345215, "learning_rate": 8.9209859555586e-05, "loss": 1.2195, "step": 118 }, { "epoch": 0.26877470355731226, "grad_norm": 0.5336365103721619, "learning_rate": 8.897272320465887e-05, "loss": 1.4808, "step": 119 }, { "epoch": 0.27103331451157536, "grad_norm": 0.5398093461990356, "learning_rate": 8.873333179269635e-05, "loss": 1.4662, "step": 120 }, { "epoch": 0.2732919254658385, "grad_norm": 0.5843866467475891, "learning_rate": 8.849169917149531e-05, "loss": 1.6004, "step": 121 }, { "epoch": 0.27555053642010163, "grad_norm": 0.5099990963935852, "learning_rate": 8.82478393225347e-05, "loss": 1.6265, "step": 122 }, { "epoch": 0.27780914737436474, "grad_norm": 0.6847774386405945, "learning_rate": 8.800176635616657e-05, "loss": 1.7536, "step": 123 }, { "epoch": 0.2800677583286279, "grad_norm": 0.6712337732315063, "learning_rate": 8.775349451079948e-05, "loss": 1.3252, "step": 124 }, { "epoch": 0.282326369282891, "grad_norm": 0.7772756814956665, "learning_rate": 8.750303815207486e-05, "loss": 1.6567, "step": 125 }, { "epoch": 0.2845849802371542, "grad_norm": 0.6570346355438232, "learning_rate": 8.725041177203554e-05, "loss": 1.2232, "step": 126 }, { "epoch": 0.2868435911914173, "grad_norm": 0.6152611374855042, "learning_rate": 8.699562998828738e-05, "loss": 1.42, "step": 127 }, { "epoch": 0.2891022021456804, "grad_norm": 0.5262449979782104, "learning_rate": 8.673870754315336e-05, "loss": 1.3769, "step": 128 }, { "epoch": 0.29136081309994355, "grad_norm": 0.644555926322937, "learning_rate": 8.647965930282059e-05, "loss": 1.4069, "step": 129 }, { "epoch": 0.29361942405420666, "grad_norm": 0.74615079164505, "learning_rate": 8.621850025648009e-05, "loss": 1.2758, "step": 130 }, { "epoch": 0.29587803500846976, "grad_norm": 1.079114317893982, "learning_rate": 8.59552455154595e-05, "loss": 1.7415, "step": 131 }, { "epoch": 0.2981366459627329, "grad_norm": 0.522380530834198, "learning_rate": 8.56899103123487e-05, "loss": 1.3716, "step": 132 }, { "epoch": 0.30039525691699603, "grad_norm": 0.6177389621734619, "learning_rate": 8.54225100001184e-05, "loss": 1.2745, "step": 133 }, { "epoch": 0.3026538678712592, "grad_norm": 0.5246083736419678, "learning_rate": 8.51530600512318e-05, "loss": 1.4917, "step": 134 }, { "epoch": 0.3049124788255223, "grad_norm": 1.4917463064193726, "learning_rate": 8.488157605674925e-05, "loss": 1.2877, "step": 135 }, { "epoch": 0.3071710897797854, "grad_norm": 0.5848981738090515, "learning_rate": 8.460807372542618e-05, "loss": 1.2985, "step": 136 }, { "epoch": 0.3094297007340486, "grad_norm": 0.8174607157707214, "learning_rate": 8.43325688828042e-05, "loss": 1.183, "step": 137 }, { "epoch": 0.3116883116883117, "grad_norm": 0.467540442943573, "learning_rate": 8.405507747029523e-05, "loss": 1.3237, "step": 138 }, { "epoch": 0.31394692264257484, "grad_norm": 0.43935471773147583, "learning_rate": 8.377561554425922e-05, "loss": 1.1914, "step": 139 }, { "epoch": 0.31620553359683795, "grad_norm": 0.5460615754127502, "learning_rate": 8.349419927507505e-05, "loss": 1.474, "step": 140 }, { "epoch": 0.31846414455110106, "grad_norm": 0.5589008331298828, "learning_rate": 8.321084494620488e-05, "loss": 1.5669, "step": 141 }, { "epoch": 0.3207227555053642, "grad_norm": 0.5428789854049683, "learning_rate": 8.292556895325194e-05, "loss": 1.2317, "step": 142 }, { "epoch": 0.32298136645962733, "grad_norm": 0.5516197085380554, "learning_rate": 8.263838780301182e-05, "loss": 1.374, "step": 143 }, { "epoch": 0.32523997741389044, "grad_norm": 0.6941014528274536, "learning_rate": 8.234931811251739e-05, "loss": 1.1839, "step": 144 }, { "epoch": 0.3274985883681536, "grad_norm": 0.7662340402603149, "learning_rate": 8.205837660807725e-05, "loss": 1.6056, "step": 145 }, { "epoch": 0.3297571993224167, "grad_norm": 0.6064298748970032, "learning_rate": 8.176558012430791e-05, "loss": 1.2736, "step": 146 }, { "epoch": 0.33201581027667987, "grad_norm": 0.7128156423568726, "learning_rate": 8.147094560315977e-05, "loss": 1.156, "step": 147 }, { "epoch": 0.334274421230943, "grad_norm": 0.5706272721290588, "learning_rate": 8.117449009293668e-05, "loss": 1.3529, "step": 148 }, { "epoch": 0.3365330321852061, "grad_norm": 0.4535754919052124, "learning_rate": 8.08762307473096e-05, "loss": 1.2878, "step": 149 }, { "epoch": 0.33879164313946925, "grad_norm": 0.5142524242401123, "learning_rate": 8.057618482432399e-05, "loss": 1.4953, "step": 150 }, { "epoch": 0.34105025409373235, "grad_norm": 0.4920322895050049, "learning_rate": 8.027436968540123e-05, "loss": 1.4318, "step": 151 }, { "epoch": 0.34330886504799546, "grad_norm": 0.5119665861129761, "learning_rate": 7.997080279433402e-05, "loss": 1.2697, "step": 152 }, { "epoch": 0.3455674760022586, "grad_norm": 0.6550598740577698, "learning_rate": 7.966550171627592e-05, "loss": 1.3713, "step": 153 }, { "epoch": 0.34782608695652173, "grad_norm": 0.47998011112213135, "learning_rate": 7.9358484116725e-05, "loss": 1.3111, "step": 154 }, { "epoch": 0.3500846979107849, "grad_norm": 0.535916268825531, "learning_rate": 7.904976776050156e-05, "loss": 1.5861, "step": 155 }, { "epoch": 0.352343308865048, "grad_norm": 0.4098435640335083, "learning_rate": 7.873937051072035e-05, "loss": 1.1488, "step": 156 }, { "epoch": 0.3546019198193111, "grad_norm": 0.6115076541900635, "learning_rate": 7.842731032775687e-05, "loss": 1.4248, "step": 157 }, { "epoch": 0.35686053077357427, "grad_norm": 0.6232655644416809, "learning_rate": 7.81136052682082e-05, "loss": 1.3005, "step": 158 }, { "epoch": 0.3591191417278374, "grad_norm": 0.9366422295570374, "learning_rate": 7.779827348384813e-05, "loss": 1.6395, "step": 159 }, { "epoch": 0.3613777526821005, "grad_norm": 0.709642231464386, "learning_rate": 7.748133322057693e-05, "loss": 1.5304, "step": 160 }, { "epoch": 0.36363636363636365, "grad_norm": 0.5911598205566406, "learning_rate": 7.716280281736551e-05, "loss": 1.6141, "step": 161 }, { "epoch": 0.36589497459062675, "grad_norm": 0.45050862431526184, "learning_rate": 7.68427007051944e-05, "loss": 1.3448, "step": 162 }, { "epoch": 0.3681535855448899, "grad_norm": 0.5454220771789551, "learning_rate": 7.652104540598712e-05, "loss": 1.2553, "step": 163 }, { "epoch": 0.370412196499153, "grad_norm": 0.5132555365562439, "learning_rate": 7.619785553153864e-05, "loss": 1.4322, "step": 164 }, { "epoch": 0.37267080745341613, "grad_norm": 0.48642510175704956, "learning_rate": 7.58731497824383e-05, "loss": 1.2141, "step": 165 }, { "epoch": 0.3749294184076793, "grad_norm": 0.6442435383796692, "learning_rate": 7.554694694698784e-05, "loss": 1.3274, "step": 166 }, { "epoch": 0.3771880293619424, "grad_norm": 0.6116953492164612, "learning_rate": 7.521926590011418e-05, "loss": 1.5006, "step": 167 }, { "epoch": 0.3794466403162055, "grad_norm": 3.9714138507843018, "learning_rate": 7.489012560227742e-05, "loss": 1.193, "step": 168 }, { "epoch": 0.38170525127046867, "grad_norm": 0.5512414574623108, "learning_rate": 7.455954509837352e-05, "loss": 1.4702, "step": 169 }, { "epoch": 0.3839638622247318, "grad_norm": 0.5597122311592102, "learning_rate": 7.422754351663252e-05, "loss": 1.4636, "step": 170 }, { "epoch": 0.38622247317899494, "grad_norm": 0.4637336730957031, "learning_rate": 7.389414006751158e-05, "loss": 1.4661, "step": 171 }, { "epoch": 0.38848108413325805, "grad_norm": 1.08772873878479, "learning_rate": 7.355935404258354e-05, "loss": 1.4758, "step": 172 }, { "epoch": 0.39073969508752115, "grad_norm": 0.537563681602478, "learning_rate": 7.322320481342054e-05, "loss": 1.4603, "step": 173 }, { "epoch": 0.3929983060417843, "grad_norm": 0.6120302677154541, "learning_rate": 7.288571183047322e-05, "loss": 1.1336, "step": 174 }, { "epoch": 0.3952569169960474, "grad_norm": 0.5834792852401733, "learning_rate": 7.254689462194522e-05, "loss": 1.1359, "step": 175 }, { "epoch": 0.39751552795031053, "grad_norm": 0.5482034683227539, "learning_rate": 7.220677279266327e-05, "loss": 1.416, "step": 176 }, { "epoch": 0.3997741389045737, "grad_norm": 0.6085582375526428, "learning_rate": 7.186536602294278e-05, "loss": 1.5104, "step": 177 }, { "epoch": 0.4020327498588368, "grad_norm": 0.5022709369659424, "learning_rate": 7.152269406744903e-05, "loss": 1.8106, "step": 178 }, { "epoch": 0.40429136081309996, "grad_norm": 0.6673846244812012, "learning_rate": 7.117877675405427e-05, "loss": 1.4334, "step": 179 }, { "epoch": 0.40654997176736307, "grad_norm": 0.6298004388809204, "learning_rate": 7.083363398269022e-05, "loss": 1.4593, "step": 180 }, { "epoch": 0.4088085827216262, "grad_norm": 0.5097036361694336, "learning_rate": 7.04872857241968e-05, "loss": 1.3969, "step": 181 }, { "epoch": 0.41106719367588934, "grad_norm": 0.5180112719535828, "learning_rate": 7.013975201916648e-05, "loss": 1.7094, "step": 182 }, { "epoch": 0.41332580463015245, "grad_norm": 0.803783655166626, "learning_rate": 6.979105297678462e-05, "loss": 1.4688, "step": 183 }, { "epoch": 0.4155844155844156, "grad_norm": 0.5442876219749451, "learning_rate": 6.944120877366604e-05, "loss": 1.5643, "step": 184 }, { "epoch": 0.4178430265386787, "grad_norm": 0.6098489165306091, "learning_rate": 6.909023965268746e-05, "loss": 1.3152, "step": 185 }, { "epoch": 0.4201016374929418, "grad_norm": 0.8846858143806458, "learning_rate": 6.873816592181617e-05, "loss": 1.1858, "step": 186 }, { "epoch": 0.422360248447205, "grad_norm": 0.5681151747703552, "learning_rate": 6.838500795293505e-05, "loss": 1.3645, "step": 187 }, { "epoch": 0.4246188594014681, "grad_norm": 0.5397717356681824, "learning_rate": 6.803078618066378e-05, "loss": 1.4444, "step": 188 }, { "epoch": 0.4268774703557312, "grad_norm": 0.5300354361534119, "learning_rate": 6.767552110117631e-05, "loss": 1.4324, "step": 189 }, { "epoch": 0.42913608130999437, "grad_norm": 0.4856337010860443, "learning_rate": 6.73192332710151e-05, "loss": 1.3462, "step": 190 }, { "epoch": 0.4313946922642575, "grad_norm": 0.6010419130325317, "learning_rate": 6.696194330590151e-05, "loss": 1.6966, "step": 191 }, { "epoch": 0.43365330321852064, "grad_norm": 0.5775107145309448, "learning_rate": 6.660367187954304e-05, "loss": 1.4229, "step": 192 }, { "epoch": 0.43591191417278374, "grad_norm": 0.6146912574768066, "learning_rate": 6.624443972243698e-05, "loss": 1.2737, "step": 193 }, { "epoch": 0.43817052512704685, "grad_norm": 1.4060486555099487, "learning_rate": 6.5884267620671e-05, "loss": 1.2221, "step": 194 }, { "epoch": 0.44042913608131, "grad_norm": 1.0087642669677734, "learning_rate": 6.552317641472026e-05, "loss": 1.504, "step": 195 }, { "epoch": 0.4426877470355731, "grad_norm": 0.4778720438480377, "learning_rate": 6.516118699824178e-05, "loss": 1.2763, "step": 196 }, { "epoch": 0.4449463579898362, "grad_norm": 0.5498337745666504, "learning_rate": 6.479832031686521e-05, "loss": 1.4852, "step": 197 }, { "epoch": 0.4472049689440994, "grad_norm": 0.5609453916549683, "learning_rate": 6.443459736698105e-05, "loss": 1.6017, "step": 198 }, { "epoch": 0.4494635798983625, "grad_norm": 0.5456724166870117, "learning_rate": 6.407003919452564e-05, "loss": 1.2885, "step": 199 }, { "epoch": 0.45172219085262566, "grad_norm": 0.7034818530082703, "learning_rate": 6.370466689376342e-05, "loss": 1.4856, "step": 200 }, { "epoch": 0.45398080180688877, "grad_norm": 0.5647463202476501, "learning_rate": 6.33385016060664e-05, "loss": 1.745, "step": 201 }, { "epoch": 0.4562394127611519, "grad_norm": 0.5447326302528381, "learning_rate": 6.297156451869082e-05, "loss": 1.407, "step": 202 }, { "epoch": 0.45849802371541504, "grad_norm": 0.5139984488487244, "learning_rate": 6.260387686355121e-05, "loss": 1.3284, "step": 203 }, { "epoch": 0.46075663466967814, "grad_norm": 0.44981297850608826, "learning_rate": 6.223545991599184e-05, "loss": 1.4727, "step": 204 }, { "epoch": 0.46301524562394125, "grad_norm": 0.508630096912384, "learning_rate": 6.186633499355576e-05, "loss": 1.6505, "step": 205 }, { "epoch": 0.4652738565782044, "grad_norm": 0.5300900340080261, "learning_rate": 6.149652345475118e-05, "loss": 1.3005, "step": 206 }, { "epoch": 0.4675324675324675, "grad_norm": 0.8527065515518188, "learning_rate": 6.112604669781572e-05, "loss": 1.4615, "step": 207 }, { "epoch": 0.4697910784867307, "grad_norm": 0.6106131672859192, "learning_rate": 6.075492615947823e-05, "loss": 1.3441, "step": 208 }, { "epoch": 0.4720496894409938, "grad_norm": 0.4431338608264923, "learning_rate": 6.038318331371836e-05, "loss": 1.5369, "step": 209 }, { "epoch": 0.4743083003952569, "grad_norm": 0.6845422983169556, "learning_rate": 6.001083967052408e-05, "loss": 1.6926, "step": 210 }, { "epoch": 0.47656691134952006, "grad_norm": 0.48047032952308655, "learning_rate": 5.963791677464696e-05, "loss": 1.7314, "step": 211 }, { "epoch": 0.47882552230378317, "grad_norm": 0.7335491180419922, "learning_rate": 5.9264436204355724e-05, "loss": 1.2773, "step": 212 }, { "epoch": 0.4810841332580463, "grad_norm": 0.7190502285957336, "learning_rate": 5.889041957018745e-05, "loss": 1.2519, "step": 213 }, { "epoch": 0.48334274421230944, "grad_norm": 0.547556459903717, "learning_rate": 5.85158885136973e-05, "loss": 1.1781, "step": 214 }, { "epoch": 0.48560135516657255, "grad_norm": 0.6162766814231873, "learning_rate": 5.81408647062062e-05, "loss": 1.4605, "step": 215 }, { "epoch": 0.4878599661208357, "grad_norm": 0.7362288236618042, "learning_rate": 5.7765369847546916e-05, "loss": 1.5264, "step": 216 }, { "epoch": 0.4901185770750988, "grad_norm": 0.8551615476608276, "learning_rate": 5.7389425664808396e-05, "loss": 1.7111, "step": 217 }, { "epoch": 0.4923771880293619, "grad_norm": 0.5673311948776245, "learning_rate": 5.7013053911078677e-05, "loss": 1.4621, "step": 218 }, { "epoch": 0.4946357989836251, "grad_norm": 0.4883188307285309, "learning_rate": 5.6636276364186105e-05, "loss": 1.4759, "step": 219 }, { "epoch": 0.4968944099378882, "grad_norm": 0.5014625191688538, "learning_rate": 5.6259114825439275e-05, "loss": 1.5336, "step": 220 }, { "epoch": 0.4991530208921513, "grad_norm": 0.5976287722587585, "learning_rate": 5.588159111836553e-05, "loss": 1.3577, "step": 221 }, { "epoch": 0.5014116318464145, "grad_norm": 0.5646213889122009, "learning_rate": 5.550372708744815e-05, "loss": 1.4191, "step": 222 }, { "epoch": 0.5014116318464145, "eval_loss": 1.4101738929748535, "eval_runtime": 96.1988, "eval_samples_per_second": 7.755, "eval_steps_per_second": 0.977, "step": 222 }, { "epoch": 0.5036702428006776, "grad_norm": 0.48547643423080444, "learning_rate": 5.51255445968625e-05, "loss": 1.4118, "step": 223 }, { "epoch": 0.5059288537549407, "grad_norm": 0.7442102432250977, "learning_rate": 5.4747065529210736e-05, "loss": 1.3571, "step": 224 }, { "epoch": 0.5081874647092038, "grad_norm": 0.6088778376579285, "learning_rate": 5.436831178425582e-05, "loss": 1.3432, "step": 225 }, { "epoch": 0.510446075663467, "grad_norm": 0.8041794896125793, "learning_rate": 5.3989305277654156e-05, "loss": 1.4741, "step": 226 }, { "epoch": 0.51270468661773, "grad_norm": 0.47421059012413025, "learning_rate": 5.361006793968764e-05, "loss": 1.2545, "step": 227 }, { "epoch": 0.5149632975719932, "grad_norm": 0.6154189109802246, "learning_rate": 5.32306217139946e-05, "loss": 1.4776, "step": 228 }, { "epoch": 0.5172219085262564, "grad_norm": 0.5990487337112427, "learning_rate": 5.28509885563002e-05, "loss": 1.2825, "step": 229 }, { "epoch": 0.5194805194805194, "grad_norm": 0.6563726663589478, "learning_rate": 5.247119043314592e-05, "loss": 1.4692, "step": 230 }, { "epoch": 0.5217391304347826, "grad_norm": 0.5759412050247192, "learning_rate": 5.209124932061862e-05, "loss": 1.599, "step": 231 }, { "epoch": 0.5239977413890458, "grad_norm": 0.5122321248054504, "learning_rate": 5.1711187203078824e-05, "loss": 1.4495, "step": 232 }, { "epoch": 0.5262563523433089, "grad_norm": 0.5724780559539795, "learning_rate": 5.133102607188874e-05, "loss": 1.4343, "step": 233 }, { "epoch": 0.528514963297572, "grad_norm": 0.6793734431266785, "learning_rate": 5.0950787924139764e-05, "loss": 1.2671, "step": 234 }, { "epoch": 0.5307735742518351, "grad_norm": 0.5128283500671387, "learning_rate": 5.057049476137967e-05, "loss": 1.1848, "step": 235 }, { "epoch": 0.5330321852060983, "grad_norm": 0.5823889970779419, "learning_rate": 5.0190168588339536e-05, "loss": 1.2724, "step": 236 }, { "epoch": 0.5352907961603613, "grad_norm": 0.41200748085975647, "learning_rate": 4.9809831411660476e-05, "loss": 1.3234, "step": 237 }, { "epoch": 0.5375494071146245, "grad_norm": 0.5784906148910522, "learning_rate": 4.942950523862033e-05, "loss": 1.4, "step": 238 }, { "epoch": 0.5398080180688877, "grad_norm": 0.9113706946372986, "learning_rate": 4.904921207586024e-05, "loss": 1.3569, "step": 239 }, { "epoch": 0.5420666290231507, "grad_norm": 0.5041924118995667, "learning_rate": 4.866897392811126e-05, "loss": 1.1772, "step": 240 }, { "epoch": 0.5443252399774139, "grad_norm": 0.7096860408782959, "learning_rate": 4.828881279692119e-05, "loss": 1.6993, "step": 241 }, { "epoch": 0.546583850931677, "grad_norm": 0.4959520101547241, "learning_rate": 4.7908750679381384e-05, "loss": 1.6176, "step": 242 }, { "epoch": 0.5488424618859401, "grad_norm": 0.5700163841247559, "learning_rate": 4.752880956685407e-05, "loss": 1.0817, "step": 243 }, { "epoch": 0.5511010728402033, "grad_norm": 0.5470147132873535, "learning_rate": 4.7149011443699814e-05, "loss": 1.4854, "step": 244 }, { "epoch": 0.5533596837944664, "grad_norm": 0.7049497961997986, "learning_rate": 4.676937828600542e-05, "loss": 1.4135, "step": 245 }, { "epoch": 0.5556182947487295, "grad_norm": 0.7252426743507385, "learning_rate": 4.638993206031237e-05, "loss": 1.4154, "step": 246 }, { "epoch": 0.5578769057029926, "grad_norm": 0.575930118560791, "learning_rate": 4.601069472234584e-05, "loss": 1.0767, "step": 247 }, { "epoch": 0.5601355166572558, "grad_norm": 1.1477127075195312, "learning_rate": 4.56316882157442e-05, "loss": 1.332, "step": 248 }, { "epoch": 0.562394127611519, "grad_norm": 0.5191376209259033, "learning_rate": 4.525293447078927e-05, "loss": 1.7038, "step": 249 }, { "epoch": 0.564652738565782, "grad_norm": 0.554302990436554, "learning_rate": 4.4874455403137514e-05, "loss": 1.7356, "step": 250 }, { "epoch": 0.5669113495200452, "grad_norm": 1.1342941522598267, "learning_rate": 4.449627291255184e-05, "loss": 1.6412, "step": 251 }, { "epoch": 0.5691699604743083, "grad_norm": 0.5467344522476196, "learning_rate": 4.411840888163449e-05, "loss": 1.3031, "step": 252 }, { "epoch": 0.5714285714285714, "grad_norm": 0.4619009494781494, "learning_rate": 4.3740885174560736e-05, "loss": 1.4958, "step": 253 }, { "epoch": 0.5736871823828346, "grad_norm": 0.6281998157501221, "learning_rate": 4.336372363581391e-05, "loss": 1.4351, "step": 254 }, { "epoch": 0.5759457933370977, "grad_norm": 0.6288211345672607, "learning_rate": 4.298694608892134e-05, "loss": 1.7786, "step": 255 }, { "epoch": 0.5782044042913608, "grad_norm": 1.0841307640075684, "learning_rate": 4.2610574335191615e-05, "loss": 1.4286, "step": 256 }, { "epoch": 0.5804630152456239, "grad_norm": 0.7780132293701172, "learning_rate": 4.2234630152453116e-05, "loss": 1.385, "step": 257 }, { "epoch": 0.5827216261998871, "grad_norm": 0.49743711948394775, "learning_rate": 4.185913529379381e-05, "loss": 1.1707, "step": 258 }, { "epoch": 0.5849802371541502, "grad_norm": 0.7499518394470215, "learning_rate": 4.1484111486302704e-05, "loss": 1.2754, "step": 259 }, { "epoch": 0.5872388481084133, "grad_norm": 0.5472561717033386, "learning_rate": 4.110958042981255e-05, "loss": 1.4632, "step": 260 }, { "epoch": 0.5894974590626765, "grad_norm": 0.6489593386650085, "learning_rate": 4.0735563795644294e-05, "loss": 1.5454, "step": 261 }, { "epoch": 0.5917560700169395, "grad_norm": 0.5738951563835144, "learning_rate": 4.0362083225353046e-05, "loss": 1.4356, "step": 262 }, { "epoch": 0.5940146809712027, "grad_norm": 0.4810039699077606, "learning_rate": 3.998916032947594e-05, "loss": 1.3287, "step": 263 }, { "epoch": 0.5962732919254659, "grad_norm": 0.6687359809875488, "learning_rate": 3.961681668628164e-05, "loss": 1.2772, "step": 264 }, { "epoch": 0.598531902879729, "grad_norm": 0.5497633814811707, "learning_rate": 3.9245073840521765e-05, "loss": 1.4254, "step": 265 }, { "epoch": 0.6007905138339921, "grad_norm": 0.5534753203392029, "learning_rate": 3.887395330218429e-05, "loss": 1.3866, "step": 266 }, { "epoch": 0.6030491247882552, "grad_norm": 1.2714121341705322, "learning_rate": 3.850347654524883e-05, "loss": 1.4241, "step": 267 }, { "epoch": 0.6053077357425184, "grad_norm": 0.6475468277931213, "learning_rate": 3.8133665006444255e-05, "loss": 1.2423, "step": 268 }, { "epoch": 0.6075663466967814, "grad_norm": 0.6580026745796204, "learning_rate": 3.776454008400816e-05, "loss": 1.7248, "step": 269 }, { "epoch": 0.6098249576510446, "grad_norm": 0.47581014037132263, "learning_rate": 3.7396123136448824e-05, "loss": 1.5206, "step": 270 }, { "epoch": 0.6120835686053078, "grad_norm": 0.6214271783828735, "learning_rate": 3.70284354813092e-05, "loss": 1.3408, "step": 271 }, { "epoch": 0.6143421795595708, "grad_norm": 0.6450148224830627, "learning_rate": 3.666149839393361e-05, "loss": 1.6774, "step": 272 }, { "epoch": 0.616600790513834, "grad_norm": 0.4316481947898865, "learning_rate": 3.629533310623658e-05, "loss": 1.0687, "step": 273 }, { "epoch": 0.6188594014680971, "grad_norm": 0.6752032041549683, "learning_rate": 3.592996080547438e-05, "loss": 1.7115, "step": 274 }, { "epoch": 0.6211180124223602, "grad_norm": 0.6112974882125854, "learning_rate": 3.556540263301896e-05, "loss": 1.5235, "step": 275 }, { "epoch": 0.6233766233766234, "grad_norm": 0.5716705322265625, "learning_rate": 3.520167968313479e-05, "loss": 1.4077, "step": 276 }, { "epoch": 0.6256352343308865, "grad_norm": 0.443218469619751, "learning_rate": 3.483881300175823e-05, "loss": 1.5901, "step": 277 }, { "epoch": 0.6278938452851497, "grad_norm": 0.6423050761222839, "learning_rate": 3.447682358527974e-05, "loss": 1.2472, "step": 278 }, { "epoch": 0.6301524562394127, "grad_norm": 0.6637226939201355, "learning_rate": 3.411573237932904e-05, "loss": 1.416, "step": 279 }, { "epoch": 0.6324110671936759, "grad_norm": 0.6266383528709412, "learning_rate": 3.3755560277563023e-05, "loss": 1.3596, "step": 280 }, { "epoch": 0.6346696781479391, "grad_norm": 0.4592457711696625, "learning_rate": 3.339632812045696e-05, "loss": 1.4775, "step": 281 }, { "epoch": 0.6369282891022021, "grad_norm": 0.4435870945453644, "learning_rate": 3.303805669409848e-05, "loss": 1.2606, "step": 282 }, { "epoch": 0.6391869000564653, "grad_norm": 0.8418070673942566, "learning_rate": 3.268076672898492e-05, "loss": 1.292, "step": 283 }, { "epoch": 0.6414455110107284, "grad_norm": 0.46599528193473816, "learning_rate": 3.2324478898823705e-05, "loss": 1.4636, "step": 284 }, { "epoch": 0.6437041219649915, "grad_norm": 0.5560009479522705, "learning_rate": 3.196921381933624e-05, "loss": 1.3654, "step": 285 }, { "epoch": 0.6459627329192547, "grad_norm": 0.5951889753341675, "learning_rate": 3.1614992047064945e-05, "loss": 1.2783, "step": 286 }, { "epoch": 0.6482213438735178, "grad_norm": 0.5692819952964783, "learning_rate": 3.126183407818384e-05, "loss": 1.3873, "step": 287 }, { "epoch": 0.6504799548277809, "grad_norm": 0.7618367671966553, "learning_rate": 3.090976034731257e-05, "loss": 1.3041, "step": 288 }, { "epoch": 0.652738565782044, "grad_norm": 0.6365513205528259, "learning_rate": 3.055879122633397e-05, "loss": 1.3736, "step": 289 }, { "epoch": 0.6549971767363072, "grad_norm": 0.6634312272071838, "learning_rate": 3.020894702321539e-05, "loss": 1.5358, "step": 290 }, { "epoch": 0.6572557876905702, "grad_norm": 0.4850894510746002, "learning_rate": 2.9860247980833532e-05, "loss": 1.2746, "step": 291 }, { "epoch": 0.6595143986448334, "grad_norm": 0.6311604976654053, "learning_rate": 2.951271427580321e-05, "loss": 1.3607, "step": 292 }, { "epoch": 0.6617730095990966, "grad_norm": 0.6195625066757202, "learning_rate": 2.91663660173098e-05, "loss": 1.3412, "step": 293 }, { "epoch": 0.6640316205533597, "grad_norm": 0.6908369660377502, "learning_rate": 2.882122324594575e-05, "loss": 1.5332, "step": 294 }, { "epoch": 0.6662902315076228, "grad_norm": 0.9951730966567993, "learning_rate": 2.847730593255097e-05, "loss": 1.4562, "step": 295 }, { "epoch": 0.668548842461886, "grad_norm": 0.4664572775363922, "learning_rate": 2.8134633977057235e-05, "loss": 1.608, "step": 296 }, { "epoch": 0.6708074534161491, "grad_norm": 0.5350282788276672, "learning_rate": 2.779322720733673e-05, "loss": 1.6665, "step": 297 }, { "epoch": 0.6730660643704122, "grad_norm": 0.4838089346885681, "learning_rate": 2.745310537805479e-05, "loss": 1.3432, "step": 298 }, { "epoch": 0.6753246753246753, "grad_norm": 0.5749621987342834, "learning_rate": 2.7114288169526793e-05, "loss": 1.5841, "step": 299 }, { "epoch": 0.6775832862789385, "grad_norm": 0.44221848249435425, "learning_rate": 2.6776795186579468e-05, "loss": 1.2465, "step": 300 }, { "epoch": 0.6798418972332015, "grad_norm": 0.5502551794052124, "learning_rate": 2.6440645957416484e-05, "loss": 1.7499, "step": 301 }, { "epoch": 0.6821005081874647, "grad_norm": 0.51673823595047, "learning_rate": 2.610585993248843e-05, "loss": 1.4352, "step": 302 }, { "epoch": 0.6843591191417279, "grad_norm": 0.5909664034843445, "learning_rate": 2.5772456483367497e-05, "loss": 1.2685, "step": 303 }, { "epoch": 0.6866177300959909, "grad_norm": 0.5207455158233643, "learning_rate": 2.5440454901626486e-05, "loss": 1.4422, "step": 304 }, { "epoch": 0.6888763410502541, "grad_norm": 0.6264783143997192, "learning_rate": 2.510987439772261e-05, "loss": 1.1306, "step": 305 }, { "epoch": 0.6911349520045172, "grad_norm": 0.5614155530929565, "learning_rate": 2.4780734099885833e-05, "loss": 1.9378, "step": 306 }, { "epoch": 0.6933935629587803, "grad_norm": 0.5099676251411438, "learning_rate": 2.4453053053012187e-05, "loss": 1.4991, "step": 307 }, { "epoch": 0.6956521739130435, "grad_norm": 0.6968526244163513, "learning_rate": 2.4126850217561698e-05, "loss": 1.5211, "step": 308 }, { "epoch": 0.6979107848673066, "grad_norm": 0.4605090320110321, "learning_rate": 2.3802144468461367e-05, "loss": 1.5865, "step": 309 }, { "epoch": 0.7001693958215698, "grad_norm": 0.8055828809738159, "learning_rate": 2.347895459401288e-05, "loss": 1.4216, "step": 310 }, { "epoch": 0.7024280067758328, "grad_norm": 0.5503789186477661, "learning_rate": 2.3157299294805613e-05, "loss": 1.378, "step": 311 }, { "epoch": 0.704686617730096, "grad_norm": 0.6064462661743164, "learning_rate": 2.2837197182634483e-05, "loss": 1.6254, "step": 312 }, { "epoch": 0.7069452286843592, "grad_norm": 0.7875195741653442, "learning_rate": 2.2518666779423074e-05, "loss": 1.459, "step": 313 }, { "epoch": 0.7092038396386222, "grad_norm": 0.5678854584693909, "learning_rate": 2.2201726516151882e-05, "loss": 1.4943, "step": 314 }, { "epoch": 0.7114624505928854, "grad_norm": 0.6135299801826477, "learning_rate": 2.1886394731791816e-05, "loss": 1.5494, "step": 315 }, { "epoch": 0.7137210615471485, "grad_norm": 0.49695029854774475, "learning_rate": 2.157268967224314e-05, "loss": 1.4126, "step": 316 }, { "epoch": 0.7159796725014116, "grad_norm": 0.5455291271209717, "learning_rate": 2.126062948927966e-05, "loss": 1.5505, "step": 317 }, { "epoch": 0.7182382834556748, "grad_norm": 0.6816684603691101, "learning_rate": 2.0950232239498446e-05, "loss": 1.1783, "step": 318 }, { "epoch": 0.7204968944099379, "grad_norm": 0.548568069934845, "learning_rate": 2.064151588327501e-05, "loss": 1.5532, "step": 319 }, { "epoch": 0.722755505364201, "grad_norm": 0.44020459055900574, "learning_rate": 2.0334498283724078e-05, "loss": 1.7366, "step": 320 }, { "epoch": 0.7250141163184641, "grad_norm": 0.5120103359222412, "learning_rate": 2.002919720566599e-05, "loss": 1.2849, "step": 321 }, { "epoch": 0.7272727272727273, "grad_norm": 1.1308592557907104, "learning_rate": 1.9725630314598782e-05, "loss": 1.3336, "step": 322 }, { "epoch": 0.7295313382269905, "grad_norm": 0.7465949058532715, "learning_rate": 1.9423815175676025e-05, "loss": 1.4263, "step": 323 }, { "epoch": 0.7317899491812535, "grad_norm": 0.6979694962501526, "learning_rate": 1.912376925269041e-05, "loss": 1.2292, "step": 324 }, { "epoch": 0.7340485601355167, "grad_norm": 0.6051463484764099, "learning_rate": 1.8825509907063327e-05, "loss": 1.6008, "step": 325 }, { "epoch": 0.7363071710897798, "grad_norm": 0.5719790458679199, "learning_rate": 1.8529054396840234e-05, "loss": 1.5861, "step": 326 }, { "epoch": 0.7385657820440429, "grad_norm": 0.4965897798538208, "learning_rate": 1.8234419875692105e-05, "loss": 1.9165, "step": 327 }, { "epoch": 0.740824392998306, "grad_norm": 0.6010226011276245, "learning_rate": 1.7941623391922772e-05, "loss": 1.2716, "step": 328 }, { "epoch": 0.7430830039525692, "grad_norm": 0.6560564637184143, "learning_rate": 1.7650681887482628e-05, "loss": 1.5902, "step": 329 }, { "epoch": 0.7453416149068323, "grad_norm": 0.7512021660804749, "learning_rate": 1.7361612196988174e-05, "loss": 1.4901, "step": 330 }, { "epoch": 0.7476002258610954, "grad_norm": 0.5139881372451782, "learning_rate": 1.7074431046748075e-05, "loss": 1.366, "step": 331 }, { "epoch": 0.7498588368153586, "grad_norm": 0.5443575978279114, "learning_rate": 1.678915505379513e-05, "loss": 1.2283, "step": 332 }, { "epoch": 0.7521174477696216, "grad_norm": 0.5868397951126099, "learning_rate": 1.650580072492496e-05, "loss": 1.396, "step": 333 }, { "epoch": 0.7543760587238848, "grad_norm": 0.5796262621879578, "learning_rate": 1.6224384455740788e-05, "loss": 1.522, "step": 334 }, { "epoch": 0.756634669678148, "grad_norm": 0.5410485863685608, "learning_rate": 1.5944922529704777e-05, "loss": 1.3733, "step": 335 }, { "epoch": 0.758893280632411, "grad_norm": 0.5935060977935791, "learning_rate": 1.5667431117195814e-05, "loss": 1.4833, "step": 336 }, { "epoch": 0.7611518915866742, "grad_norm": 0.6086990833282471, "learning_rate": 1.539192627457382e-05, "loss": 1.2748, "step": 337 }, { "epoch": 0.7634105025409373, "grad_norm": 0.9184845089912415, "learning_rate": 1.5118423943250771e-05, "loss": 1.5052, "step": 338 }, { "epoch": 0.7656691134952005, "grad_norm": 0.5554947853088379, "learning_rate": 1.4846939948768218e-05, "loss": 1.8114, "step": 339 }, { "epoch": 0.7679277244494636, "grad_norm": 0.5822046399116516, "learning_rate": 1.45774899998816e-05, "loss": 1.1491, "step": 340 }, { "epoch": 0.7701863354037267, "grad_norm": 0.5082180500030518, "learning_rate": 1.4310089687651301e-05, "loss": 1.2931, "step": 341 }, { "epoch": 0.7724449463579899, "grad_norm": 0.7087529897689819, "learning_rate": 1.40447544845405e-05, "loss": 1.4946, "step": 342 }, { "epoch": 0.7747035573122529, "grad_norm": 0.566061794757843, "learning_rate": 1.378149974351991e-05, "loss": 1.31, "step": 343 }, { "epoch": 0.7769621682665161, "grad_norm": 0.48755723237991333, "learning_rate": 1.3520340697179406e-05, "loss": 1.5299, "step": 344 }, { "epoch": 0.7792207792207793, "grad_norm": 0.6160857677459717, "learning_rate": 1.3261292456846647e-05, "loss": 1.468, "step": 345 }, { "epoch": 0.7814793901750423, "grad_norm": 0.9709001779556274, "learning_rate": 1.3004370011712624e-05, "loss": 1.3858, "step": 346 }, { "epoch": 0.7837380011293055, "grad_norm": 0.5348103046417236, "learning_rate": 1.2749588227964465e-05, "loss": 1.2159, "step": 347 }, { "epoch": 0.7859966120835686, "grad_norm": 0.6455698609352112, "learning_rate": 1.2496961847925153e-05, "loss": 1.4438, "step": 348 }, { "epoch": 0.7882552230378317, "grad_norm": 0.7605583071708679, "learning_rate": 1.2246505489200532e-05, "loss": 1.2351, "step": 349 }, { "epoch": 0.7905138339920948, "grad_norm": 0.6536300182342529, "learning_rate": 1.1998233643833457e-05, "loss": 1.3756, "step": 350 }, { "epoch": 0.792772444946358, "grad_norm": 0.5800183415412903, "learning_rate": 1.1752160677465286e-05, "loss": 1.3849, "step": 351 }, { "epoch": 0.7950310559006211, "grad_norm": 0.5287428498268127, "learning_rate": 1.150830082850468e-05, "loss": 1.4647, "step": 352 }, { "epoch": 0.7972896668548842, "grad_norm": 0.4410305619239807, "learning_rate": 1.126666820730366e-05, "loss": 1.3785, "step": 353 }, { "epoch": 0.7995482778091474, "grad_norm": 0.4867897927761078, "learning_rate": 1.1027276795341135e-05, "loss": 1.4087, "step": 354 }, { "epoch": 0.8018068887634106, "grad_norm": 0.7169702053070068, "learning_rate": 1.0790140444414e-05, "loss": 1.1061, "step": 355 }, { "epoch": 0.8040654997176736, "grad_norm": 1.4617806673049927, "learning_rate": 1.0555272875835537e-05, "loss": 1.3367, "step": 356 }, { "epoch": 0.8063241106719368, "grad_norm": 0.37601158022880554, "learning_rate": 1.0322687679641523e-05, "loss": 1.3613, "step": 357 }, { "epoch": 0.8085827216261999, "grad_norm": 0.6595029830932617, "learning_rate": 1.0092398313803863e-05, "loss": 1.6219, "step": 358 }, { "epoch": 0.810841332580463, "grad_norm": 0.5519759058952332, "learning_rate": 9.864418103451828e-06, "loss": 1.5668, "step": 359 }, { "epoch": 0.8130999435347261, "grad_norm": 0.5467275977134705, "learning_rate": 9.638760240101102e-06, "loss": 1.2981, "step": 360 }, { "epoch": 0.8153585544889893, "grad_norm": 0.5513888597488403, "learning_rate": 9.415437780890451e-06, "loss": 1.319, "step": 361 }, { "epoch": 0.8176171654432524, "grad_norm": 0.6388846039772034, "learning_rate": 9.194463647826223e-06, "loss": 1.6887, "step": 362 }, { "epoch": 0.8198757763975155, "grad_norm": 0.6353502869606018, "learning_rate": 8.975850627034604e-06, "loss": 1.3972, "step": 363 }, { "epoch": 0.8221343873517787, "grad_norm": 0.6129746437072754, "learning_rate": 8.759611368021831e-06, "loss": 1.3317, "step": 364 }, { "epoch": 0.8243929983060417, "grad_norm": 0.6045680046081543, "learning_rate": 8.545758382942232e-06, "loss": 1.2371, "step": 365 }, { "epoch": 0.8266516092603049, "grad_norm": 0.5147614479064941, "learning_rate": 8.334304045874247e-06, "loss": 1.498, "step": 366 }, { "epoch": 0.8289102202145681, "grad_norm": 0.41877540946006775, "learning_rate": 8.125260592104445e-06, "loss": 1.4604, "step": 367 }, { "epoch": 0.8311688311688312, "grad_norm": 0.5674257278442383, "learning_rate": 7.918640117419507e-06, "loss": 1.2845, "step": 368 }, { "epoch": 0.8334274421230943, "grad_norm": 0.7951464056968689, "learning_rate": 7.71445457740641e-06, "loss": 1.3372, "step": 369 }, { "epoch": 0.8356860530773574, "grad_norm": 0.6295720934867859, "learning_rate": 7.512715786760605e-06, "loss": 1.6711, "step": 370 }, { "epoch": 0.8379446640316206, "grad_norm": 0.5587788224220276, "learning_rate": 7.313435418602388e-06, "loss": 1.4419, "step": 371 }, { "epoch": 0.8402032749858837, "grad_norm": 0.8322708010673523, "learning_rate": 7.116625003801436e-06, "loss": 1.3559, "step": 372 }, { "epoch": 0.8424618859401468, "grad_norm": 0.6834149956703186, "learning_rate": 6.922295930309691e-06, "loss": 1.5552, "step": 373 }, { "epoch": 0.84472049689441, "grad_norm": 0.7518057823181152, "learning_rate": 6.730459442502329e-06, "loss": 1.5805, "step": 374 }, { "epoch": 0.846979107848673, "grad_norm": 0.5238639712333679, "learning_rate": 6.541126640527195e-06, "loss": 1.6161, "step": 375 }, { "epoch": 0.8492377188029362, "grad_norm": 0.5006189346313477, "learning_rate": 6.354308479662446e-06, "loss": 1.5392, "step": 376 }, { "epoch": 0.8514963297571994, "grad_norm": 0.5511265397071838, "learning_rate": 6.170015769682741e-06, "loss": 1.6469, "step": 377 }, { "epoch": 0.8537549407114624, "grad_norm": 0.5680528283119202, "learning_rate": 5.988259174233713e-06, "loss": 1.3312, "step": 378 }, { "epoch": 0.8560135516657256, "grad_norm": 0.773024320602417, "learning_rate": 5.80904921021494e-06, "loss": 1.4814, "step": 379 }, { "epoch": 0.8582721626199887, "grad_norm": 0.47784173488616943, "learning_rate": 5.6323962471714286e-06, "loss": 1.3431, "step": 380 }, { "epoch": 0.8605307735742518, "grad_norm": 0.5493645668029785, "learning_rate": 5.458310506693571e-06, "loss": 1.5127, "step": 381 }, { "epoch": 0.862789384528515, "grad_norm": 0.5539724826812744, "learning_rate": 5.286802061825752e-06, "loss": 1.4401, "step": 382 }, { "epoch": 0.8650479954827781, "grad_norm": 0.4766329824924469, "learning_rate": 5.117880836483452e-06, "loss": 1.5344, "step": 383 }, { "epoch": 0.8673066064370413, "grad_norm": 0.46693822741508484, "learning_rate": 4.951556604879048e-06, "loss": 1.3009, "step": 384 }, { "epoch": 0.8695652173913043, "grad_norm": 0.7873904705047607, "learning_rate": 4.7878389909562285e-06, "loss": 1.4413, "step": 385 }, { "epoch": 0.8718238283455675, "grad_norm": 0.5490197539329529, "learning_rate": 4.62673746783317e-06, "loss": 1.2131, "step": 386 }, { "epoch": 0.8740824392998306, "grad_norm": 0.4545706808567047, "learning_rate": 4.468261357254339e-06, "loss": 1.1735, "step": 387 }, { "epoch": 0.8763410502540937, "grad_norm": 0.5005091428756714, "learning_rate": 4.312419829051173e-06, "loss": 1.5943, "step": 388 }, { "epoch": 0.8785996612083569, "grad_norm": 0.546002984046936, "learning_rate": 4.15922190061146e-06, "loss": 1.3671, "step": 389 }, { "epoch": 0.88085827216262, "grad_norm": 0.6055895686149597, "learning_rate": 4.008676436357539e-06, "loss": 1.4737, "step": 390 }, { "epoch": 0.8831168831168831, "grad_norm": 0.7526850700378418, "learning_rate": 3.86079214723345e-06, "loss": 1.6545, "step": 391 }, { "epoch": 0.8853754940711462, "grad_norm": 0.6517372727394104, "learning_rate": 3.7155775902008526e-06, "loss": 1.2377, "step": 392 }, { "epoch": 0.8876341050254094, "grad_norm": 0.590649425983429, "learning_rate": 3.5730411677439125e-06, "loss": 1.5282, "step": 393 }, { "epoch": 0.8898927159796725, "grad_norm": 0.7884031534194946, "learning_rate": 3.4331911273830784e-06, "loss": 1.2815, "step": 394 }, { "epoch": 0.8921513269339356, "grad_norm": 0.4688451886177063, "learning_rate": 3.2960355611979245e-06, "loss": 1.2642, "step": 395 }, { "epoch": 0.8944099378881988, "grad_norm": 0.5359352231025696, "learning_rate": 3.161582405358876e-06, "loss": 1.4675, "step": 396 }, { "epoch": 0.8966685488424618, "grad_norm": 0.5633996725082397, "learning_rate": 3.029839439668003e-06, "loss": 1.5731, "step": 397 }, { "epoch": 0.898927159796725, "grad_norm": 0.6047767400741577, "learning_rate": 2.9008142871088663e-06, "loss": 1.4189, "step": 398 }, { "epoch": 0.9011857707509882, "grad_norm": 0.47979751229286194, "learning_rate": 2.7745144134054433e-06, "loss": 1.3879, "step": 399 }, { "epoch": 0.9034443817052513, "grad_norm": 0.4920281171798706, "learning_rate": 2.6509471265901477e-06, "loss": 1.423, "step": 400 }, { "epoch": 0.9057029926595144, "grad_norm": 0.5504844784736633, "learning_rate": 2.530119576580936e-06, "loss": 1.3404, "step": 401 }, { "epoch": 0.9079616036137775, "grad_norm": 0.4826345145702362, "learning_rate": 2.412038754767626e-06, "loss": 1.2597, "step": 402 }, { "epoch": 0.9102202145680407, "grad_norm": 0.7517184615135193, "learning_rate": 2.296711493607334e-06, "loss": 1.3037, "step": 403 }, { "epoch": 0.9124788255223037, "grad_norm": 0.6532472372055054, "learning_rate": 2.1841444662291543e-06, "loss": 1.4865, "step": 404 }, { "epoch": 0.9147374364765669, "grad_norm": 0.5889557600021362, "learning_rate": 2.074344186048022e-06, "loss": 1.2332, "step": 405 }, { "epoch": 0.9169960474308301, "grad_norm": 0.4623706638813019, "learning_rate": 1.967317006387831e-06, "loss": 1.5374, "step": 406 }, { "epoch": 0.9192546583850931, "grad_norm": 0.6362307071685791, "learning_rate": 1.863069120113814e-06, "loss": 1.0415, "step": 407 }, { "epoch": 0.9215132693393563, "grad_norm": 0.5328913927078247, "learning_rate": 1.7616065592742038e-06, "loss": 1.3665, "step": 408 }, { "epoch": 0.9237718802936195, "grad_norm": 0.61063551902771, "learning_rate": 1.6629351947512195e-06, "loss": 1.4575, "step": 409 }, { "epoch": 0.9260304912478825, "grad_norm": 0.5676048994064331, "learning_rate": 1.567060735921344e-06, "loss": 1.5069, "step": 410 }, { "epoch": 0.9282891022021457, "grad_norm": 0.6710496544837952, "learning_rate": 1.4739887303249877e-06, "loss": 1.4816, "step": 411 }, { "epoch": 0.9305477131564088, "grad_norm": 0.5965588092803955, "learning_rate": 1.383724563345451e-06, "loss": 1.6784, "step": 412 }, { "epoch": 0.932806324110672, "grad_norm": 0.6151901483535767, "learning_rate": 1.2962734578973568e-06, "loss": 1.4018, "step": 413 }, { "epoch": 0.935064935064935, "grad_norm": 0.6532612442970276, "learning_rate": 1.2116404741244203e-06, "loss": 1.415, "step": 414 }, { "epoch": 0.9373235460191982, "grad_norm": 0.5151341557502747, "learning_rate": 1.1298305091066664e-06, "loss": 1.3436, "step": 415 }, { "epoch": 0.9395821569734614, "grad_norm": 0.6843656301498413, "learning_rate": 1.0508482965770505e-06, "loss": 1.5971, "step": 416 }, { "epoch": 0.9418407679277244, "grad_norm": 0.5822469592094421, "learning_rate": 9.746984066475729e-07, "loss": 1.3158, "step": 417 }, { "epoch": 0.9440993788819876, "grad_norm": 0.8204763531684875, "learning_rate": 9.013852455448335e-07, "loss": 1.2683, "step": 418 }, { "epoch": 0.9463579898362507, "grad_norm": 0.9668586254119873, "learning_rate": 8.309130553550815e-07, "loss": 1.4966, "step": 419 }, { "epoch": 0.9486166007905138, "grad_norm": 0.3911222815513611, "learning_rate": 7.63285913778733e-07, "loss": 1.3486, "step": 420 }, { "epoch": 0.950875211744777, "grad_norm": 0.5275930166244507, "learning_rate": 6.985077338944657e-07, "loss": 1.044, "step": 421 }, { "epoch": 0.9531338226990401, "grad_norm": 0.5202364921569824, "learning_rate": 6.365822639327723e-07, "loss": 1.262, "step": 422 }, { "epoch": 0.9553924336533032, "grad_norm": 0.5621113181114197, "learning_rate": 5.775130870590783e-07, "loss": 1.3838, "step": 423 }, { "epoch": 0.9576510446075663, "grad_norm": 0.5562776923179626, "learning_rate": 5.213036211664191e-07, "loss": 1.4821, "step": 424 }, { "epoch": 0.9599096555618295, "grad_norm": 0.5257914066314697, "learning_rate": 4.6795711867766436e-07, "loss": 1.6874, "step": 425 }, { "epoch": 0.9621682665160926, "grad_norm": 0.7182251811027527, "learning_rate": 4.1747666635733597e-07, "loss": 1.3114, "step": 426 }, { "epoch": 0.9644268774703557, "grad_norm": 0.7395084500312805, "learning_rate": 3.698651851329837e-07, "loss": 1.4869, "step": 427 }, { "epoch": 0.9666854884246189, "grad_norm": 0.5292602777481079, "learning_rate": 3.251254299261874e-07, "loss": 1.6935, "step": 428 }, { "epoch": 0.968944099378882, "grad_norm": 0.7703830599784851, "learning_rate": 2.8325998949314536e-07, "loss": 1.581, "step": 429 }, { "epoch": 0.9712027103331451, "grad_norm": 0.4652191400527954, "learning_rate": 2.442712862748775e-07, "loss": 1.7907, "step": 430 }, { "epoch": 0.9734613212874083, "grad_norm": 0.7090250849723816, "learning_rate": 2.0816157625706545e-07, "loss": 1.3354, "step": 431 }, { "epoch": 0.9757199322416714, "grad_norm": 0.6182876825332642, "learning_rate": 1.749329488395124e-07, "loss": 1.6426, "step": 432 }, { "epoch": 0.9779785431959345, "grad_norm": 0.42393407225608826, "learning_rate": 1.4458732671523977e-07, "loss": 1.2865, "step": 433 }, { "epoch": 0.9802371541501976, "grad_norm": 0.7182570695877075, "learning_rate": 1.1712646575922637e-07, "loss": 1.3558, "step": 434 }, { "epoch": 0.9824957651044608, "grad_norm": 0.5384182929992676, "learning_rate": 9.255195492685609e-08, "loss": 1.4792, "step": 435 }, { "epoch": 0.9847543760587238, "grad_norm": 0.6862145662307739, "learning_rate": 7.086521616190279e-08, "loss": 1.4509, "step": 436 }, { "epoch": 0.987012987012987, "grad_norm": 0.9429339170455933, "learning_rate": 5.2067504314323723e-08, "loss": 1.2235, "step": 437 }, { "epoch": 0.9892715979672502, "grad_norm": 0.48626038432121277, "learning_rate": 3.6159907067601085e-08, "loss": 1.3711, "step": 438 }, { "epoch": 0.9915302089215132, "grad_norm": 0.6334356665611267, "learning_rate": 2.3143344875831142e-08, "loss": 1.3192, "step": 439 }, { "epoch": 0.9937888198757764, "grad_norm": 0.49483799934387207, "learning_rate": 1.3018570910466877e-08, "loss": 1.1924, "step": 440 }, { "epoch": 0.9960474308300395, "grad_norm": 0.5049090385437012, "learning_rate": 5.786171016708419e-09, "loss": 1.5949, "step": 441 }, { "epoch": 0.9983060417843026, "grad_norm": 0.6922277808189392, "learning_rate": 1.446563679641244e-09, "loss": 1.4329, "step": 442 }, { "epoch": 1.0005646527385659, "grad_norm": 0.656385064125061, "learning_rate": 0.0, "loss": 1.5656, "step": 443 } ], "logging_steps": 1, "max_steps": 443, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.630732306700042e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }