{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990726429675425, "eval_steps": 500, "global_step": 202, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004945904173106646, "grad_norm": 54.500160217285156, "learning_rate": 0.0, "loss": 9.5291, "step": 1 }, { "epoch": 0.009891808346213293, "grad_norm": 53.72835922241211, "learning_rate": 4e-05, "loss": 9.4814, "step": 2 }, { "epoch": 0.014837712519319939, "grad_norm": 17.406553268432617, "learning_rate": 8e-05, "loss": 9.1288, "step": 3 }, { "epoch": 0.019783616692426585, "grad_norm": 3.4949991703033447, "learning_rate": 0.00012, "loss": 8.8408, "step": 4 }, { "epoch": 0.02472952086553323, "grad_norm": 2.9090073108673096, "learning_rate": 0.00016, "loss": 8.7705, "step": 5 }, { "epoch": 0.029675425038639878, "grad_norm": 3.398167371749878, "learning_rate": 0.0002, "loss": 8.6466, "step": 6 }, { "epoch": 0.03462132921174652, "grad_norm": 1.6190311908721924, "learning_rate": 0.00019898477157360406, "loss": 8.5125, "step": 7 }, { "epoch": 0.03956723338485317, "grad_norm": 1.8773953914642334, "learning_rate": 0.00019796954314720813, "loss": 8.5322, "step": 8 }, { "epoch": 0.04451313755795981, "grad_norm": 1.283807396888733, "learning_rate": 0.00019695431472081218, "loss": 8.4917, "step": 9 }, { "epoch": 0.04945904173106646, "grad_norm": 1.9215106964111328, "learning_rate": 0.00019593908629441626, "loss": 8.3638, "step": 10 }, { "epoch": 0.05440494590417311, "grad_norm": 1.5560728311538696, "learning_rate": 0.00019492385786802033, "loss": 8.3021, "step": 11 }, { "epoch": 0.059350850077279756, "grad_norm": 1.4610416889190674, "learning_rate": 0.00019390862944162438, "loss": 8.3058, "step": 12 }, { "epoch": 0.0642967542503864, "grad_norm": 1.4304499626159668, "learning_rate": 0.00019289340101522843, "loss": 8.2576, "step": 13 }, { "epoch": 0.06924265842349304, "grad_norm": 1.2287720441818237, "learning_rate": 0.0001918781725888325, "loss": 8.0443, "step": 14 }, { "epoch": 0.07418856259659969, "grad_norm": 1.3729023933410645, "learning_rate": 0.00019086294416243655, "loss": 8.1255, "step": 15 }, { "epoch": 0.07913446676970634, "grad_norm": 1.2619420289993286, "learning_rate": 0.0001898477157360406, "loss": 8.032, "step": 16 }, { "epoch": 0.08408037094281298, "grad_norm": 1.4744280576705933, "learning_rate": 0.0001888324873096447, "loss": 7.8637, "step": 17 }, { "epoch": 0.08902627511591962, "grad_norm": 1.6214470863342285, "learning_rate": 0.00018781725888324875, "loss": 7.9172, "step": 18 }, { "epoch": 0.09397217928902628, "grad_norm": 1.283504605293274, "learning_rate": 0.0001868020304568528, "loss": 7.8251, "step": 19 }, { "epoch": 0.09891808346213292, "grad_norm": 1.0794684886932373, "learning_rate": 0.00018578680203045687, "loss": 7.7431, "step": 20 }, { "epoch": 0.10386398763523957, "grad_norm": 1.1826306581497192, "learning_rate": 0.00018477157360406092, "loss": 7.6118, "step": 21 }, { "epoch": 0.10880989180834622, "grad_norm": 1.5493848323822021, "learning_rate": 0.00018375634517766497, "loss": 7.5928, "step": 22 }, { "epoch": 0.11375579598145286, "grad_norm": 2.191657304763794, "learning_rate": 0.00018274111675126904, "loss": 7.596, "step": 23 }, { "epoch": 0.11870170015455951, "grad_norm": 1.2168949842453003, "learning_rate": 0.0001817258883248731, "loss": 7.5224, "step": 24 }, { "epoch": 0.12364760432766615, "grad_norm": 1.1562331914901733, "learning_rate": 0.00018071065989847717, "loss": 7.4952, "step": 25 }, { "epoch": 0.1285935085007728, "grad_norm": 1.9624497890472412, "learning_rate": 0.00017969543147208124, "loss": 7.459, "step": 26 }, { "epoch": 0.13353941267387945, "grad_norm": 2.2458877563476562, "learning_rate": 0.0001786802030456853, "loss": 7.3465, "step": 27 }, { "epoch": 0.1384853168469861, "grad_norm": 1.3750243186950684, "learning_rate": 0.00017766497461928934, "loss": 7.3891, "step": 28 }, { "epoch": 0.14343122102009273, "grad_norm": 1.2398021221160889, "learning_rate": 0.0001766497461928934, "loss": 7.3127, "step": 29 }, { "epoch": 0.14837712519319937, "grad_norm": 2.071115732192993, "learning_rate": 0.00017563451776649746, "loss": 7.2548, "step": 30 }, { "epoch": 0.15332302936630604, "grad_norm": 2.288498640060425, "learning_rate": 0.0001746192893401015, "loss": 7.1908, "step": 31 }, { "epoch": 0.15826893353941268, "grad_norm": 1.2050567865371704, "learning_rate": 0.0001736040609137056, "loss": 7.1467, "step": 32 }, { "epoch": 0.16321483771251932, "grad_norm": 1.4064340591430664, "learning_rate": 0.00017258883248730966, "loss": 7.035, "step": 33 }, { "epoch": 0.16816074188562596, "grad_norm": 1.2630614042282104, "learning_rate": 0.0001715736040609137, "loss": 7.0536, "step": 34 }, { "epoch": 0.1731066460587326, "grad_norm": 1.8433802127838135, "learning_rate": 0.00017055837563451778, "loss": 7.0115, "step": 35 }, { "epoch": 0.17805255023183925, "grad_norm": 1.744345784187317, "learning_rate": 0.00016954314720812183, "loss": 7.038, "step": 36 }, { "epoch": 0.18299845440494591, "grad_norm": 1.679824709892273, "learning_rate": 0.00016852791878172588, "loss": 6.8946, "step": 37 }, { "epoch": 0.18794435857805256, "grad_norm": 1.4559205770492554, "learning_rate": 0.00016751269035532995, "loss": 6.9053, "step": 38 }, { "epoch": 0.1928902627511592, "grad_norm": 1.7544541358947754, "learning_rate": 0.00016649746192893403, "loss": 6.9277, "step": 39 }, { "epoch": 0.19783616692426584, "grad_norm": 1.594734787940979, "learning_rate": 0.00016548223350253808, "loss": 6.912, "step": 40 }, { "epoch": 0.20278207109737248, "grad_norm": 1.3439960479736328, "learning_rate": 0.00016446700507614215, "loss": 6.8592, "step": 41 }, { "epoch": 0.20772797527047915, "grad_norm": 1.4330651760101318, "learning_rate": 0.0001634517766497462, "loss": 6.8965, "step": 42 }, { "epoch": 0.2126738794435858, "grad_norm": 2.439265489578247, "learning_rate": 0.00016243654822335025, "loss": 6.8126, "step": 43 }, { "epoch": 0.21761978361669243, "grad_norm": 1.2343510389328003, "learning_rate": 0.00016142131979695432, "loss": 6.8057, "step": 44 }, { "epoch": 0.22256568778979907, "grad_norm": 1.15224027633667, "learning_rate": 0.00016040609137055837, "loss": 6.6727, "step": 45 }, { "epoch": 0.2275115919629057, "grad_norm": 1.6769089698791504, "learning_rate": 0.00015939086294416242, "loss": 6.7457, "step": 46 }, { "epoch": 0.23245749613601235, "grad_norm": 2.4642043113708496, "learning_rate": 0.00015837563451776652, "loss": 6.742, "step": 47 }, { "epoch": 0.23740340030911902, "grad_norm": 1.1713383197784424, "learning_rate": 0.00015736040609137057, "loss": 6.7022, "step": 48 }, { "epoch": 0.24234930448222566, "grad_norm": 1.5891178846359253, "learning_rate": 0.00015634517766497462, "loss": 6.6446, "step": 49 }, { "epoch": 0.2472952086553323, "grad_norm": 2.0845682621002197, "learning_rate": 0.0001553299492385787, "loss": 6.5948, "step": 50 }, { "epoch": 0.252241112828439, "grad_norm": 1.4469300508499146, "learning_rate": 0.00015431472081218274, "loss": 6.5604, "step": 51 }, { "epoch": 0.2571870170015456, "grad_norm": 1.0141685009002686, "learning_rate": 0.0001532994923857868, "loss": 6.5418, "step": 52 }, { "epoch": 0.26213292117465226, "grad_norm": 2.21588134765625, "learning_rate": 0.00015228426395939087, "loss": 6.4273, "step": 53 }, { "epoch": 0.2670788253477589, "grad_norm": 1.4307092428207397, "learning_rate": 0.00015126903553299494, "loss": 6.4938, "step": 54 }, { "epoch": 0.27202472952086554, "grad_norm": 1.4310742616653442, "learning_rate": 0.000150253807106599, "loss": 6.4357, "step": 55 }, { "epoch": 0.2769706336939722, "grad_norm": 1.1520801782608032, "learning_rate": 0.00014923857868020306, "loss": 6.5101, "step": 56 }, { "epoch": 0.2819165378670788, "grad_norm": 1.0513254404067993, "learning_rate": 0.0001482233502538071, "loss": 6.4536, "step": 57 }, { "epoch": 0.28686244204018546, "grad_norm": 1.5814175605773926, "learning_rate": 0.00014720812182741116, "loss": 6.4139, "step": 58 }, { "epoch": 0.2918083462132921, "grad_norm": 1.5383965969085693, "learning_rate": 0.00014619289340101523, "loss": 6.3318, "step": 59 }, { "epoch": 0.29675425038639874, "grad_norm": 1.0093541145324707, "learning_rate": 0.00014517766497461928, "loss": 6.4279, "step": 60 }, { "epoch": 0.3017001545595054, "grad_norm": 1.4959982633590698, "learning_rate": 0.00014416243654822336, "loss": 6.3061, "step": 61 }, { "epoch": 0.3066460587326121, "grad_norm": 1.649026870727539, "learning_rate": 0.00014314720812182743, "loss": 6.274, "step": 62 }, { "epoch": 0.3115919629057187, "grad_norm": 0.9700078964233398, "learning_rate": 0.00014213197969543148, "loss": 6.4123, "step": 63 }, { "epoch": 0.31653786707882536, "grad_norm": 1.0136897563934326, "learning_rate": 0.00014111675126903553, "loss": 6.3055, "step": 64 }, { "epoch": 0.321483771251932, "grad_norm": 1.6081498861312866, "learning_rate": 0.0001401015228426396, "loss": 6.3642, "step": 65 }, { "epoch": 0.32642967542503865, "grad_norm": 1.1522279977798462, "learning_rate": 0.00013908629441624365, "loss": 6.2726, "step": 66 }, { "epoch": 0.3313755795981453, "grad_norm": 0.8351190686225891, "learning_rate": 0.00013807106598984773, "loss": 6.2645, "step": 67 }, { "epoch": 0.33632148377125193, "grad_norm": 1.1132313013076782, "learning_rate": 0.00013705583756345178, "loss": 6.2681, "step": 68 }, { "epoch": 0.34126738794435857, "grad_norm": 1.2936571836471558, "learning_rate": 0.00013604060913705585, "loss": 6.2473, "step": 69 }, { "epoch": 0.3462132921174652, "grad_norm": 1.250172734260559, "learning_rate": 0.0001350253807106599, "loss": 6.2264, "step": 70 }, { "epoch": 0.35115919629057185, "grad_norm": 1.0878709554672241, "learning_rate": 0.00013401015228426397, "loss": 6.1898, "step": 71 }, { "epoch": 0.3561051004636785, "grad_norm": 0.9934064149856567, "learning_rate": 0.00013299492385786802, "loss": 6.2047, "step": 72 }, { "epoch": 0.3610510046367852, "grad_norm": 0.8686928749084473, "learning_rate": 0.00013197969543147207, "loss": 6.1214, "step": 73 }, { "epoch": 0.36599690880989183, "grad_norm": 0.858200192451477, "learning_rate": 0.00013096446700507615, "loss": 6.0784, "step": 74 }, { "epoch": 0.37094281298299847, "grad_norm": 0.8108780980110168, "learning_rate": 0.0001299492385786802, "loss": 6.1899, "step": 75 }, { "epoch": 0.3758887171561051, "grad_norm": 0.8366422653198242, "learning_rate": 0.00012893401015228427, "loss": 6.131, "step": 76 }, { "epoch": 0.38083462132921175, "grad_norm": 1.2487200498580933, "learning_rate": 0.00012791878172588834, "loss": 6.1158, "step": 77 }, { "epoch": 0.3857805255023184, "grad_norm": 1.0677459239959717, "learning_rate": 0.0001269035532994924, "loss": 6.0873, "step": 78 }, { "epoch": 0.39072642967542504, "grad_norm": 0.9405259490013123, "learning_rate": 0.00012588832487309644, "loss": 6.0409, "step": 79 }, { "epoch": 0.3956723338485317, "grad_norm": 1.488607406616211, "learning_rate": 0.00012487309644670052, "loss": 5.9868, "step": 80 }, { "epoch": 0.4006182380216383, "grad_norm": 0.9067093729972839, "learning_rate": 0.00012385786802030456, "loss": 6.0035, "step": 81 }, { "epoch": 0.40556414219474496, "grad_norm": 1.1395992040634155, "learning_rate": 0.00012284263959390864, "loss": 5.9638, "step": 82 }, { "epoch": 0.4105100463678516, "grad_norm": 1.4701273441314697, "learning_rate": 0.0001218274111675127, "loss": 6.0212, "step": 83 }, { "epoch": 0.4154559505409583, "grad_norm": 0.8167937397956848, "learning_rate": 0.00012081218274111676, "loss": 6.0759, "step": 84 }, { "epoch": 0.42040185471406494, "grad_norm": 1.398577332496643, "learning_rate": 0.00011979695431472082, "loss": 5.9284, "step": 85 }, { "epoch": 0.4253477588871716, "grad_norm": 1.0022815465927124, "learning_rate": 0.00011878172588832489, "loss": 5.9638, "step": 86 }, { "epoch": 0.4302936630602782, "grad_norm": 1.1316360235214233, "learning_rate": 0.00011776649746192893, "loss": 5.8901, "step": 87 }, { "epoch": 0.43523956723338486, "grad_norm": 1.1034351587295532, "learning_rate": 0.000116751269035533, "loss": 5.9288, "step": 88 }, { "epoch": 0.4401854714064915, "grad_norm": 0.9991883039474487, "learning_rate": 0.00011573604060913706, "loss": 5.9447, "step": 89 }, { "epoch": 0.44513137557959814, "grad_norm": 1.4334654808044434, "learning_rate": 0.00011472081218274113, "loss": 5.8657, "step": 90 }, { "epoch": 0.4500772797527048, "grad_norm": 1.0602012872695923, "learning_rate": 0.0001137055837563452, "loss": 5.8563, "step": 91 }, { "epoch": 0.4550231839258114, "grad_norm": 0.9210672378540039, "learning_rate": 0.00011269035532994925, "loss": 5.8811, "step": 92 }, { "epoch": 0.45996908809891807, "grad_norm": 0.9101308584213257, "learning_rate": 0.0001116751269035533, "loss": 5.9572, "step": 93 }, { "epoch": 0.4649149922720247, "grad_norm": 0.8447904586791992, "learning_rate": 0.00011065989847715736, "loss": 5.8762, "step": 94 }, { "epoch": 0.46986089644513135, "grad_norm": 0.7616278529167175, "learning_rate": 0.00010964467005076143, "loss": 5.9493, "step": 95 }, { "epoch": 0.47480680061823805, "grad_norm": 1.0465595722198486, "learning_rate": 0.00010862944162436547, "loss": 5.8367, "step": 96 }, { "epoch": 0.4797527047913447, "grad_norm": 1.4627708196640015, "learning_rate": 0.00010761421319796954, "loss": 5.8301, "step": 97 }, { "epoch": 0.4846986089644513, "grad_norm": 1.0495349168777466, "learning_rate": 0.00010659898477157362, "loss": 5.8782, "step": 98 }, { "epoch": 0.48964451313755797, "grad_norm": 0.9480841755867004, "learning_rate": 0.00010558375634517767, "loss": 5.7681, "step": 99 }, { "epoch": 0.4945904173106646, "grad_norm": 0.8606300354003906, "learning_rate": 0.00010456852791878173, "loss": 5.7448, "step": 100 }, { "epoch": 0.49953632148377125, "grad_norm": 0.9947773218154907, "learning_rate": 0.0001035532994923858, "loss": 5.8485, "step": 101 }, { "epoch": 0.504482225656878, "grad_norm": 1.0647828578948975, "learning_rate": 0.00010253807106598984, "loss": 5.7214, "step": 102 }, { "epoch": 0.5094281298299845, "grad_norm": 1.1592961549758911, "learning_rate": 0.0001015228426395939, "loss": 5.7393, "step": 103 }, { "epoch": 0.5143740340030912, "grad_norm": 0.8949771523475647, "learning_rate": 0.00010050761421319797, "loss": 5.7635, "step": 104 }, { "epoch": 0.5193199381761978, "grad_norm": 0.8713933229446411, "learning_rate": 9.949238578680203e-05, "loss": 5.7227, "step": 105 }, { "epoch": 0.5242658423493045, "grad_norm": 0.8814818859100342, "learning_rate": 9.847715736040609e-05, "loss": 5.7516, "step": 106 }, { "epoch": 0.5292117465224111, "grad_norm": 0.9553707838058472, "learning_rate": 9.746192893401017e-05, "loss": 5.7522, "step": 107 }, { "epoch": 0.5341576506955178, "grad_norm": 0.8567320704460144, "learning_rate": 9.644670050761421e-05, "loss": 5.6508, "step": 108 }, { "epoch": 0.5391035548686244, "grad_norm": 1.0081580877304077, "learning_rate": 9.543147208121828e-05, "loss": 5.642, "step": 109 }, { "epoch": 0.5440494590417311, "grad_norm": 1.1526085138320923, "learning_rate": 9.441624365482235e-05, "loss": 5.7423, "step": 110 }, { "epoch": 0.5489953632148377, "grad_norm": 1.2273470163345337, "learning_rate": 9.34010152284264e-05, "loss": 5.7094, "step": 111 }, { "epoch": 0.5539412673879444, "grad_norm": 0.830719530582428, "learning_rate": 9.238578680203046e-05, "loss": 5.7365, "step": 112 }, { "epoch": 0.558887171561051, "grad_norm": 1.1520576477050781, "learning_rate": 9.137055837563452e-05, "loss": 5.7391, "step": 113 }, { "epoch": 0.5638330757341576, "grad_norm": 1.1414787769317627, "learning_rate": 9.035532994923858e-05, "loss": 5.7288, "step": 114 }, { "epoch": 0.5687789799072643, "grad_norm": 0.9615758061408997, "learning_rate": 8.934010152284265e-05, "loss": 5.5568, "step": 115 }, { "epoch": 0.5737248840803709, "grad_norm": 0.8781617879867554, "learning_rate": 8.83248730964467e-05, "loss": 5.6264, "step": 116 }, { "epoch": 0.5786707882534776, "grad_norm": 1.1544886827468872, "learning_rate": 8.730964467005075e-05, "loss": 5.6724, "step": 117 }, { "epoch": 0.5836166924265842, "grad_norm": 0.931874692440033, "learning_rate": 8.629441624365483e-05, "loss": 5.6046, "step": 118 }, { "epoch": 0.5885625965996909, "grad_norm": 0.7856680750846863, "learning_rate": 8.527918781725889e-05, "loss": 5.6521, "step": 119 }, { "epoch": 0.5935085007727975, "grad_norm": 1.162001609802246, "learning_rate": 8.426395939086294e-05, "loss": 5.5843, "step": 120 }, { "epoch": 0.5984544049459042, "grad_norm": 0.8572034239768982, "learning_rate": 8.324873096446701e-05, "loss": 5.6526, "step": 121 }, { "epoch": 0.6034003091190108, "grad_norm": 0.9555945992469788, "learning_rate": 8.223350253807108e-05, "loss": 5.6673, "step": 122 }, { "epoch": 0.6083462132921175, "grad_norm": 0.880160927772522, "learning_rate": 8.121827411167512e-05, "loss": 5.498, "step": 123 }, { "epoch": 0.6132921174652242, "grad_norm": 1.1022496223449707, "learning_rate": 8.020304568527919e-05, "loss": 5.5833, "step": 124 }, { "epoch": 0.6182380216383307, "grad_norm": 0.9595851898193359, "learning_rate": 7.918781725888326e-05, "loss": 5.6384, "step": 125 }, { "epoch": 0.6231839258114374, "grad_norm": 1.4313597679138184, "learning_rate": 7.817258883248731e-05, "loss": 5.5478, "step": 126 }, { "epoch": 0.628129829984544, "grad_norm": 0.9351322054862976, "learning_rate": 7.715736040609137e-05, "loss": 5.5652, "step": 127 }, { "epoch": 0.6330757341576507, "grad_norm": 1.251789927482605, "learning_rate": 7.614213197969543e-05, "loss": 5.5387, "step": 128 }, { "epoch": 0.6380216383307573, "grad_norm": 0.98284912109375, "learning_rate": 7.51269035532995e-05, "loss": 5.5338, "step": 129 }, { "epoch": 0.642967542503864, "grad_norm": 1.0421977043151855, "learning_rate": 7.411167512690356e-05, "loss": 5.5774, "step": 130 }, { "epoch": 0.6479134466769706, "grad_norm": 1.0751053094863892, "learning_rate": 7.309644670050762e-05, "loss": 5.5642, "step": 131 }, { "epoch": 0.6528593508500773, "grad_norm": 1.089376449584961, "learning_rate": 7.208121827411168e-05, "loss": 5.505, "step": 132 }, { "epoch": 0.6578052550231839, "grad_norm": 1.0731728076934814, "learning_rate": 7.106598984771574e-05, "loss": 5.5514, "step": 133 }, { "epoch": 0.6627511591962906, "grad_norm": 1.2262444496154785, "learning_rate": 7.00507614213198e-05, "loss": 5.5723, "step": 134 }, { "epoch": 0.6676970633693973, "grad_norm": 1.0487595796585083, "learning_rate": 6.903553299492386e-05, "loss": 5.5587, "step": 135 }, { "epoch": 0.6726429675425039, "grad_norm": 1.084671139717102, "learning_rate": 6.802030456852793e-05, "loss": 5.4868, "step": 136 }, { "epoch": 0.6775888717156106, "grad_norm": 1.1871248483657837, "learning_rate": 6.700507614213199e-05, "loss": 5.5475, "step": 137 }, { "epoch": 0.6825347758887171, "grad_norm": 0.960493803024292, "learning_rate": 6.598984771573604e-05, "loss": 5.5006, "step": 138 }, { "epoch": 0.6874806800618238, "grad_norm": 1.053593397140503, "learning_rate": 6.49746192893401e-05, "loss": 5.5389, "step": 139 }, { "epoch": 0.6924265842349304, "grad_norm": 0.8886996507644653, "learning_rate": 6.395939086294417e-05, "loss": 5.4616, "step": 140 }, { "epoch": 0.6973724884080371, "grad_norm": 1.1852856874465942, "learning_rate": 6.294416243654822e-05, "loss": 5.498, "step": 141 }, { "epoch": 0.7023183925811437, "grad_norm": 0.8381466865539551, "learning_rate": 6.192893401015228e-05, "loss": 5.4977, "step": 142 }, { "epoch": 0.7072642967542504, "grad_norm": 1.01845121383667, "learning_rate": 6.091370558375635e-05, "loss": 5.4162, "step": 143 }, { "epoch": 0.712210200927357, "grad_norm": 0.9204426407814026, "learning_rate": 5.989847715736041e-05, "loss": 5.4654, "step": 144 }, { "epoch": 0.7171561051004637, "grad_norm": 1.0901105403900146, "learning_rate": 5.8883248730964467e-05, "loss": 5.4262, "step": 145 }, { "epoch": 0.7221020092735704, "grad_norm": 0.9842381477355957, "learning_rate": 5.786802030456853e-05, "loss": 5.4622, "step": 146 }, { "epoch": 0.727047913446677, "grad_norm": 1.1234885454177856, "learning_rate": 5.68527918781726e-05, "loss": 5.4668, "step": 147 }, { "epoch": 0.7319938176197837, "grad_norm": 1.0685431957244873, "learning_rate": 5.583756345177665e-05, "loss": 5.4649, "step": 148 }, { "epoch": 0.7369397217928902, "grad_norm": 1.086138367652893, "learning_rate": 5.482233502538071e-05, "loss": 5.336, "step": 149 }, { "epoch": 0.7418856259659969, "grad_norm": 1.0806076526641846, "learning_rate": 5.380710659898477e-05, "loss": 5.3463, "step": 150 }, { "epoch": 0.7468315301391035, "grad_norm": 1.1613116264343262, "learning_rate": 5.2791878172588836e-05, "loss": 5.4095, "step": 151 }, { "epoch": 0.7517774343122102, "grad_norm": 1.1117639541625977, "learning_rate": 5.17766497461929e-05, "loss": 5.4248, "step": 152 }, { "epoch": 0.7567233384853168, "grad_norm": 0.9730443954467773, "learning_rate": 5.076142131979695e-05, "loss": 5.5573, "step": 153 }, { "epoch": 0.7616692426584235, "grad_norm": 1.0216584205627441, "learning_rate": 4.9746192893401014e-05, "loss": 5.3337, "step": 154 }, { "epoch": 0.7666151468315301, "grad_norm": 0.9828229546546936, "learning_rate": 4.873096446700508e-05, "loss": 5.3757, "step": 155 }, { "epoch": 0.7715610510046368, "grad_norm": 1.0315641164779663, "learning_rate": 4.771573604060914e-05, "loss": 5.4465, "step": 156 }, { "epoch": 0.7765069551777435, "grad_norm": 1.1969993114471436, "learning_rate": 4.67005076142132e-05, "loss": 5.4018, "step": 157 }, { "epoch": 0.7814528593508501, "grad_norm": 0.7633097171783447, "learning_rate": 4.568527918781726e-05, "loss": 5.5137, "step": 158 }, { "epoch": 0.7863987635239568, "grad_norm": 0.8312305212020874, "learning_rate": 4.467005076142132e-05, "loss": 5.4078, "step": 159 }, { "epoch": 0.7913446676970634, "grad_norm": 0.9463878870010376, "learning_rate": 4.365482233502538e-05, "loss": 5.3738, "step": 160 }, { "epoch": 0.79629057187017, "grad_norm": 0.8046661615371704, "learning_rate": 4.2639593908629446e-05, "loss": 5.455, "step": 161 }, { "epoch": 0.8012364760432766, "grad_norm": 1.0929735898971558, "learning_rate": 4.162436548223351e-05, "loss": 5.4263, "step": 162 }, { "epoch": 0.8061823802163833, "grad_norm": 1.0323022603988647, "learning_rate": 4.060913705583756e-05, "loss": 5.4503, "step": 163 }, { "epoch": 0.8111282843894899, "grad_norm": 0.7212726473808289, "learning_rate": 3.959390862944163e-05, "loss": 5.3904, "step": 164 }, { "epoch": 0.8160741885625966, "grad_norm": 0.8705483078956604, "learning_rate": 3.8578680203045685e-05, "loss": 5.2958, "step": 165 }, { "epoch": 0.8210200927357032, "grad_norm": 0.9705776572227478, "learning_rate": 3.756345177664975e-05, "loss": 5.3806, "step": 166 }, { "epoch": 0.8259659969088099, "grad_norm": 0.7694171667098999, "learning_rate": 3.654822335025381e-05, "loss": 5.3446, "step": 167 }, { "epoch": 0.8309119010819166, "grad_norm": 1.0148179531097412, "learning_rate": 3.553299492385787e-05, "loss": 5.4316, "step": 168 }, { "epoch": 0.8358578052550232, "grad_norm": 1.0124086141586304, "learning_rate": 3.451776649746193e-05, "loss": 5.2903, "step": 169 }, { "epoch": 0.8408037094281299, "grad_norm": 0.8755667209625244, "learning_rate": 3.3502538071065994e-05, "loss": 5.2636, "step": 170 }, { "epoch": 0.8457496136012365, "grad_norm": 0.992751955986023, "learning_rate": 3.248730964467005e-05, "loss": 5.3662, "step": 171 }, { "epoch": 0.8506955177743432, "grad_norm": 0.676480770111084, "learning_rate": 3.147208121827411e-05, "loss": 5.3912, "step": 172 }, { "epoch": 0.8556414219474497, "grad_norm": 0.8479735851287842, "learning_rate": 3.0456852791878175e-05, "loss": 5.5655, "step": 173 }, { "epoch": 0.8605873261205564, "grad_norm": 0.8780114054679871, "learning_rate": 2.9441624365482233e-05, "loss": 5.4011, "step": 174 }, { "epoch": 0.865533230293663, "grad_norm": 0.7192287445068359, "learning_rate": 2.84263959390863e-05, "loss": 5.46, "step": 175 }, { "epoch": 0.8704791344667697, "grad_norm": 0.9556674957275391, "learning_rate": 2.7411167512690357e-05, "loss": 5.4278, "step": 176 }, { "epoch": 0.8754250386398763, "grad_norm": 0.7303546667098999, "learning_rate": 2.6395939086294418e-05, "loss": 5.3822, "step": 177 }, { "epoch": 0.880370942812983, "grad_norm": 0.7659119963645935, "learning_rate": 2.5380710659898476e-05, "loss": 5.3925, "step": 178 }, { "epoch": 0.8853168469860896, "grad_norm": 0.8511722087860107, "learning_rate": 2.436548223350254e-05, "loss": 5.3318, "step": 179 }, { "epoch": 0.8902627511591963, "grad_norm": 0.8240477442741394, "learning_rate": 2.33502538071066e-05, "loss": 5.2479, "step": 180 }, { "epoch": 0.895208655332303, "grad_norm": 0.8193429112434387, "learning_rate": 2.233502538071066e-05, "loss": 5.4237, "step": 181 }, { "epoch": 0.9001545595054096, "grad_norm": 0.8074966669082642, "learning_rate": 2.1319796954314723e-05, "loss": 5.5029, "step": 182 }, { "epoch": 0.9051004636785163, "grad_norm": 0.6603164076805115, "learning_rate": 2.030456852791878e-05, "loss": 5.3007, "step": 183 }, { "epoch": 0.9100463678516229, "grad_norm": 0.633477509021759, "learning_rate": 1.9289340101522843e-05, "loss": 5.396, "step": 184 }, { "epoch": 0.9149922720247295, "grad_norm": 0.6681249141693115, "learning_rate": 1.8274111675126904e-05, "loss": 5.3733, "step": 185 }, { "epoch": 0.9199381761978361, "grad_norm": 0.756808340549469, "learning_rate": 1.7258883248730966e-05, "loss": 5.3439, "step": 186 }, { "epoch": 0.9248840803709428, "grad_norm": 0.64524906873703, "learning_rate": 1.6243654822335024e-05, "loss": 5.4027, "step": 187 }, { "epoch": 0.9298299845440494, "grad_norm": 0.7147576212882996, "learning_rate": 1.5228426395939088e-05, "loss": 5.3111, "step": 188 }, { "epoch": 0.9347758887171561, "grad_norm": 0.6565448641777039, "learning_rate": 1.421319796954315e-05, "loss": 5.3649, "step": 189 }, { "epoch": 0.9397217928902627, "grad_norm": 0.6476154923439026, "learning_rate": 1.3197969543147209e-05, "loss": 5.3617, "step": 190 }, { "epoch": 0.9446676970633694, "grad_norm": 0.6315869092941284, "learning_rate": 1.218274111675127e-05, "loss": 5.3247, "step": 191 }, { "epoch": 0.9496136012364761, "grad_norm": 0.6404466032981873, "learning_rate": 1.116751269035533e-05, "loss": 5.3402, "step": 192 }, { "epoch": 0.9545595054095827, "grad_norm": 0.6863434314727783, "learning_rate": 1.015228426395939e-05, "loss": 5.3436, "step": 193 }, { "epoch": 0.9595054095826894, "grad_norm": 0.6492709517478943, "learning_rate": 9.137055837563452e-06, "loss": 5.2449, "step": 194 }, { "epoch": 0.964451313755796, "grad_norm": 0.647345781326294, "learning_rate": 8.121827411167512e-06, "loss": 5.3811, "step": 195 }, { "epoch": 0.9693972179289027, "grad_norm": 0.711609423160553, "learning_rate": 7.106598984771575e-06, "loss": 5.3612, "step": 196 }, { "epoch": 0.9743431221020092, "grad_norm": 0.610159158706665, "learning_rate": 6.091370558375635e-06, "loss": 5.3041, "step": 197 }, { "epoch": 0.9792890262751159, "grad_norm": 0.61027592420578, "learning_rate": 5.076142131979695e-06, "loss": 5.3324, "step": 198 }, { "epoch": 0.9842349304482225, "grad_norm": 0.5848086476325989, "learning_rate": 4.060913705583756e-06, "loss": 5.3446, "step": 199 }, { "epoch": 0.9891808346213292, "grad_norm": 0.5617231130599976, "learning_rate": 3.0456852791878177e-06, "loss": 5.3997, "step": 200 }, { "epoch": 0.9941267387944358, "grad_norm": 0.6468728184700012, "learning_rate": 2.030456852791878e-06, "loss": 5.3444, "step": 201 }, { "epoch": 0.9990726429675425, "grad_norm": 0.629033088684082, "learning_rate": 1.015228426395939e-06, "loss": 5.3283, "step": 202 } ], "logging_steps": 1, "max_steps": 202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5526784012305408.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }