|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9990726429675425, |
|
"eval_steps": 500, |
|
"global_step": 202, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004945904173106646, |
|
"grad_norm": 54.500160217285156, |
|
"learning_rate": 0.0, |
|
"loss": 9.5291, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009891808346213293, |
|
"grad_norm": 53.72835922241211, |
|
"learning_rate": 4e-05, |
|
"loss": 9.4814, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.014837712519319939, |
|
"grad_norm": 17.406553268432617, |
|
"learning_rate": 8e-05, |
|
"loss": 9.1288, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.019783616692426585, |
|
"grad_norm": 3.4949991703033447, |
|
"learning_rate": 0.00012, |
|
"loss": 8.8408, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02472952086553323, |
|
"grad_norm": 2.9090073108673096, |
|
"learning_rate": 0.00016, |
|
"loss": 8.7705, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.029675425038639878, |
|
"grad_norm": 3.398167371749878, |
|
"learning_rate": 0.0002, |
|
"loss": 8.6466, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03462132921174652, |
|
"grad_norm": 1.6190311908721924, |
|
"learning_rate": 0.00019898477157360406, |
|
"loss": 8.5125, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03956723338485317, |
|
"grad_norm": 1.8773953914642334, |
|
"learning_rate": 0.00019796954314720813, |
|
"loss": 8.5322, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04451313755795981, |
|
"grad_norm": 1.283807396888733, |
|
"learning_rate": 0.00019695431472081218, |
|
"loss": 8.4917, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04945904173106646, |
|
"grad_norm": 1.9215106964111328, |
|
"learning_rate": 0.00019593908629441626, |
|
"loss": 8.3638, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05440494590417311, |
|
"grad_norm": 1.5560728311538696, |
|
"learning_rate": 0.00019492385786802033, |
|
"loss": 8.3021, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.059350850077279756, |
|
"grad_norm": 1.4610416889190674, |
|
"learning_rate": 0.00019390862944162438, |
|
"loss": 8.3058, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0642967542503864, |
|
"grad_norm": 1.4304499626159668, |
|
"learning_rate": 0.00019289340101522843, |
|
"loss": 8.2576, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06924265842349304, |
|
"grad_norm": 1.2287720441818237, |
|
"learning_rate": 0.0001918781725888325, |
|
"loss": 8.0443, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07418856259659969, |
|
"grad_norm": 1.3729023933410645, |
|
"learning_rate": 0.00019086294416243655, |
|
"loss": 8.1255, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07913446676970634, |
|
"grad_norm": 1.2619420289993286, |
|
"learning_rate": 0.0001898477157360406, |
|
"loss": 8.032, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08408037094281298, |
|
"grad_norm": 1.4744280576705933, |
|
"learning_rate": 0.0001888324873096447, |
|
"loss": 7.8637, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.08902627511591962, |
|
"grad_norm": 1.6214470863342285, |
|
"learning_rate": 0.00018781725888324875, |
|
"loss": 7.9172, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.09397217928902628, |
|
"grad_norm": 1.283504605293274, |
|
"learning_rate": 0.0001868020304568528, |
|
"loss": 7.8251, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.09891808346213292, |
|
"grad_norm": 1.0794684886932373, |
|
"learning_rate": 0.00018578680203045687, |
|
"loss": 7.7431, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10386398763523957, |
|
"grad_norm": 1.1826306581497192, |
|
"learning_rate": 0.00018477157360406092, |
|
"loss": 7.6118, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.10880989180834622, |
|
"grad_norm": 1.5493848323822021, |
|
"learning_rate": 0.00018375634517766497, |
|
"loss": 7.5928, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.11375579598145286, |
|
"grad_norm": 2.191657304763794, |
|
"learning_rate": 0.00018274111675126904, |
|
"loss": 7.596, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.11870170015455951, |
|
"grad_norm": 1.2168949842453003, |
|
"learning_rate": 0.0001817258883248731, |
|
"loss": 7.5224, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.12364760432766615, |
|
"grad_norm": 1.1562331914901733, |
|
"learning_rate": 0.00018071065989847717, |
|
"loss": 7.4952, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1285935085007728, |
|
"grad_norm": 1.9624497890472412, |
|
"learning_rate": 0.00017969543147208124, |
|
"loss": 7.459, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.13353941267387945, |
|
"grad_norm": 2.2458877563476562, |
|
"learning_rate": 0.0001786802030456853, |
|
"loss": 7.3465, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1384853168469861, |
|
"grad_norm": 1.3750243186950684, |
|
"learning_rate": 0.00017766497461928934, |
|
"loss": 7.3891, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.14343122102009273, |
|
"grad_norm": 1.2398021221160889, |
|
"learning_rate": 0.0001766497461928934, |
|
"loss": 7.3127, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.14837712519319937, |
|
"grad_norm": 2.071115732192993, |
|
"learning_rate": 0.00017563451776649746, |
|
"loss": 7.2548, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15332302936630604, |
|
"grad_norm": 2.288498640060425, |
|
"learning_rate": 0.0001746192893401015, |
|
"loss": 7.1908, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.15826893353941268, |
|
"grad_norm": 1.2050567865371704, |
|
"learning_rate": 0.0001736040609137056, |
|
"loss": 7.1467, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.16321483771251932, |
|
"grad_norm": 1.4064340591430664, |
|
"learning_rate": 0.00017258883248730966, |
|
"loss": 7.035, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.16816074188562596, |
|
"grad_norm": 1.2630614042282104, |
|
"learning_rate": 0.0001715736040609137, |
|
"loss": 7.0536, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.1731066460587326, |
|
"grad_norm": 1.8433802127838135, |
|
"learning_rate": 0.00017055837563451778, |
|
"loss": 7.0115, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.17805255023183925, |
|
"grad_norm": 1.744345784187317, |
|
"learning_rate": 0.00016954314720812183, |
|
"loss": 7.038, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.18299845440494591, |
|
"grad_norm": 1.679824709892273, |
|
"learning_rate": 0.00016852791878172588, |
|
"loss": 6.8946, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.18794435857805256, |
|
"grad_norm": 1.4559205770492554, |
|
"learning_rate": 0.00016751269035532995, |
|
"loss": 6.9053, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1928902627511592, |
|
"grad_norm": 1.7544541358947754, |
|
"learning_rate": 0.00016649746192893403, |
|
"loss": 6.9277, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.19783616692426584, |
|
"grad_norm": 1.594734787940979, |
|
"learning_rate": 0.00016548223350253808, |
|
"loss": 6.912, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.20278207109737248, |
|
"grad_norm": 1.3439960479736328, |
|
"learning_rate": 0.00016446700507614215, |
|
"loss": 6.8592, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.20772797527047915, |
|
"grad_norm": 1.4330651760101318, |
|
"learning_rate": 0.0001634517766497462, |
|
"loss": 6.8965, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2126738794435858, |
|
"grad_norm": 2.439265489578247, |
|
"learning_rate": 0.00016243654822335025, |
|
"loss": 6.8126, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.21761978361669243, |
|
"grad_norm": 1.2343510389328003, |
|
"learning_rate": 0.00016142131979695432, |
|
"loss": 6.8057, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.22256568778979907, |
|
"grad_norm": 1.15224027633667, |
|
"learning_rate": 0.00016040609137055837, |
|
"loss": 6.6727, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2275115919629057, |
|
"grad_norm": 1.6769089698791504, |
|
"learning_rate": 0.00015939086294416242, |
|
"loss": 6.7457, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.23245749613601235, |
|
"grad_norm": 2.4642043113708496, |
|
"learning_rate": 0.00015837563451776652, |
|
"loss": 6.742, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.23740340030911902, |
|
"grad_norm": 1.1713383197784424, |
|
"learning_rate": 0.00015736040609137057, |
|
"loss": 6.7022, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.24234930448222566, |
|
"grad_norm": 1.5891178846359253, |
|
"learning_rate": 0.00015634517766497462, |
|
"loss": 6.6446, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2472952086553323, |
|
"grad_norm": 2.0845682621002197, |
|
"learning_rate": 0.0001553299492385787, |
|
"loss": 6.5948, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.252241112828439, |
|
"grad_norm": 1.4469300508499146, |
|
"learning_rate": 0.00015431472081218274, |
|
"loss": 6.5604, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.2571870170015456, |
|
"grad_norm": 1.0141685009002686, |
|
"learning_rate": 0.0001532994923857868, |
|
"loss": 6.5418, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.26213292117465226, |
|
"grad_norm": 2.21588134765625, |
|
"learning_rate": 0.00015228426395939087, |
|
"loss": 6.4273, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2670788253477589, |
|
"grad_norm": 1.4307092428207397, |
|
"learning_rate": 0.00015126903553299494, |
|
"loss": 6.4938, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.27202472952086554, |
|
"grad_norm": 1.4310742616653442, |
|
"learning_rate": 0.000150253807106599, |
|
"loss": 6.4357, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2769706336939722, |
|
"grad_norm": 1.1520801782608032, |
|
"learning_rate": 0.00014923857868020306, |
|
"loss": 6.5101, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2819165378670788, |
|
"grad_norm": 1.0513254404067993, |
|
"learning_rate": 0.0001482233502538071, |
|
"loss": 6.4536, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.28686244204018546, |
|
"grad_norm": 1.5814175605773926, |
|
"learning_rate": 0.00014720812182741116, |
|
"loss": 6.4139, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2918083462132921, |
|
"grad_norm": 1.5383965969085693, |
|
"learning_rate": 0.00014619289340101523, |
|
"loss": 6.3318, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.29675425038639874, |
|
"grad_norm": 1.0093541145324707, |
|
"learning_rate": 0.00014517766497461928, |
|
"loss": 6.4279, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3017001545595054, |
|
"grad_norm": 1.4959982633590698, |
|
"learning_rate": 0.00014416243654822336, |
|
"loss": 6.3061, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3066460587326121, |
|
"grad_norm": 1.649026870727539, |
|
"learning_rate": 0.00014314720812182743, |
|
"loss": 6.274, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3115919629057187, |
|
"grad_norm": 0.9700078964233398, |
|
"learning_rate": 0.00014213197969543148, |
|
"loss": 6.4123, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.31653786707882536, |
|
"grad_norm": 1.0136897563934326, |
|
"learning_rate": 0.00014111675126903553, |
|
"loss": 6.3055, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.321483771251932, |
|
"grad_norm": 1.6081498861312866, |
|
"learning_rate": 0.0001401015228426396, |
|
"loss": 6.3642, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.32642967542503865, |
|
"grad_norm": 1.1522279977798462, |
|
"learning_rate": 0.00013908629441624365, |
|
"loss": 6.2726, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3313755795981453, |
|
"grad_norm": 0.8351190686225891, |
|
"learning_rate": 0.00013807106598984773, |
|
"loss": 6.2645, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.33632148377125193, |
|
"grad_norm": 1.1132313013076782, |
|
"learning_rate": 0.00013705583756345178, |
|
"loss": 6.2681, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.34126738794435857, |
|
"grad_norm": 1.2936571836471558, |
|
"learning_rate": 0.00013604060913705585, |
|
"loss": 6.2473, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3462132921174652, |
|
"grad_norm": 1.250172734260559, |
|
"learning_rate": 0.0001350253807106599, |
|
"loss": 6.2264, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.35115919629057185, |
|
"grad_norm": 1.0878709554672241, |
|
"learning_rate": 0.00013401015228426397, |
|
"loss": 6.1898, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3561051004636785, |
|
"grad_norm": 0.9934064149856567, |
|
"learning_rate": 0.00013299492385786802, |
|
"loss": 6.2047, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3610510046367852, |
|
"grad_norm": 0.8686928749084473, |
|
"learning_rate": 0.00013197969543147207, |
|
"loss": 6.1214, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.36599690880989183, |
|
"grad_norm": 0.858200192451477, |
|
"learning_rate": 0.00013096446700507615, |
|
"loss": 6.0784, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.37094281298299847, |
|
"grad_norm": 0.8108780980110168, |
|
"learning_rate": 0.0001299492385786802, |
|
"loss": 6.1899, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3758887171561051, |
|
"grad_norm": 0.8366422653198242, |
|
"learning_rate": 0.00012893401015228427, |
|
"loss": 6.131, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.38083462132921175, |
|
"grad_norm": 1.2487200498580933, |
|
"learning_rate": 0.00012791878172588834, |
|
"loss": 6.1158, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3857805255023184, |
|
"grad_norm": 1.0677459239959717, |
|
"learning_rate": 0.0001269035532994924, |
|
"loss": 6.0873, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.39072642967542504, |
|
"grad_norm": 0.9405259490013123, |
|
"learning_rate": 0.00012588832487309644, |
|
"loss": 6.0409, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3956723338485317, |
|
"grad_norm": 1.488607406616211, |
|
"learning_rate": 0.00012487309644670052, |
|
"loss": 5.9868, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4006182380216383, |
|
"grad_norm": 0.9067093729972839, |
|
"learning_rate": 0.00012385786802030456, |
|
"loss": 6.0035, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.40556414219474496, |
|
"grad_norm": 1.1395992040634155, |
|
"learning_rate": 0.00012284263959390864, |
|
"loss": 5.9638, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4105100463678516, |
|
"grad_norm": 1.4701273441314697, |
|
"learning_rate": 0.0001218274111675127, |
|
"loss": 6.0212, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.4154559505409583, |
|
"grad_norm": 0.8167937397956848, |
|
"learning_rate": 0.00012081218274111676, |
|
"loss": 6.0759, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.42040185471406494, |
|
"grad_norm": 1.398577332496643, |
|
"learning_rate": 0.00011979695431472082, |
|
"loss": 5.9284, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4253477588871716, |
|
"grad_norm": 1.0022815465927124, |
|
"learning_rate": 0.00011878172588832489, |
|
"loss": 5.9638, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.4302936630602782, |
|
"grad_norm": 1.1316360235214233, |
|
"learning_rate": 0.00011776649746192893, |
|
"loss": 5.8901, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.43523956723338486, |
|
"grad_norm": 1.1034351587295532, |
|
"learning_rate": 0.000116751269035533, |
|
"loss": 5.9288, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.4401854714064915, |
|
"grad_norm": 0.9991883039474487, |
|
"learning_rate": 0.00011573604060913706, |
|
"loss": 5.9447, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.44513137557959814, |
|
"grad_norm": 1.4334654808044434, |
|
"learning_rate": 0.00011472081218274113, |
|
"loss": 5.8657, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4500772797527048, |
|
"grad_norm": 1.0602012872695923, |
|
"learning_rate": 0.0001137055837563452, |
|
"loss": 5.8563, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.4550231839258114, |
|
"grad_norm": 0.9210672378540039, |
|
"learning_rate": 0.00011269035532994925, |
|
"loss": 5.8811, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.45996908809891807, |
|
"grad_norm": 0.9101308584213257, |
|
"learning_rate": 0.0001116751269035533, |
|
"loss": 5.9572, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.4649149922720247, |
|
"grad_norm": 0.8447904586791992, |
|
"learning_rate": 0.00011065989847715736, |
|
"loss": 5.8762, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.46986089644513135, |
|
"grad_norm": 0.7616278529167175, |
|
"learning_rate": 0.00010964467005076143, |
|
"loss": 5.9493, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.47480680061823805, |
|
"grad_norm": 1.0465595722198486, |
|
"learning_rate": 0.00010862944162436547, |
|
"loss": 5.8367, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4797527047913447, |
|
"grad_norm": 1.4627708196640015, |
|
"learning_rate": 0.00010761421319796954, |
|
"loss": 5.8301, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4846986089644513, |
|
"grad_norm": 1.0495349168777466, |
|
"learning_rate": 0.00010659898477157362, |
|
"loss": 5.8782, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.48964451313755797, |
|
"grad_norm": 0.9480841755867004, |
|
"learning_rate": 0.00010558375634517767, |
|
"loss": 5.7681, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4945904173106646, |
|
"grad_norm": 0.8606300354003906, |
|
"learning_rate": 0.00010456852791878173, |
|
"loss": 5.7448, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.49953632148377125, |
|
"grad_norm": 0.9947773218154907, |
|
"learning_rate": 0.0001035532994923858, |
|
"loss": 5.8485, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.504482225656878, |
|
"grad_norm": 1.0647828578948975, |
|
"learning_rate": 0.00010253807106598984, |
|
"loss": 5.7214, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5094281298299845, |
|
"grad_norm": 1.1592961549758911, |
|
"learning_rate": 0.0001015228426395939, |
|
"loss": 5.7393, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5143740340030912, |
|
"grad_norm": 0.8949771523475647, |
|
"learning_rate": 0.00010050761421319797, |
|
"loss": 5.7635, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5193199381761978, |
|
"grad_norm": 0.8713933229446411, |
|
"learning_rate": 9.949238578680203e-05, |
|
"loss": 5.7227, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5242658423493045, |
|
"grad_norm": 0.8814818859100342, |
|
"learning_rate": 9.847715736040609e-05, |
|
"loss": 5.7516, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5292117465224111, |
|
"grad_norm": 0.9553707838058472, |
|
"learning_rate": 9.746192893401017e-05, |
|
"loss": 5.7522, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5341576506955178, |
|
"grad_norm": 0.8567320704460144, |
|
"learning_rate": 9.644670050761421e-05, |
|
"loss": 5.6508, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5391035548686244, |
|
"grad_norm": 1.0081580877304077, |
|
"learning_rate": 9.543147208121828e-05, |
|
"loss": 5.642, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5440494590417311, |
|
"grad_norm": 1.1526085138320923, |
|
"learning_rate": 9.441624365482235e-05, |
|
"loss": 5.7423, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5489953632148377, |
|
"grad_norm": 1.2273470163345337, |
|
"learning_rate": 9.34010152284264e-05, |
|
"loss": 5.7094, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5539412673879444, |
|
"grad_norm": 0.830719530582428, |
|
"learning_rate": 9.238578680203046e-05, |
|
"loss": 5.7365, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.558887171561051, |
|
"grad_norm": 1.1520576477050781, |
|
"learning_rate": 9.137055837563452e-05, |
|
"loss": 5.7391, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5638330757341576, |
|
"grad_norm": 1.1414787769317627, |
|
"learning_rate": 9.035532994923858e-05, |
|
"loss": 5.7288, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5687789799072643, |
|
"grad_norm": 0.9615758061408997, |
|
"learning_rate": 8.934010152284265e-05, |
|
"loss": 5.5568, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5737248840803709, |
|
"grad_norm": 0.8781617879867554, |
|
"learning_rate": 8.83248730964467e-05, |
|
"loss": 5.6264, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5786707882534776, |
|
"grad_norm": 1.1544886827468872, |
|
"learning_rate": 8.730964467005075e-05, |
|
"loss": 5.6724, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5836166924265842, |
|
"grad_norm": 0.931874692440033, |
|
"learning_rate": 8.629441624365483e-05, |
|
"loss": 5.6046, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5885625965996909, |
|
"grad_norm": 0.7856680750846863, |
|
"learning_rate": 8.527918781725889e-05, |
|
"loss": 5.6521, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5935085007727975, |
|
"grad_norm": 1.162001609802246, |
|
"learning_rate": 8.426395939086294e-05, |
|
"loss": 5.5843, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5984544049459042, |
|
"grad_norm": 0.8572034239768982, |
|
"learning_rate": 8.324873096446701e-05, |
|
"loss": 5.6526, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6034003091190108, |
|
"grad_norm": 0.9555945992469788, |
|
"learning_rate": 8.223350253807108e-05, |
|
"loss": 5.6673, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6083462132921175, |
|
"grad_norm": 0.880160927772522, |
|
"learning_rate": 8.121827411167512e-05, |
|
"loss": 5.498, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6132921174652242, |
|
"grad_norm": 1.1022496223449707, |
|
"learning_rate": 8.020304568527919e-05, |
|
"loss": 5.5833, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6182380216383307, |
|
"grad_norm": 0.9595851898193359, |
|
"learning_rate": 7.918781725888326e-05, |
|
"loss": 5.6384, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6231839258114374, |
|
"grad_norm": 1.4313597679138184, |
|
"learning_rate": 7.817258883248731e-05, |
|
"loss": 5.5478, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.628129829984544, |
|
"grad_norm": 0.9351322054862976, |
|
"learning_rate": 7.715736040609137e-05, |
|
"loss": 5.5652, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6330757341576507, |
|
"grad_norm": 1.251789927482605, |
|
"learning_rate": 7.614213197969543e-05, |
|
"loss": 5.5387, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6380216383307573, |
|
"grad_norm": 0.98284912109375, |
|
"learning_rate": 7.51269035532995e-05, |
|
"loss": 5.5338, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.642967542503864, |
|
"grad_norm": 1.0421977043151855, |
|
"learning_rate": 7.411167512690356e-05, |
|
"loss": 5.5774, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6479134466769706, |
|
"grad_norm": 1.0751053094863892, |
|
"learning_rate": 7.309644670050762e-05, |
|
"loss": 5.5642, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6528593508500773, |
|
"grad_norm": 1.089376449584961, |
|
"learning_rate": 7.208121827411168e-05, |
|
"loss": 5.505, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.6578052550231839, |
|
"grad_norm": 1.0731728076934814, |
|
"learning_rate": 7.106598984771574e-05, |
|
"loss": 5.5514, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.6627511591962906, |
|
"grad_norm": 1.2262444496154785, |
|
"learning_rate": 7.00507614213198e-05, |
|
"loss": 5.5723, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6676970633693973, |
|
"grad_norm": 1.0487595796585083, |
|
"learning_rate": 6.903553299492386e-05, |
|
"loss": 5.5587, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6726429675425039, |
|
"grad_norm": 1.084671139717102, |
|
"learning_rate": 6.802030456852793e-05, |
|
"loss": 5.4868, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.6775888717156106, |
|
"grad_norm": 1.1871248483657837, |
|
"learning_rate": 6.700507614213199e-05, |
|
"loss": 5.5475, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6825347758887171, |
|
"grad_norm": 0.960493803024292, |
|
"learning_rate": 6.598984771573604e-05, |
|
"loss": 5.5006, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6874806800618238, |
|
"grad_norm": 1.053593397140503, |
|
"learning_rate": 6.49746192893401e-05, |
|
"loss": 5.5389, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6924265842349304, |
|
"grad_norm": 0.8886996507644653, |
|
"learning_rate": 6.395939086294417e-05, |
|
"loss": 5.4616, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6973724884080371, |
|
"grad_norm": 1.1852856874465942, |
|
"learning_rate": 6.294416243654822e-05, |
|
"loss": 5.498, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7023183925811437, |
|
"grad_norm": 0.8381466865539551, |
|
"learning_rate": 6.192893401015228e-05, |
|
"loss": 5.4977, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.7072642967542504, |
|
"grad_norm": 1.01845121383667, |
|
"learning_rate": 6.091370558375635e-05, |
|
"loss": 5.4162, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.712210200927357, |
|
"grad_norm": 0.9204426407814026, |
|
"learning_rate": 5.989847715736041e-05, |
|
"loss": 5.4654, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7171561051004637, |
|
"grad_norm": 1.0901105403900146, |
|
"learning_rate": 5.8883248730964467e-05, |
|
"loss": 5.4262, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7221020092735704, |
|
"grad_norm": 0.9842381477355957, |
|
"learning_rate": 5.786802030456853e-05, |
|
"loss": 5.4622, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.727047913446677, |
|
"grad_norm": 1.1234885454177856, |
|
"learning_rate": 5.68527918781726e-05, |
|
"loss": 5.4668, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.7319938176197837, |
|
"grad_norm": 1.0685431957244873, |
|
"learning_rate": 5.583756345177665e-05, |
|
"loss": 5.4649, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.7369397217928902, |
|
"grad_norm": 1.086138367652893, |
|
"learning_rate": 5.482233502538071e-05, |
|
"loss": 5.336, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.7418856259659969, |
|
"grad_norm": 1.0806076526641846, |
|
"learning_rate": 5.380710659898477e-05, |
|
"loss": 5.3463, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7468315301391035, |
|
"grad_norm": 1.1613116264343262, |
|
"learning_rate": 5.2791878172588836e-05, |
|
"loss": 5.4095, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.7517774343122102, |
|
"grad_norm": 1.1117639541625977, |
|
"learning_rate": 5.17766497461929e-05, |
|
"loss": 5.4248, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.7567233384853168, |
|
"grad_norm": 0.9730443954467773, |
|
"learning_rate": 5.076142131979695e-05, |
|
"loss": 5.5573, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.7616692426584235, |
|
"grad_norm": 1.0216584205627441, |
|
"learning_rate": 4.9746192893401014e-05, |
|
"loss": 5.3337, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.7666151468315301, |
|
"grad_norm": 0.9828229546546936, |
|
"learning_rate": 4.873096446700508e-05, |
|
"loss": 5.3757, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7715610510046368, |
|
"grad_norm": 1.0315641164779663, |
|
"learning_rate": 4.771573604060914e-05, |
|
"loss": 5.4465, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.7765069551777435, |
|
"grad_norm": 1.1969993114471436, |
|
"learning_rate": 4.67005076142132e-05, |
|
"loss": 5.4018, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.7814528593508501, |
|
"grad_norm": 0.7633097171783447, |
|
"learning_rate": 4.568527918781726e-05, |
|
"loss": 5.5137, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.7863987635239568, |
|
"grad_norm": 0.8312305212020874, |
|
"learning_rate": 4.467005076142132e-05, |
|
"loss": 5.4078, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.7913446676970634, |
|
"grad_norm": 0.9463878870010376, |
|
"learning_rate": 4.365482233502538e-05, |
|
"loss": 5.3738, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.79629057187017, |
|
"grad_norm": 0.8046661615371704, |
|
"learning_rate": 4.2639593908629446e-05, |
|
"loss": 5.455, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.8012364760432766, |
|
"grad_norm": 1.0929735898971558, |
|
"learning_rate": 4.162436548223351e-05, |
|
"loss": 5.4263, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.8061823802163833, |
|
"grad_norm": 1.0323022603988647, |
|
"learning_rate": 4.060913705583756e-05, |
|
"loss": 5.4503, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.8111282843894899, |
|
"grad_norm": 0.7212726473808289, |
|
"learning_rate": 3.959390862944163e-05, |
|
"loss": 5.3904, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.8160741885625966, |
|
"grad_norm": 0.8705483078956604, |
|
"learning_rate": 3.8578680203045685e-05, |
|
"loss": 5.2958, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8210200927357032, |
|
"grad_norm": 0.9705776572227478, |
|
"learning_rate": 3.756345177664975e-05, |
|
"loss": 5.3806, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.8259659969088099, |
|
"grad_norm": 0.7694171667098999, |
|
"learning_rate": 3.654822335025381e-05, |
|
"loss": 5.3446, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.8309119010819166, |
|
"grad_norm": 1.0148179531097412, |
|
"learning_rate": 3.553299492385787e-05, |
|
"loss": 5.4316, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8358578052550232, |
|
"grad_norm": 1.0124086141586304, |
|
"learning_rate": 3.451776649746193e-05, |
|
"loss": 5.2903, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.8408037094281299, |
|
"grad_norm": 0.8755667209625244, |
|
"learning_rate": 3.3502538071065994e-05, |
|
"loss": 5.2636, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8457496136012365, |
|
"grad_norm": 0.992751955986023, |
|
"learning_rate": 3.248730964467005e-05, |
|
"loss": 5.3662, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.8506955177743432, |
|
"grad_norm": 0.676480770111084, |
|
"learning_rate": 3.147208121827411e-05, |
|
"loss": 5.3912, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.8556414219474497, |
|
"grad_norm": 0.8479735851287842, |
|
"learning_rate": 3.0456852791878175e-05, |
|
"loss": 5.5655, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.8605873261205564, |
|
"grad_norm": 0.8780114054679871, |
|
"learning_rate": 2.9441624365482233e-05, |
|
"loss": 5.4011, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.865533230293663, |
|
"grad_norm": 0.7192287445068359, |
|
"learning_rate": 2.84263959390863e-05, |
|
"loss": 5.46, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8704791344667697, |
|
"grad_norm": 0.9556674957275391, |
|
"learning_rate": 2.7411167512690357e-05, |
|
"loss": 5.4278, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.8754250386398763, |
|
"grad_norm": 0.7303546667098999, |
|
"learning_rate": 2.6395939086294418e-05, |
|
"loss": 5.3822, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.880370942812983, |
|
"grad_norm": 0.7659119963645935, |
|
"learning_rate": 2.5380710659898476e-05, |
|
"loss": 5.3925, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.8853168469860896, |
|
"grad_norm": 0.8511722087860107, |
|
"learning_rate": 2.436548223350254e-05, |
|
"loss": 5.3318, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.8902627511591963, |
|
"grad_norm": 0.8240477442741394, |
|
"learning_rate": 2.33502538071066e-05, |
|
"loss": 5.2479, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.895208655332303, |
|
"grad_norm": 0.8193429112434387, |
|
"learning_rate": 2.233502538071066e-05, |
|
"loss": 5.4237, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.9001545595054096, |
|
"grad_norm": 0.8074966669082642, |
|
"learning_rate": 2.1319796954314723e-05, |
|
"loss": 5.5029, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.9051004636785163, |
|
"grad_norm": 0.6603164076805115, |
|
"learning_rate": 2.030456852791878e-05, |
|
"loss": 5.3007, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.9100463678516229, |
|
"grad_norm": 0.633477509021759, |
|
"learning_rate": 1.9289340101522843e-05, |
|
"loss": 5.396, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9149922720247295, |
|
"grad_norm": 0.6681249141693115, |
|
"learning_rate": 1.8274111675126904e-05, |
|
"loss": 5.3733, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9199381761978361, |
|
"grad_norm": 0.756808340549469, |
|
"learning_rate": 1.7258883248730966e-05, |
|
"loss": 5.3439, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.9248840803709428, |
|
"grad_norm": 0.64524906873703, |
|
"learning_rate": 1.6243654822335024e-05, |
|
"loss": 5.4027, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.9298299845440494, |
|
"grad_norm": 0.7147576212882996, |
|
"learning_rate": 1.5228426395939088e-05, |
|
"loss": 5.3111, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.9347758887171561, |
|
"grad_norm": 0.6565448641777039, |
|
"learning_rate": 1.421319796954315e-05, |
|
"loss": 5.3649, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.9397217928902627, |
|
"grad_norm": 0.6476154923439026, |
|
"learning_rate": 1.3197969543147209e-05, |
|
"loss": 5.3617, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9446676970633694, |
|
"grad_norm": 0.6315869092941284, |
|
"learning_rate": 1.218274111675127e-05, |
|
"loss": 5.3247, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.9496136012364761, |
|
"grad_norm": 0.6404466032981873, |
|
"learning_rate": 1.116751269035533e-05, |
|
"loss": 5.3402, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.9545595054095827, |
|
"grad_norm": 0.6863434314727783, |
|
"learning_rate": 1.015228426395939e-05, |
|
"loss": 5.3436, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.9595054095826894, |
|
"grad_norm": 0.6492709517478943, |
|
"learning_rate": 9.137055837563452e-06, |
|
"loss": 5.2449, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.964451313755796, |
|
"grad_norm": 0.647345781326294, |
|
"learning_rate": 8.121827411167512e-06, |
|
"loss": 5.3811, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.9693972179289027, |
|
"grad_norm": 0.711609423160553, |
|
"learning_rate": 7.106598984771575e-06, |
|
"loss": 5.3612, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.9743431221020092, |
|
"grad_norm": 0.610159158706665, |
|
"learning_rate": 6.091370558375635e-06, |
|
"loss": 5.3041, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.9792890262751159, |
|
"grad_norm": 0.61027592420578, |
|
"learning_rate": 5.076142131979695e-06, |
|
"loss": 5.3324, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.9842349304482225, |
|
"grad_norm": 0.5848086476325989, |
|
"learning_rate": 4.060913705583756e-06, |
|
"loss": 5.3446, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.9891808346213292, |
|
"grad_norm": 0.5617231130599976, |
|
"learning_rate": 3.0456852791878177e-06, |
|
"loss": 5.3997, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9941267387944358, |
|
"grad_norm": 0.6468728184700012, |
|
"learning_rate": 2.030456852791878e-06, |
|
"loss": 5.3444, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.9990726429675425, |
|
"grad_norm": 0.629033088684082, |
|
"learning_rate": 1.015228426395939e-06, |
|
"loss": 5.3283, |
|
"step": 202 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 202, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5526784012305408.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|