|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.2352456766401319, |
|
"eval_steps": 500, |
|
"global_step": 9000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01372495196266813, |
|
"grad_norm": 23.625333786010742, |
|
"learning_rate": 6.378600823045268e-07, |
|
"loss": 3.631, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02744990392533626, |
|
"grad_norm": 26.834190368652344, |
|
"learning_rate": 1.3237311385459534e-06, |
|
"loss": 0.887, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04117485588800439, |
|
"grad_norm": 20.897249221801758, |
|
"learning_rate": 2.00960219478738e-06, |
|
"loss": 0.5548, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05489980785067252, |
|
"grad_norm": 14.432657241821289, |
|
"learning_rate": 2.6954732510288067e-06, |
|
"loss": 0.353, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06862475981334065, |
|
"grad_norm": 45.2662239074707, |
|
"learning_rate": 3.3813443072702336e-06, |
|
"loss": 0.3153, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08234971177600878, |
|
"grad_norm": 10.040751457214355, |
|
"learning_rate": 4.06721536351166e-06, |
|
"loss": 0.2679, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.09607466373867692, |
|
"grad_norm": 26.088348388671875, |
|
"learning_rate": 4.753086419753087e-06, |
|
"loss": 0.308, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.10979961570134504, |
|
"grad_norm": 5.486932754516602, |
|
"learning_rate": 5.438957475994513e-06, |
|
"loss": 0.2776, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.12352456766401318, |
|
"grad_norm": 25.890995025634766, |
|
"learning_rate": 6.12482853223594e-06, |
|
"loss": 0.2864, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1372495196266813, |
|
"grad_norm": 6.877874851226807, |
|
"learning_rate": 6.810699588477366e-06, |
|
"loss": 0.2502, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.15097447158934943, |
|
"grad_norm": 8.569019317626953, |
|
"learning_rate": 7.496570644718793e-06, |
|
"loss": 0.2392, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.16469942355201755, |
|
"grad_norm": 26.53949546813965, |
|
"learning_rate": 8.18244170096022e-06, |
|
"loss": 0.2599, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1784243755146857, |
|
"grad_norm": 31.634700775146484, |
|
"learning_rate": 8.868312757201646e-06, |
|
"loss": 0.2638, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.19214932747735383, |
|
"grad_norm": 35.01237869262695, |
|
"learning_rate": 9.554183813443072e-06, |
|
"loss": 0.2672, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.20587427944002196, |
|
"grad_norm": 15.365808486938477, |
|
"learning_rate": 9.97331096538051e-06, |
|
"loss": 0.2573, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.21959923140269008, |
|
"grad_norm": 2.571429491043091, |
|
"learning_rate": 9.897056580753394e-06, |
|
"loss": 0.2461, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.23332418336535823, |
|
"grad_norm": 2.3058722019195557, |
|
"learning_rate": 9.820802196126277e-06, |
|
"loss": 0.2688, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.24704913532802636, |
|
"grad_norm": 24.54079246520996, |
|
"learning_rate": 9.744547811499163e-06, |
|
"loss": 0.2524, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.26077408729069446, |
|
"grad_norm": 4.643412113189697, |
|
"learning_rate": 9.668293426872047e-06, |
|
"loss": 0.2797, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.2744990392533626, |
|
"grad_norm": 11.36482048034668, |
|
"learning_rate": 9.592039042244929e-06, |
|
"loss": 0.2426, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.28822399121603076, |
|
"grad_norm": 30.41916275024414, |
|
"learning_rate": 9.515784657617814e-06, |
|
"loss": 0.245, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.30194894317869886, |
|
"grad_norm": 31.319168090820312, |
|
"learning_rate": 9.439530272990698e-06, |
|
"loss": 0.2365, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.315673895141367, |
|
"grad_norm": 2.134601593017578, |
|
"learning_rate": 9.363275888363582e-06, |
|
"loss": 0.214, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.3293988471040351, |
|
"grad_norm": 18.417972564697266, |
|
"learning_rate": 9.287021503736465e-06, |
|
"loss": 0.2669, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.34312379906670326, |
|
"grad_norm": 3.676449775695801, |
|
"learning_rate": 9.21076711910935e-06, |
|
"loss": 0.2301, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3568487510293714, |
|
"grad_norm": 21.718795776367188, |
|
"learning_rate": 9.134512734482233e-06, |
|
"loss": 0.2332, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.3705737029920395, |
|
"grad_norm": 15.048168182373047, |
|
"learning_rate": 9.058258349855117e-06, |
|
"loss": 0.1976, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.38429865495470766, |
|
"grad_norm": 17.38605308532715, |
|
"learning_rate": 8.982003965228e-06, |
|
"loss": 0.2236, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.3980236069173758, |
|
"grad_norm": 30.235185623168945, |
|
"learning_rate": 8.905749580600886e-06, |
|
"loss": 0.1932, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.4117485588800439, |
|
"grad_norm": 21.414888381958008, |
|
"learning_rate": 8.82949519597377e-06, |
|
"loss": 0.2256, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.42547351084271207, |
|
"grad_norm": 2.553528070449829, |
|
"learning_rate": 8.753240811346654e-06, |
|
"loss": 0.2145, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.43919846280538016, |
|
"grad_norm": 5.725930213928223, |
|
"learning_rate": 8.676986426719537e-06, |
|
"loss": 0.2089, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.4529234147680483, |
|
"grad_norm": 4.194558620452881, |
|
"learning_rate": 8.600732042092421e-06, |
|
"loss": 0.2349, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.46664836673071647, |
|
"grad_norm": 4.438018321990967, |
|
"learning_rate": 8.524477657465305e-06, |
|
"loss": 0.2161, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.48037331869338457, |
|
"grad_norm": 4.64914608001709, |
|
"learning_rate": 8.448223272838189e-06, |
|
"loss": 0.1956, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4940982706560527, |
|
"grad_norm": 11.573904991149902, |
|
"learning_rate": 8.371968888211072e-06, |
|
"loss": 0.1946, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5078232226187208, |
|
"grad_norm": 10.568695068359375, |
|
"learning_rate": 8.295714503583956e-06, |
|
"loss": 0.2072, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.5215481745813889, |
|
"grad_norm": 1.9153978824615479, |
|
"learning_rate": 8.21946011895684e-06, |
|
"loss": 0.1983, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.5352731265440571, |
|
"grad_norm": 19.77039337158203, |
|
"learning_rate": 8.143205734329724e-06, |
|
"loss": 0.2112, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.5489980785067252, |
|
"grad_norm": 2.6344797611236572, |
|
"learning_rate": 8.06695134970261e-06, |
|
"loss": 0.1957, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5627230304693933, |
|
"grad_norm": 41.998775482177734, |
|
"learning_rate": 7.990696965075493e-06, |
|
"loss": 0.1756, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.5764479824320615, |
|
"grad_norm": 9.939011573791504, |
|
"learning_rate": 7.914442580448377e-06, |
|
"loss": 0.1856, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.5901729343947296, |
|
"grad_norm": 32.17470932006836, |
|
"learning_rate": 7.83818819582126e-06, |
|
"loss": 0.205, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.6038978863573977, |
|
"grad_norm": 19.29119110107422, |
|
"learning_rate": 7.761933811194144e-06, |
|
"loss": 0.1973, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.6176228383200659, |
|
"grad_norm": 2.429710865020752, |
|
"learning_rate": 7.685679426567028e-06, |
|
"loss": 0.2009, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.631347790282734, |
|
"grad_norm": 3.910445213317871, |
|
"learning_rate": 7.609425041939912e-06, |
|
"loss": 0.1725, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.6450727422454021, |
|
"grad_norm": 13.112035751342773, |
|
"learning_rate": 7.5331706573127965e-06, |
|
"loss": 0.1939, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.6587976942080702, |
|
"grad_norm": 18.058584213256836, |
|
"learning_rate": 7.45691627268568e-06, |
|
"loss": 0.1751, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.6725226461707384, |
|
"grad_norm": 11.847052574157715, |
|
"learning_rate": 7.380661888058563e-06, |
|
"loss": 0.1903, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.6862475981334065, |
|
"grad_norm": 7.183902740478516, |
|
"learning_rate": 7.304407503431448e-06, |
|
"loss": 0.1795, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6999725500960746, |
|
"grad_norm": 11.07944107055664, |
|
"learning_rate": 7.2281531188043316e-06, |
|
"loss": 0.1854, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.7136975020587428, |
|
"grad_norm": 14.285106658935547, |
|
"learning_rate": 7.151898734177216e-06, |
|
"loss": 0.1885, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.7274224540214109, |
|
"grad_norm": 20.710567474365234, |
|
"learning_rate": 7.075644349550099e-06, |
|
"loss": 0.1911, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.741147405984079, |
|
"grad_norm": 20.737842559814453, |
|
"learning_rate": 6.999389964922984e-06, |
|
"loss": 0.1826, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.7548723579467472, |
|
"grad_norm": 15.488795280456543, |
|
"learning_rate": 6.9231355802958675e-06, |
|
"loss": 0.1777, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7685973099094153, |
|
"grad_norm": 20.551816940307617, |
|
"learning_rate": 6.846881195668752e-06, |
|
"loss": 0.1692, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.7823222618720834, |
|
"grad_norm": 4.317481517791748, |
|
"learning_rate": 6.770626811041635e-06, |
|
"loss": 0.1596, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.7960472138347516, |
|
"grad_norm": 11.099555015563965, |
|
"learning_rate": 6.69437242641452e-06, |
|
"loss": 0.1891, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.8097721657974197, |
|
"grad_norm": 0.8909740447998047, |
|
"learning_rate": 6.6181180417874034e-06, |
|
"loss": 0.1728, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.8234971177600878, |
|
"grad_norm": 5.078292369842529, |
|
"learning_rate": 6.541863657160286e-06, |
|
"loss": 0.1856, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.8372220697227559, |
|
"grad_norm": 7.8703532218933105, |
|
"learning_rate": 6.465609272533171e-06, |
|
"loss": 0.183, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.8509470216854241, |
|
"grad_norm": 15.550518989562988, |
|
"learning_rate": 6.389354887906056e-06, |
|
"loss": 0.1631, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.8646719736480922, |
|
"grad_norm": 9.581011772155762, |
|
"learning_rate": 6.313100503278939e-06, |
|
"loss": 0.148, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.8783969256107603, |
|
"grad_norm": 3.0507142543792725, |
|
"learning_rate": 6.236846118651822e-06, |
|
"loss": 0.1925, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.8921218775734285, |
|
"grad_norm": 10.670945167541504, |
|
"learning_rate": 6.160591734024707e-06, |
|
"loss": 0.1487, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9058468295360966, |
|
"grad_norm": 19.39102554321289, |
|
"learning_rate": 6.084337349397591e-06, |
|
"loss": 0.1754, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.9195717814987647, |
|
"grad_norm": 35.748374938964844, |
|
"learning_rate": 6.008082964770475e-06, |
|
"loss": 0.1712, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.9332967334614329, |
|
"grad_norm": 2.9247872829437256, |
|
"learning_rate": 5.931828580143358e-06, |
|
"loss": 0.1531, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.947021685424101, |
|
"grad_norm": 2.437190294265747, |
|
"learning_rate": 5.855574195516243e-06, |
|
"loss": 0.1546, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.9607466373867691, |
|
"grad_norm": 3.3819751739501953, |
|
"learning_rate": 5.779319810889127e-06, |
|
"loss": 0.1485, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9744715893494372, |
|
"grad_norm": 2.5567026138305664, |
|
"learning_rate": 5.70306542626201e-06, |
|
"loss": 0.1529, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.9881965413121054, |
|
"grad_norm": 3.1037750244140625, |
|
"learning_rate": 5.626811041634894e-06, |
|
"loss": 0.1583, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.0019214932747735, |
|
"grad_norm": 8.52094554901123, |
|
"learning_rate": 5.550556657007779e-06, |
|
"loss": 0.1814, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.0156464452374416, |
|
"grad_norm": 2.9368324279785156, |
|
"learning_rate": 5.4743022723806626e-06, |
|
"loss": 0.1437, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.0293713972001097, |
|
"grad_norm": 5.706370830535889, |
|
"learning_rate": 5.3980478877535455e-06, |
|
"loss": 0.1474, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.0430963491627778, |
|
"grad_norm": 18.225793838500977, |
|
"learning_rate": 5.32179350312643e-06, |
|
"loss": 0.1339, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.0568213011254461, |
|
"grad_norm": 7.467564105987549, |
|
"learning_rate": 5.245539118499314e-06, |
|
"loss": 0.1244, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.0705462530881142, |
|
"grad_norm": 1.9714295864105225, |
|
"learning_rate": 5.1692847338721985e-06, |
|
"loss": 0.1376, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.0842712050507823, |
|
"grad_norm": 4.701173782348633, |
|
"learning_rate": 5.0930303492450814e-06, |
|
"loss": 0.1232, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.0979961570134504, |
|
"grad_norm": 9.477237701416016, |
|
"learning_rate": 5.016775964617966e-06, |
|
"loss": 0.1327, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.1117211089761185, |
|
"grad_norm": 2.713070869445801, |
|
"learning_rate": 4.94052157999085e-06, |
|
"loss": 0.1184, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.1254460609387866, |
|
"grad_norm": 27.41179847717285, |
|
"learning_rate": 4.864267195363734e-06, |
|
"loss": 0.1236, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.1391710129014547, |
|
"grad_norm": 1.996870994567871, |
|
"learning_rate": 4.788012810736617e-06, |
|
"loss": 0.1391, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.152895964864123, |
|
"grad_norm": 2.3036551475524902, |
|
"learning_rate": 4.711758426109502e-06, |
|
"loss": 0.1368, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.1666209168267911, |
|
"grad_norm": 14.458354949951172, |
|
"learning_rate": 4.635504041482386e-06, |
|
"loss": 0.1433, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.1803458687894592, |
|
"grad_norm": 3.8688037395477295, |
|
"learning_rate": 4.5592496568552695e-06, |
|
"loss": 0.1483, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.1940708207521273, |
|
"grad_norm": 25.26601791381836, |
|
"learning_rate": 4.482995272228153e-06, |
|
"loss": 0.1444, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.2077957727147954, |
|
"grad_norm": 3.356841802597046, |
|
"learning_rate": 4.406740887601038e-06, |
|
"loss": 0.1361, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.2215207246774635, |
|
"grad_norm": 3.6947531700134277, |
|
"learning_rate": 4.330486502973921e-06, |
|
"loss": 0.1265, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.2352456766401319, |
|
"grad_norm": 3.885274648666382, |
|
"learning_rate": 4.2542321183468055e-06, |
|
"loss": 0.1208, |
|
"step": 9000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 14572, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|