|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.802654408616688, |
|
"eval_steps": 500, |
|
"global_step": 80000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011266590053854301, |
|
"grad_norm": 3268.47607421875, |
|
"learning_rate": 3.125e-06, |
|
"loss": 80243.872, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.022533180107708602, |
|
"grad_norm": 1101.09423828125, |
|
"learning_rate": 6.25e-06, |
|
"loss": 455.9426, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0337997701615629, |
|
"grad_norm": 1345.1741943359375, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 398.2565, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.045066360215417205, |
|
"grad_norm": 957.8522338867188, |
|
"learning_rate": 1.25e-05, |
|
"loss": 361.662, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0563329502692715, |
|
"grad_norm": 1153.3565673828125, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 351.6057, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.0675995403231258, |
|
"grad_norm": 3220.947021484375, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 324.4043, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0788661303769801, |
|
"grad_norm": 1318.7249755859375, |
|
"learning_rate": 2.1875e-05, |
|
"loss": 303.6339, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09013272043083441, |
|
"grad_norm": 1085.81982421875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 290.8328, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1013993104846887, |
|
"grad_norm": 1732.97412109375, |
|
"learning_rate": 2.8125000000000003e-05, |
|
"loss": 286.8244, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.112665900538543, |
|
"grad_norm": 1018.1327514648438, |
|
"learning_rate": 3.125e-05, |
|
"loss": 274.4126, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.1239324905923973, |
|
"grad_norm": 1115.255859375, |
|
"learning_rate": 3.4375e-05, |
|
"loss": 258.1825, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.1351990806462516, |
|
"grad_norm": 2264.675537109375, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 249.4162, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.1464656707001059, |
|
"grad_norm": 1202.668212890625, |
|
"learning_rate": 4.0625000000000005e-05, |
|
"loss": 238.756, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.1577322607539602, |
|
"grad_norm": 1271.8997802734375, |
|
"learning_rate": 4.375e-05, |
|
"loss": 234.6415, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.16899885080781452, |
|
"grad_norm": 880.48291015625, |
|
"learning_rate": 4.6875e-05, |
|
"loss": 220.2277, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.18026544086166882, |
|
"grad_norm": 1199.511962890625, |
|
"learning_rate": 5e-05, |
|
"loss": 212.284, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.19153203091552312, |
|
"grad_norm": 885.7015991210938, |
|
"learning_rate": 4.999405067699773e-05, |
|
"loss": 197.9255, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.2027986209693774, |
|
"grad_norm": 1130.3941650390625, |
|
"learning_rate": 4.997620553954645e-05, |
|
"loss": 201.8366, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.2140652110232317, |
|
"grad_norm": 1032.0145263671875, |
|
"learning_rate": 4.994647308096509e-05, |
|
"loss": 195.4686, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.225331801077086, |
|
"grad_norm": 1835.482666015625, |
|
"learning_rate": 4.990486745229364e-05, |
|
"loss": 186.7062, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2365983911309403, |
|
"grad_norm": 1477.0096435546875, |
|
"learning_rate": 4.985140845555799e-05, |
|
"loss": 188.2111, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.2478649811847946, |
|
"grad_norm": 688.68310546875, |
|
"learning_rate": 4.9786121534345265e-05, |
|
"loss": 180.3121, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.2591315712386489, |
|
"grad_norm": 1981.5882568359375, |
|
"learning_rate": 4.970903776169402e-05, |
|
"loss": 171.624, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.2703981612925032, |
|
"grad_norm": 1986.2034912109375, |
|
"learning_rate": 4.962019382530521e-05, |
|
"loss": 178.433, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.2816647513463575, |
|
"grad_norm": 1576.7353515625, |
|
"learning_rate": 4.951963201008076e-05, |
|
"loss": 164.5357, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.2929313414002118, |
|
"grad_norm": 1520.091796875, |
|
"learning_rate": 4.940740017799833e-05, |
|
"loss": 162.9244, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.3041979314540661, |
|
"grad_norm": 1361.819091796875, |
|
"learning_rate": 4.9283551745331534e-05, |
|
"loss": 164.2202, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.3154645215079204, |
|
"grad_norm": 1085.12548828125, |
|
"learning_rate": 4.914814565722671e-05, |
|
"loss": 164.809, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.3267311115617747, |
|
"grad_norm": 1452.0218505859375, |
|
"learning_rate": 4.9001246359648224e-05, |
|
"loss": 162.6041, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.33799770161562903, |
|
"grad_norm": 1728.5352783203125, |
|
"learning_rate": 4.884292376870567e-05, |
|
"loss": 156.4712, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.3492642916694833, |
|
"grad_norm": 1888.999267578125, |
|
"learning_rate": 4.867325323737765e-05, |
|
"loss": 151.6581, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.36053088172333764, |
|
"grad_norm": 1127.0830078125, |
|
"learning_rate": 4.849231551964771e-05, |
|
"loss": 154.7625, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.3717974717771919, |
|
"grad_norm": 1579.301513671875, |
|
"learning_rate": 4.830019673206997e-05, |
|
"loss": 158.3963, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.38306406183104624, |
|
"grad_norm": 1466.5936279296875, |
|
"learning_rate": 4.8096988312782174e-05, |
|
"loss": 151.9632, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.3943306518849005, |
|
"grad_norm": 1501.413330078125, |
|
"learning_rate": 4.788278697798618e-05, |
|
"loss": 152.0479, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.4055972419387548, |
|
"grad_norm": 938.9967651367188, |
|
"learning_rate": 4.765769467591625e-05, |
|
"loss": 147.7795, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.4168638319926091, |
|
"grad_norm": 1407.6708984375, |
|
"learning_rate": 4.742181853831721e-05, |
|
"loss": 145.714, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.4281304220464634, |
|
"grad_norm": 1046.4781494140625, |
|
"learning_rate": 4.717527082945554e-05, |
|
"loss": 147.87, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.43939701210031773, |
|
"grad_norm": 1437.9764404296875, |
|
"learning_rate": 4.69181688926877e-05, |
|
"loss": 139.7023, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.450663602154172, |
|
"grad_norm": 1155.10595703125, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 146.1232, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.46193019220802634, |
|
"grad_norm": 1317.321044921875, |
|
"learning_rate": 4.637279676682367e-05, |
|
"loss": 139.9448, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.4731967822618806, |
|
"grad_norm": 1005.6251831054688, |
|
"learning_rate": 4.608478614532215e-05, |
|
"loss": 142.7613, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.48446337231573494, |
|
"grad_norm": 2557.992919921875, |
|
"learning_rate": 4.5786740307563636e-05, |
|
"loss": 139.5429, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.4957299623695892, |
|
"grad_norm": 1912.8707275390625, |
|
"learning_rate": 4.54788011072248e-05, |
|
"loss": 138.4188, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.5069965524234435, |
|
"grad_norm": 1349.5655517578125, |
|
"learning_rate": 4.516111510668707e-05, |
|
"loss": 135.984, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.5182631424772978, |
|
"grad_norm": 2255.250732421875, |
|
"learning_rate": 4.4833833507280884e-05, |
|
"loss": 136.2522, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.5295297325311521, |
|
"grad_norm": 947.2132568359375, |
|
"learning_rate": 4.4497112077322044e-05, |
|
"loss": 135.873, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.5407963225850064, |
|
"grad_norm": 1092.2021484375, |
|
"learning_rate": 4.415111107797445e-05, |
|
"loss": 136.0968, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.5520629126388608, |
|
"grad_norm": 2551.856201171875, |
|
"learning_rate": 4.379599518697444e-05, |
|
"loss": 133.4127, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.563329502692715, |
|
"grad_norm": 1087.750732421875, |
|
"learning_rate": 4.34319334202531e-05, |
|
"loss": 137.0111, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.5745960927465693, |
|
"grad_norm": 710.9840698242188, |
|
"learning_rate": 4.305909905149389e-05, |
|
"loss": 128.7961, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.5858626828004236, |
|
"grad_norm": 1028.0732421875, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 131.5538, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.597129272854278, |
|
"grad_norm": 1650.9874267578125, |
|
"learning_rate": 4.228782639455674e-05, |
|
"loss": 131.9968, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.6083958629081322, |
|
"grad_norm": 3326.53564453125, |
|
"learning_rate": 4.188975519039151e-05, |
|
"loss": 129.6315, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.6196624529619865, |
|
"grad_norm": 2218.9794921875, |
|
"learning_rate": 4.148364537750172e-05, |
|
"loss": 126.7289, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.6309290430158409, |
|
"grad_norm": 1092.4063720703125, |
|
"learning_rate": 4.1069690242163484e-05, |
|
"loss": 131.7434, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.6421956330696951, |
|
"grad_norm": 1642.626220703125, |
|
"learning_rate": 4.064808680460148e-05, |
|
"loss": 131.2289, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.6534622231235494, |
|
"grad_norm": 2752.57470703125, |
|
"learning_rate": 4.021903572521802e-05, |
|
"loss": 125.4669, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.6647288131774037, |
|
"grad_norm": 2134.545654296875, |
|
"learning_rate": 3.978274120908956e-05, |
|
"loss": 128.8268, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.6759954032312581, |
|
"grad_norm": 1844.7005615234375, |
|
"learning_rate": 3.933941090877615e-05, |
|
"loss": 126.0543, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6872619932851123, |
|
"grad_norm": 911.3765869140625, |
|
"learning_rate": 3.888925582549006e-05, |
|
"loss": 124.5692, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.6985285833389666, |
|
"grad_norm": 1240.784423828125, |
|
"learning_rate": 3.84324902086706e-05, |
|
"loss": 129.9127, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.7097951733928209, |
|
"grad_norm": 1387.1654052734375, |
|
"learning_rate": 3.796933145401304e-05, |
|
"loss": 128.229, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.7210617634466753, |
|
"grad_norm": 5207.958984375, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 123.6102, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.7323283535005295, |
|
"grad_norm": 1478.238525390625, |
|
"learning_rate": 3.702471922298469e-05, |
|
"loss": 122.5027, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.7435949435543838, |
|
"grad_norm": 1219.868408203125, |
|
"learning_rate": 3.654371533087586e-05, |
|
"loss": 121.6549, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.7548615336082382, |
|
"grad_norm": 1724.080078125, |
|
"learning_rate": 3.6057217255475034e-05, |
|
"loss": 122.2066, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.7661281236620925, |
|
"grad_norm": 2315.779052734375, |
|
"learning_rate": 3.556545654351749e-05, |
|
"loss": 124.5871, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.7773947137159467, |
|
"grad_norm": 1415.927978515625, |
|
"learning_rate": 3.5068667246468436e-05, |
|
"loss": 119.864, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.788661303769801, |
|
"grad_norm": 2292.79736328125, |
|
"learning_rate": 3.456708580912725e-05, |
|
"loss": 124.4397, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.7999278938236554, |
|
"grad_norm": 2198.923583984375, |
|
"learning_rate": 3.406095095709254e-05, |
|
"loss": 119.8706, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.8111944838775096, |
|
"grad_norm": 774.8341064453125, |
|
"learning_rate": 3.355050358314172e-05, |
|
"loss": 122.3335, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.8224610739313639, |
|
"grad_norm": 1356.7291259765625, |
|
"learning_rate": 3.303598663257904e-05, |
|
"loss": 117.6119, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.8337276639852182, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.251764498760683e-05, |
|
"loss": 122.5679, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.8449942540390726, |
|
"grad_norm": 0.0, |
|
"learning_rate": 3.1995725350774806e-05, |
|
"loss": 116.4365, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.8562608440929268, |
|
"grad_norm": 1365.280517578125, |
|
"learning_rate": 3.147047612756302e-05, |
|
"loss": 119.871, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.8675274341467811, |
|
"grad_norm": 1635.288330078125, |
|
"learning_rate": 3.094214730815433e-05, |
|
"loss": 116.4286, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.8787940242006355, |
|
"grad_norm": 1430.511962890625, |
|
"learning_rate": 3.0410990348452573e-05, |
|
"loss": 113.9206, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.8900606142544898, |
|
"grad_norm": 1990.450927734375, |
|
"learning_rate": 2.9877258050403212e-05, |
|
"loss": 118.977, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.901327204308344, |
|
"grad_norm": 875.0726928710938, |
|
"learning_rate": 2.9341204441673266e-05, |
|
"loss": 114.4106, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.9125937943621983, |
|
"grad_norm": 1655.7935791015625, |
|
"learning_rate": 2.8803084654747918e-05, |
|
"loss": 115.3111, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.9238603844160527, |
|
"grad_norm": 1956.72216796875, |
|
"learning_rate": 2.8263154805501297e-05, |
|
"loss": 117.3135, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.9351269744699069, |
|
"grad_norm": 937.9488525390625, |
|
"learning_rate": 2.7721671871299116e-05, |
|
"loss": 116.4852, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.9463935645237612, |
|
"grad_norm": 1579.9736328125, |
|
"learning_rate": 2.717889356869146e-05, |
|
"loss": 118.1533, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.9576601545776156, |
|
"grad_norm": 1284.749755859375, |
|
"learning_rate": 2.663507823075358e-05, |
|
"loss": 113.1541, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.9689267446314699, |
|
"grad_norm": 1050.632080078125, |
|
"learning_rate": 2.6090484684133404e-05, |
|
"loss": 115.3209, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.9801933346853241, |
|
"grad_norm": 1058.0616455078125, |
|
"learning_rate": 2.5545372125864032e-05, |
|
"loss": 119.323, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.9914599247391784, |
|
"grad_norm": 1310.6236572265625, |
|
"learning_rate": 2.5e-05, |
|
"loss": 111.4601, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 114.70393371582031, |
|
"eval_runtime": 1385.5714, |
|
"eval_samples_per_second": 14.238, |
|
"eval_steps_per_second": 3.56, |
|
"step": 44379 |
|
}, |
|
{ |
|
"epoch": 1.0027265147930327, |
|
"grad_norm": 1699.930419921875, |
|
"learning_rate": 2.4454627874135974e-05, |
|
"loss": 111.6484, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.013993104846887, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.3909515315866605e-05, |
|
"loss": 111.7967, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.0252596949007413, |
|
"grad_norm": 1440.404052734375, |
|
"learning_rate": 2.3364921769246423e-05, |
|
"loss": 107.7449, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.0365262849545958, |
|
"grad_norm": 1180.1439208984375, |
|
"learning_rate": 2.2821106431308544e-05, |
|
"loss": 109.0394, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.04779287500845, |
|
"grad_norm": 1600.07568359375, |
|
"learning_rate": 2.2278328128700893e-05, |
|
"loss": 104.1056, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.0590594650623042, |
|
"grad_norm": 1731.480224609375, |
|
"learning_rate": 2.173684519449872e-05, |
|
"loss": 106.6714, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.0703260551161586, |
|
"grad_norm": 955.4271240234375, |
|
"learning_rate": 2.1196915345252084e-05, |
|
"loss": 105.6627, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.0815926451700129, |
|
"grad_norm": 2818.5283203125, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 106.0866, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.092859235223867, |
|
"grad_norm": 806.58447265625, |
|
"learning_rate": 2.0122741949596797e-05, |
|
"loss": 111.0945, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.1041258252777215, |
|
"grad_norm": 1397.9573974609375, |
|
"learning_rate": 1.958900965154743e-05, |
|
"loss": 104.8343, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.1153924153315757, |
|
"grad_norm": 827.4343872070312, |
|
"learning_rate": 1.9057852691845677e-05, |
|
"loss": 103.2378, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.12665900538543, |
|
"grad_norm": 2430.602294921875, |
|
"learning_rate": 1.852952387243698e-05, |
|
"loss": 107.4349, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.1379255954392844, |
|
"grad_norm": 1653.0751953125, |
|
"learning_rate": 1.80042746492252e-05, |
|
"loss": 111.0688, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.1491921854931386, |
|
"grad_norm": 1318.4361572265625, |
|
"learning_rate": 1.7482355012393177e-05, |
|
"loss": 107.7423, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.160458775546993, |
|
"grad_norm": 1956.568603515625, |
|
"learning_rate": 1.6964013367420966e-05, |
|
"loss": 103.8008, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.1717253656008473, |
|
"grad_norm": 764.591552734375, |
|
"learning_rate": 1.6449496416858284e-05, |
|
"loss": 104.8202, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.1829919556547015, |
|
"grad_norm": 1741.2701416015625, |
|
"learning_rate": 1.5939049042907462e-05, |
|
"loss": 107.5796, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.194258545708556, |
|
"grad_norm": 2550.640625, |
|
"learning_rate": 1.5432914190872757e-05, |
|
"loss": 101.9078, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.2055251357624102, |
|
"grad_norm": 698.9768676757812, |
|
"learning_rate": 1.4931332753531574e-05, |
|
"loss": 108.1872, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.2167917258162644, |
|
"grad_norm": 3405.810546875, |
|
"learning_rate": 1.443454345648252e-05, |
|
"loss": 99.6448, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.2280583158701188, |
|
"grad_norm": 3267.17236328125, |
|
"learning_rate": 1.3942782744524973e-05, |
|
"loss": 108.0154, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.239324905923973, |
|
"grad_norm": 1127.849853515625, |
|
"learning_rate": 1.3456284669124158e-05, |
|
"loss": 100.0244, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.2505914959778273, |
|
"grad_norm": 1082.1241455078125, |
|
"learning_rate": 1.2975280777015314e-05, |
|
"loss": 104.4581, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.2618580860316817, |
|
"grad_norm": 1860.104248046875, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 103.1496, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.273124676085536, |
|
"grad_norm": 1682.26708984375, |
|
"learning_rate": 1.2030668545986959e-05, |
|
"loss": 107.3074, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.2843912661393904, |
|
"grad_norm": 616.0757446289062, |
|
"learning_rate": 1.1567509791329401e-05, |
|
"loss": 102.4632, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.2956578561932446, |
|
"grad_norm": 624.0526733398438, |
|
"learning_rate": 1.1110744174509952e-05, |
|
"loss": 100.0244, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.3069244462470988, |
|
"grad_norm": 1387.1529541015625, |
|
"learning_rate": 1.0660589091223855e-05, |
|
"loss": 105.9446, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.318191036300953, |
|
"grad_norm": 1828.0877685546875, |
|
"learning_rate": 1.0217258790910448e-05, |
|
"loss": 104.6986, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.3294576263548075, |
|
"grad_norm": 650.5953369140625, |
|
"learning_rate": 9.780964274781984e-06, |
|
"loss": 105.4167, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.3407242164086617, |
|
"grad_norm": 996.328125, |
|
"learning_rate": 9.351913195398524e-06, |
|
"loss": 103.2462, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.3519908064625161, |
|
"grad_norm": 1035.3780517578125, |
|
"learning_rate": 8.930309757836517e-06, |
|
"loss": 103.7156, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.3632573965163703, |
|
"grad_norm": 1865.9571533203125, |
|
"learning_rate": 8.51635462249828e-06, |
|
"loss": 105.3619, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.3745239865702246, |
|
"grad_norm": 2139.88671875, |
|
"learning_rate": 8.110244809608495e-06, |
|
"loss": 103.2506, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.385790576624079, |
|
"grad_norm": 2942.61083984375, |
|
"learning_rate": 7.712173605443269e-06, |
|
"loss": 102.9886, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.3970571666779332, |
|
"grad_norm": 415.38153076171875, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 102.6045, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.4083237567317877, |
|
"grad_norm": 1247.396728515625, |
|
"learning_rate": 6.940900948506113e-06, |
|
"loss": 102.5344, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.4195903467856419, |
|
"grad_norm": 740.3623046875, |
|
"learning_rate": 6.568066579746901e-06, |
|
"loss": 104.4504, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.430856936839496, |
|
"grad_norm": 1221.581298828125, |
|
"learning_rate": 6.204004813025568e-06, |
|
"loss": 101.7631, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.4421235268933503, |
|
"grad_norm": 1514.0115966796875, |
|
"learning_rate": 5.848888922025553e-06, |
|
"loss": 99.2285, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.4533901169472048, |
|
"grad_norm": 821.0801391601562, |
|
"learning_rate": 5.50288792267796e-06, |
|
"loss": 102.3846, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.464656707001059, |
|
"grad_norm": 1313.3104248046875, |
|
"learning_rate": 5.166166492719124e-06, |
|
"loss": 103.6035, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.4759232970549134, |
|
"grad_norm": 1329.42919921875, |
|
"learning_rate": 4.8388848933129335e-06, |
|
"loss": 104.2133, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.4871898871087676, |
|
"grad_norm": 3106.43505859375, |
|
"learning_rate": 4.521198892775203e-06, |
|
"loss": 100.1638, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.4984564771626219, |
|
"grad_norm": 996.626220703125, |
|
"learning_rate": 4.213259692436367e-06, |
|
"loss": 97.5407, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.5097230672164763, |
|
"grad_norm": 1821.3323974609375, |
|
"learning_rate": 3.9152138546778625e-06, |
|
"loss": 102.4095, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.5209896572703305, |
|
"grad_norm": 1764.0323486328125, |
|
"learning_rate": 3.6272032331763408e-06, |
|
"loss": 96.4079, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.532256247324185, |
|
"grad_norm": 1377.978271484375, |
|
"learning_rate": 3.3493649053890326e-06, |
|
"loss": 104.5846, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.5435228373780392, |
|
"grad_norm": 1291.5908203125, |
|
"learning_rate": 3.081831107312308e-06, |
|
"loss": 100.9904, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.5547894274318934, |
|
"grad_norm": 2216.6796875, |
|
"learning_rate": 2.8247291705444575e-06, |
|
"loss": 99.7059, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.5660560174857476, |
|
"grad_norm": 1271.5609130859375, |
|
"learning_rate": 2.578181461682794e-06, |
|
"loss": 99.0976, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.577322607539602, |
|
"grad_norm": 2712.908203125, |
|
"learning_rate": 2.3423053240837515e-06, |
|
"loss": 99.1392, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.5885891975934565, |
|
"grad_norm": 1653.12890625, |
|
"learning_rate": 2.1172130220138226e-06, |
|
"loss": 101.0942, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.5998557876473107, |
|
"grad_norm": 1816.012451171875, |
|
"learning_rate": 1.9030116872178316e-06, |
|
"loss": 99.6448, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.611122377701165, |
|
"grad_norm": 1233.8719482421875, |
|
"learning_rate": 1.6998032679300391e-06, |
|
"loss": 101.785, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.6223889677550192, |
|
"grad_norm": 988.7132568359375, |
|
"learning_rate": 1.5076844803522922e-06, |
|
"loss": 95.7829, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.6336555578088736, |
|
"grad_norm": 1261.22021484375, |
|
"learning_rate": 1.3267467626223606e-06, |
|
"loss": 105.201, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.6449221478627278, |
|
"grad_norm": 934.3240966796875, |
|
"learning_rate": 1.1570762312943295e-06, |
|
"loss": 98.9473, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.6561887379165823, |
|
"grad_norm": 664.5089111328125, |
|
"learning_rate": 9.98753640351785e-07, |
|
"loss": 95.5199, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.6674553279704365, |
|
"grad_norm": 1963.7371826171875, |
|
"learning_rate": 8.51854342773295e-07, |
|
"loss": 101.1854, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.6787219180242907, |
|
"grad_norm": 2443.130126953125, |
|
"learning_rate": 7.164482546684642e-07, |
|
"loss": 97.2825, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.689988508078145, |
|
"grad_norm": 1137.326416015625, |
|
"learning_rate": 5.925998220016659e-07, |
|
"loss": 100.3116, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.7012550981319994, |
|
"grad_norm": 1520.0399169921875, |
|
"learning_rate": 4.803679899192392e-07, |
|
"loss": 100.4226, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.7125216881858538, |
|
"grad_norm": 1065.4630126953125, |
|
"learning_rate": 3.7980617469479953e-07, |
|
"loss": 101.6046, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.723788278239708, |
|
"grad_norm": 1042.942138671875, |
|
"learning_rate": 2.909622383059835e-07, |
|
"loss": 99.1135, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.7350548682935623, |
|
"grad_norm": 841.5830078125, |
|
"learning_rate": 2.1387846565474045e-07, |
|
"loss": 101.7611, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.7463214583474165, |
|
"grad_norm": 3369.8330078125, |
|
"learning_rate": 1.4859154444200884e-07, |
|
"loss": 99.4417, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.757588048401271, |
|
"grad_norm": 2432.385498046875, |
|
"learning_rate": 9.513254770636137e-08, |
|
"loss": 100.2592, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.7688546384551251, |
|
"grad_norm": 1680.1993408203125, |
|
"learning_rate": 5.352691903491303e-08, |
|
"loss": 98.8878, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.7801212285089796, |
|
"grad_norm": 682.8377075195312, |
|
"learning_rate": 2.3794460453555047e-08, |
|
"loss": 97.455, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.7913878185628338, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.94932300227169e-09, |
|
"loss": 102.4157, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.802654408616688, |
|
"grad_norm": 3429.1298828125, |
|
"learning_rate": 0.0, |
|
"loss": 97.4711, |
|
"step": 80000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 80000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 20000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|