|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 47.05882352941177, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.20885063707828522, |
|
"learning_rate": 4.9980725906018074e-05, |
|
"loss": 0.8318, |
|
"num_input_tokens_seen": 121824, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.21794493496418, |
|
"learning_rate": 4.99229333433282e-05, |
|
"loss": 0.7891, |
|
"num_input_tokens_seen": 239760, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.20168891549110413, |
|
"learning_rate": 4.982671142387316e-05, |
|
"loss": 0.7678, |
|
"num_input_tokens_seen": 364912, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.20661190152168274, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 0.728, |
|
"num_input_tokens_seen": 487440, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.2073347568511963, |
|
"learning_rate": 4.951963201008076e-05, |
|
"loss": 0.7364, |
|
"num_input_tokens_seen": 607888, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 0.19631442427635193, |
|
"learning_rate": 4.9309248009941914e-05, |
|
"loss": 0.7217, |
|
"num_input_tokens_seen": 728656, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 4.117647058823529, |
|
"grad_norm": 0.22293810546398163, |
|
"learning_rate": 4.906138091134118e-05, |
|
"loss": 0.6901, |
|
"num_input_tokens_seen": 849216, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 0.2156902402639389, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 0.6761, |
|
"num_input_tokens_seen": 971440, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 5.294117647058823, |
|
"grad_norm": 0.22460030019283295, |
|
"learning_rate": 4.8454783398062106e-05, |
|
"loss": 0.6601, |
|
"num_input_tokens_seen": 1091264, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 0.2591679096221924, |
|
"learning_rate": 4.8096988312782174e-05, |
|
"loss": 0.6439, |
|
"num_input_tokens_seen": 1211184, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 6.470588235294118, |
|
"grad_norm": 0.26881489157676697, |
|
"learning_rate": 4.7703579345627035e-05, |
|
"loss": 0.6181, |
|
"num_input_tokens_seen": 1334112, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 0.3284054100513458, |
|
"learning_rate": 4.72751631047092e-05, |
|
"loss": 0.6145, |
|
"num_input_tokens_seen": 1454512, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 7.647058823529412, |
|
"grad_norm": 0.2977285385131836, |
|
"learning_rate": 4.681240017681993e-05, |
|
"loss": 0.5834, |
|
"num_input_tokens_seen": 1576640, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 8.235294117647058, |
|
"grad_norm": 0.3388771116733551, |
|
"learning_rate": 4.6316004108852305e-05, |
|
"loss": 0.5632, |
|
"num_input_tokens_seen": 1698624, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 8.823529411764707, |
|
"grad_norm": 0.3815699815750122, |
|
"learning_rate": 4.5786740307563636e-05, |
|
"loss": 0.5549, |
|
"num_input_tokens_seen": 1818688, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 0.37038519978523254, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 0.5151, |
|
"num_input_tokens_seen": 1942112, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.4679271876811981, |
|
"learning_rate": 4.463292327201862e-05, |
|
"loss": 0.5147, |
|
"num_input_tokens_seen": 2061552, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 10.588235294117647, |
|
"grad_norm": 0.4134647846221924, |
|
"learning_rate": 4.401014914000078e-05, |
|
"loss": 0.4635, |
|
"num_input_tokens_seen": 2185344, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 11.176470588235293, |
|
"grad_norm": 0.45239707827568054, |
|
"learning_rate": 4.335806273589214e-05, |
|
"loss": 0.4585, |
|
"num_input_tokens_seen": 2306256, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 11.764705882352942, |
|
"grad_norm": 0.5336123704910278, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.4291, |
|
"num_input_tokens_seen": 2426592, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 12.352941176470589, |
|
"grad_norm": 0.5823401212692261, |
|
"learning_rate": 4.197001863832355e-05, |
|
"loss": 0.3997, |
|
"num_input_tokens_seen": 2548672, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 12.941176470588236, |
|
"grad_norm": 0.5824088454246521, |
|
"learning_rate": 4.123620120825459e-05, |
|
"loss": 0.3797, |
|
"num_input_tokens_seen": 2667984, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 13.529411764705882, |
|
"grad_norm": 0.7273723483085632, |
|
"learning_rate": 4.047734873274586e-05, |
|
"loss": 0.3412, |
|
"num_input_tokens_seen": 2791904, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 14.117647058823529, |
|
"grad_norm": 0.6384756565093994, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 0.3298, |
|
"num_input_tokens_seen": 2910560, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 14.705882352941176, |
|
"grad_norm": 0.684781014919281, |
|
"learning_rate": 3.888925582549006e-05, |
|
"loss": 0.2863, |
|
"num_input_tokens_seen": 3034368, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 15.294117647058824, |
|
"grad_norm": 0.7853628396987915, |
|
"learning_rate": 3.8062464117898724e-05, |
|
"loss": 0.2738, |
|
"num_input_tokens_seen": 3153984, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 15.882352941176471, |
|
"grad_norm": 0.7987646460533142, |
|
"learning_rate": 3.721553103742388e-05, |
|
"loss": 0.2367, |
|
"num_input_tokens_seen": 3278336, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 16.470588235294116, |
|
"grad_norm": 0.74590665102005, |
|
"learning_rate": 3.634976249348867e-05, |
|
"loss": 0.2189, |
|
"num_input_tokens_seen": 3398224, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 17.058823529411764, |
|
"grad_norm": 0.8422712683677673, |
|
"learning_rate": 3.54664934384357e-05, |
|
"loss": 0.1971, |
|
"num_input_tokens_seen": 3519168, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 17.647058823529413, |
|
"grad_norm": 0.8479442000389099, |
|
"learning_rate": 3.456708580912725e-05, |
|
"loss": 0.1705, |
|
"num_input_tokens_seen": 3641392, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 18.235294117647058, |
|
"grad_norm": 0.8197467923164368, |
|
"learning_rate": 3.365292642693732e-05, |
|
"loss": 0.1454, |
|
"num_input_tokens_seen": 3764240, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 18.823529411764707, |
|
"grad_norm": 1.0131207704544067, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 0.1387, |
|
"num_input_tokens_seen": 3882896, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 19.41176470588235, |
|
"grad_norm": 0.858586311340332, |
|
"learning_rate": 3.178601124662686e-05, |
|
"loss": 0.1191, |
|
"num_input_tokens_seen": 4004560, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.9852003455162048, |
|
"learning_rate": 3.083613409639764e-05, |
|
"loss": 0.1101, |
|
"num_input_tokens_seen": 4127360, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 20.58823529411765, |
|
"grad_norm": 0.885619580745697, |
|
"learning_rate": 2.9877258050403212e-05, |
|
"loss": 0.0885, |
|
"num_input_tokens_seen": 4248640, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 21.176470588235293, |
|
"grad_norm": 0.6304395794868469, |
|
"learning_rate": 2.8910861626005776e-05, |
|
"loss": 0.0865, |
|
"num_input_tokens_seen": 4369872, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 21.764705882352942, |
|
"grad_norm": 0.7621514797210693, |
|
"learning_rate": 2.7938434936445945e-05, |
|
"loss": 0.0708, |
|
"num_input_tokens_seen": 4490688, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 22.352941176470587, |
|
"grad_norm": 0.8263904452323914, |
|
"learning_rate": 2.6961477393196126e-05, |
|
"loss": 0.0715, |
|
"num_input_tokens_seen": 4612144, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 22.941176470588236, |
|
"grad_norm": 0.5912930965423584, |
|
"learning_rate": 2.598149539397672e-05, |
|
"loss": 0.0584, |
|
"num_input_tokens_seen": 4733632, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 23.529411764705884, |
|
"grad_norm": 0.6392534971237183, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0517, |
|
"num_input_tokens_seen": 4854048, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 24.11764705882353, |
|
"grad_norm": 0.44309210777282715, |
|
"learning_rate": 2.4018504606023293e-05, |
|
"loss": 0.0518, |
|
"num_input_tokens_seen": 4976208, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 24.705882352941178, |
|
"grad_norm": 0.5509380102157593, |
|
"learning_rate": 2.303852260680388e-05, |
|
"loss": 0.0416, |
|
"num_input_tokens_seen": 5096976, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 25.294117647058822, |
|
"grad_norm": 0.47966495156288147, |
|
"learning_rate": 2.2061565063554064e-05, |
|
"loss": 0.0425, |
|
"num_input_tokens_seen": 5217056, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 25.88235294117647, |
|
"grad_norm": 0.5966067910194397, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 0.0377, |
|
"num_input_tokens_seen": 5337440, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 26.470588235294116, |
|
"grad_norm": 0.37054604291915894, |
|
"learning_rate": 2.0122741949596797e-05, |
|
"loss": 0.0396, |
|
"num_input_tokens_seen": 5457008, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 27.058823529411764, |
|
"grad_norm": 0.40190204977989197, |
|
"learning_rate": 1.9163865903602374e-05, |
|
"loss": 0.0322, |
|
"num_input_tokens_seen": 5582160, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 27.647058823529413, |
|
"grad_norm": 0.34666651487350464, |
|
"learning_rate": 1.8213988753373146e-05, |
|
"loss": 0.0341, |
|
"num_input_tokens_seen": 5702928, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 28.235294117647058, |
|
"grad_norm": 0.27824667096138, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 0.028, |
|
"num_input_tokens_seen": 5824768, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 28.823529411764707, |
|
"grad_norm": 0.32425612211227417, |
|
"learning_rate": 1.6347073573062672e-05, |
|
"loss": 0.027, |
|
"num_input_tokens_seen": 5946656, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 29.41176470588235, |
|
"grad_norm": 0.3999980092048645, |
|
"learning_rate": 1.5432914190872757e-05, |
|
"loss": 0.0258, |
|
"num_input_tokens_seen": 6070048, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.6484708189964294, |
|
"learning_rate": 1.4533506561564306e-05, |
|
"loss": 0.0247, |
|
"num_input_tokens_seen": 6191376, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 30.58823529411765, |
|
"grad_norm": 0.28348207473754883, |
|
"learning_rate": 1.3650237506511331e-05, |
|
"loss": 0.0233, |
|
"num_input_tokens_seen": 6312544, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 31.176470588235293, |
|
"grad_norm": 0.28730762004852295, |
|
"learning_rate": 1.2784468962576136e-05, |
|
"loss": 0.0229, |
|
"num_input_tokens_seen": 6435232, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 31.764705882352942, |
|
"grad_norm": 0.23221123218536377, |
|
"learning_rate": 1.1937535882101281e-05, |
|
"loss": 0.0243, |
|
"num_input_tokens_seen": 6558816, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 32.35294117647059, |
|
"grad_norm": 0.21052616834640503, |
|
"learning_rate": 1.1110744174509952e-05, |
|
"loss": 0.0221, |
|
"num_input_tokens_seen": 6681456, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 32.94117647058823, |
|
"grad_norm": 0.2625516355037689, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 0.0189, |
|
"num_input_tokens_seen": 6800352, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 33.529411764705884, |
|
"grad_norm": 0.267925500869751, |
|
"learning_rate": 9.522651267254149e-06, |
|
"loss": 0.0188, |
|
"num_input_tokens_seen": 6923584, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 34.11764705882353, |
|
"grad_norm": 0.3179719150066376, |
|
"learning_rate": 8.763798791745411e-06, |
|
"loss": 0.0198, |
|
"num_input_tokens_seen": 7041648, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 34.705882352941174, |
|
"grad_norm": 0.22515079379081726, |
|
"learning_rate": 8.029981361676456e-06, |
|
"loss": 0.0195, |
|
"num_input_tokens_seen": 7166416, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 35.294117647058826, |
|
"grad_norm": 0.22048750519752502, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 0.0222, |
|
"num_input_tokens_seen": 7283072, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 35.88235294117647, |
|
"grad_norm": 0.3719558119773865, |
|
"learning_rate": 6.641937264107867e-06, |
|
"loss": 0.015, |
|
"num_input_tokens_seen": 7405936, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 36.470588235294116, |
|
"grad_norm": 0.244638592004776, |
|
"learning_rate": 5.989850859999227e-06, |
|
"loss": 0.0162, |
|
"num_input_tokens_seen": 7526224, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 37.05882352941177, |
|
"grad_norm": 0.20829229056835175, |
|
"learning_rate": 5.367076727981382e-06, |
|
"loss": 0.0191, |
|
"num_input_tokens_seen": 7648896, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 37.64705882352941, |
|
"grad_norm": 0.18424373865127563, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 0.0173, |
|
"num_input_tokens_seen": 7768768, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 38.23529411764706, |
|
"grad_norm": 0.17650990188121796, |
|
"learning_rate": 4.213259692436367e-06, |
|
"loss": 0.0185, |
|
"num_input_tokens_seen": 7892064, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 38.8235294117647, |
|
"grad_norm": 0.2193550169467926, |
|
"learning_rate": 3.6839958911476957e-06, |
|
"loss": 0.0164, |
|
"num_input_tokens_seen": 8011552, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 39.411764705882355, |
|
"grad_norm": 0.18098759651184082, |
|
"learning_rate": 3.187599823180071e-06, |
|
"loss": 0.0167, |
|
"num_input_tokens_seen": 8137664, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.14081740379333496, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 0.0163, |
|
"num_input_tokens_seen": 8256400, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 40.588235294117645, |
|
"grad_norm": 0.15283998847007751, |
|
"learning_rate": 2.296420654372966e-06, |
|
"loss": 0.0162, |
|
"num_input_tokens_seen": 8379376, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 41.1764705882353, |
|
"grad_norm": 0.16390979290008545, |
|
"learning_rate": 1.9030116872178316e-06, |
|
"loss": 0.0158, |
|
"num_input_tokens_seen": 8499712, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 41.76470588235294, |
|
"grad_norm": 0.1703239530324936, |
|
"learning_rate": 1.5452166019378989e-06, |
|
"loss": 0.0169, |
|
"num_input_tokens_seen": 8620768, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 42.35294117647059, |
|
"grad_norm": 0.1654016375541687, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 0.015, |
|
"num_input_tokens_seen": 8744400, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 42.94117647058823, |
|
"grad_norm": 0.18267235159873962, |
|
"learning_rate": 9.386190886588208e-07, |
|
"loss": 0.0167, |
|
"num_input_tokens_seen": 8863312, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 43.529411764705884, |
|
"grad_norm": 0.17041648924350739, |
|
"learning_rate": 6.907519900580861e-07, |
|
"loss": 0.0153, |
|
"num_input_tokens_seen": 8986880, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 44.11764705882353, |
|
"grad_norm": 0.16261689364910126, |
|
"learning_rate": 4.803679899192392e-07, |
|
"loss": 0.0157, |
|
"num_input_tokens_seen": 9104640, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 44.705882352941174, |
|
"grad_norm": 0.1600603610277176, |
|
"learning_rate": 3.077914851215585e-07, |
|
"loss": 0.0167, |
|
"num_input_tokens_seen": 9225360, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 45.294117647058826, |
|
"grad_norm": 0.14950668811798096, |
|
"learning_rate": 1.732885761268427e-07, |
|
"loss": 0.0146, |
|
"num_input_tokens_seen": 9345008, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 45.88235294117647, |
|
"grad_norm": 0.1505534052848816, |
|
"learning_rate": 7.706665667180091e-08, |
|
"loss": 0.0154, |
|
"num_input_tokens_seen": 9465872, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 46.470588235294116, |
|
"grad_norm": 0.1543729603290558, |
|
"learning_rate": 1.9274093981927478e-08, |
|
"loss": 0.0173, |
|
"num_input_tokens_seen": 9589456, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 47.05882352941177, |
|
"grad_norm": 0.2318650335073471, |
|
"learning_rate": 0.0, |
|
"loss": 0.0163, |
|
"num_input_tokens_seen": 9708528, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 9708528, |
|
"num_train_epochs": 50, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1615729589682176e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|