{ "best_metric": null, "best_model_checkpoint": null, "epoch": 47.05882352941177, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5882352941176471, "grad_norm": 0.20885063707828522, "learning_rate": 4.9980725906018074e-05, "loss": 0.8318, "num_input_tokens_seen": 121824, "step": 5 }, { "epoch": 1.1764705882352942, "grad_norm": 0.21794493496418, "learning_rate": 4.99229333433282e-05, "loss": 0.7891, "num_input_tokens_seen": 239760, "step": 10 }, { "epoch": 1.7647058823529411, "grad_norm": 0.20168891549110413, "learning_rate": 4.982671142387316e-05, "loss": 0.7678, "num_input_tokens_seen": 364912, "step": 15 }, { "epoch": 2.3529411764705883, "grad_norm": 0.20661190152168274, "learning_rate": 4.9692208514878444e-05, "loss": 0.728, "num_input_tokens_seen": 487440, "step": 20 }, { "epoch": 2.9411764705882355, "grad_norm": 0.2073347568511963, "learning_rate": 4.951963201008076e-05, "loss": 0.7364, "num_input_tokens_seen": 607888, "step": 25 }, { "epoch": 3.5294117647058822, "grad_norm": 0.19631442427635193, "learning_rate": 4.9309248009941914e-05, "loss": 0.7217, "num_input_tokens_seen": 728656, "step": 30 }, { "epoch": 4.117647058823529, "grad_norm": 0.22293810546398163, "learning_rate": 4.906138091134118e-05, "loss": 0.6901, "num_input_tokens_seen": 849216, "step": 35 }, { "epoch": 4.705882352941177, "grad_norm": 0.2156902402639389, "learning_rate": 4.877641290737884e-05, "loss": 0.6761, "num_input_tokens_seen": 971440, "step": 40 }, { "epoch": 5.294117647058823, "grad_norm": 0.22460030019283295, "learning_rate": 4.8454783398062106e-05, "loss": 0.6601, "num_input_tokens_seen": 1091264, "step": 45 }, { "epoch": 5.882352941176471, "grad_norm": 0.2591679096221924, "learning_rate": 4.8096988312782174e-05, "loss": 0.6439, "num_input_tokens_seen": 1211184, "step": 50 }, { "epoch": 6.470588235294118, "grad_norm": 0.26881489157676697, "learning_rate": 4.7703579345627035e-05, "loss": 0.6181, "num_input_tokens_seen": 1334112, "step": 55 }, { "epoch": 7.0588235294117645, "grad_norm": 0.3284054100513458, "learning_rate": 4.72751631047092e-05, "loss": 0.6145, "num_input_tokens_seen": 1454512, "step": 60 }, { "epoch": 7.647058823529412, "grad_norm": 0.2977285385131836, "learning_rate": 4.681240017681993e-05, "loss": 0.5834, "num_input_tokens_seen": 1576640, "step": 65 }, { "epoch": 8.235294117647058, "grad_norm": 0.3388771116733551, "learning_rate": 4.6316004108852305e-05, "loss": 0.5632, "num_input_tokens_seen": 1698624, "step": 70 }, { "epoch": 8.823529411764707, "grad_norm": 0.3815699815750122, "learning_rate": 4.5786740307563636e-05, "loss": 0.5549, "num_input_tokens_seen": 1818688, "step": 75 }, { "epoch": 9.411764705882353, "grad_norm": 0.37038519978523254, "learning_rate": 4.522542485937369e-05, "loss": 0.5151, "num_input_tokens_seen": 1942112, "step": 80 }, { "epoch": 10.0, "grad_norm": 0.4679271876811981, "learning_rate": 4.463292327201862e-05, "loss": 0.5147, "num_input_tokens_seen": 2061552, "step": 85 }, { "epoch": 10.588235294117647, "grad_norm": 0.4134647846221924, "learning_rate": 4.401014914000078e-05, "loss": 0.4635, "num_input_tokens_seen": 2185344, "step": 90 }, { "epoch": 11.176470588235293, "grad_norm": 0.45239707827568054, "learning_rate": 4.335806273589214e-05, "loss": 0.4585, "num_input_tokens_seen": 2306256, "step": 95 }, { "epoch": 11.764705882352942, "grad_norm": 0.5336123704910278, "learning_rate": 4.267766952966369e-05, "loss": 0.4291, "num_input_tokens_seen": 2426592, "step": 100 }, { "epoch": 12.352941176470589, "grad_norm": 0.5823401212692261, "learning_rate": 4.197001863832355e-05, "loss": 0.3997, "num_input_tokens_seen": 2548672, "step": 105 }, { "epoch": 12.941176470588236, "grad_norm": 0.5824088454246521, "learning_rate": 4.123620120825459e-05, "loss": 0.3797, "num_input_tokens_seen": 2667984, "step": 110 }, { "epoch": 13.529411764705882, "grad_norm": 0.7273723483085632, "learning_rate": 4.047734873274586e-05, "loss": 0.3412, "num_input_tokens_seen": 2791904, "step": 115 }, { "epoch": 14.117647058823529, "grad_norm": 0.6384756565093994, "learning_rate": 3.969463130731183e-05, "loss": 0.3298, "num_input_tokens_seen": 2910560, "step": 120 }, { "epoch": 14.705882352941176, "grad_norm": 0.684781014919281, "learning_rate": 3.888925582549006e-05, "loss": 0.2863, "num_input_tokens_seen": 3034368, "step": 125 }, { "epoch": 15.294117647058824, "grad_norm": 0.7853628396987915, "learning_rate": 3.8062464117898724e-05, "loss": 0.2738, "num_input_tokens_seen": 3153984, "step": 130 }, { "epoch": 15.882352941176471, "grad_norm": 0.7987646460533142, "learning_rate": 3.721553103742388e-05, "loss": 0.2367, "num_input_tokens_seen": 3278336, "step": 135 }, { "epoch": 16.470588235294116, "grad_norm": 0.74590665102005, "learning_rate": 3.634976249348867e-05, "loss": 0.2189, "num_input_tokens_seen": 3398224, "step": 140 }, { "epoch": 17.058823529411764, "grad_norm": 0.8422712683677673, "learning_rate": 3.54664934384357e-05, "loss": 0.1971, "num_input_tokens_seen": 3519168, "step": 145 }, { "epoch": 17.647058823529413, "grad_norm": 0.8479442000389099, "learning_rate": 3.456708580912725e-05, "loss": 0.1705, "num_input_tokens_seen": 3641392, "step": 150 }, { "epoch": 18.235294117647058, "grad_norm": 0.8197467923164368, "learning_rate": 3.365292642693732e-05, "loss": 0.1454, "num_input_tokens_seen": 3764240, "step": 155 }, { "epoch": 18.823529411764707, "grad_norm": 1.0131207704544067, "learning_rate": 3.272542485937369e-05, "loss": 0.1387, "num_input_tokens_seen": 3882896, "step": 160 }, { "epoch": 19.41176470588235, "grad_norm": 0.858586311340332, "learning_rate": 3.178601124662686e-05, "loss": 0.1191, "num_input_tokens_seen": 4004560, "step": 165 }, { "epoch": 20.0, "grad_norm": 0.9852003455162048, "learning_rate": 3.083613409639764e-05, "loss": 0.1101, "num_input_tokens_seen": 4127360, "step": 170 }, { "epoch": 20.58823529411765, "grad_norm": 0.885619580745697, "learning_rate": 2.9877258050403212e-05, "loss": 0.0885, "num_input_tokens_seen": 4248640, "step": 175 }, { "epoch": 21.176470588235293, "grad_norm": 0.6304395794868469, "learning_rate": 2.8910861626005776e-05, "loss": 0.0865, "num_input_tokens_seen": 4369872, "step": 180 }, { "epoch": 21.764705882352942, "grad_norm": 0.7621514797210693, "learning_rate": 2.7938434936445945e-05, "loss": 0.0708, "num_input_tokens_seen": 4490688, "step": 185 }, { "epoch": 22.352941176470587, "grad_norm": 0.8263904452323914, "learning_rate": 2.6961477393196126e-05, "loss": 0.0715, "num_input_tokens_seen": 4612144, "step": 190 }, { "epoch": 22.941176470588236, "grad_norm": 0.5912930965423584, "learning_rate": 2.598149539397672e-05, "loss": 0.0584, "num_input_tokens_seen": 4733632, "step": 195 }, { "epoch": 23.529411764705884, "grad_norm": 0.6392534971237183, "learning_rate": 2.5e-05, "loss": 0.0517, "num_input_tokens_seen": 4854048, "step": 200 }, { "epoch": 24.11764705882353, "grad_norm": 0.44309210777282715, "learning_rate": 2.4018504606023293e-05, "loss": 0.0518, "num_input_tokens_seen": 4976208, "step": 205 }, { "epoch": 24.705882352941178, "grad_norm": 0.5509380102157593, "learning_rate": 2.303852260680388e-05, "loss": 0.0416, "num_input_tokens_seen": 5096976, "step": 210 }, { "epoch": 25.294117647058822, "grad_norm": 0.47966495156288147, "learning_rate": 2.2061565063554064e-05, "loss": 0.0425, "num_input_tokens_seen": 5217056, "step": 215 }, { "epoch": 25.88235294117647, "grad_norm": 0.5966067910194397, "learning_rate": 2.1089138373994223e-05, "loss": 0.0377, "num_input_tokens_seen": 5337440, "step": 220 }, { "epoch": 26.470588235294116, "grad_norm": 0.37054604291915894, "learning_rate": 2.0122741949596797e-05, "loss": 0.0396, "num_input_tokens_seen": 5457008, "step": 225 }, { "epoch": 27.058823529411764, "grad_norm": 0.40190204977989197, "learning_rate": 1.9163865903602374e-05, "loss": 0.0322, "num_input_tokens_seen": 5582160, "step": 230 }, { "epoch": 27.647058823529413, "grad_norm": 0.34666651487350464, "learning_rate": 1.8213988753373146e-05, "loss": 0.0341, "num_input_tokens_seen": 5702928, "step": 235 }, { "epoch": 28.235294117647058, "grad_norm": 0.27824667096138, "learning_rate": 1.7274575140626318e-05, "loss": 0.028, "num_input_tokens_seen": 5824768, "step": 240 }, { "epoch": 28.823529411764707, "grad_norm": 0.32425612211227417, "learning_rate": 1.6347073573062672e-05, "loss": 0.027, "num_input_tokens_seen": 5946656, "step": 245 }, { "epoch": 29.41176470588235, "grad_norm": 0.3999980092048645, "learning_rate": 1.5432914190872757e-05, "loss": 0.0258, "num_input_tokens_seen": 6070048, "step": 250 }, { "epoch": 30.0, "grad_norm": 0.6484708189964294, "learning_rate": 1.4533506561564306e-05, "loss": 0.0247, "num_input_tokens_seen": 6191376, "step": 255 }, { "epoch": 30.58823529411765, "grad_norm": 0.28348207473754883, "learning_rate": 1.3650237506511331e-05, "loss": 0.0233, "num_input_tokens_seen": 6312544, "step": 260 }, { "epoch": 31.176470588235293, "grad_norm": 0.28730762004852295, "learning_rate": 1.2784468962576136e-05, "loss": 0.0229, "num_input_tokens_seen": 6435232, "step": 265 }, { "epoch": 31.764705882352942, "grad_norm": 0.23221123218536377, "learning_rate": 1.1937535882101281e-05, "loss": 0.0243, "num_input_tokens_seen": 6558816, "step": 270 }, { "epoch": 32.35294117647059, "grad_norm": 0.21052616834640503, "learning_rate": 1.1110744174509952e-05, "loss": 0.0221, "num_input_tokens_seen": 6681456, "step": 275 }, { "epoch": 32.94117647058823, "grad_norm": 0.2625516355037689, "learning_rate": 1.0305368692688174e-05, "loss": 0.0189, "num_input_tokens_seen": 6800352, "step": 280 }, { "epoch": 33.529411764705884, "grad_norm": 0.267925500869751, "learning_rate": 9.522651267254149e-06, "loss": 0.0188, "num_input_tokens_seen": 6923584, "step": 285 }, { "epoch": 34.11764705882353, "grad_norm": 0.3179719150066376, "learning_rate": 8.763798791745411e-06, "loss": 0.0198, "num_input_tokens_seen": 7041648, "step": 290 }, { "epoch": 34.705882352941174, "grad_norm": 0.22515079379081726, "learning_rate": 8.029981361676456e-06, "loss": 0.0195, "num_input_tokens_seen": 7166416, "step": 295 }, { "epoch": 35.294117647058826, "grad_norm": 0.22048750519752502, "learning_rate": 7.3223304703363135e-06, "loss": 0.0222, "num_input_tokens_seen": 7283072, "step": 300 }, { "epoch": 35.88235294117647, "grad_norm": 0.3719558119773865, "learning_rate": 6.641937264107867e-06, "loss": 0.015, "num_input_tokens_seen": 7405936, "step": 305 }, { "epoch": 36.470588235294116, "grad_norm": 0.244638592004776, "learning_rate": 5.989850859999227e-06, "loss": 0.0162, "num_input_tokens_seen": 7526224, "step": 310 }, { "epoch": 37.05882352941177, "grad_norm": 0.20829229056835175, "learning_rate": 5.367076727981382e-06, "loss": 0.0191, "num_input_tokens_seen": 7648896, "step": 315 }, { "epoch": 37.64705882352941, "grad_norm": 0.18424373865127563, "learning_rate": 4.7745751406263165e-06, "loss": 0.0173, "num_input_tokens_seen": 7768768, "step": 320 }, { "epoch": 38.23529411764706, "grad_norm": 0.17650990188121796, "learning_rate": 4.213259692436367e-06, "loss": 0.0185, "num_input_tokens_seen": 7892064, "step": 325 }, { "epoch": 38.8235294117647, "grad_norm": 0.2193550169467926, "learning_rate": 3.6839958911476957e-06, "loss": 0.0164, "num_input_tokens_seen": 8011552, "step": 330 }, { "epoch": 39.411764705882355, "grad_norm": 0.18098759651184082, "learning_rate": 3.187599823180071e-06, "loss": 0.0167, "num_input_tokens_seen": 8137664, "step": 335 }, { "epoch": 40.0, "grad_norm": 0.14081740379333496, "learning_rate": 2.7248368952908053e-06, "loss": 0.0163, "num_input_tokens_seen": 8256400, "step": 340 }, { "epoch": 40.588235294117645, "grad_norm": 0.15283998847007751, "learning_rate": 2.296420654372966e-06, "loss": 0.0162, "num_input_tokens_seen": 8379376, "step": 345 }, { "epoch": 41.1764705882353, "grad_norm": 0.16390979290008545, "learning_rate": 1.9030116872178316e-06, "loss": 0.0158, "num_input_tokens_seen": 8499712, "step": 350 }, { "epoch": 41.76470588235294, "grad_norm": 0.1703239530324936, "learning_rate": 1.5452166019378989e-06, "loss": 0.0169, "num_input_tokens_seen": 8620768, "step": 355 }, { "epoch": 42.35294117647059, "grad_norm": 0.1654016375541687, "learning_rate": 1.2235870926211619e-06, "loss": 0.015, "num_input_tokens_seen": 8744400, "step": 360 }, { "epoch": 42.94117647058823, "grad_norm": 0.18267235159873962, "learning_rate": 9.386190886588208e-07, "loss": 0.0167, "num_input_tokens_seen": 8863312, "step": 365 }, { "epoch": 43.529411764705884, "grad_norm": 0.17041648924350739, "learning_rate": 6.907519900580861e-07, "loss": 0.0153, "num_input_tokens_seen": 8986880, "step": 370 }, { "epoch": 44.11764705882353, "grad_norm": 0.16261689364910126, "learning_rate": 4.803679899192392e-07, "loss": 0.0157, "num_input_tokens_seen": 9104640, "step": 375 }, { "epoch": 44.705882352941174, "grad_norm": 0.1600603610277176, "learning_rate": 3.077914851215585e-07, "loss": 0.0167, "num_input_tokens_seen": 9225360, "step": 380 }, { "epoch": 45.294117647058826, "grad_norm": 0.14950668811798096, "learning_rate": 1.732885761268427e-07, "loss": 0.0146, "num_input_tokens_seen": 9345008, "step": 385 }, { "epoch": 45.88235294117647, "grad_norm": 0.1505534052848816, "learning_rate": 7.706665667180091e-08, "loss": 0.0154, "num_input_tokens_seen": 9465872, "step": 390 }, { "epoch": 46.470588235294116, "grad_norm": 0.1543729603290558, "learning_rate": 1.9274093981927478e-08, "loss": 0.0173, "num_input_tokens_seen": 9589456, "step": 395 }, { "epoch": 47.05882352941177, "grad_norm": 0.2318650335073471, "learning_rate": 0.0, "loss": 0.0163, "num_input_tokens_seen": 9708528, "step": 400 } ], "logging_steps": 5, "max_steps": 400, "num_input_tokens_seen": 9708528, "num_train_epochs": 50, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1615729589682176e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }