|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 124, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008064516129032258, |
|
"grad_norm": 82.1250991821289, |
|
"learning_rate": 4.999197688241076e-07, |
|
"loss": 3.3188, |
|
"mean_token_accuracy": 0.4619899392127991, |
|
"num_tokens": 3582.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016129032258064516, |
|
"grad_norm": 85.84162139892578, |
|
"learning_rate": 4.996791267927632e-07, |
|
"loss": 3.1019, |
|
"mean_token_accuracy": 0.48747536540031433, |
|
"num_tokens": 7139.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024193548387096774, |
|
"grad_norm": 77.11351776123047, |
|
"learning_rate": 4.992782283619118e-07, |
|
"loss": 3.0994, |
|
"mean_token_accuracy": 0.46798279881477356, |
|
"num_tokens": 11094.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03225806451612903, |
|
"grad_norm": 74.11935424804688, |
|
"learning_rate": 4.987173308479737e-07, |
|
"loss": 3.0893, |
|
"mean_token_accuracy": 0.4673512279987335, |
|
"num_tokens": 14896.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04032258064516129, |
|
"grad_norm": 58.4622917175293, |
|
"learning_rate": 4.979967942626857e-07, |
|
"loss": 2.9759, |
|
"mean_token_accuracy": 0.4749932289123535, |
|
"num_tokens": 18599.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04838709677419355, |
|
"grad_norm": 71.54608154296875, |
|
"learning_rate": 4.971170810820278e-07, |
|
"loss": 2.8627, |
|
"mean_token_accuracy": 0.489130437374115, |
|
"num_tokens": 22099.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.056451612903225805, |
|
"grad_norm": 67.31790924072266, |
|
"learning_rate": 4.960787559493836e-07, |
|
"loss": 2.9878, |
|
"mean_token_accuracy": 0.49747729301452637, |
|
"num_tokens": 25076.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 51.87714767456055, |
|
"learning_rate": 4.948824853131236e-07, |
|
"loss": 2.8431, |
|
"mean_token_accuracy": 0.4804079532623291, |
|
"num_tokens": 28806.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07258064516129033, |
|
"grad_norm": 50.492496490478516, |
|
"learning_rate": 4.935290369988468e-07, |
|
"loss": 2.7265, |
|
"mean_token_accuracy": 0.4897351861000061, |
|
"num_tokens": 32171.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08064516129032258, |
|
"grad_norm": 50.19187927246094, |
|
"learning_rate": 4.920192797165511e-07, |
|
"loss": 2.5757, |
|
"mean_token_accuracy": 0.5013927817344666, |
|
"num_tokens": 35765.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08870967741935484, |
|
"grad_norm": 53.92109298706055, |
|
"learning_rate": 4.903541825030531e-07, |
|
"loss": 2.5328, |
|
"mean_token_accuracy": 0.5096573233604431, |
|
"num_tokens": 38979.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0967741935483871, |
|
"grad_norm": 131.06643676757812, |
|
"learning_rate": 4.885348141000122e-07, |
|
"loss": 2.4167, |
|
"mean_token_accuracy": 0.507402777671814, |
|
"num_tokens": 42968.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10483870967741936, |
|
"grad_norm": 42.45437240600586, |
|
"learning_rate": 4.865623422679592e-07, |
|
"loss": 2.3694, |
|
"mean_token_accuracy": 0.5128635168075562, |
|
"num_tokens": 46548.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11290322580645161, |
|
"grad_norm": 40.65642547607422, |
|
"learning_rate": 4.844380330367701e-07, |
|
"loss": 2.2659, |
|
"mean_token_accuracy": 0.5277366042137146, |
|
"num_tokens": 49923.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12096774193548387, |
|
"grad_norm": 40.563663482666016, |
|
"learning_rate": 4.821632498930656e-07, |
|
"loss": 2.325, |
|
"mean_token_accuracy": 0.5243242979049683, |
|
"num_tokens": 53257.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 29.71602439880371, |
|
"learning_rate": 4.797394529050577e-07, |
|
"loss": 2.2635, |
|
"mean_token_accuracy": 0.5110940933227539, |
|
"num_tokens": 57182.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13709677419354838, |
|
"grad_norm": 36.85178756713867, |
|
"learning_rate": 4.771681977854062e-07, |
|
"loss": 2.2947, |
|
"mean_token_accuracy": 0.49719932675361633, |
|
"num_tokens": 60221.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14516129032258066, |
|
"grad_norm": 40.669857025146484, |
|
"learning_rate": 4.744511348926854e-07, |
|
"loss": 2.1618, |
|
"mean_token_accuracy": 0.5076530575752258, |
|
"num_tokens": 63753.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1532258064516129, |
|
"grad_norm": 31.07614517211914, |
|
"learning_rate": 4.7159000817210204e-07, |
|
"loss": 2.2338, |
|
"mean_token_accuracy": 0.5, |
|
"num_tokens": 67491.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 39.705753326416016, |
|
"learning_rate": 4.685866540361455e-07, |
|
"loss": 2.1938, |
|
"mean_token_accuracy": 0.5107913613319397, |
|
"num_tokens": 70831.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1693548387096774, |
|
"grad_norm": 40.17426681518555, |
|
"learning_rate": 4.654430001858874e-07, |
|
"loss": 2.1139, |
|
"mean_token_accuracy": 0.5218411087989807, |
|
"num_tokens": 74246.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1774193548387097, |
|
"grad_norm": 34.13058853149414, |
|
"learning_rate": 4.6216106437368775e-07, |
|
"loss": 2.0303, |
|
"mean_token_accuracy": 0.5338891744613647, |
|
"num_tokens": 77968.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.18548387096774194, |
|
"grad_norm": 25.600557327270508, |
|
"learning_rate": 4.5874295310810185e-07, |
|
"loss": 2.0356, |
|
"mean_token_accuracy": 0.515510618686676, |
|
"num_tokens": 80841.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 14.962992668151855, |
|
"learning_rate": 4.551908603018191e-07, |
|
"loss": 1.9918, |
|
"mean_token_accuracy": 0.5420023202896118, |
|
"num_tokens": 84321.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.20161290322580644, |
|
"grad_norm": 10.336100578308105, |
|
"learning_rate": 4.5150706586350127e-07, |
|
"loss": 2.0338, |
|
"mean_token_accuracy": 0.5391054153442383, |
|
"num_tokens": 88148.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.20967741935483872, |
|
"grad_norm": 11.232644081115723, |
|
"learning_rate": 4.476939342344246e-07, |
|
"loss": 2.0344, |
|
"mean_token_accuracy": 0.5330302119255066, |
|
"num_tokens": 91891.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.21774193548387097, |
|
"grad_norm": 8.730783462524414, |
|
"learning_rate": 4.437539128708647e-07, |
|
"loss": 2.0291, |
|
"mean_token_accuracy": 0.5340464115142822, |
|
"num_tokens": 95728.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.22580645161290322, |
|
"grad_norm": 9.901666641235352, |
|
"learning_rate": 4.396895306731977e-07, |
|
"loss": 1.9593, |
|
"mean_token_accuracy": 0.5452925562858582, |
|
"num_tokens": 99099.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23387096774193547, |
|
"grad_norm": 8.69468879699707, |
|
"learning_rate": 4.355033963627277e-07, |
|
"loss": 1.9023, |
|
"mean_token_accuracy": 0.5690703988075256, |
|
"num_tokens": 102556.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24193548387096775, |
|
"grad_norm": 8.984487533569336, |
|
"learning_rate": 4.3119819680727996e-07, |
|
"loss": 1.938, |
|
"mean_token_accuracy": 0.5465425252914429, |
|
"num_tokens": 106320.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 9.061673164367676, |
|
"learning_rate": 4.2677669529663686e-07, |
|
"loss": 1.9321, |
|
"mean_token_accuracy": 0.5477131605148315, |
|
"num_tokens": 109866.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 8.572089195251465, |
|
"learning_rate": 4.2224172976892166e-07, |
|
"loss": 1.9352, |
|
"mean_token_accuracy": 0.5485040545463562, |
|
"num_tokens": 113179.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2661290322580645, |
|
"grad_norm": 8.716224670410156, |
|
"learning_rate": 4.175962109890696e-07, |
|
"loss": 1.8432, |
|
"mean_token_accuracy": 0.550000011920929, |
|
"num_tokens": 116683.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.27419354838709675, |
|
"grad_norm": 11.871490478515625, |
|
"learning_rate": 4.128431206805556e-07, |
|
"loss": 1.8008, |
|
"mean_token_accuracy": 0.5759973526000977, |
|
"num_tokens": 119720.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.28225806451612906, |
|
"grad_norm": 9.3665132522583, |
|
"learning_rate": 4.0798550961157595e-07, |
|
"loss": 1.8901, |
|
"mean_token_accuracy": 0.552654504776001, |
|
"num_tokens": 123171.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2903225806451613, |
|
"grad_norm": 9.704719543457031, |
|
"learning_rate": 4.030264956369157e-07, |
|
"loss": 1.8121, |
|
"mean_token_accuracy": 0.5599347352981567, |
|
"num_tokens": 126854.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.29838709677419356, |
|
"grad_norm": 10.195917129516602, |
|
"learning_rate": 3.9796926169675424e-07, |
|
"loss": 1.885, |
|
"mean_token_accuracy": 0.5533230304718018, |
|
"num_tokens": 130740.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3064516129032258, |
|
"grad_norm": 9.518072128295898, |
|
"learning_rate": 3.9281705377369805e-07, |
|
"loss": 1.9059, |
|
"mean_token_accuracy": 0.5531600713729858, |
|
"num_tokens": 134130.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.31451612903225806, |
|
"grad_norm": 7.841033458709717, |
|
"learning_rate": 3.875731788093478e-07, |
|
"loss": 1.8547, |
|
"mean_token_accuracy": 0.5586956739425659, |
|
"num_tokens": 137814.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 10.610803604125977, |
|
"learning_rate": 3.822410025817406e-07, |
|
"loss": 1.8147, |
|
"mean_token_accuracy": 0.5637563467025757, |
|
"num_tokens": 141151.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.33064516129032256, |
|
"grad_norm": 10.157751083374023, |
|
"learning_rate": 3.768239475450268e-07, |
|
"loss": 1.8701, |
|
"mean_token_accuracy": 0.549227237701416, |
|
"num_tokens": 144649.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3387096774193548, |
|
"grad_norm": 8.627851486206055, |
|
"learning_rate": 3.713254906327703e-07, |
|
"loss": 1.8508, |
|
"mean_token_accuracy": 0.557908833026886, |
|
"num_tokens": 148383.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3467741935483871, |
|
"grad_norm": 7.615920543670654, |
|
"learning_rate": 3.657491610262802e-07, |
|
"loss": 1.8584, |
|
"mean_token_accuracy": 0.5659451484680176, |
|
"num_tokens": 151852.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3548387096774194, |
|
"grad_norm": 8.200079917907715, |
|
"learning_rate": 3.6009853788940856e-07, |
|
"loss": 1.7812, |
|
"mean_token_accuracy": 0.5586913228034973, |
|
"num_tokens": 155860.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3629032258064516, |
|
"grad_norm": 7.837497234344482, |
|
"learning_rate": 3.543772480712658e-07, |
|
"loss": 1.7635, |
|
"mean_token_accuracy": 0.5686478018760681, |
|
"num_tokens": 159717.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3709677419354839, |
|
"grad_norm": 8.181253433227539, |
|
"learning_rate": 3.4858896377832965e-07, |
|
"loss": 1.7004, |
|
"mean_token_accuracy": 0.5794797539710999, |
|
"num_tokens": 163181.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3790322580645161, |
|
"grad_norm": 6.781255722045898, |
|
"learning_rate": 3.42737400217442e-07, |
|
"loss": 1.8779, |
|
"mean_token_accuracy": 0.5520778298377991, |
|
"num_tokens": 166987.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 7.666635036468506, |
|
"learning_rate": 3.36826313211205e-07, |
|
"loss": 1.8343, |
|
"mean_token_accuracy": 0.5685471296310425, |
|
"num_tokens": 170164.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3951612903225806, |
|
"grad_norm": 8.650765419006348, |
|
"learning_rate": 3.308594967873095e-07, |
|
"loss": 1.7022, |
|
"mean_token_accuracy": 0.5841107368469238, |
|
"num_tokens": 173491.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"grad_norm": 7.792832851409912, |
|
"learning_rate": 3.2484078074333956e-07, |
|
"loss": 1.7076, |
|
"mean_token_accuracy": 0.5851721167564392, |
|
"num_tokens": 176894.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4112903225806452, |
|
"grad_norm": 9.63578987121582, |
|
"learning_rate": 3.1877402818861946e-07, |
|
"loss": 1.7369, |
|
"mean_token_accuracy": 0.576729416847229, |
|
"num_tokens": 180671.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.41935483870967744, |
|
"grad_norm": 7.068120002746582, |
|
"learning_rate": 3.126631330646801e-07, |
|
"loss": 1.7723, |
|
"mean_token_accuracy": 0.5763046741485596, |
|
"num_tokens": 184220.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4274193548387097, |
|
"grad_norm": 7.844783782958984, |
|
"learning_rate": 3.065120176459337e-07, |
|
"loss": 1.7888, |
|
"mean_token_accuracy": 0.5663133859634399, |
|
"num_tokens": 187715.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.43548387096774194, |
|
"grad_norm": 6.963573932647705, |
|
"learning_rate": 3.00324630022165e-07, |
|
"loss": 1.6962, |
|
"mean_token_accuracy": 0.5830934047698975, |
|
"num_tokens": 191540.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4435483870967742, |
|
"grad_norm": 7.507209777832031, |
|
"learning_rate": 2.9410494156445216e-07, |
|
"loss": 1.7422, |
|
"mean_token_accuracy": 0.5805253982543945, |
|
"num_tokens": 195046.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 7.959713459014893, |
|
"learning_rate": 2.8785694437614416e-07, |
|
"loss": 1.7368, |
|
"mean_token_accuracy": 0.5760632753372192, |
|
"num_tokens": 198718.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4596774193548387, |
|
"grad_norm": 7.953831195831299, |
|
"learning_rate": 2.8158464873053234e-07, |
|
"loss": 1.7707, |
|
"mean_token_accuracy": 0.5705274939537048, |
|
"num_tokens": 202210.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.46774193548387094, |
|
"grad_norm": 8.072004318237305, |
|
"learning_rate": 2.7529208049685804e-07, |
|
"loss": 1.7197, |
|
"mean_token_accuracy": 0.5858024954795837, |
|
"num_tokens": 205454.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.47580645161290325, |
|
"grad_norm": 8.691699028015137, |
|
"learning_rate": 2.6898327855631154e-07, |
|
"loss": 1.807, |
|
"mean_token_accuracy": 0.560819149017334, |
|
"num_tokens": 209511.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 6.996568202972412, |
|
"learning_rate": 2.626622922096782e-07, |
|
"loss": 1.7697, |
|
"mean_token_accuracy": 0.5675398111343384, |
|
"num_tokens": 213409.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.49193548387096775, |
|
"grad_norm": 8.156956672668457, |
|
"learning_rate": 2.5633317857829693e-07, |
|
"loss": 1.6853, |
|
"mean_token_accuracy": 0.5745296478271484, |
|
"num_tokens": 216868.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 8.733380317687988, |
|
"learning_rate": 2.5e-07, |
|
"loss": 1.6566, |
|
"mean_token_accuracy": 0.5956632494926453, |
|
"num_tokens": 220008.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5080645161290323, |
|
"grad_norm": 6.773242473602295, |
|
"learning_rate": 2.4366682142170305e-07, |
|
"loss": 1.7601, |
|
"mean_token_accuracy": 0.5683262944221497, |
|
"num_tokens": 223788.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 6.594358444213867, |
|
"learning_rate": 2.3733770779032184e-07, |
|
"loss": 1.754, |
|
"mean_token_accuracy": 0.5651482939720154, |
|
"num_tokens": 227568.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5241935483870968, |
|
"grad_norm": 7.253048419952393, |
|
"learning_rate": 2.3101672144368846e-07, |
|
"loss": 1.7411, |
|
"mean_token_accuracy": 0.5777088403701782, |
|
"num_tokens": 231439.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.532258064516129, |
|
"grad_norm": 6.758819580078125, |
|
"learning_rate": 2.2470791950314196e-07, |
|
"loss": 1.6841, |
|
"mean_token_accuracy": 0.583798885345459, |
|
"num_tokens": 235023.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5403225806451613, |
|
"grad_norm": 7.620910167694092, |
|
"learning_rate": 2.1841535126946775e-07, |
|
"loss": 1.7435, |
|
"mean_token_accuracy": 0.5806363224983215, |
|
"num_tokens": 238673.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5483870967741935, |
|
"grad_norm": 7.808475017547607, |
|
"learning_rate": 2.121430556238559e-07, |
|
"loss": 1.7878, |
|
"mean_token_accuracy": 0.5660945773124695, |
|
"num_tokens": 241998.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5564516129032258, |
|
"grad_norm": 10.025287628173828, |
|
"learning_rate": 2.0589505843554795e-07, |
|
"loss": 1.8058, |
|
"mean_token_accuracy": 0.566918134689331, |
|
"num_tokens": 245581.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5645161290322581, |
|
"grad_norm": 7.763560771942139, |
|
"learning_rate": 1.9967536997783493e-07, |
|
"loss": 1.6668, |
|
"mean_token_accuracy": 0.5830458402633667, |
|
"num_tokens": 249643.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5725806451612904, |
|
"grad_norm": 7.218297958374023, |
|
"learning_rate": 1.9348798235406626e-07, |
|
"loss": 1.6796, |
|
"mean_token_accuracy": 0.5924479365348816, |
|
"num_tokens": 253487.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 7.534745693206787, |
|
"learning_rate": 1.8733686693531982e-07, |
|
"loss": 1.6667, |
|
"mean_token_accuracy": 0.5781828761100769, |
|
"num_tokens": 257002.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5887096774193549, |
|
"grad_norm": 9.321051597595215, |
|
"learning_rate": 1.8122597181138047e-07, |
|
"loss": 1.6971, |
|
"mean_token_accuracy": 0.5793435573577881, |
|
"num_tokens": 260327.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5967741935483871, |
|
"grad_norm": 7.620558261871338, |
|
"learning_rate": 1.751592192566605e-07, |
|
"loss": 1.8091, |
|
"mean_token_accuracy": 0.5647743940353394, |
|
"num_tokens": 263766.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6048387096774194, |
|
"grad_norm": 6.960766315460205, |
|
"learning_rate": 1.6914050321269047e-07, |
|
"loss": 1.7232, |
|
"mean_token_accuracy": 0.5714285969734192, |
|
"num_tokens": 267431.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6129032258064516, |
|
"grad_norm": 8.637341499328613, |
|
"learning_rate": 1.6317368678879496e-07, |
|
"loss": 1.8332, |
|
"mean_token_accuracy": 0.5513805747032166, |
|
"num_tokens": 270948.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6209677419354839, |
|
"grad_norm": 6.999222278594971, |
|
"learning_rate": 1.5726259978255807e-07, |
|
"loss": 1.737, |
|
"mean_token_accuracy": 0.578661322593689, |
|
"num_tokens": 274448.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6290322580645161, |
|
"grad_norm": 6.716799736022949, |
|
"learning_rate": 1.514110362216704e-07, |
|
"loss": 1.5999, |
|
"mean_token_accuracy": 0.6048541069030762, |
|
"num_tokens": 278119.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6370967741935484, |
|
"grad_norm": 7.0507659912109375, |
|
"learning_rate": 1.4562275192873428e-07, |
|
"loss": 1.7363, |
|
"mean_token_accuracy": 0.5828737020492554, |
|
"num_tokens": 281568.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 8.561986923217773, |
|
"learning_rate": 1.3990146211059139e-07, |
|
"loss": 1.6661, |
|
"mean_token_accuracy": 0.591465175151825, |
|
"num_tokens": 285087.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6532258064516129, |
|
"grad_norm": 7.29276704788208, |
|
"learning_rate": 1.342508389737198e-07, |
|
"loss": 1.6752, |
|
"mean_token_accuracy": 0.5835070013999939, |
|
"num_tokens": 288450.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6612903225806451, |
|
"grad_norm": 7.706023216247559, |
|
"learning_rate": 1.2867450936722978e-07, |
|
"loss": 1.7451, |
|
"mean_token_accuracy": 0.5716311931610107, |
|
"num_tokens": 291979.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6693548387096774, |
|
"grad_norm": 7.056520462036133, |
|
"learning_rate": 1.2317605245497323e-07, |
|
"loss": 1.7154, |
|
"mean_token_accuracy": 0.5758506059646606, |
|
"num_tokens": 295892.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6774193548387096, |
|
"grad_norm": 7.503317356109619, |
|
"learning_rate": 1.1775899741825945e-07, |
|
"loss": 1.6899, |
|
"mean_token_accuracy": 0.5809495449066162, |
|
"num_tokens": 299287.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6854838709677419, |
|
"grad_norm": 6.785901069641113, |
|
"learning_rate": 1.1242682119065216e-07, |
|
"loss": 1.7004, |
|
"mean_token_accuracy": 0.5904300808906555, |
|
"num_tokens": 302802.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6935483870967742, |
|
"grad_norm": 8.526169776916504, |
|
"learning_rate": 1.0718294622630186e-07, |
|
"loss": 1.764, |
|
"mean_token_accuracy": 0.5716612339019775, |
|
"num_tokens": 305876.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.7016129032258065, |
|
"grad_norm": 8.141705513000488, |
|
"learning_rate": 1.0203073830324565e-07, |
|
"loss": 1.6696, |
|
"mean_token_accuracy": 0.5959654450416565, |
|
"num_tokens": 309350.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 7.417829513549805, |
|
"learning_rate": 9.697350436308427e-08, |
|
"loss": 1.6542, |
|
"mean_token_accuracy": 0.5915982127189636, |
|
"num_tokens": 312520.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.717741935483871, |
|
"grad_norm": 7.390657424926758, |
|
"learning_rate": 9.201449038842401e-08, |
|
"loss": 1.7805, |
|
"mean_token_accuracy": 0.5598756074905396, |
|
"num_tokens": 316382.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7258064516129032, |
|
"grad_norm": 7.71380615234375, |
|
"learning_rate": 8.715687931944449e-08, |
|
"loss": 1.6886, |
|
"mean_token_accuracy": 0.578542947769165, |
|
"num_tokens": 319900.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7338709677419355, |
|
"grad_norm": 7.551061153411865, |
|
"learning_rate": 8.240378901093034e-08, |
|
"loss": 1.7469, |
|
"mean_token_accuracy": 0.570781409740448, |
|
"num_tokens": 323436.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7419354838709677, |
|
"grad_norm": 9.389298439025879, |
|
"learning_rate": 7.775827023107834e-08, |
|
"loss": 1.7162, |
|
"mean_token_accuracy": 0.571618914604187, |
|
"num_tokens": 326442.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 7.125860691070557, |
|
"learning_rate": 7.322330470336313e-08, |
|
"loss": 1.7112, |
|
"mean_token_accuracy": 0.568901777267456, |
|
"num_tokens": 330234.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7580645161290323, |
|
"grad_norm": 7.604709148406982, |
|
"learning_rate": 6.880180319272006e-08, |
|
"loss": 1.653, |
|
"mean_token_accuracy": 0.5836288928985596, |
|
"num_tokens": 333903.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7661290322580645, |
|
"grad_norm": 6.952591896057129, |
|
"learning_rate": 6.449660363727236e-08, |
|
"loss": 1.6965, |
|
"mean_token_accuracy": 0.5798975825309753, |
|
"num_tokens": 337618.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 10.35627555847168, |
|
"learning_rate": 6.031046932680229e-08, |
|
"loss": 1.6945, |
|
"mean_token_accuracy": 0.5744901299476624, |
|
"num_tokens": 340515.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.782258064516129, |
|
"grad_norm": 7.425570487976074, |
|
"learning_rate": 5.624608712913531e-08, |
|
"loss": 1.6154, |
|
"mean_token_accuracy": 0.5917431116104126, |
|
"num_tokens": 344007.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7903225806451613, |
|
"grad_norm": 10.059042930603027, |
|
"learning_rate": 5.230606576557539e-08, |
|
"loss": 1.6855, |
|
"mean_token_accuracy": 0.5853906869888306, |
|
"num_tokens": 347940.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7983870967741935, |
|
"grad_norm": 6.582756042480469, |
|
"learning_rate": 4.84929341364988e-08, |
|
"loss": 1.6939, |
|
"mean_token_accuracy": 0.5809850096702576, |
|
"num_tokens": 351741.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 6.744507312774658, |
|
"learning_rate": 4.480913969818098e-08, |
|
"loss": 1.6739, |
|
"mean_token_accuracy": 0.585814356803894, |
|
"num_tokens": 355171.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8145161290322581, |
|
"grad_norm": 6.796554088592529, |
|
"learning_rate": 4.125704689189818e-08, |
|
"loss": 1.7039, |
|
"mean_token_accuracy": 0.5784457325935364, |
|
"num_tokens": 359267.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8225806451612904, |
|
"grad_norm": 6.474706649780273, |
|
"learning_rate": 3.783893562631224e-08, |
|
"loss": 1.6828, |
|
"mean_token_accuracy": 0.5838965773582458, |
|
"num_tokens": 362674.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8306451612903226, |
|
"grad_norm": 7.683028221130371, |
|
"learning_rate": 3.455699981411259e-08, |
|
"loss": 1.7309, |
|
"mean_token_accuracy": 0.5660528540611267, |
|
"num_tokens": 366425.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 7.123412609100342, |
|
"learning_rate": 3.141334596385447e-08, |
|
"loss": 1.6526, |
|
"mean_token_accuracy": 0.5894578099250793, |
|
"num_tokens": 369749.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8467741935483871, |
|
"grad_norm": 6.300046920776367, |
|
"learning_rate": 2.8409991827897968e-08, |
|
"loss": 1.6927, |
|
"mean_token_accuracy": 0.5794023275375366, |
|
"num_tokens": 373802.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8548387096774194, |
|
"grad_norm": 6.947690486907959, |
|
"learning_rate": 2.5548865107314604e-08, |
|
"loss": 1.7412, |
|
"mean_token_accuracy": 0.5656623244285583, |
|
"num_tokens": 377324.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8629032258064516, |
|
"grad_norm": 7.4605231285095215, |
|
"learning_rate": 2.283180221459377e-08, |
|
"loss": 1.7679, |
|
"mean_token_accuracy": 0.5644617676734924, |
|
"num_tokens": 381183.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8709677419354839, |
|
"grad_norm": 6.687449932098389, |
|
"learning_rate": 2.0260547094942348e-08, |
|
"loss": 1.7144, |
|
"mean_token_accuracy": 0.5790040493011475, |
|
"num_tokens": 384902.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8790322580645161, |
|
"grad_norm": 9.971823692321777, |
|
"learning_rate": 1.7836750106934474e-08, |
|
"loss": 1.6716, |
|
"mean_token_accuracy": 0.5851125121116638, |
|
"num_tokens": 388372.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8870967741935484, |
|
"grad_norm": 8.792091369628906, |
|
"learning_rate": 1.5561966963229923e-08, |
|
"loss": 1.66, |
|
"mean_token_accuracy": 0.5835981965065002, |
|
"num_tokens": 391522.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8951612903225806, |
|
"grad_norm": 7.2787981033325195, |
|
"learning_rate": 1.3437657732040781e-08, |
|
"loss": 1.6463, |
|
"mean_token_accuracy": 0.5921754837036133, |
|
"num_tokens": 394900.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 7.135777473449707, |
|
"learning_rate": 1.1465185899987794e-08, |
|
"loss": 1.7416, |
|
"mean_token_accuracy": 0.5735452771186829, |
|
"num_tokens": 398616.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.9112903225806451, |
|
"grad_norm": 8.646471977233887, |
|
"learning_rate": 9.6458174969469e-09, |
|
"loss": 1.7218, |
|
"mean_token_accuracy": 0.5801047086715698, |
|
"num_tokens": 402440.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.9193548387096774, |
|
"grad_norm": 6.620112895965576, |
|
"learning_rate": 7.980720283448955e-09, |
|
"loss": 1.8393, |
|
"mean_token_accuracy": 0.5539906024932861, |
|
"num_tokens": 406278.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9274193548387096, |
|
"grad_norm": 8.354401588439941, |
|
"learning_rate": 6.470963001153268e-09, |
|
"loss": 1.7239, |
|
"mean_token_accuracy": 0.5888278484344482, |
|
"num_tokens": 409558.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9354838709677419, |
|
"grad_norm": 7.131601810455322, |
|
"learning_rate": 5.117514686876378e-09, |
|
"loss": 1.6136, |
|
"mean_token_accuracy": 0.5943509340286255, |
|
"num_tokens": 412890.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9435483870967742, |
|
"grad_norm": 7.938460826873779, |
|
"learning_rate": 3.921244050616446e-09, |
|
"loss": 1.7333, |
|
"mean_token_accuracy": 0.5793216824531555, |
|
"num_tokens": 416550.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9516129032258065, |
|
"grad_norm": 7.657175540924072, |
|
"learning_rate": 2.8829189179721547e-09, |
|
"loss": 1.7309, |
|
"mean_token_accuracy": 0.5766178369522095, |
|
"num_tokens": 419830.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9596774193548387, |
|
"grad_norm": 7.384486198425293, |
|
"learning_rate": 2.0032057373142453e-09, |
|
"loss": 1.6211, |
|
"mean_token_accuracy": 0.5845938324928284, |
|
"num_tokens": 423404.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 8.56894588470459, |
|
"learning_rate": 1.2826691520262112e-09, |
|
"loss": 1.7963, |
|
"mean_token_accuracy": 0.57413250207901, |
|
"num_tokens": 426895.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9758064516129032, |
|
"grad_norm": 6.813057899475098, |
|
"learning_rate": 7.217716380881477e-10, |
|
"loss": 1.726, |
|
"mean_token_accuracy": 0.5653586387634277, |
|
"num_tokens": 430900.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9838709677419355, |
|
"grad_norm": 6.8685431480407715, |
|
"learning_rate": 3.2087320723681033e-10, |
|
"loss": 1.6799, |
|
"mean_token_accuracy": 0.5889589190483093, |
|
"num_tokens": 434237.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.9919354838709677, |
|
"grad_norm": 7.024580955505371, |
|
"learning_rate": 8.023117589237016e-11, |
|
"loss": 1.7037, |
|
"mean_token_accuracy": 0.5847502946853638, |
|
"num_tokens": 438005.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 8.294002532958984, |
|
"learning_rate": 0.0, |
|
"loss": 1.6386, |
|
"mean_token_accuracy": 0.5888568758964539, |
|
"num_tokens": 441132.0, |
|
"step": 124 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 124, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0678265542868992e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|