ikerion's picture
Training in progress, step 3500, checkpoint
e76786f verified
{
"best_metric": 0.011125968769192696,
"best_model_checkpoint": "./mistral-magyar-portas-results/checkpoint-2500",
"epoch": 2.7429467084639496,
"eval_steps": 500,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03918495297805643,
"grad_norm": 0.0693359375,
"learning_rate": 0.0001,
"loss": 0.0137,
"step": 50
},
{
"epoch": 0.07836990595611286,
"grad_norm": 0.890625,
"learning_rate": 0.0002,
"loss": 0.0186,
"step": 100
},
{
"epoch": 0.11755485893416928,
"grad_norm": 0.96875,
"learning_rate": 0.00019991124482238458,
"loss": 0.0345,
"step": 150
},
{
"epoch": 0.15673981191222572,
"grad_norm": 0.146484375,
"learning_rate": 0.00019964513683916945,
"loss": 0.0232,
"step": 200
},
{
"epoch": 0.19592476489028213,
"grad_norm": 0.09228515625,
"learning_rate": 0.00019920214841958082,
"loss": 0.0165,
"step": 250
},
{
"epoch": 0.23510971786833856,
"grad_norm": 0.0537109375,
"learning_rate": 0.00019858306591393602,
"loss": 0.0145,
"step": 300
},
{
"epoch": 0.274294670846395,
"grad_norm": 0.09130859375,
"learning_rate": 0.00019778898825778996,
"loss": 0.0136,
"step": 350
},
{
"epoch": 0.31347962382445144,
"grad_norm": 0.06494140625,
"learning_rate": 0.00019682132502121086,
"loss": 0.0135,
"step": 400
},
{
"epoch": 0.35266457680250785,
"grad_norm": 0.2373046875,
"learning_rate": 0.00019568179390664744,
"loss": 0.0169,
"step": 450
},
{
"epoch": 0.39184952978056425,
"grad_norm": 0.953125,
"learning_rate": 0.00019437241769982907,
"loss": 0.0323,
"step": 500
},
{
"epoch": 0.39184952978056425,
"eval_loss": 0.06335489451885223,
"eval_runtime": 438.733,
"eval_samples_per_second": 2.587,
"eval_steps_per_second": 1.295,
"step": 500
},
{
"epoch": 0.43103448275862066,
"grad_norm": 0.115234375,
"learning_rate": 0.00019289552067911186,
"loss": 0.0273,
"step": 550
},
{
"epoch": 0.4702194357366771,
"grad_norm": 4.03125,
"learning_rate": 0.00019125372448964363,
"loss": 0.0197,
"step": 600
},
{
"epoch": 0.5094043887147336,
"grad_norm": 0.11572265625,
"learning_rate": 0.00018944994348967247,
"loss": 0.0201,
"step": 650
},
{
"epoch": 0.54858934169279,
"grad_norm": 0.1865234375,
"learning_rate": 0.00018748737957725904,
"loss": 0.022,
"step": 700
},
{
"epoch": 0.5877742946708464,
"grad_norm": 0.0888671875,
"learning_rate": 0.00018536951650657585,
"loss": 0.0157,
"step": 750
},
{
"epoch": 0.6269592476489029,
"grad_norm": 0.07421875,
"learning_rate": 0.00018310011370388304,
"loss": 0.0144,
"step": 800
},
{
"epoch": 0.6661442006269592,
"grad_norm": 0.08984375,
"learning_rate": 0.00018068319959415723,
"loss": 0.0149,
"step": 850
},
{
"epoch": 0.7053291536050157,
"grad_norm": 0.140625,
"learning_rate": 0.00017812306445022025,
"loss": 0.0137,
"step": 900
},
{
"epoch": 0.7445141065830722,
"grad_norm": 0.05810546875,
"learning_rate": 0.0001754242527770605,
"loss": 0.0142,
"step": 950
},
{
"epoch": 0.7836990595611285,
"grad_norm": 0.10791015625,
"learning_rate": 0.000172591555244866,
"loss": 0.0123,
"step": 1000
},
{
"epoch": 0.7836990595611285,
"eval_loss": 0.013054505921900272,
"eval_runtime": 438.9352,
"eval_samples_per_second": 2.586,
"eval_steps_per_second": 1.294,
"step": 1000
},
{
"epoch": 0.822884012539185,
"grad_norm": 0.08447265625,
"learning_rate": 0.0001696300001850887,
"loss": 0.013,
"step": 1050
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.0615234375,
"learning_rate": 0.0001665448446646357,
"loss": 0.0127,
"step": 1100
},
{
"epoch": 0.9012539184952978,
"grad_norm": 0.08642578125,
"learning_rate": 0.00016334156515403065,
"loss": 0.0124,
"step": 1150
},
{
"epoch": 0.9404388714733543,
"grad_norm": 0.16015625,
"learning_rate": 0.00016002584780611194,
"loss": 0.0124,
"step": 1200
},
{
"epoch": 0.9796238244514106,
"grad_norm": 0.0654296875,
"learning_rate": 0.00015660357836252232,
"loss": 0.0127,
"step": 1250
},
{
"epoch": 1.0188087774294672,
"grad_norm": 0.10400390625,
"learning_rate": 0.000153080831705908,
"loss": 0.0107,
"step": 1300
},
{
"epoch": 1.0579937304075235,
"grad_norm": 0.099609375,
"learning_rate": 0.00014946386107637306,
"loss": 0.0092,
"step": 1350
},
{
"epoch": 1.09717868338558,
"grad_norm": 0.0556640625,
"learning_rate": 0.00014575908697133058,
"loss": 0.0093,
"step": 1400
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.07177734375,
"learning_rate": 0.00014197308574845488,
"loss": 0.0099,
"step": 1450
},
{
"epoch": 1.1755485893416928,
"grad_norm": 0.0869140625,
"learning_rate": 0.0001381125779519658,
"loss": 0.0094,
"step": 1500
},
{
"epoch": 1.1755485893416928,
"eval_loss": 0.012080053798854351,
"eval_runtime": 438.8603,
"eval_samples_per_second": 2.586,
"eval_steps_per_second": 1.294,
"step": 1500
},
{
"epoch": 1.2147335423197492,
"grad_norm": 0.078125,
"learning_rate": 0.00013418441638296652,
"loss": 0.0095,
"step": 1550
},
{
"epoch": 1.2539184952978055,
"grad_norm": 0.1396484375,
"learning_rate": 0.00013019557393501228,
"loss": 0.0093,
"step": 1600
},
{
"epoch": 1.293103448275862,
"grad_norm": 0.0537109375,
"learning_rate": 0.00012615313121650204,
"loss": 0.0098,
"step": 1650
},
{
"epoch": 1.3322884012539185,
"grad_norm": 0.057861328125,
"learning_rate": 0.00012206426398186534,
"loss": 0.01,
"step": 1700
},
{
"epoch": 1.3714733542319748,
"grad_norm": 0.0693359375,
"learning_rate": 0.00011793623039385545,
"loss": 0.0094,
"step": 1750
},
{
"epoch": 1.4106583072100314,
"grad_norm": 0.12109375,
"learning_rate": 0.00011377635813955834,
"loss": 0.0096,
"step": 1800
},
{
"epoch": 1.4498432601880877,
"grad_norm": 0.07177734375,
"learning_rate": 0.00010959203142298981,
"loss": 0.0091,
"step": 1850
},
{
"epoch": 1.489028213166144,
"grad_norm": 0.050537109375,
"learning_rate": 0.00010539067785736856,
"loss": 0.0086,
"step": 1900
},
{
"epoch": 1.5282131661442007,
"grad_norm": 0.150390625,
"learning_rate": 0.0001011797552803333,
"loss": 0.0091,
"step": 1950
},
{
"epoch": 1.567398119122257,
"grad_norm": 0.1318359375,
"learning_rate": 9.696673851550907e-05,
"loss": 0.0093,
"step": 2000
},
{
"epoch": 1.567398119122257,
"eval_loss": 0.011984162032604218,
"eval_runtime": 438.9114,
"eval_samples_per_second": 2.586,
"eval_steps_per_second": 1.294,
"step": 2000
},
{
"epoch": 1.6065830721003134,
"grad_norm": 0.09814453125,
"learning_rate": 9.275910610392104e-05,
"loss": 0.0091,
"step": 2050
},
{
"epoch": 1.64576802507837,
"grad_norm": 0.13671875,
"learning_rate": 8.856432702880984e-05,
"loss": 0.0096,
"step": 2100
},
{
"epoch": 1.6849529780564263,
"grad_norm": 0.07568359375,
"learning_rate": 8.43898474574128e-05,
"loss": 0.0092,
"step": 2150
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.046875,
"learning_rate": 8.02430775232462e-05,
"loss": 0.0084,
"step": 2200
},
{
"epoch": 1.7633228840125392,
"grad_norm": 0.062255859375,
"learning_rate": 7.61313781723508e-05,
"loss": 0.0087,
"step": 2250
},
{
"epoch": 1.8025078369905956,
"grad_norm": 0.06591796875,
"learning_rate": 7.206204809685029e-05,
"loss": 0.0086,
"step": 2300
},
{
"epoch": 1.841692789968652,
"grad_norm": 0.052978515625,
"learning_rate": 6.804231077901733e-05,
"loss": 0.0085,
"step": 2350
},
{
"epoch": 1.8808777429467085,
"grad_norm": 0.07421875,
"learning_rate": 6.407930166884409e-05,
"loss": 0.0094,
"step": 2400
},
{
"epoch": 1.9200626959247649,
"grad_norm": 0.1044921875,
"learning_rate": 6.018005551787984e-05,
"loss": 0.008,
"step": 2450
},
{
"epoch": 1.9592476489028212,
"grad_norm": 0.05615234375,
"learning_rate": 5.635149389181855e-05,
"loss": 0.0088,
"step": 2500
},
{
"epoch": 1.9592476489028212,
"eval_loss": 0.011125968769192696,
"eval_runtime": 438.9494,
"eval_samples_per_second": 2.586,
"eval_steps_per_second": 1.294,
"step": 2500
},
{
"epoch": 1.9984326018808778,
"grad_norm": 0.038330078125,
"learning_rate": 5.260041288400284e-05,
"loss": 0.0092,
"step": 2550
},
{
"epoch": 2.0376175548589344,
"grad_norm": 0.056396484375,
"learning_rate": 4.893347105165468e-05,
"loss": 0.0064,
"step": 2600
},
{
"epoch": 2.0768025078369905,
"grad_norm": 0.0517578125,
"learning_rate": 4.535717759624677e-05,
"loss": 0.0065,
"step": 2650
},
{
"epoch": 2.115987460815047,
"grad_norm": 0.049560546875,
"learning_rate": 4.187788080899591e-05,
"loss": 0.0064,
"step": 2700
},
{
"epoch": 2.1551724137931036,
"grad_norm": 0.049072265625,
"learning_rate": 3.8501756801988675e-05,
"loss": 0.0062,
"step": 2750
},
{
"epoch": 2.19435736677116,
"grad_norm": 0.0908203125,
"learning_rate": 3.5234798544942914e-05,
"loss": 0.0063,
"step": 2800
},
{
"epoch": 2.2335423197492164,
"grad_norm": 0.0595703125,
"learning_rate": 3.208280522706602e-05,
"loss": 0.0066,
"step": 2850
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.044677734375,
"learning_rate": 2.9051371962893358e-05,
"loss": 0.0063,
"step": 2900
},
{
"epoch": 2.311912225705329,
"grad_norm": 0.05224609375,
"learning_rate": 2.6145879860380773e-05,
"loss": 0.0065,
"step": 2950
},
{
"epoch": 2.3510971786833856,
"grad_norm": 0.0908203125,
"learning_rate": 2.337148646888061e-05,
"loss": 0.0065,
"step": 3000
},
{
"epoch": 2.3510971786833856,
"eval_loss": 0.01185206975787878,
"eval_runtime": 438.7429,
"eval_samples_per_second": 2.587,
"eval_steps_per_second": 1.295,
"step": 3000
},
{
"epoch": 2.3902821316614418,
"grad_norm": 0.054931640625,
"learning_rate": 2.073311662395764e-05,
"loss": 0.0064,
"step": 3050
},
{
"epoch": 2.4294670846394983,
"grad_norm": 0.0546875,
"learning_rate": 1.8235453705295848e-05,
"loss": 0.0063,
"step": 3100
},
{
"epoch": 2.468652037617555,
"grad_norm": 0.1494140625,
"learning_rate": 1.5882931323214713e-05,
"loss": 0.0065,
"step": 3150
},
{
"epoch": 2.507836990595611,
"grad_norm": 0.072265625,
"learning_rate": 1.3679725448551451e-05,
"loss": 0.0062,
"step": 3200
},
{
"epoch": 2.5470219435736676,
"grad_norm": 0.078125,
"learning_rate": 1.1629746999880697e-05,
"loss": 0.0065,
"step": 3250
},
{
"epoch": 2.586206896551724,
"grad_norm": 0.0625,
"learning_rate": 9.736634901228814e-06,
"loss": 0.0064,
"step": 3300
},
{
"epoch": 2.6253918495297803,
"grad_norm": 0.07177734375,
"learning_rate": 8.00374962260706e-06,
"loss": 0.0062,
"step": 3350
},
{
"epoch": 2.664576802507837,
"grad_norm": 0.051513671875,
"learning_rate": 6.434167214829267e-06,
"loss": 0.0062,
"step": 3400
},
{
"epoch": 2.7037617554858935,
"grad_norm": 0.251953125,
"learning_rate": 5.030673849203082e-06,
"loss": 0.007,
"step": 3450
},
{
"epoch": 2.7429467084639496,
"grad_norm": 0.11572265625,
"learning_rate": 3.7957608717875015e-06,
"loss": 0.0064,
"step": 3500
},
{
"epoch": 2.7429467084639496,
"eval_loss": 0.011658398434519768,
"eval_runtime": 438.6997,
"eval_samples_per_second": 2.587,
"eval_steps_per_second": 1.295,
"step": 3500
}
],
"logging_steps": 50,
"max_steps": 3828,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 3.9128851808256e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}