|
{ |
|
"best_metric": 0.011125968769192696, |
|
"best_model_checkpoint": "./mistral-magyar-portas-results/checkpoint-2500", |
|
"epoch": 2.7429467084639496, |
|
"eval_steps": 500, |
|
"global_step": 3500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03918495297805643, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0137, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07836990595611286, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0186, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11755485893416928, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00019991124482238458, |
|
"loss": 0.0345, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15673981191222572, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 0.00019964513683916945, |
|
"loss": 0.0232, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19592476489028213, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 0.00019920214841958082, |
|
"loss": 0.0165, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.23510971786833856, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 0.00019858306591393602, |
|
"loss": 0.0145, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.274294670846395, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 0.00019778898825778996, |
|
"loss": 0.0136, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.31347962382445144, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 0.00019682132502121086, |
|
"loss": 0.0135, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.35266457680250785, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00019568179390664744, |
|
"loss": 0.0169, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.39184952978056425, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00019437241769982907, |
|
"loss": 0.0323, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.39184952978056425, |
|
"eval_loss": 0.06335489451885223, |
|
"eval_runtime": 438.733, |
|
"eval_samples_per_second": 2.587, |
|
"eval_steps_per_second": 1.295, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 0.115234375, |
|
"learning_rate": 0.00019289552067911186, |
|
"loss": 0.0273, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4702194357366771, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.00019125372448964363, |
|
"loss": 0.0197, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5094043887147336, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 0.00018944994348967247, |
|
"loss": 0.0201, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.54858934169279, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00018748737957725904, |
|
"loss": 0.022, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5877742946708464, |
|
"grad_norm": 0.0888671875, |
|
"learning_rate": 0.00018536951650657585, |
|
"loss": 0.0157, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6269592476489029, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00018310011370388304, |
|
"loss": 0.0144, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6661442006269592, |
|
"grad_norm": 0.08984375, |
|
"learning_rate": 0.00018068319959415723, |
|
"loss": 0.0149, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7053291536050157, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 0.00017812306445022025, |
|
"loss": 0.0137, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7445141065830722, |
|
"grad_norm": 0.05810546875, |
|
"learning_rate": 0.0001754242527770605, |
|
"loss": 0.0142, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7836990595611285, |
|
"grad_norm": 0.10791015625, |
|
"learning_rate": 0.000172591555244866, |
|
"loss": 0.0123, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7836990595611285, |
|
"eval_loss": 0.013054505921900272, |
|
"eval_runtime": 438.9352, |
|
"eval_samples_per_second": 2.586, |
|
"eval_steps_per_second": 1.294, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.822884012539185, |
|
"grad_norm": 0.08447265625, |
|
"learning_rate": 0.0001696300001850887, |
|
"loss": 0.013, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 0.0615234375, |
|
"learning_rate": 0.0001665448446646357, |
|
"loss": 0.0127, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9012539184952978, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 0.00016334156515403065, |
|
"loss": 0.0124, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9404388714733543, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 0.00016002584780611194, |
|
"loss": 0.0124, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9796238244514106, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 0.00015660357836252232, |
|
"loss": 0.0127, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.0188087774294672, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.000153080831705908, |
|
"loss": 0.0107, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.0579937304075235, |
|
"grad_norm": 0.099609375, |
|
"learning_rate": 0.00014946386107637306, |
|
"loss": 0.0092, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.09717868338558, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 0.00014575908697133058, |
|
"loss": 0.0093, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 0.00014197308574845488, |
|
"loss": 0.0099, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.1755485893416928, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.0001381125779519658, |
|
"loss": 0.0094, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.1755485893416928, |
|
"eval_loss": 0.012080053798854351, |
|
"eval_runtime": 438.8603, |
|
"eval_samples_per_second": 2.586, |
|
"eval_steps_per_second": 1.294, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2147335423197492, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.00013418441638296652, |
|
"loss": 0.0095, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.2539184952978055, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 0.00013019557393501228, |
|
"loss": 0.0093, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.293103448275862, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 0.00012615313121650204, |
|
"loss": 0.0098, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3322884012539185, |
|
"grad_norm": 0.057861328125, |
|
"learning_rate": 0.00012206426398186534, |
|
"loss": 0.01, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.3714733542319748, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.00011793623039385545, |
|
"loss": 0.0094, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.4106583072100314, |
|
"grad_norm": 0.12109375, |
|
"learning_rate": 0.00011377635813955834, |
|
"loss": 0.0096, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.4498432601880877, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 0.00010959203142298981, |
|
"loss": 0.0091, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.489028213166144, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00010539067785736856, |
|
"loss": 0.0086, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.5282131661442007, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 0.0001011797552803333, |
|
"loss": 0.0091, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.567398119122257, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 9.696673851550907e-05, |
|
"loss": 0.0093, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.567398119122257, |
|
"eval_loss": 0.011984162032604218, |
|
"eval_runtime": 438.9114, |
|
"eval_samples_per_second": 2.586, |
|
"eval_steps_per_second": 1.294, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.6065830721003134, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 9.275910610392104e-05, |
|
"loss": 0.0091, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.64576802507837, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 8.856432702880984e-05, |
|
"loss": 0.0096, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.6849529780564263, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 8.43898474574128e-05, |
|
"loss": 0.0092, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 8.02430775232462e-05, |
|
"loss": 0.0084, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.7633228840125392, |
|
"grad_norm": 0.062255859375, |
|
"learning_rate": 7.61313781723508e-05, |
|
"loss": 0.0087, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.8025078369905956, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 7.206204809685029e-05, |
|
"loss": 0.0086, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.841692789968652, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 6.804231077901733e-05, |
|
"loss": 0.0085, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.8808777429467085, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 6.407930166884409e-05, |
|
"loss": 0.0094, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.9200626959247649, |
|
"grad_norm": 0.1044921875, |
|
"learning_rate": 6.018005551787984e-05, |
|
"loss": 0.008, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.9592476489028212, |
|
"grad_norm": 0.05615234375, |
|
"learning_rate": 5.635149389181855e-05, |
|
"loss": 0.0088, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.9592476489028212, |
|
"eval_loss": 0.011125968769192696, |
|
"eval_runtime": 438.9494, |
|
"eval_samples_per_second": 2.586, |
|
"eval_steps_per_second": 1.294, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.9984326018808778, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 5.260041288400284e-05, |
|
"loss": 0.0092, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.0376175548589344, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 4.893347105165468e-05, |
|
"loss": 0.0064, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.0768025078369905, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 4.535717759624677e-05, |
|
"loss": 0.0065, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.115987460815047, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 4.187788080899591e-05, |
|
"loss": 0.0064, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.1551724137931036, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 3.8501756801988675e-05, |
|
"loss": 0.0062, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.19435736677116, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 3.5234798544942914e-05, |
|
"loss": 0.0063, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.2335423197492164, |
|
"grad_norm": 0.0595703125, |
|
"learning_rate": 3.208280522706602e-05, |
|
"loss": 0.0066, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.044677734375, |
|
"learning_rate": 2.9051371962893358e-05, |
|
"loss": 0.0063, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.311912225705329, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 2.6145879860380773e-05, |
|
"loss": 0.0065, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.3510971786833856, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 2.337148646888061e-05, |
|
"loss": 0.0065, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.3510971786833856, |
|
"eval_loss": 0.01185206975787878, |
|
"eval_runtime": 438.7429, |
|
"eval_samples_per_second": 2.587, |
|
"eval_steps_per_second": 1.295, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.3902821316614418, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 2.073311662395764e-05, |
|
"loss": 0.0064, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.4294670846394983, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 1.8235453705295848e-05, |
|
"loss": 0.0063, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.468652037617555, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 1.5882931323214713e-05, |
|
"loss": 0.0065, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.507836990595611, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 1.3679725448551451e-05, |
|
"loss": 0.0062, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.5470219435736676, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 1.1629746999880697e-05, |
|
"loss": 0.0065, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"grad_norm": 0.0625, |
|
"learning_rate": 9.736634901228814e-06, |
|
"loss": 0.0064, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.6253918495297803, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 8.00374962260706e-06, |
|
"loss": 0.0062, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.664576802507837, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 6.434167214829267e-06, |
|
"loss": 0.0062, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.7037617554858935, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 5.030673849203082e-06, |
|
"loss": 0.007, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.7429467084639496, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 3.7957608717875015e-06, |
|
"loss": 0.0064, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.7429467084639496, |
|
"eval_loss": 0.011658398434519768, |
|
"eval_runtime": 438.6997, |
|
"eval_samples_per_second": 2.587, |
|
"eval_steps_per_second": 1.295, |
|
"step": 3500 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 3828, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 3.9128851808256e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|