{ "best_metric": 0.011125968769192696, "best_model_checkpoint": "./mistral-magyar-portas-results/checkpoint-2500", "epoch": 2.7429467084639496, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03918495297805643, "grad_norm": 0.0693359375, "learning_rate": 0.0001, "loss": 0.0137, "step": 50 }, { "epoch": 0.07836990595611286, "grad_norm": 0.890625, "learning_rate": 0.0002, "loss": 0.0186, "step": 100 }, { "epoch": 0.11755485893416928, "grad_norm": 0.96875, "learning_rate": 0.00019991124482238458, "loss": 0.0345, "step": 150 }, { "epoch": 0.15673981191222572, "grad_norm": 0.146484375, "learning_rate": 0.00019964513683916945, "loss": 0.0232, "step": 200 }, { "epoch": 0.19592476489028213, "grad_norm": 0.09228515625, "learning_rate": 0.00019920214841958082, "loss": 0.0165, "step": 250 }, { "epoch": 0.23510971786833856, "grad_norm": 0.0537109375, "learning_rate": 0.00019858306591393602, "loss": 0.0145, "step": 300 }, { "epoch": 0.274294670846395, "grad_norm": 0.09130859375, "learning_rate": 0.00019778898825778996, "loss": 0.0136, "step": 350 }, { "epoch": 0.31347962382445144, "grad_norm": 0.06494140625, "learning_rate": 0.00019682132502121086, "loss": 0.0135, "step": 400 }, { "epoch": 0.35266457680250785, "grad_norm": 0.2373046875, "learning_rate": 0.00019568179390664744, "loss": 0.0169, "step": 450 }, { "epoch": 0.39184952978056425, "grad_norm": 0.953125, "learning_rate": 0.00019437241769982907, "loss": 0.0323, "step": 500 }, { "epoch": 0.39184952978056425, "eval_loss": 0.06335489451885223, "eval_runtime": 438.733, "eval_samples_per_second": 2.587, "eval_steps_per_second": 1.295, "step": 500 }, { "epoch": 0.43103448275862066, "grad_norm": 0.115234375, "learning_rate": 0.00019289552067911186, "loss": 0.0273, "step": 550 }, { "epoch": 0.4702194357366771, "grad_norm": 4.03125, "learning_rate": 0.00019125372448964363, "loss": 0.0197, "step": 600 }, { "epoch": 0.5094043887147336, "grad_norm": 0.11572265625, "learning_rate": 0.00018944994348967247, "loss": 0.0201, "step": 650 }, { "epoch": 0.54858934169279, "grad_norm": 0.1865234375, "learning_rate": 0.00018748737957725904, "loss": 0.022, "step": 700 }, { "epoch": 0.5877742946708464, "grad_norm": 0.0888671875, "learning_rate": 0.00018536951650657585, "loss": 0.0157, "step": 750 }, { "epoch": 0.6269592476489029, "grad_norm": 0.07421875, "learning_rate": 0.00018310011370388304, "loss": 0.0144, "step": 800 }, { "epoch": 0.6661442006269592, "grad_norm": 0.08984375, "learning_rate": 0.00018068319959415723, "loss": 0.0149, "step": 850 }, { "epoch": 0.7053291536050157, "grad_norm": 0.140625, "learning_rate": 0.00017812306445022025, "loss": 0.0137, "step": 900 }, { "epoch": 0.7445141065830722, "grad_norm": 0.05810546875, "learning_rate": 0.0001754242527770605, "loss": 0.0142, "step": 950 }, { "epoch": 0.7836990595611285, "grad_norm": 0.10791015625, "learning_rate": 0.000172591555244866, "loss": 0.0123, "step": 1000 }, { "epoch": 0.7836990595611285, "eval_loss": 0.013054505921900272, "eval_runtime": 438.9352, "eval_samples_per_second": 2.586, "eval_steps_per_second": 1.294, "step": 1000 }, { "epoch": 0.822884012539185, "grad_norm": 0.08447265625, "learning_rate": 0.0001696300001850887, "loss": 0.013, "step": 1050 }, { "epoch": 0.8620689655172413, "grad_norm": 0.0615234375, "learning_rate": 0.0001665448446646357, "loss": 0.0127, "step": 1100 }, { "epoch": 0.9012539184952978, "grad_norm": 0.08642578125, "learning_rate": 0.00016334156515403065, "loss": 0.0124, "step": 1150 }, { "epoch": 0.9404388714733543, "grad_norm": 0.16015625, "learning_rate": 0.00016002584780611194, "loss": 0.0124, "step": 1200 }, { "epoch": 0.9796238244514106, "grad_norm": 0.0654296875, "learning_rate": 0.00015660357836252232, "loss": 0.0127, "step": 1250 }, { "epoch": 1.0188087774294672, "grad_norm": 0.10400390625, "learning_rate": 0.000153080831705908, "loss": 0.0107, "step": 1300 }, { "epoch": 1.0579937304075235, "grad_norm": 0.099609375, "learning_rate": 0.00014946386107637306, "loss": 0.0092, "step": 1350 }, { "epoch": 1.09717868338558, "grad_norm": 0.0556640625, "learning_rate": 0.00014575908697133058, "loss": 0.0093, "step": 1400 }, { "epoch": 1.1363636363636362, "grad_norm": 0.07177734375, "learning_rate": 0.00014197308574845488, "loss": 0.0099, "step": 1450 }, { "epoch": 1.1755485893416928, "grad_norm": 0.0869140625, "learning_rate": 0.0001381125779519658, "loss": 0.0094, "step": 1500 }, { "epoch": 1.1755485893416928, "eval_loss": 0.012080053798854351, "eval_runtime": 438.8603, "eval_samples_per_second": 2.586, "eval_steps_per_second": 1.294, "step": 1500 }, { "epoch": 1.2147335423197492, "grad_norm": 0.078125, "learning_rate": 0.00013418441638296652, "loss": 0.0095, "step": 1550 }, { "epoch": 1.2539184952978055, "grad_norm": 0.1396484375, "learning_rate": 0.00013019557393501228, "loss": 0.0093, "step": 1600 }, { "epoch": 1.293103448275862, "grad_norm": 0.0537109375, "learning_rate": 0.00012615313121650204, "loss": 0.0098, "step": 1650 }, { "epoch": 1.3322884012539185, "grad_norm": 0.057861328125, "learning_rate": 0.00012206426398186534, "loss": 0.01, "step": 1700 }, { "epoch": 1.3714733542319748, "grad_norm": 0.0693359375, "learning_rate": 0.00011793623039385545, "loss": 0.0094, "step": 1750 }, { "epoch": 1.4106583072100314, "grad_norm": 0.12109375, "learning_rate": 0.00011377635813955834, "loss": 0.0096, "step": 1800 }, { "epoch": 1.4498432601880877, "grad_norm": 0.07177734375, "learning_rate": 0.00010959203142298981, "loss": 0.0091, "step": 1850 }, { "epoch": 1.489028213166144, "grad_norm": 0.050537109375, "learning_rate": 0.00010539067785736856, "loss": 0.0086, "step": 1900 }, { "epoch": 1.5282131661442007, "grad_norm": 0.150390625, "learning_rate": 0.0001011797552803333, "loss": 0.0091, "step": 1950 }, { "epoch": 1.567398119122257, "grad_norm": 0.1318359375, "learning_rate": 9.696673851550907e-05, "loss": 0.0093, "step": 2000 }, { "epoch": 1.567398119122257, "eval_loss": 0.011984162032604218, "eval_runtime": 438.9114, "eval_samples_per_second": 2.586, "eval_steps_per_second": 1.294, "step": 2000 }, { "epoch": 1.6065830721003134, "grad_norm": 0.09814453125, "learning_rate": 9.275910610392104e-05, "loss": 0.0091, "step": 2050 }, { "epoch": 1.64576802507837, "grad_norm": 0.13671875, "learning_rate": 8.856432702880984e-05, "loss": 0.0096, "step": 2100 }, { "epoch": 1.6849529780564263, "grad_norm": 0.07568359375, "learning_rate": 8.43898474574128e-05, "loss": 0.0092, "step": 2150 }, { "epoch": 1.7241379310344827, "grad_norm": 0.046875, "learning_rate": 8.02430775232462e-05, "loss": 0.0084, "step": 2200 }, { "epoch": 1.7633228840125392, "grad_norm": 0.062255859375, "learning_rate": 7.61313781723508e-05, "loss": 0.0087, "step": 2250 }, { "epoch": 1.8025078369905956, "grad_norm": 0.06591796875, "learning_rate": 7.206204809685029e-05, "loss": 0.0086, "step": 2300 }, { "epoch": 1.841692789968652, "grad_norm": 0.052978515625, "learning_rate": 6.804231077901733e-05, "loss": 0.0085, "step": 2350 }, { "epoch": 1.8808777429467085, "grad_norm": 0.07421875, "learning_rate": 6.407930166884409e-05, "loss": 0.0094, "step": 2400 }, { "epoch": 1.9200626959247649, "grad_norm": 0.1044921875, "learning_rate": 6.018005551787984e-05, "loss": 0.008, "step": 2450 }, { "epoch": 1.9592476489028212, "grad_norm": 0.05615234375, "learning_rate": 5.635149389181855e-05, "loss": 0.0088, "step": 2500 }, { "epoch": 1.9592476489028212, "eval_loss": 0.011125968769192696, "eval_runtime": 438.9494, "eval_samples_per_second": 2.586, "eval_steps_per_second": 1.294, "step": 2500 }, { "epoch": 1.9984326018808778, "grad_norm": 0.038330078125, "learning_rate": 5.260041288400284e-05, "loss": 0.0092, "step": 2550 }, { "epoch": 2.0376175548589344, "grad_norm": 0.056396484375, "learning_rate": 4.893347105165468e-05, "loss": 0.0064, "step": 2600 }, { "epoch": 2.0768025078369905, "grad_norm": 0.0517578125, "learning_rate": 4.535717759624677e-05, "loss": 0.0065, "step": 2650 }, { "epoch": 2.115987460815047, "grad_norm": 0.049560546875, "learning_rate": 4.187788080899591e-05, "loss": 0.0064, "step": 2700 }, { "epoch": 2.1551724137931036, "grad_norm": 0.049072265625, "learning_rate": 3.8501756801988675e-05, "loss": 0.0062, "step": 2750 }, { "epoch": 2.19435736677116, "grad_norm": 0.0908203125, "learning_rate": 3.5234798544942914e-05, "loss": 0.0063, "step": 2800 }, { "epoch": 2.2335423197492164, "grad_norm": 0.0595703125, "learning_rate": 3.208280522706602e-05, "loss": 0.0066, "step": 2850 }, { "epoch": 2.2727272727272725, "grad_norm": 0.044677734375, "learning_rate": 2.9051371962893358e-05, "loss": 0.0063, "step": 2900 }, { "epoch": 2.311912225705329, "grad_norm": 0.05224609375, "learning_rate": 2.6145879860380773e-05, "loss": 0.0065, "step": 2950 }, { "epoch": 2.3510971786833856, "grad_norm": 0.0908203125, "learning_rate": 2.337148646888061e-05, "loss": 0.0065, "step": 3000 }, { "epoch": 2.3510971786833856, "eval_loss": 0.01185206975787878, "eval_runtime": 438.7429, "eval_samples_per_second": 2.587, "eval_steps_per_second": 1.295, "step": 3000 }, { "epoch": 2.3902821316614418, "grad_norm": 0.054931640625, "learning_rate": 2.073311662395764e-05, "loss": 0.0064, "step": 3050 }, { "epoch": 2.4294670846394983, "grad_norm": 0.0546875, "learning_rate": 1.8235453705295848e-05, "loss": 0.0063, "step": 3100 }, { "epoch": 2.468652037617555, "grad_norm": 0.1494140625, "learning_rate": 1.5882931323214713e-05, "loss": 0.0065, "step": 3150 }, { "epoch": 2.507836990595611, "grad_norm": 0.072265625, "learning_rate": 1.3679725448551451e-05, "loss": 0.0062, "step": 3200 }, { "epoch": 2.5470219435736676, "grad_norm": 0.078125, "learning_rate": 1.1629746999880697e-05, "loss": 0.0065, "step": 3250 }, { "epoch": 2.586206896551724, "grad_norm": 0.0625, "learning_rate": 9.736634901228814e-06, "loss": 0.0064, "step": 3300 }, { "epoch": 2.6253918495297803, "grad_norm": 0.07177734375, "learning_rate": 8.00374962260706e-06, "loss": 0.0062, "step": 3350 }, { "epoch": 2.664576802507837, "grad_norm": 0.051513671875, "learning_rate": 6.434167214829267e-06, "loss": 0.0062, "step": 3400 }, { "epoch": 2.7037617554858935, "grad_norm": 0.251953125, "learning_rate": 5.030673849203082e-06, "loss": 0.007, "step": 3450 }, { "epoch": 2.7429467084639496, "grad_norm": 0.11572265625, "learning_rate": 3.7957608717875015e-06, "loss": 0.0064, "step": 3500 }, { "epoch": 2.7429467084639496, "eval_loss": 0.011658398434519768, "eval_runtime": 438.6997, "eval_samples_per_second": 2.587, "eval_steps_per_second": 1.295, "step": 3500 } ], "logging_steps": 50, "max_steps": 3828, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 3.9128851808256e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }