{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 66, "global_step": 66, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015151515151515152, "grad_norm": 2.1806390285491943, "learning_rate": 1e-05, "loss": 2.259, "step": 1 }, { "epoch": 0.030303030303030304, "grad_norm": 2.0509750843048096, "learning_rate": 9.84848484848485e-06, "loss": 2.1749, "step": 2 }, { "epoch": 0.045454545454545456, "grad_norm": 1.945802092552185, "learning_rate": 9.696969696969698e-06, "loss": 2.1865, "step": 3 }, { "epoch": 0.06060606060606061, "grad_norm": 1.7818379402160645, "learning_rate": 9.545454545454547e-06, "loss": 2.1723, "step": 4 }, { "epoch": 0.07575757575757576, "grad_norm": 1.5140708684921265, "learning_rate": 9.393939393939396e-06, "loss": 2.0943, "step": 5 }, { "epoch": 0.09090909090909091, "grad_norm": 1.168502688407898, "learning_rate": 9.242424242424244e-06, "loss": 1.9775, "step": 6 }, { "epoch": 0.10606060606060606, "grad_norm": 0.9731884002685547, "learning_rate": 9.090909090909091e-06, "loss": 1.9921, "step": 7 }, { "epoch": 0.12121212121212122, "grad_norm": 0.8054101467132568, "learning_rate": 8.93939393939394e-06, "loss": 1.9888, "step": 8 }, { "epoch": 0.13636363636363635, "grad_norm": 0.6741383075714111, "learning_rate": 8.787878787878788e-06, "loss": 1.8612, "step": 9 }, { "epoch": 0.15151515151515152, "grad_norm": 0.6076740026473999, "learning_rate": 8.636363636363637e-06, "loss": 1.7527, "step": 10 }, { "epoch": 0.16666666666666666, "grad_norm": 0.6412762403488159, "learning_rate": 8.484848484848486e-06, "loss": 1.8131, "step": 11 }, { "epoch": 0.18181818181818182, "grad_norm": 0.6954469680786133, "learning_rate": 8.333333333333334e-06, "loss": 1.8204, "step": 12 }, { "epoch": 0.19696969696969696, "grad_norm": 0.7794247269630432, "learning_rate": 8.181818181818183e-06, "loss": 1.86, "step": 13 }, { "epoch": 0.21212121212121213, "grad_norm": 0.811967134475708, "learning_rate": 8.03030303030303e-06, "loss": 1.8567, "step": 14 }, { "epoch": 0.22727272727272727, "grad_norm": 0.7707127928733826, "learning_rate": 7.87878787878788e-06, "loss": 1.7976, "step": 15 }, { "epoch": 0.24242424242424243, "grad_norm": 0.7628827691078186, "learning_rate": 7.727272727272727e-06, "loss": 1.7364, "step": 16 }, { "epoch": 0.25757575757575757, "grad_norm": 0.7113344073295593, "learning_rate": 7.5757575757575764e-06, "loss": 1.7219, "step": 17 }, { "epoch": 0.2727272727272727, "grad_norm": 0.6681413054466248, "learning_rate": 7.424242424242425e-06, "loss": 1.7664, "step": 18 }, { "epoch": 0.2878787878787879, "grad_norm": 0.6175987720489502, "learning_rate": 7.272727272727273e-06, "loss": 1.6869, "step": 19 }, { "epoch": 0.30303030303030304, "grad_norm": 0.5634602904319763, "learning_rate": 7.121212121212122e-06, "loss": 1.7165, "step": 20 }, { "epoch": 0.3181818181818182, "grad_norm": 0.4820578694343567, "learning_rate": 6.969696969696971e-06, "loss": 1.6105, "step": 21 }, { "epoch": 0.3333333333333333, "grad_norm": 0.5000921487808228, "learning_rate": 6.818181818181818e-06, "loss": 1.6858, "step": 22 }, { "epoch": 0.3484848484848485, "grad_norm": 0.4802851676940918, "learning_rate": 6.666666666666667e-06, "loss": 1.6478, "step": 23 }, { "epoch": 0.36363636363636365, "grad_norm": 0.45336592197418213, "learning_rate": 6.515151515151516e-06, "loss": 1.5984, "step": 24 }, { "epoch": 0.3787878787878788, "grad_norm": 0.4636070132255554, "learning_rate": 6.363636363636364e-06, "loss": 1.613, "step": 25 }, { "epoch": 0.3939393939393939, "grad_norm": 0.4598033130168915, "learning_rate": 6.212121212121213e-06, "loss": 1.6106, "step": 26 }, { "epoch": 0.4090909090909091, "grad_norm": 0.46231794357299805, "learning_rate": 6.060606060606061e-06, "loss": 1.581, "step": 27 }, { "epoch": 0.42424242424242425, "grad_norm": 0.4495490789413452, "learning_rate": 5.90909090909091e-06, "loss": 1.571, "step": 28 }, { "epoch": 0.4393939393939394, "grad_norm": 0.43504828214645386, "learning_rate": 5.7575757575757586e-06, "loss": 1.4945, "step": 29 }, { "epoch": 0.45454545454545453, "grad_norm": 0.4237779974937439, "learning_rate": 5.606060606060606e-06, "loss": 1.5492, "step": 30 }, { "epoch": 0.4696969696969697, "grad_norm": 0.40998709201812744, "learning_rate": 5.4545454545454545e-06, "loss": 1.5383, "step": 31 }, { "epoch": 0.48484848484848486, "grad_norm": 0.3991314470767975, "learning_rate": 5.303030303030303e-06, "loss": 1.5579, "step": 32 }, { "epoch": 0.5, "grad_norm": 0.3889806866645813, "learning_rate": 5.151515151515152e-06, "loss": 1.5391, "step": 33 }, { "epoch": 0.5151515151515151, "grad_norm": 0.382036030292511, "learning_rate": 5e-06, "loss": 1.5403, "step": 34 }, { "epoch": 0.5303030303030303, "grad_norm": 0.3747563064098358, "learning_rate": 4.848484848484849e-06, "loss": 1.5569, "step": 35 }, { "epoch": 0.5454545454545454, "grad_norm": 0.3503689765930176, "learning_rate": 4.696969696969698e-06, "loss": 1.4986, "step": 36 }, { "epoch": 0.5606060606060606, "grad_norm": 0.34654778242111206, "learning_rate": 4.5454545454545455e-06, "loss": 1.4863, "step": 37 }, { "epoch": 0.5757575757575758, "grad_norm": 0.3559574782848358, "learning_rate": 4.393939393939394e-06, "loss": 1.5173, "step": 38 }, { "epoch": 0.5909090909090909, "grad_norm": 0.32772454619407654, "learning_rate": 4.242424242424243e-06, "loss": 1.4364, "step": 39 }, { "epoch": 0.6060606060606061, "grad_norm": 0.3556043207645416, "learning_rate": 4.0909090909090915e-06, "loss": 1.4833, "step": 40 }, { "epoch": 0.6212121212121212, "grad_norm": 0.3289327323436737, "learning_rate": 3.93939393939394e-06, "loss": 1.4412, "step": 41 }, { "epoch": 0.6363636363636364, "grad_norm": 0.38381296396255493, "learning_rate": 3.7878787878787882e-06, "loss": 1.5059, "step": 42 }, { "epoch": 0.6515151515151515, "grad_norm": 0.33431607484817505, "learning_rate": 3.6363636363636366e-06, "loss": 1.4532, "step": 43 }, { "epoch": 0.6666666666666666, "grad_norm": 0.34528639912605286, "learning_rate": 3.4848484848484854e-06, "loss": 1.4567, "step": 44 }, { "epoch": 0.6818181818181818, "grad_norm": 0.3294561803340912, "learning_rate": 3.3333333333333333e-06, "loss": 1.4228, "step": 45 }, { "epoch": 0.696969696969697, "grad_norm": 0.32518258690834045, "learning_rate": 3.181818181818182e-06, "loss": 1.4301, "step": 46 }, { "epoch": 0.7121212121212122, "grad_norm": 0.3199913203716278, "learning_rate": 3.0303030303030305e-06, "loss": 1.4139, "step": 47 }, { "epoch": 0.7272727272727273, "grad_norm": 0.3203113377094269, "learning_rate": 2.8787878787878793e-06, "loss": 1.4344, "step": 48 }, { "epoch": 0.7424242424242424, "grad_norm": 0.3189423680305481, "learning_rate": 2.7272727272727272e-06, "loss": 1.407, "step": 49 }, { "epoch": 0.7575757575757576, "grad_norm": 0.3475227653980255, "learning_rate": 2.575757575757576e-06, "loss": 1.4637, "step": 50 }, { "epoch": 0.7727272727272727, "grad_norm": 0.3078831136226654, "learning_rate": 2.4242424242424244e-06, "loss": 1.395, "step": 51 }, { "epoch": 0.7878787878787878, "grad_norm": 0.3077380061149597, "learning_rate": 2.2727272727272728e-06, "loss": 1.3932, "step": 52 }, { "epoch": 0.803030303030303, "grad_norm": 0.33048221468925476, "learning_rate": 2.1212121212121216e-06, "loss": 1.4297, "step": 53 }, { "epoch": 0.8181818181818182, "grad_norm": 0.3076639175415039, "learning_rate": 1.96969696969697e-06, "loss": 1.3972, "step": 54 }, { "epoch": 0.8333333333333334, "grad_norm": 0.31399327516555786, "learning_rate": 1.8181818181818183e-06, "loss": 1.4242, "step": 55 }, { "epoch": 0.8484848484848485, "grad_norm": 0.29711833596229553, "learning_rate": 1.6666666666666667e-06, "loss": 1.3937, "step": 56 }, { "epoch": 0.8636363636363636, "grad_norm": 0.30907464027404785, "learning_rate": 1.5151515151515152e-06, "loss": 1.4002, "step": 57 }, { "epoch": 0.8787878787878788, "grad_norm": 0.3013785481452942, "learning_rate": 1.3636363636363636e-06, "loss": 1.3923, "step": 58 }, { "epoch": 0.8939393939393939, "grad_norm": 0.31994277238845825, "learning_rate": 1.2121212121212122e-06, "loss": 1.4338, "step": 59 }, { "epoch": 0.9090909090909091, "grad_norm": 0.3298662006855011, "learning_rate": 1.0606060606060608e-06, "loss": 1.4199, "step": 60 }, { "epoch": 0.9242424242424242, "grad_norm": 0.317755788564682, "learning_rate": 9.090909090909091e-07, "loss": 1.4137, "step": 61 }, { "epoch": 0.9393939393939394, "grad_norm": 0.29229098558425903, "learning_rate": 7.575757575757576e-07, "loss": 1.3405, "step": 62 }, { "epoch": 0.9545454545454546, "grad_norm": 0.3137110471725464, "learning_rate": 6.060606060606061e-07, "loss": 1.4083, "step": 63 }, { "epoch": 0.9696969696969697, "grad_norm": 0.30957749485969543, "learning_rate": 4.5454545454545457e-07, "loss": 1.4063, "step": 64 }, { "epoch": 0.9848484848484849, "grad_norm": 0.3310662806034088, "learning_rate": 3.0303030303030305e-07, "loss": 1.4201, "step": 65 }, { "epoch": 1.0, "grad_norm": 0.297313928604126, "learning_rate": 1.5151515151515152e-07, "loss": 1.3758, "step": 66 }, { "epoch": 1.0, "eval_loss": 1.4051569700241089, "eval_runtime": 4.459, "eval_samples_per_second": 1.57, "eval_steps_per_second": 0.224, "step": 66 } ], "logging_steps": 1.0, "max_steps": 66, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3786133191275315e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }