{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 25.0, "global_step": 116, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03463203463203463, "grad_norm": NaN, "learning_rate": 0.0002, "loss": 73.3116, "step": 1 }, { "epoch": 0.06926406926406926, "grad_norm": 20.237655639648438, "learning_rate": 0.0002, "loss": 74.1209, "step": 2 }, { "epoch": 0.1038961038961039, "grad_norm": 22.083927154541016, "learning_rate": 0.00019827586206896554, "loss": 70.1702, "step": 3 }, { "epoch": 0.13852813852813853, "grad_norm": 24.736431121826172, "learning_rate": 0.00019655172413793104, "loss": 69.115, "step": 4 }, { "epoch": 0.17316017316017315, "grad_norm": 30.25436782836914, "learning_rate": 0.00019482758620689657, "loss": 62.927, "step": 5 }, { "epoch": 0.2077922077922078, "grad_norm": 32.069908142089844, "learning_rate": 0.0001931034482758621, "loss": 60.1745, "step": 6 }, { "epoch": 0.24242424242424243, "grad_norm": 30.88319969177246, "learning_rate": 0.0001913793103448276, "loss": 51.702, "step": 7 }, { "epoch": 0.27705627705627706, "grad_norm": 32.261192321777344, "learning_rate": 0.00018965517241379312, "loss": 50.5157, "step": 8 }, { "epoch": 0.3116883116883117, "grad_norm": 33.49891662597656, "learning_rate": 0.00018793103448275865, "loss": 51.5589, "step": 9 }, { "epoch": 0.3463203463203463, "grad_norm": 24.870943069458008, "learning_rate": 0.00018620689655172415, "loss": 50.8659, "step": 10 }, { "epoch": 0.38095238095238093, "grad_norm": 25.71776580810547, "learning_rate": 0.00018448275862068968, "loss": 51.3838, "step": 11 }, { "epoch": 0.4155844155844156, "grad_norm": NaN, "learning_rate": 0.00018275862068965518, "loss": 43.7134, "step": 12 }, { "epoch": 0.45021645021645024, "grad_norm": 30.822378158569336, "learning_rate": 0.00018275862068965518, "loss": 47.239, "step": 13 }, { "epoch": 0.48484848484848486, "grad_norm": 22.92112159729004, "learning_rate": 0.0001810344827586207, "loss": 47.168, "step": 14 }, { "epoch": 0.5194805194805194, "grad_norm": 28.55995750427246, "learning_rate": 0.0001793103448275862, "loss": 46.0921, "step": 15 }, { "epoch": 0.5541125541125541, "grad_norm": 28.638643264770508, "learning_rate": 0.00017758620689655173, "loss": 43.2701, "step": 16 }, { "epoch": 0.5887445887445888, "grad_norm": 13.875872611999512, "learning_rate": 0.00017586206896551723, "loss": 40.5568, "step": 17 }, { "epoch": 0.6233766233766234, "grad_norm": 15.718329429626465, "learning_rate": 0.00017413793103448276, "loss": 39.8683, "step": 18 }, { "epoch": 0.658008658008658, "grad_norm": 20.134868621826172, "learning_rate": 0.00017241379310344826, "loss": 43.4786, "step": 19 }, { "epoch": 0.6926406926406926, "grad_norm": 17.95608139038086, "learning_rate": 0.0001706896551724138, "loss": 39.757, "step": 20 }, { "epoch": 0.7272727272727273, "grad_norm": 12.064809799194336, "learning_rate": 0.00016896551724137932, "loss": 37.5556, "step": 21 }, { "epoch": 0.7619047619047619, "grad_norm": 22.8414249420166, "learning_rate": 0.00016724137931034482, "loss": 47.1283, "step": 22 }, { "epoch": 0.7965367965367965, "grad_norm": 12.476370811462402, "learning_rate": 0.00016551724137931035, "loss": 44.1665, "step": 23 }, { "epoch": 0.8311688311688312, "grad_norm": 19.783571243286133, "learning_rate": 0.00016379310344827587, "loss": 40.582, "step": 24 }, { "epoch": 0.8658008658008658, "grad_norm": 20.8145751953125, "learning_rate": 0.00016206896551724137, "loss": 41.1066, "step": 25 }, { "epoch": 0.9004329004329005, "grad_norm": 18.939414978027344, "learning_rate": 0.0001603448275862069, "loss": 40.8233, "step": 26 }, { "epoch": 0.935064935064935, "grad_norm": 16.40350914001465, "learning_rate": 0.00015862068965517243, "loss": 38.9867, "step": 27 }, { "epoch": 0.9696969696969697, "grad_norm": 12.22852897644043, "learning_rate": 0.00015689655172413793, "loss": 37.3049, "step": 28 }, { "epoch": 1.0, "grad_norm": 13.104403495788574, "learning_rate": 0.00015517241379310346, "loss": 35.9869, "step": 29 }, { "epoch": 1.0346320346320346, "grad_norm": 18.48545265197754, "learning_rate": 0.00015344827586206899, "loss": 41.11, "step": 30 }, { "epoch": 1.0692640692640694, "grad_norm": 22.71863555908203, "learning_rate": 0.00015172413793103449, "loss": 42.4246, "step": 31 }, { "epoch": 1.103896103896104, "grad_norm": 17.317848205566406, "learning_rate": 0.00015000000000000001, "loss": 41.0728, "step": 32 }, { "epoch": 1.1385281385281385, "grad_norm": 9.099658012390137, "learning_rate": 0.00014827586206896554, "loss": 35.8977, "step": 33 }, { "epoch": 1.173160173160173, "grad_norm": 18.253623962402344, "learning_rate": 0.00014655172413793104, "loss": 45.2388, "step": 34 }, { "epoch": 1.2077922077922079, "grad_norm": 11.06932258605957, "learning_rate": 0.00014482758620689657, "loss": 38.9349, "step": 35 }, { "epoch": 1.2424242424242424, "grad_norm": 13.193914413452148, "learning_rate": 0.0001431034482758621, "loss": 39.5845, "step": 36 }, { "epoch": 1.277056277056277, "grad_norm": 17.09387969970703, "learning_rate": 0.0001413793103448276, "loss": 37.5467, "step": 37 }, { "epoch": 1.3116883116883118, "grad_norm": 10.892671585083008, "learning_rate": 0.0001396551724137931, "loss": 40.0311, "step": 38 }, { "epoch": 1.3463203463203464, "grad_norm": 12.092562675476074, "learning_rate": 0.00013793103448275863, "loss": 37.58, "step": 39 }, { "epoch": 1.380952380952381, "grad_norm": 14.318760871887207, "learning_rate": 0.00013620689655172413, "loss": 44.1895, "step": 40 }, { "epoch": 1.4155844155844157, "grad_norm": 8.378585815429688, "learning_rate": 0.00013448275862068965, "loss": 40.4597, "step": 41 }, { "epoch": 1.4502164502164503, "grad_norm": 12.586594581604004, "learning_rate": 0.00013275862068965518, "loss": 34.0271, "step": 42 }, { "epoch": 1.4848484848484849, "grad_norm": 13.52869701385498, "learning_rate": 0.00013103448275862068, "loss": 40.9999, "step": 43 }, { "epoch": 1.5194805194805194, "grad_norm": 9.904869079589844, "learning_rate": 0.0001293103448275862, "loss": 39.7496, "step": 44 }, { "epoch": 1.554112554112554, "grad_norm": 17.99386978149414, "learning_rate": 0.00012758620689655174, "loss": 44.0324, "step": 45 }, { "epoch": 1.5887445887445888, "grad_norm": 13.183422088623047, "learning_rate": 0.00012586206896551724, "loss": 43.4125, "step": 46 }, { "epoch": 1.6233766233766234, "grad_norm": 10.06049919128418, "learning_rate": 0.00012413793103448277, "loss": 39.1023, "step": 47 }, { "epoch": 1.658008658008658, "grad_norm": 15.202058792114258, "learning_rate": 0.00012241379310344827, "loss": 37.0854, "step": 48 }, { "epoch": 1.6926406926406927, "grad_norm": 14.160669326782227, "learning_rate": 0.0001206896551724138, "loss": 36.6519, "step": 49 }, { "epoch": 1.7272727272727273, "grad_norm": 10.129295349121094, "learning_rate": 0.00011896551724137932, "loss": 40.1695, "step": 50 }, { "epoch": 1.7619047619047619, "grad_norm": 10.78943920135498, "learning_rate": 0.00011724137931034482, "loss": 40.4436, "step": 51 }, { "epoch": 1.7965367965367967, "grad_norm": 9.714445114135742, "learning_rate": 0.00011551724137931035, "loss": 41.4942, "step": 52 }, { "epoch": 1.8311688311688312, "grad_norm": 17.142423629760742, "learning_rate": 0.00011379310344827588, "loss": 45.8405, "step": 53 }, { "epoch": 1.8658008658008658, "grad_norm": 14.116847038269043, "learning_rate": 0.00011206896551724138, "loss": 38.918, "step": 54 }, { "epoch": 1.9004329004329006, "grad_norm": 8.169567108154297, "learning_rate": 0.0001103448275862069, "loss": 39.6313, "step": 55 }, { "epoch": 1.935064935064935, "grad_norm": 10.515144348144531, "learning_rate": 0.00010862068965517242, "loss": 42.3468, "step": 56 }, { "epoch": 1.9696969696969697, "grad_norm": 9.634146690368652, "learning_rate": 0.00010689655172413792, "loss": 43.1881, "step": 57 }, { "epoch": 2.0, "grad_norm": 16.54326629638672, "learning_rate": 0.00010517241379310345, "loss": 31.4997, "step": 58 }, { "epoch": 2.034632034632035, "grad_norm": 13.723237991333008, "learning_rate": 0.00010344827586206898, "loss": 39.0238, "step": 59 }, { "epoch": 2.069264069264069, "grad_norm": 11.90444564819336, "learning_rate": 0.00010172413793103448, "loss": 45.2243, "step": 60 }, { "epoch": 2.103896103896104, "grad_norm": 10.994711875915527, "learning_rate": 0.0001, "loss": 39.9984, "step": 61 }, { "epoch": 2.1385281385281387, "grad_norm": 11.435210227966309, "learning_rate": 9.827586206896552e-05, "loss": 43.82, "step": 62 }, { "epoch": 2.173160173160173, "grad_norm": 13.717733383178711, "learning_rate": 9.655172413793105e-05, "loss": 38.5156, "step": 63 }, { "epoch": 2.207792207792208, "grad_norm": 10.357719421386719, "learning_rate": 9.482758620689656e-05, "loss": 43.2746, "step": 64 }, { "epoch": 2.242424242424242, "grad_norm": 14.937288284301758, "learning_rate": 9.310344827586207e-05, "loss": 38.4409, "step": 65 }, { "epoch": 2.277056277056277, "grad_norm": 14.609394073486328, "learning_rate": 9.137931034482759e-05, "loss": 37.9292, "step": 66 }, { "epoch": 2.311688311688312, "grad_norm": 14.482377052307129, "learning_rate": 8.96551724137931e-05, "loss": 37.5054, "step": 67 }, { "epoch": 2.346320346320346, "grad_norm": 27.845836639404297, "learning_rate": 8.793103448275862e-05, "loss": 31.3865, "step": 68 }, { "epoch": 2.380952380952381, "grad_norm": 12.11486530303955, "learning_rate": 8.620689655172413e-05, "loss": 39.3685, "step": 69 }, { "epoch": 2.4155844155844157, "grad_norm": 12.73064136505127, "learning_rate": 8.448275862068966e-05, "loss": 39.5741, "step": 70 }, { "epoch": 2.45021645021645, "grad_norm": 12.202842712402344, "learning_rate": 8.275862068965517e-05, "loss": 39.1132, "step": 71 }, { "epoch": 2.484848484848485, "grad_norm": 10.822341918945312, "learning_rate": 8.103448275862069e-05, "loss": 37.3896, "step": 72 }, { "epoch": 2.5194805194805197, "grad_norm": 18.123933792114258, "learning_rate": 7.931034482758621e-05, "loss": 40.2132, "step": 73 }, { "epoch": 2.554112554112554, "grad_norm": 11.42330265045166, "learning_rate": 7.758620689655173e-05, "loss": 36.5826, "step": 74 }, { "epoch": 2.588744588744589, "grad_norm": 14.098088264465332, "learning_rate": 7.586206896551724e-05, "loss": 36.8519, "step": 75 }, { "epoch": 2.6233766233766236, "grad_norm": 21.339242935180664, "learning_rate": 7.413793103448277e-05, "loss": 44.4439, "step": 76 }, { "epoch": 2.658008658008658, "grad_norm": 12.531341552734375, "learning_rate": 7.241379310344828e-05, "loss": 36.067, "step": 77 }, { "epoch": 2.6926406926406927, "grad_norm": 14.97744083404541, "learning_rate": 7.06896551724138e-05, "loss": 43.6598, "step": 78 }, { "epoch": 2.7272727272727275, "grad_norm": 12.019708633422852, "learning_rate": 6.896551724137931e-05, "loss": 41.1947, "step": 79 }, { "epoch": 2.761904761904762, "grad_norm": 11.916413307189941, "learning_rate": 6.724137931034483e-05, "loss": 38.0417, "step": 80 }, { "epoch": 2.7965367965367967, "grad_norm": 24.30942726135254, "learning_rate": 6.551724137931034e-05, "loss": 34.9891, "step": 81 }, { "epoch": 2.8311688311688314, "grad_norm": 13.871292114257812, "learning_rate": 6.379310344827587e-05, "loss": 38.7756, "step": 82 }, { "epoch": 2.865800865800866, "grad_norm": 13.42234992980957, "learning_rate": 6.206896551724138e-05, "loss": 37.8582, "step": 83 }, { "epoch": 2.9004329004329006, "grad_norm": 10.090055465698242, "learning_rate": 6.03448275862069e-05, "loss": 41.1415, "step": 84 }, { "epoch": 2.935064935064935, "grad_norm": 14.918863296508789, "learning_rate": 5.862068965517241e-05, "loss": 35.2293, "step": 85 }, { "epoch": 2.9696969696969697, "grad_norm": 16.72150993347168, "learning_rate": 5.689655172413794e-05, "loss": 34.67, "step": 86 }, { "epoch": 3.0, "grad_norm": 10.314674377441406, "learning_rate": 5.517241379310345e-05, "loss": 34.2737, "step": 87 }, { "epoch": 3.034632034632035, "grad_norm": 11.975030899047852, "learning_rate": 5.344827586206896e-05, "loss": 36.4709, "step": 88 }, { "epoch": 3.069264069264069, "grad_norm": 12.001708984375, "learning_rate": 5.172413793103449e-05, "loss": 37.6535, "step": 89 }, { "epoch": 3.103896103896104, "grad_norm": 24.412235260009766, "learning_rate": 5e-05, "loss": 42.9897, "step": 90 }, { "epoch": 3.1385281385281387, "grad_norm": 18.475101470947266, "learning_rate": 4.827586206896552e-05, "loss": 40.2994, "step": 91 }, { "epoch": 3.173160173160173, "grad_norm": 16.03579330444336, "learning_rate": 4.655172413793104e-05, "loss": 39.8546, "step": 92 }, { "epoch": 3.207792207792208, "grad_norm": 14.9187650680542, "learning_rate": 4.482758620689655e-05, "loss": 36.7785, "step": 93 }, { "epoch": 3.242424242424242, "grad_norm": 12.255768775939941, "learning_rate": 4.3103448275862066e-05, "loss": 36.3773, "step": 94 }, { "epoch": 3.277056277056277, "grad_norm": 10.56830883026123, "learning_rate": 4.1379310344827587e-05, "loss": 36.5246, "step": 95 }, { "epoch": 3.311688311688312, "grad_norm": 11.949898719787598, "learning_rate": 3.965517241379311e-05, "loss": 37.3966, "step": 96 }, { "epoch": 3.346320346320346, "grad_norm": 12.314105987548828, "learning_rate": 3.793103448275862e-05, "loss": 39.7101, "step": 97 }, { "epoch": 3.380952380952381, "grad_norm": 12.357243537902832, "learning_rate": 3.620689655172414e-05, "loss": 40.0369, "step": 98 }, { "epoch": 3.4155844155844157, "grad_norm": 16.851078033447266, "learning_rate": 3.4482758620689657e-05, "loss": 36.3794, "step": 99 }, { "epoch": 3.45021645021645, "grad_norm": 14.092662811279297, "learning_rate": 3.275862068965517e-05, "loss": 40.9533, "step": 100 }, { "epoch": 3.484848484848485, "grad_norm": 11.862981796264648, "learning_rate": 3.103448275862069e-05, "loss": 40.7241, "step": 101 }, { "epoch": 3.5194805194805197, "grad_norm": 24.208559036254883, "learning_rate": 2.9310344827586206e-05, "loss": 33.0654, "step": 102 }, { "epoch": 3.554112554112554, "grad_norm": 11.848682403564453, "learning_rate": 2.7586206896551727e-05, "loss": 38.5032, "step": 103 }, { "epoch": 3.588744588744589, "grad_norm": 12.40089225769043, "learning_rate": 2.5862068965517244e-05, "loss": 40.0596, "step": 104 }, { "epoch": 3.6233766233766236, "grad_norm": 22.941667556762695, "learning_rate": 2.413793103448276e-05, "loss": 35.6433, "step": 105 }, { "epoch": 3.658008658008658, "grad_norm": 20.270925521850586, "learning_rate": 2.2413793103448276e-05, "loss": 36.9345, "step": 106 }, { "epoch": 3.6926406926406927, "grad_norm": 11.919129371643066, "learning_rate": 2.0689655172413793e-05, "loss": 38.8896, "step": 107 }, { "epoch": 3.7272727272727275, "grad_norm": 11.444902420043945, "learning_rate": 1.896551724137931e-05, "loss": 40.4934, "step": 108 }, { "epoch": 3.761904761904762, "grad_norm": 10.795780181884766, "learning_rate": 1.7241379310344828e-05, "loss": 37.4473, "step": 109 }, { "epoch": 3.7965367965367967, "grad_norm": 29.648780822753906, "learning_rate": 1.5517241379310346e-05, "loss": 32.2658, "step": 110 }, { "epoch": 3.8311688311688314, "grad_norm": 11.925921440124512, "learning_rate": 1.3793103448275863e-05, "loss": 39.5132, "step": 111 }, { "epoch": 3.865800865800866, "grad_norm": 13.764852523803711, "learning_rate": 1.206896551724138e-05, "loss": 34.6145, "step": 112 }, { "epoch": 3.9004329004329006, "grad_norm": 11.574369430541992, "learning_rate": 1.0344827586206897e-05, "loss": 38.1931, "step": 113 }, { "epoch": 3.935064935064935, "grad_norm": 13.354058265686035, "learning_rate": 8.620689655172414e-06, "loss": 41.1929, "step": 114 }, { "epoch": 3.9696969696969697, "grad_norm": 12.630919456481934, "learning_rate": 6.896551724137932e-06, "loss": 35.7694, "step": 115 }, { "epoch": 4.0, "grad_norm": 12.714266777038574, "learning_rate": 5.172413793103448e-06, "loss": 29.9168, "step": 116 }, { "epoch": 4.0, "step": 116, "total_flos": 34002530627808.0, "train_loss": 41.40085841869486, "train_runtime": 6671.9706, "train_samples_per_second": 0.276, "train_steps_per_second": 0.017 } ], "logging_steps": 1.0, "max_steps": 116, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 34002530627808.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }