{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.18066847335140018, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018066847335140017, "grad_norm": 2.8401210755443786, "learning_rate": 0.0, "loss": 0.8251, "step": 1 }, { "epoch": 0.0036133694670280035, "grad_norm": 2.884596771811998, "learning_rate": 1.7857142857142857e-06, "loss": 0.8284, "step": 2 }, { "epoch": 0.005420054200542005, "grad_norm": 2.885426730353446, "learning_rate": 3.5714285714285714e-06, "loss": 0.8426, "step": 3 }, { "epoch": 0.007226738934056007, "grad_norm": 2.651910724864975, "learning_rate": 5.357142857142857e-06, "loss": 0.8329, "step": 4 }, { "epoch": 0.009033423667570008, "grad_norm": 2.138099539200911, "learning_rate": 7.142857142857143e-06, "loss": 0.8087, "step": 5 }, { "epoch": 0.01084010840108401, "grad_norm": 1.5516804515831568, "learning_rate": 8.92857142857143e-06, "loss": 0.7878, "step": 6 }, { "epoch": 0.012646793134598013, "grad_norm": 1.3537216088647457, "learning_rate": 1.0714285714285714e-05, "loss": 0.76, "step": 7 }, { "epoch": 0.014453477868112014, "grad_norm": 2.2798033166818095, "learning_rate": 1.25e-05, "loss": 0.7502, "step": 8 }, { "epoch": 0.016260162601626018, "grad_norm": 2.4280919600136315, "learning_rate": 1.4285714285714285e-05, "loss": 0.7449, "step": 9 }, { "epoch": 0.018066847335140017, "grad_norm": 1.931081683969551, "learning_rate": 1.6071428571428572e-05, "loss": 0.7271, "step": 10 }, { "epoch": 0.01987353206865402, "grad_norm": 2.033957326273334, "learning_rate": 1.785714285714286e-05, "loss": 0.7224, "step": 11 }, { "epoch": 0.02168021680216802, "grad_norm": 1.7440303482576578, "learning_rate": 1.9642857142857145e-05, "loss": 0.7373, "step": 12 }, { "epoch": 0.023486901535682024, "grad_norm": 1.1423427090231095, "learning_rate": 2.1428571428571428e-05, "loss": 0.7033, "step": 13 }, { "epoch": 0.025293586269196026, "grad_norm": 0.9350758811837806, "learning_rate": 2.3214285714285715e-05, "loss": 0.6886, "step": 14 }, { "epoch": 0.02710027100271003, "grad_norm": 0.9041529205527444, "learning_rate": 2.5e-05, "loss": 0.6858, "step": 15 }, { "epoch": 0.028906955736224028, "grad_norm": 0.7774159470583337, "learning_rate": 2.6785714285714288e-05, "loss": 0.676, "step": 16 }, { "epoch": 0.03071364046973803, "grad_norm": 0.6642849851627606, "learning_rate": 2.857142857142857e-05, "loss": 0.6816, "step": 17 }, { "epoch": 0.032520325203252036, "grad_norm": 0.6696862181877565, "learning_rate": 3.0357142857142857e-05, "loss": 0.6727, "step": 18 }, { "epoch": 0.03432700993676603, "grad_norm": 0.6199596133483118, "learning_rate": 3.2142857142857144e-05, "loss": 0.6678, "step": 19 }, { "epoch": 0.036133694670280034, "grad_norm": 0.5724104094717255, "learning_rate": 3.392857142857143e-05, "loss": 0.6448, "step": 20 }, { "epoch": 0.037940379403794036, "grad_norm": 0.5762214978597714, "learning_rate": 3.571428571428572e-05, "loss": 0.6506, "step": 21 }, { "epoch": 0.03974706413730804, "grad_norm": 0.5444644294356963, "learning_rate": 3.7500000000000003e-05, "loss": 0.6413, "step": 22 }, { "epoch": 0.04155374887082204, "grad_norm": 0.4801008888914334, "learning_rate": 3.928571428571429e-05, "loss": 0.6425, "step": 23 }, { "epoch": 0.04336043360433604, "grad_norm": 0.4780671650041637, "learning_rate": 4.107142857142857e-05, "loss": 0.6489, "step": 24 }, { "epoch": 0.045167118337850046, "grad_norm": 0.5145358853730851, "learning_rate": 4.2857142857142856e-05, "loss": 0.6306, "step": 25 }, { "epoch": 0.04697380307136405, "grad_norm": 0.4229563893889767, "learning_rate": 4.464285714285715e-05, "loss": 0.6453, "step": 26 }, { "epoch": 0.04878048780487805, "grad_norm": 0.4021332182222708, "learning_rate": 4.642857142857143e-05, "loss": 0.6251, "step": 27 }, { "epoch": 0.05058717253839205, "grad_norm": 0.48615223659558016, "learning_rate": 4.8214285714285716e-05, "loss": 0.6272, "step": 28 }, { "epoch": 0.052393857271906055, "grad_norm": 0.46304419453924084, "learning_rate": 5e-05, "loss": 0.6385, "step": 29 }, { "epoch": 0.05420054200542006, "grad_norm": 0.4055914485047887, "learning_rate": 4.999955240022902e-05, "loss": 0.6406, "step": 30 }, { "epoch": 0.05600722673893405, "grad_norm": 0.4320919468210144, "learning_rate": 4.999820961694372e-05, "loss": 0.6303, "step": 31 }, { "epoch": 0.057813911472448055, "grad_norm": 0.4042983593828653, "learning_rate": 4.999597169822646e-05, "loss": 0.6186, "step": 32 }, { "epoch": 0.05962059620596206, "grad_norm": 0.4310835292518631, "learning_rate": 4.9992838724212585e-05, "loss": 0.6287, "step": 33 }, { "epoch": 0.06142728093947606, "grad_norm": 0.412271814827728, "learning_rate": 4.9988810807087584e-05, "loss": 0.6165, "step": 34 }, { "epoch": 0.06323396567299007, "grad_norm": 0.40598837987994935, "learning_rate": 4.998388809108303e-05, "loss": 0.622, "step": 35 }, { "epoch": 0.06504065040650407, "grad_norm": 0.4223908220324365, "learning_rate": 4.997807075247146e-05, "loss": 0.6189, "step": 36 }, { "epoch": 0.06684733514001806, "grad_norm": 0.38197899028265064, "learning_rate": 4.997135899956001e-05, "loss": 0.623, "step": 37 }, { "epoch": 0.06865401987353206, "grad_norm": 0.3885330150920031, "learning_rate": 4.9963753072683025e-05, "loss": 0.6164, "step": 38 }, { "epoch": 0.07046070460704607, "grad_norm": 0.41115019726029983, "learning_rate": 4.9955253244193375e-05, "loss": 0.6182, "step": 39 }, { "epoch": 0.07226738934056007, "grad_norm": 0.3495037257115665, "learning_rate": 4.994585981845278e-05, "loss": 0.6072, "step": 40 }, { "epoch": 0.07407407407407407, "grad_norm": 0.3973831198418077, "learning_rate": 4.9935573131820854e-05, "loss": 0.6137, "step": 41 }, { "epoch": 0.07588075880758807, "grad_norm": 0.3339390206679997, "learning_rate": 4.9924393552643075e-05, "loss": 0.6064, "step": 42 }, { "epoch": 0.07768744354110207, "grad_norm": 0.3384678373992373, "learning_rate": 4.991232148123761e-05, "loss": 0.606, "step": 43 }, { "epoch": 0.07949412827461608, "grad_norm": 0.3764492630812678, "learning_rate": 4.989935734988098e-05, "loss": 0.601, "step": 44 }, { "epoch": 0.08130081300813008, "grad_norm": 0.35707353533847597, "learning_rate": 4.988550162279255e-05, "loss": 0.6097, "step": 45 }, { "epoch": 0.08310749774164408, "grad_norm": 0.34040533355863656, "learning_rate": 4.987075479611796e-05, "loss": 0.6094, "step": 46 }, { "epoch": 0.08491418247515808, "grad_norm": 0.34496537693497575, "learning_rate": 4.985511739791129e-05, "loss": 0.6129, "step": 47 }, { "epoch": 0.08672086720867209, "grad_norm": 0.3655410266089117, "learning_rate": 4.983858998811622e-05, "loss": 0.5994, "step": 48 }, { "epoch": 0.08852755194218609, "grad_norm": 0.32151319632227454, "learning_rate": 4.9821173158545936e-05, "loss": 0.607, "step": 49 }, { "epoch": 0.09033423667570009, "grad_norm": 0.3573869377855791, "learning_rate": 4.980286753286195e-05, "loss": 0.6067, "step": 50 }, { "epoch": 0.0921409214092141, "grad_norm": 0.3601371463623409, "learning_rate": 4.978367376655177e-05, "loss": 0.5965, "step": 51 }, { "epoch": 0.0939476061427281, "grad_norm": 0.3288839599265164, "learning_rate": 4.976359254690543e-05, "loss": 0.6128, "step": 52 }, { "epoch": 0.0957542908762421, "grad_norm": 0.3899144366212297, "learning_rate": 4.974262459299087e-05, "loss": 0.5992, "step": 53 }, { "epoch": 0.0975609756097561, "grad_norm": 0.31373036051997816, "learning_rate": 4.972077065562821e-05, "loss": 0.5894, "step": 54 }, { "epoch": 0.0993676603432701, "grad_norm": 0.3317064945312652, "learning_rate": 4.969803151736284e-05, "loss": 0.6046, "step": 55 }, { "epoch": 0.1011743450767841, "grad_norm": 0.4008096818415782, "learning_rate": 4.9674407992437394e-05, "loss": 0.6091, "step": 56 }, { "epoch": 0.10298102981029811, "grad_norm": 0.32205121061079506, "learning_rate": 4.964990092676263e-05, "loss": 0.5936, "step": 57 }, { "epoch": 0.10478771454381211, "grad_norm": 0.3834592472356572, "learning_rate": 4.962451119788709e-05, "loss": 0.6035, "step": 58 }, { "epoch": 0.10659439927732611, "grad_norm": 0.4364607834229955, "learning_rate": 4.959823971496574e-05, "loss": 0.5895, "step": 59 }, { "epoch": 0.10840108401084012, "grad_norm": 0.39798614932188364, "learning_rate": 4.957108741872736e-05, "loss": 0.5903, "step": 60 }, { "epoch": 0.1102077687443541, "grad_norm": 0.4078983073259993, "learning_rate": 4.954305528144085e-05, "loss": 0.594, "step": 61 }, { "epoch": 0.1120144534778681, "grad_norm": 0.5064139886264023, "learning_rate": 4.9514144306880506e-05, "loss": 0.5989, "step": 62 }, { "epoch": 0.11382113821138211, "grad_norm": 0.3049367414530056, "learning_rate": 4.9484355530289944e-05, "loss": 0.5982, "step": 63 }, { "epoch": 0.11562782294489611, "grad_norm": 0.45932591541254997, "learning_rate": 4.9453690018345144e-05, "loss": 0.598, "step": 64 }, { "epoch": 0.11743450767841011, "grad_norm": 0.34088884376991047, "learning_rate": 4.9422148869116194e-05, "loss": 0.5914, "step": 65 }, { "epoch": 0.11924119241192412, "grad_norm": 0.4408196994719187, "learning_rate": 4.938973321202799e-05, "loss": 0.5943, "step": 66 }, { "epoch": 0.12104787714543812, "grad_norm": 0.4034840944061305, "learning_rate": 4.935644420781978e-05, "loss": 0.5852, "step": 67 }, { "epoch": 0.12285456187895212, "grad_norm": 0.3532258693927161, "learning_rate": 4.932228304850363e-05, "loss": 0.6003, "step": 68 }, { "epoch": 0.12466124661246612, "grad_norm": 0.4041417285254443, "learning_rate": 4.928725095732169e-05, "loss": 0.6019, "step": 69 }, { "epoch": 0.12646793134598014, "grad_norm": 0.338119605860217, "learning_rate": 4.925134918870245e-05, "loss": 0.6056, "step": 70 }, { "epoch": 0.12827461607949414, "grad_norm": 0.2924641177682486, "learning_rate": 4.9214579028215776e-05, "loss": 0.5784, "step": 71 }, { "epoch": 0.13008130081300814, "grad_norm": 0.37576892630911196, "learning_rate": 4.917694179252692e-05, "loss": 0.5966, "step": 72 }, { "epoch": 0.13188798554652212, "grad_norm": 0.3066908816324021, "learning_rate": 4.91384388293493e-05, "loss": 0.5944, "step": 73 }, { "epoch": 0.13369467028003612, "grad_norm": 0.3296017371410444, "learning_rate": 4.909907151739633e-05, "loss": 0.5863, "step": 74 }, { "epoch": 0.13550135501355012, "grad_norm": 0.3510646842800697, "learning_rate": 4.9058841266332e-05, "loss": 0.5854, "step": 75 }, { "epoch": 0.13730803974706413, "grad_norm": 0.33050013383675975, "learning_rate": 4.90177495167204e-05, "loss": 0.5816, "step": 76 }, { "epoch": 0.13911472448057813, "grad_norm": 0.336545072709173, "learning_rate": 4.897579773997415e-05, "loss": 0.5768, "step": 77 }, { "epoch": 0.14092140921409213, "grad_norm": 0.3401542906091868, "learning_rate": 4.893298743830168e-05, "loss": 0.5877, "step": 78 }, { "epoch": 0.14272809394760613, "grad_norm": 0.30833272476392615, "learning_rate": 4.888932014465352e-05, "loss": 0.5949, "step": 79 }, { "epoch": 0.14453477868112014, "grad_norm": 0.40829913126640544, "learning_rate": 4.88447974226673e-05, "loss": 0.6045, "step": 80 }, { "epoch": 0.14634146341463414, "grad_norm": 0.3247095550760803, "learning_rate": 4.879942086661184e-05, "loss": 0.5901, "step": 81 }, { "epoch": 0.14814814814814814, "grad_norm": 0.40484730371305205, "learning_rate": 4.875319210133004e-05, "loss": 0.5825, "step": 82 }, { "epoch": 0.14995483288166214, "grad_norm": 0.37019522474324174, "learning_rate": 4.870611278218066e-05, "loss": 0.5918, "step": 83 }, { "epoch": 0.15176151761517614, "grad_norm": 0.3654744041300334, "learning_rate": 4.865818459497911e-05, "loss": 0.5865, "step": 84 }, { "epoch": 0.15356820234869015, "grad_norm": 0.3001653084612634, "learning_rate": 4.860940925593703e-05, "loss": 0.5889, "step": 85 }, { "epoch": 0.15537488708220415, "grad_norm": 0.3463142769052332, "learning_rate": 4.8559788511600876e-05, "loss": 0.5881, "step": 86 }, { "epoch": 0.15718157181571815, "grad_norm": 0.3221292284344934, "learning_rate": 4.850932413878934e-05, "loss": 0.5901, "step": 87 }, { "epoch": 0.15898825654923215, "grad_norm": 0.33253415770699135, "learning_rate": 4.8458017944529776e-05, "loss": 0.5952, "step": 88 }, { "epoch": 0.16079494128274616, "grad_norm": 0.37021022779245716, "learning_rate": 4.8405871765993433e-05, "loss": 0.5928, "step": 89 }, { "epoch": 0.16260162601626016, "grad_norm": 0.3197338147243217, "learning_rate": 4.8352887470429726e-05, "loss": 0.5837, "step": 90 }, { "epoch": 0.16440831074977416, "grad_norm": 0.35706780201968874, "learning_rate": 4.8299066955099335e-05, "loss": 0.5811, "step": 91 }, { "epoch": 0.16621499548328816, "grad_norm": 0.3730971582648582, "learning_rate": 4.8244412147206284e-05, "loss": 0.586, "step": 92 }, { "epoch": 0.16802168021680217, "grad_norm": 0.33467520904104064, "learning_rate": 4.8188925003828945e-05, "loss": 0.5919, "step": 93 }, { "epoch": 0.16982836495031617, "grad_norm": 0.41181901183464464, "learning_rate": 4.813260751184992e-05, "loss": 0.5922, "step": 94 }, { "epoch": 0.17163504968383017, "grad_norm": 0.31057237794043846, "learning_rate": 4.807546168788494e-05, "loss": 0.5835, "step": 95 }, { "epoch": 0.17344173441734417, "grad_norm": 0.4045185112342142, "learning_rate": 4.8017489578210604e-05, "loss": 0.5839, "step": 96 }, { "epoch": 0.17524841915085818, "grad_norm": 0.342091450071029, "learning_rate": 4.7958693258691167e-05, "loss": 0.5891, "step": 97 }, { "epoch": 0.17705510388437218, "grad_norm": 0.32193383766669476, "learning_rate": 4.7899074834704165e-05, "loss": 0.5814, "step": 98 }, { "epoch": 0.17886178861788618, "grad_norm": 0.33881983844597735, "learning_rate": 4.783863644106502e-05, "loss": 0.5951, "step": 99 }, { "epoch": 0.18066847335140018, "grad_norm": 0.3141049036238513, "learning_rate": 4.7777380241950645e-05, "loss": 0.5672, "step": 100 } ], "logging_steps": 1, "max_steps": 553, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 115440686923776.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }