{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04189359028068706, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00041893590280687055, "grad_norm": 8.333969116210938, "learning_rate": 2e-05, "loss": 3.3261, "step": 1 }, { "epoch": 0.0008378718056137411, "grad_norm": 7.348582744598389, "learning_rate": 1.9998324256388776e-05, "loss": 3.2312, "step": 2 }, { "epoch": 0.0012568077084206116, "grad_norm": 3.1899449825286865, "learning_rate": 1.9996648512777547e-05, "loss": 2.9318, "step": 3 }, { "epoch": 0.0016757436112274822, "grad_norm": 2.177359104156494, "learning_rate": 1.9994972769166317e-05, "loss": 2.8876, "step": 4 }, { "epoch": 0.0020946795140343527, "grad_norm": 2.1708009243011475, "learning_rate": 1.999329702555509e-05, "loss": 3.068, "step": 5 }, { "epoch": 0.0025136154168412233, "grad_norm": 1.77362060546875, "learning_rate": 1.9991621281943865e-05, "loss": 2.6598, "step": 6 }, { "epoch": 0.002932551319648094, "grad_norm": 1.709831714630127, "learning_rate": 1.9989945538332636e-05, "loss": 2.767, "step": 7 }, { "epoch": 0.0033514872224549644, "grad_norm": 1.5493128299713135, "learning_rate": 1.998826979472141e-05, "loss": 2.7182, "step": 8 }, { "epoch": 0.003770423125261835, "grad_norm": 1.39557945728302, "learning_rate": 1.998659405111018e-05, "loss": 2.8695, "step": 9 }, { "epoch": 0.0041893590280687055, "grad_norm": 1.1357755661010742, "learning_rate": 1.9984918307498955e-05, "loss": 2.5497, "step": 10 }, { "epoch": 0.004608294930875576, "grad_norm": 1.0683544874191284, "learning_rate": 1.9983242563887726e-05, "loss": 2.7363, "step": 11 }, { "epoch": 0.005027230833682447, "grad_norm": 0.9199109673500061, "learning_rate": 1.99815668202765e-05, "loss": 2.4438, "step": 12 }, { "epoch": 0.005446166736489317, "grad_norm": 0.9708887338638306, "learning_rate": 1.997989107666527e-05, "loss": 2.6098, "step": 13 }, { "epoch": 0.005865102639296188, "grad_norm": 0.9026995301246643, "learning_rate": 1.9978215333054045e-05, "loss": 2.4444, "step": 14 }, { "epoch": 0.006284038542103058, "grad_norm": 1.2418183088302612, "learning_rate": 1.9976539589442816e-05, "loss": 2.5436, "step": 15 }, { "epoch": 0.006702974444909929, "grad_norm": 0.8761052489280701, "learning_rate": 1.997486384583159e-05, "loss": 2.3926, "step": 16 }, { "epoch": 0.007121910347716799, "grad_norm": 0.8849633932113647, "learning_rate": 1.997318810222036e-05, "loss": 2.5935, "step": 17 }, { "epoch": 0.00754084625052367, "grad_norm": 0.8368175029754639, "learning_rate": 1.9971512358609135e-05, "loss": 2.4684, "step": 18 }, { "epoch": 0.007959782153330541, "grad_norm": 0.697807788848877, "learning_rate": 1.9969836614997905e-05, "loss": 2.3537, "step": 19 }, { "epoch": 0.008378718056137411, "grad_norm": 0.7476556301116943, "learning_rate": 1.996816087138668e-05, "loss": 2.4659, "step": 20 }, { "epoch": 0.008797653958944282, "grad_norm": 0.7885666489601135, "learning_rate": 1.9966485127775454e-05, "loss": 2.4466, "step": 21 }, { "epoch": 0.009216589861751152, "grad_norm": 0.7271686792373657, "learning_rate": 1.9964809384164224e-05, "loss": 2.3659, "step": 22 }, { "epoch": 0.009635525764558023, "grad_norm": 0.7286465764045715, "learning_rate": 1.9963133640552995e-05, "loss": 2.4473, "step": 23 }, { "epoch": 0.010054461667364893, "grad_norm": 0.8572853207588196, "learning_rate": 1.996145789694177e-05, "loss": 2.3595, "step": 24 }, { "epoch": 0.010473397570171765, "grad_norm": 0.8283334374427795, "learning_rate": 1.9959782153330543e-05, "loss": 2.5291, "step": 25 }, { "epoch": 0.010892333472978634, "grad_norm": 0.6586313843727112, "learning_rate": 1.9958106409719314e-05, "loss": 2.3985, "step": 26 }, { "epoch": 0.011311269375785506, "grad_norm": 0.6430657505989075, "learning_rate": 1.9956430666108085e-05, "loss": 2.3578, "step": 27 }, { "epoch": 0.011730205278592375, "grad_norm": 0.6550448536872864, "learning_rate": 1.995475492249686e-05, "loss": 2.4077, "step": 28 }, { "epoch": 0.012149141181399247, "grad_norm": 0.7592840194702148, "learning_rate": 1.9953079178885633e-05, "loss": 2.5008, "step": 29 }, { "epoch": 0.012568077084206116, "grad_norm": 0.7858672738075256, "learning_rate": 1.9951403435274407e-05, "loss": 2.4674, "step": 30 }, { "epoch": 0.012987012987012988, "grad_norm": 0.6130352020263672, "learning_rate": 1.9949727691663178e-05, "loss": 2.3526, "step": 31 }, { "epoch": 0.013405948889819858, "grad_norm": 0.6684207320213318, "learning_rate": 1.994805194805195e-05, "loss": 2.3732, "step": 32 }, { "epoch": 0.013824884792626729, "grad_norm": 0.8275600671768188, "learning_rate": 1.9946376204440723e-05, "loss": 2.135, "step": 33 }, { "epoch": 0.014243820695433599, "grad_norm": 0.5858725309371948, "learning_rate": 1.9944700460829494e-05, "loss": 2.1368, "step": 34 }, { "epoch": 0.01466275659824047, "grad_norm": 0.7133444547653198, "learning_rate": 1.9943024717218268e-05, "loss": 2.304, "step": 35 }, { "epoch": 0.01508169250104734, "grad_norm": 0.5466803312301636, "learning_rate": 1.994134897360704e-05, "loss": 2.1682, "step": 36 }, { "epoch": 0.015500628403854211, "grad_norm": 0.5196086168289185, "learning_rate": 1.9939673229995813e-05, "loss": 2.1546, "step": 37 }, { "epoch": 0.015919564306661083, "grad_norm": 0.5088497400283813, "learning_rate": 1.9937997486384583e-05, "loss": 2.1018, "step": 38 }, { "epoch": 0.016338500209467952, "grad_norm": 0.6117899417877197, "learning_rate": 1.9936321742773358e-05, "loss": 2.2346, "step": 39 }, { "epoch": 0.016757436112274822, "grad_norm": 0.5710458159446716, "learning_rate": 1.993464599916213e-05, "loss": 2.2147, "step": 40 }, { "epoch": 0.01717637201508169, "grad_norm": 0.5152861475944519, "learning_rate": 1.9932970255550902e-05, "loss": 2.2716, "step": 41 }, { "epoch": 0.017595307917888565, "grad_norm": 0.6851192712783813, "learning_rate": 1.9931294511939673e-05, "loss": 2.3158, "step": 42 }, { "epoch": 0.018014243820695434, "grad_norm": 0.5485531687736511, "learning_rate": 1.9929618768328447e-05, "loss": 2.2679, "step": 43 }, { "epoch": 0.018433179723502304, "grad_norm": 0.48592010140419006, "learning_rate": 1.992794302471722e-05, "loss": 2.1303, "step": 44 }, { "epoch": 0.018852115626309174, "grad_norm": 0.5533665418624878, "learning_rate": 1.9926267281105992e-05, "loss": 2.1981, "step": 45 }, { "epoch": 0.019271051529116047, "grad_norm": 0.5932656526565552, "learning_rate": 1.9924591537494763e-05, "loss": 2.3737, "step": 46 }, { "epoch": 0.019689987431922917, "grad_norm": 0.5236673951148987, "learning_rate": 1.9922915793883537e-05, "loss": 2.2694, "step": 47 }, { "epoch": 0.020108923334729786, "grad_norm": 0.5357316732406616, "learning_rate": 1.992124005027231e-05, "loss": 2.2368, "step": 48 }, { "epoch": 0.020527859237536656, "grad_norm": 0.5500349998474121, "learning_rate": 1.9919564306661082e-05, "loss": 2.213, "step": 49 }, { "epoch": 0.02094679514034353, "grad_norm": 0.48040810227394104, "learning_rate": 1.9917888563049853e-05, "loss": 2.1892, "step": 50 }, { "epoch": 0.0213657310431504, "grad_norm": 0.5716186165809631, "learning_rate": 1.9916212819438627e-05, "loss": 2.2039, "step": 51 }, { "epoch": 0.02178466694595727, "grad_norm": 0.5564374923706055, "learning_rate": 1.99145370758274e-05, "loss": 2.1411, "step": 52 }, { "epoch": 0.022203602848764138, "grad_norm": 0.4996980130672455, "learning_rate": 1.9912861332216175e-05, "loss": 2.1521, "step": 53 }, { "epoch": 0.02262253875157101, "grad_norm": 0.5239240527153015, "learning_rate": 1.9911185588604946e-05, "loss": 2.0742, "step": 54 }, { "epoch": 0.02304147465437788, "grad_norm": 0.4403076767921448, "learning_rate": 1.9909509844993716e-05, "loss": 1.9841, "step": 55 }, { "epoch": 0.02346041055718475, "grad_norm": 0.5169032216072083, "learning_rate": 1.990783410138249e-05, "loss": 2.0327, "step": 56 }, { "epoch": 0.02387934645999162, "grad_norm": 0.4901898503303528, "learning_rate": 1.9906158357771265e-05, "loss": 2.0063, "step": 57 }, { "epoch": 0.024298282362798494, "grad_norm": 0.6581910252571106, "learning_rate": 1.9904482614160035e-05, "loss": 2.1385, "step": 58 }, { "epoch": 0.024717218265605363, "grad_norm": 0.4522070586681366, "learning_rate": 1.9902806870548806e-05, "loss": 1.9944, "step": 59 }, { "epoch": 0.025136154168412233, "grad_norm": 0.5315820574760437, "learning_rate": 1.990113112693758e-05, "loss": 2.1579, "step": 60 }, { "epoch": 0.025555090071219103, "grad_norm": 0.4661259353160858, "learning_rate": 1.9899455383326354e-05, "loss": 2.1193, "step": 61 }, { "epoch": 0.025974025974025976, "grad_norm": 0.4940222203731537, "learning_rate": 1.9897779639715125e-05, "loss": 2.0844, "step": 62 }, { "epoch": 0.026392961876832845, "grad_norm": 0.46520665287971497, "learning_rate": 1.98961038961039e-05, "loss": 1.9306, "step": 63 }, { "epoch": 0.026811897779639715, "grad_norm": 0.5645989179611206, "learning_rate": 1.989442815249267e-05, "loss": 2.1236, "step": 64 }, { "epoch": 0.027230833682446585, "grad_norm": 0.47880157828330994, "learning_rate": 1.989275240888144e-05, "loss": 2.0206, "step": 65 }, { "epoch": 0.027649769585253458, "grad_norm": 0.6371349692344666, "learning_rate": 1.9891076665270215e-05, "loss": 2.019, "step": 66 }, { "epoch": 0.028068705488060328, "grad_norm": 0.5742272734642029, "learning_rate": 1.988940092165899e-05, "loss": 2.0899, "step": 67 }, { "epoch": 0.028487641390867197, "grad_norm": 0.5579768419265747, "learning_rate": 1.988772517804776e-05, "loss": 2.081, "step": 68 }, { "epoch": 0.028906577293674067, "grad_norm": 0.5897182822227478, "learning_rate": 1.988604943443653e-05, "loss": 1.9601, "step": 69 }, { "epoch": 0.02932551319648094, "grad_norm": 0.46881428360939026, "learning_rate": 1.9884373690825305e-05, "loss": 1.9085, "step": 70 }, { "epoch": 0.02974444909928781, "grad_norm": 0.6095844507217407, "learning_rate": 1.988269794721408e-05, "loss": 1.9762, "step": 71 }, { "epoch": 0.03016338500209468, "grad_norm": 0.599513053894043, "learning_rate": 1.988102220360285e-05, "loss": 1.8723, "step": 72 }, { "epoch": 0.03058232090490155, "grad_norm": 0.585457980632782, "learning_rate": 1.987934645999162e-05, "loss": 1.9209, "step": 73 }, { "epoch": 0.031001256807708422, "grad_norm": 0.42224225401878357, "learning_rate": 1.9877670716380394e-05, "loss": 1.9186, "step": 74 }, { "epoch": 0.03142019271051529, "grad_norm": 0.4566991329193115, "learning_rate": 1.987599497276917e-05, "loss": 2.018, "step": 75 }, { "epoch": 0.031839128613322165, "grad_norm": 0.47718995809555054, "learning_rate": 1.9874319229157943e-05, "loss": 2.0119, "step": 76 }, { "epoch": 0.03225806451612903, "grad_norm": 0.4412285089492798, "learning_rate": 1.9872643485546713e-05, "loss": 1.9211, "step": 77 }, { "epoch": 0.032677000418935905, "grad_norm": 0.4711454212665558, "learning_rate": 1.9870967741935484e-05, "loss": 1.9051, "step": 78 }, { "epoch": 0.03309593632174277, "grad_norm": 0.4665948450565338, "learning_rate": 1.9869291998324258e-05, "loss": 1.9571, "step": 79 }, { "epoch": 0.033514872224549644, "grad_norm": 0.46011775732040405, "learning_rate": 1.9867616254713032e-05, "loss": 1.9599, "step": 80 }, { "epoch": 0.03393380812735652, "grad_norm": 0.46272069215774536, "learning_rate": 1.9865940511101803e-05, "loss": 1.9161, "step": 81 }, { "epoch": 0.03435274403016338, "grad_norm": 0.5554195046424866, "learning_rate": 1.9864264767490574e-05, "loss": 2.0202, "step": 82 }, { "epoch": 0.034771679932970256, "grad_norm": 0.5324104428291321, "learning_rate": 1.9862589023879348e-05, "loss": 1.9356, "step": 83 }, { "epoch": 0.03519061583577713, "grad_norm": 0.5279750823974609, "learning_rate": 1.9860913280268122e-05, "loss": 1.9511, "step": 84 }, { "epoch": 0.035609551738583996, "grad_norm": 0.5002080202102661, "learning_rate": 1.9859237536656893e-05, "loss": 1.9248, "step": 85 }, { "epoch": 0.03602848764139087, "grad_norm": 0.5625497102737427, "learning_rate": 1.9857561793045667e-05, "loss": 2.0023, "step": 86 }, { "epoch": 0.036447423544197735, "grad_norm": 0.6030247807502747, "learning_rate": 1.9855886049434438e-05, "loss": 1.893, "step": 87 }, { "epoch": 0.03686635944700461, "grad_norm": 0.4760509729385376, "learning_rate": 1.9854210305823212e-05, "loss": 1.8902, "step": 88 }, { "epoch": 0.03728529534981148, "grad_norm": 0.6618624925613403, "learning_rate": 1.9852534562211983e-05, "loss": 1.9173, "step": 89 }, { "epoch": 0.03770423125261835, "grad_norm": 0.47204822301864624, "learning_rate": 1.9850858818600757e-05, "loss": 1.9266, "step": 90 }, { "epoch": 0.03812316715542522, "grad_norm": 0.5421533584594727, "learning_rate": 1.9849183074989527e-05, "loss": 1.9796, "step": 91 }, { "epoch": 0.038542103058232094, "grad_norm": 0.48972201347351074, "learning_rate": 1.98475073313783e-05, "loss": 1.91, "step": 92 }, { "epoch": 0.03896103896103896, "grad_norm": 0.5566658973693848, "learning_rate": 1.9845831587767072e-05, "loss": 1.8992, "step": 93 }, { "epoch": 0.03937997486384583, "grad_norm": 0.4685937464237213, "learning_rate": 1.9844155844155846e-05, "loss": 1.9231, "step": 94 }, { "epoch": 0.0397989107666527, "grad_norm": 0.6744531393051147, "learning_rate": 1.9842480100544617e-05, "loss": 1.9109, "step": 95 }, { "epoch": 0.04021784666945957, "grad_norm": 0.6984325051307678, "learning_rate": 1.984080435693339e-05, "loss": 1.9566, "step": 96 }, { "epoch": 0.040636782572266446, "grad_norm": 0.6627328991889954, "learning_rate": 1.9839128613322162e-05, "loss": 1.9933, "step": 97 }, { "epoch": 0.04105571847507331, "grad_norm": 0.4586343765258789, "learning_rate": 1.9837452869710936e-05, "loss": 1.7939, "step": 98 }, { "epoch": 0.041474654377880185, "grad_norm": 0.6211162805557251, "learning_rate": 1.983577712609971e-05, "loss": 1.9164, "step": 99 }, { "epoch": 0.04189359028068706, "grad_norm": 0.9397639632225037, "learning_rate": 1.983410138248848e-05, "loss": 2.0262, "step": 100 } ], "logging_steps": 1.0, "max_steps": 11935, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.051799292837888e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }