diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25130363762015456, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00025130363762015457, + "grad_norm": 2.0144803524017334, + "learning_rate": 0.0, + "loss": 2.7742, + "step": 1 + }, + { + "epoch": 0.0005026072752403091, + "grad_norm": 1.5290026664733887, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.9485, + "step": 2 + }, + { + "epoch": 0.0007539109128604636, + "grad_norm": 1.7576098442077637, + "learning_rate": 4.000000000000001e-06, + "loss": 3.2699, + "step": 3 + }, + { + "epoch": 0.0010052145504806183, + "grad_norm": 1.4803149700164795, + "learning_rate": 6e-06, + "loss": 2.6531, + "step": 4 + }, + { + "epoch": 0.0012565181881007727, + "grad_norm": 1.2919175624847412, + "learning_rate": 8.000000000000001e-06, + "loss": 2.7198, + "step": 5 + }, + { + "epoch": 0.0015078218257209273, + "grad_norm": 1.0887187719345093, + "learning_rate": 1e-05, + "loss": 2.9058, + "step": 6 + }, + { + "epoch": 0.001759125463341082, + "grad_norm": 1.176196575164795, + "learning_rate": 1.2e-05, + "loss": 2.769, + "step": 7 + }, + { + "epoch": 0.0020104291009612365, + "grad_norm": 1.4506360292434692, + "learning_rate": 1.4000000000000001e-05, + "loss": 2.8493, + "step": 8 + }, + { + "epoch": 0.002261732738581391, + "grad_norm": 0.8232998251914978, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.7691, + "step": 9 + }, + { + "epoch": 0.0025130363762015454, + "grad_norm": 0.8385952711105347, + "learning_rate": 1.8e-05, + "loss": 2.7073, + "step": 10 + }, + { + "epoch": 0.0027643400138217, + "grad_norm": 0.594434916973114, + "learning_rate": 2e-05, + "loss": 2.1646, + "step": 11 + }, + { + "epoch": 0.0030156436514418546, + "grad_norm": 0.8067689538002014, + "learning_rate": 2.2000000000000003e-05, + "loss": 2.5741, + "step": 12 + }, + { + "epoch": 0.0032669472890620092, + "grad_norm": 0.8038071393966675, + "learning_rate": 2.4e-05, + "loss": 2.6733, + "step": 13 + }, + { + "epoch": 0.003518250926682164, + "grad_norm": 0.48843103647232056, + "learning_rate": 2.6000000000000002e-05, + "loss": 2.598, + "step": 14 + }, + { + "epoch": 0.0037695545643023185, + "grad_norm": 0.8283182978630066, + "learning_rate": 2.8000000000000003e-05, + "loss": 2.8211, + "step": 15 + }, + { + "epoch": 0.004020858201922473, + "grad_norm": 0.4953489601612091, + "learning_rate": 3e-05, + "loss": 2.3855, + "step": 16 + }, + { + "epoch": 0.004272161839542627, + "grad_norm": 0.4421069025993347, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.6665, + "step": 17 + }, + { + "epoch": 0.004523465477162782, + "grad_norm": 0.6795067191123962, + "learning_rate": 3.4000000000000007e-05, + "loss": 2.5962, + "step": 18 + }, + { + "epoch": 0.0047747691147829365, + "grad_norm": 0.5411877036094666, + "learning_rate": 3.6e-05, + "loss": 2.4962, + "step": 19 + }, + { + "epoch": 0.005026072752403091, + "grad_norm": 0.5864161252975464, + "learning_rate": 3.8e-05, + "loss": 2.4219, + "step": 20 + }, + { + "epoch": 0.005277376390023246, + "grad_norm": 0.5454627871513367, + "learning_rate": 4e-05, + "loss": 2.8872, + "step": 21 + }, + { + "epoch": 0.0055286800276434, + "grad_norm": 0.4071284532546997, + "learning_rate": 4.2e-05, + "loss": 2.4573, + "step": 22 + }, + { + "epoch": 0.005779983665263555, + "grad_norm": 0.2068735957145691, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.8764, + "step": 23 + }, + { + "epoch": 0.006031287302883709, + "grad_norm": 0.5132932662963867, + "learning_rate": 4.600000000000001e-05, + "loss": 2.4306, + "step": 24 + }, + { + "epoch": 0.006282590940503863, + "grad_norm": 0.3241199851036072, + "learning_rate": 4.8e-05, + "loss": 2.1521, + "step": 25 + }, + { + "epoch": 0.0065338945781240184, + "grad_norm": 0.4157683551311493, + "learning_rate": 5e-05, + "loss": 2.7139, + "step": 26 + }, + { + "epoch": 0.006785198215744173, + "grad_norm": 0.5854943990707397, + "learning_rate": 5.2000000000000004e-05, + "loss": 2.2383, + "step": 27 + }, + { + "epoch": 0.007036501853364328, + "grad_norm": 0.5483741760253906, + "learning_rate": 5.4000000000000005e-05, + "loss": 2.4569, + "step": 28 + }, + { + "epoch": 0.007287805490984482, + "grad_norm": 0.41493722796440125, + "learning_rate": 5.6000000000000006e-05, + "loss": 2.9473, + "step": 29 + }, + { + "epoch": 0.007539109128604637, + "grad_norm": 0.3696433901786804, + "learning_rate": 5.8e-05, + "loss": 2.2744, + "step": 30 + }, + { + "epoch": 0.007790412766224791, + "grad_norm": 0.18437762558460236, + "learning_rate": 6e-05, + "loss": 1.0493, + "step": 31 + }, + { + "epoch": 0.008041716403844946, + "grad_norm": 0.46951478719711304, + "learning_rate": 6.2e-05, + "loss": 2.4904, + "step": 32 + }, + { + "epoch": 0.0082930200414651, + "grad_norm": 0.32901376485824585, + "learning_rate": 6.400000000000001e-05, + "loss": 2.0627, + "step": 33 + }, + { + "epoch": 0.008544323679085255, + "grad_norm": 0.668229877948761, + "learning_rate": 6.6e-05, + "loss": 2.3694, + "step": 34 + }, + { + "epoch": 0.008795627316705409, + "grad_norm": 0.9680635333061218, + "learning_rate": 6.800000000000001e-05, + "loss": 2.3719, + "step": 35 + }, + { + "epoch": 0.009046930954325565, + "grad_norm": 0.6750502586364746, + "learning_rate": 7e-05, + "loss": 2.3538, + "step": 36 + }, + { + "epoch": 0.009298234591945719, + "grad_norm": 0.1759599894285202, + "learning_rate": 7.2e-05, + "loss": 1.0368, + "step": 37 + }, + { + "epoch": 0.009549538229565873, + "grad_norm": 0.5437096953392029, + "learning_rate": 7.4e-05, + "loss": 2.7952, + "step": 38 + }, + { + "epoch": 0.009800841867186027, + "grad_norm": 0.31723931431770325, + "learning_rate": 7.6e-05, + "loss": 2.4566, + "step": 39 + }, + { + "epoch": 0.010052145504806181, + "grad_norm": 0.34227266907691956, + "learning_rate": 7.800000000000001e-05, + "loss": 2.281, + "step": 40 + }, + { + "epoch": 0.010303449142426337, + "grad_norm": 0.3893303871154785, + "learning_rate": 8e-05, + "loss": 2.7128, + "step": 41 + }, + { + "epoch": 0.010554752780046492, + "grad_norm": 0.32380804419517517, + "learning_rate": 8.2e-05, + "loss": 2.7928, + "step": 42 + }, + { + "epoch": 0.010806056417666646, + "grad_norm": 0.29380202293395996, + "learning_rate": 8.4e-05, + "loss": 2.305, + "step": 43 + }, + { + "epoch": 0.0110573600552868, + "grad_norm": 0.4324714243412018, + "learning_rate": 8.6e-05, + "loss": 2.4824, + "step": 44 + }, + { + "epoch": 0.011308663692906954, + "grad_norm": 0.4421226978302002, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1697, + "step": 45 + }, + { + "epoch": 0.01155996733052711, + "grad_norm": 0.5161215662956238, + "learning_rate": 9e-05, + "loss": 2.4065, + "step": 46 + }, + { + "epoch": 0.011811270968147264, + "grad_norm": 0.3017684817314148, + "learning_rate": 9.200000000000001e-05, + "loss": 2.4247, + "step": 47 + }, + { + "epoch": 0.012062574605767418, + "grad_norm": 0.5015223622322083, + "learning_rate": 9.4e-05, + "loss": 2.3724, + "step": 48 + }, + { + "epoch": 0.012313878243387573, + "grad_norm": 0.3141859471797943, + "learning_rate": 9.6e-05, + "loss": 1.8871, + "step": 49 + }, + { + "epoch": 0.012565181881007727, + "grad_norm": 0.1826818883419037, + "learning_rate": 9.8e-05, + "loss": 1.9633, + "step": 50 + }, + { + "epoch": 0.012816485518627883, + "grad_norm": 0.37197422981262207, + "learning_rate": 0.0001, + "loss": 2.4161, + "step": 51 + }, + { + "epoch": 0.013067789156248037, + "grad_norm": 0.3732236623764038, + "learning_rate": 9.99999840163606e-05, + "loss": 2.2943, + "step": 52 + }, + { + "epoch": 0.013319092793868191, + "grad_norm": 0.5340951681137085, + "learning_rate": 9.99999360654526e-05, + "loss": 2.2039, + "step": 53 + }, + { + "epoch": 0.013570396431488345, + "grad_norm": 0.394661545753479, + "learning_rate": 9.999985614730664e-05, + "loss": 2.3037, + "step": 54 + }, + { + "epoch": 0.013821700069108501, + "grad_norm": 0.3042058050632477, + "learning_rate": 9.999974426197384e-05, + "loss": 2.5117, + "step": 55 + }, + { + "epoch": 0.014073003706728655, + "grad_norm": 0.362106591463089, + "learning_rate": 9.999960040952574e-05, + "loss": 2.6846, + "step": 56 + }, + { + "epoch": 0.01432430734434881, + "grad_norm": 0.728489875793457, + "learning_rate": 9.99994245900543e-05, + "loss": 2.4936, + "step": 57 + }, + { + "epoch": 0.014575610981968964, + "grad_norm": 0.2623450756072998, + "learning_rate": 9.999921680367191e-05, + "loss": 2.6129, + "step": 58 + }, + { + "epoch": 0.014826914619589118, + "grad_norm": 0.24758170545101166, + "learning_rate": 9.999897705051145e-05, + "loss": 2.1818, + "step": 59 + }, + { + "epoch": 0.015078218257209274, + "grad_norm": 0.3045499324798584, + "learning_rate": 9.99987053307262e-05, + "loss": 2.3918, + "step": 60 + }, + { + "epoch": 0.015329521894829428, + "grad_norm": 0.3922071158885956, + "learning_rate": 9.999840164448984e-05, + "loss": 2.2996, + "step": 61 + }, + { + "epoch": 0.015580825532449582, + "grad_norm": 0.20261235535144806, + "learning_rate": 9.999806599199659e-05, + "loss": 1.9872, + "step": 62 + }, + { + "epoch": 0.015832129170069736, + "grad_norm": 0.3178957402706146, + "learning_rate": 9.999769837346103e-05, + "loss": 2.4412, + "step": 63 + }, + { + "epoch": 0.016083432807689892, + "grad_norm": 0.304047167301178, + "learning_rate": 9.999729878911816e-05, + "loss": 2.705, + "step": 64 + }, + { + "epoch": 0.016334736445310045, + "grad_norm": 0.5113909840583801, + "learning_rate": 9.99968672392235e-05, + "loss": 2.533, + "step": 65 + }, + { + "epoch": 0.0165860400829302, + "grad_norm": 0.29634010791778564, + "learning_rate": 9.999640372405295e-05, + "loss": 2.4283, + "step": 66 + }, + { + "epoch": 0.016837343720550357, + "grad_norm": 0.3125896453857422, + "learning_rate": 9.999590824390281e-05, + "loss": 2.4421, + "step": 67 + }, + { + "epoch": 0.01708864735817051, + "grad_norm": 0.38444283604621887, + "learning_rate": 9.999538079908993e-05, + "loss": 2.1928, + "step": 68 + }, + { + "epoch": 0.017339950995790665, + "grad_norm": 0.3269999325275421, + "learning_rate": 9.999482138995149e-05, + "loss": 2.6259, + "step": 69 + }, + { + "epoch": 0.017591254633410817, + "grad_norm": 0.30345460772514343, + "learning_rate": 9.999423001684513e-05, + "loss": 2.6393, + "step": 70 + }, + { + "epoch": 0.017842558271030973, + "grad_norm": 1.1956677436828613, + "learning_rate": 9.9993606680149e-05, + "loss": 2.9455, + "step": 71 + }, + { + "epoch": 0.01809386190865113, + "grad_norm": 0.4432305097579956, + "learning_rate": 9.999295138026157e-05, + "loss": 2.3158, + "step": 72 + }, + { + "epoch": 0.018345165546271282, + "grad_norm": 0.24386799335479736, + "learning_rate": 9.999226411760185e-05, + "loss": 2.6777, + "step": 73 + }, + { + "epoch": 0.018596469183891438, + "grad_norm": 0.34685081243515015, + "learning_rate": 9.99915448926092e-05, + "loss": 2.4423, + "step": 74 + }, + { + "epoch": 0.01884777282151159, + "grad_norm": 0.5708550214767456, + "learning_rate": 9.999079370574347e-05, + "loss": 2.2399, + "step": 75 + }, + { + "epoch": 0.019099076459131746, + "grad_norm": 0.2775181829929352, + "learning_rate": 9.999001055748492e-05, + "loss": 1.4073, + "step": 76 + }, + { + "epoch": 0.019350380096751902, + "grad_norm": 0.44185924530029297, + "learning_rate": 9.998919544833427e-05, + "loss": 2.2447, + "step": 77 + }, + { + "epoch": 0.019601683734372054, + "grad_norm": 0.35135698318481445, + "learning_rate": 9.998834837881263e-05, + "loss": 2.4701, + "step": 78 + }, + { + "epoch": 0.01985298737199221, + "grad_norm": 0.2450733631849289, + "learning_rate": 9.998746934946159e-05, + "loss": 2.1066, + "step": 79 + }, + { + "epoch": 0.020104291009612363, + "grad_norm": 0.3851965367794037, + "learning_rate": 9.998655836084316e-05, + "loss": 2.2371, + "step": 80 + }, + { + "epoch": 0.02035559464723252, + "grad_norm": 0.2562990188598633, + "learning_rate": 9.998561541353976e-05, + "loss": 2.5131, + "step": 81 + }, + { + "epoch": 0.020606898284852675, + "grad_norm": 0.35332947969436646, + "learning_rate": 9.998464050815426e-05, + "loss": 2.4474, + "step": 82 + }, + { + "epoch": 0.020858201922472827, + "grad_norm": 0.3566109836101532, + "learning_rate": 9.998363364530998e-05, + "loss": 2.5007, + "step": 83 + }, + { + "epoch": 0.021109505560092983, + "grad_norm": 0.27204859256744385, + "learning_rate": 9.998259482565063e-05, + "loss": 2.221, + "step": 84 + }, + { + "epoch": 0.021360809197713135, + "grad_norm": 0.1774623841047287, + "learning_rate": 9.998152404984036e-05, + "loss": 1.5693, + "step": 85 + }, + { + "epoch": 0.02161211283533329, + "grad_norm": 0.31386587023735046, + "learning_rate": 9.998042131856382e-05, + "loss": 2.1838, + "step": 86 + }, + { + "epoch": 0.021863416472953447, + "grad_norm": 0.4387069344520569, + "learning_rate": 9.997928663252601e-05, + "loss": 2.2176, + "step": 87 + }, + { + "epoch": 0.0221147201105736, + "grad_norm": 0.5427992343902588, + "learning_rate": 9.997811999245236e-05, + "loss": 2.2632, + "step": 88 + }, + { + "epoch": 0.022366023748193756, + "grad_norm": 0.14437389373779297, + "learning_rate": 9.997692139908879e-05, + "loss": 1.0033, + "step": 89 + }, + { + "epoch": 0.022617327385813908, + "grad_norm": 0.18704228103160858, + "learning_rate": 9.99756908532016e-05, + "loss": 1.1516, + "step": 90 + }, + { + "epoch": 0.022868631023434064, + "grad_norm": 0.23126636445522308, + "learning_rate": 9.997442835557753e-05, + "loss": 1.9892, + "step": 91 + }, + { + "epoch": 0.02311993466105422, + "grad_norm": 0.2967139780521393, + "learning_rate": 9.997313390702377e-05, + "loss": 2.0742, + "step": 92 + }, + { + "epoch": 0.023371238298674372, + "grad_norm": 0.25964412093162537, + "learning_rate": 9.997180750836792e-05, + "loss": 2.4486, + "step": 93 + }, + { + "epoch": 0.02362254193629453, + "grad_norm": 0.21999791264533997, + "learning_rate": 9.997044916045799e-05, + "loss": 1.0311, + "step": 94 + }, + { + "epoch": 0.02387384557391468, + "grad_norm": 0.22438450157642365, + "learning_rate": 9.996905886416244e-05, + "loss": 1.2788, + "step": 95 + }, + { + "epoch": 0.024125149211534837, + "grad_norm": 0.3627947270870209, + "learning_rate": 9.996763662037014e-05, + "loss": 2.2382, + "step": 96 + }, + { + "epoch": 0.024376452849154993, + "grad_norm": 0.2582647502422333, + "learning_rate": 9.996618242999042e-05, + "loss": 2.2999, + "step": 97 + }, + { + "epoch": 0.024627756486775145, + "grad_norm": 0.34755995869636536, + "learning_rate": 9.9964696293953e-05, + "loss": 2.2329, + "step": 98 + }, + { + "epoch": 0.0248790601243953, + "grad_norm": 0.15312296152114868, + "learning_rate": 9.996317821320802e-05, + "loss": 0.6043, + "step": 99 + }, + { + "epoch": 0.025130363762015454, + "grad_norm": 0.22593067586421967, + "learning_rate": 9.996162818872607e-05, + "loss": 2.3993, + "step": 100 + }, + { + "epoch": 0.02538166739963561, + "grad_norm": 0.3924887776374817, + "learning_rate": 9.996004622149814e-05, + "loss": 2.3133, + "step": 101 + }, + { + "epoch": 0.025632971037255765, + "grad_norm": 0.5138020515441895, + "learning_rate": 9.995843231253569e-05, + "loss": 2.4782, + "step": 102 + }, + { + "epoch": 0.025884274674875918, + "grad_norm": 0.42053958773612976, + "learning_rate": 9.995678646287053e-05, + "loss": 2.0569, + "step": 103 + }, + { + "epoch": 0.026135578312496074, + "grad_norm": 0.2818872034549713, + "learning_rate": 9.995510867355494e-05, + "loss": 2.7051, + "step": 104 + }, + { + "epoch": 0.02638688195011623, + "grad_norm": 0.4185803532600403, + "learning_rate": 9.995339894566158e-05, + "loss": 2.2182, + "step": 105 + }, + { + "epoch": 0.026638185587736382, + "grad_norm": 0.3512636721134186, + "learning_rate": 9.995165728028359e-05, + "loss": 2.5701, + "step": 106 + }, + { + "epoch": 0.026889489225356538, + "grad_norm": 0.3916986584663391, + "learning_rate": 9.994988367853451e-05, + "loss": 2.034, + "step": 107 + }, + { + "epoch": 0.02714079286297669, + "grad_norm": 0.3517094850540161, + "learning_rate": 9.994807814154824e-05, + "loss": 2.0577, + "step": 108 + }, + { + "epoch": 0.027392096500596846, + "grad_norm": 0.3244116008281708, + "learning_rate": 9.994624067047917e-05, + "loss": 2.1661, + "step": 109 + }, + { + "epoch": 0.027643400138217002, + "grad_norm": 0.42575743794441223, + "learning_rate": 9.994437126650207e-05, + "loss": 2.5129, + "step": 110 + }, + { + "epoch": 0.027894703775837155, + "grad_norm": 0.1691725254058838, + "learning_rate": 9.994246993081213e-05, + "loss": 1.5939, + "step": 111 + }, + { + "epoch": 0.02814600741345731, + "grad_norm": 0.33062514662742615, + "learning_rate": 9.994053666462498e-05, + "loss": 2.222, + "step": 112 + }, + { + "epoch": 0.028397311051077463, + "grad_norm": 0.3275945484638214, + "learning_rate": 9.993857146917662e-05, + "loss": 2.3336, + "step": 113 + }, + { + "epoch": 0.02864861468869762, + "grad_norm": 0.3942461907863617, + "learning_rate": 9.993657434572353e-05, + "loss": 2.3289, + "step": 114 + }, + { + "epoch": 0.028899918326317775, + "grad_norm": 0.3180122971534729, + "learning_rate": 9.993454529554251e-05, + "loss": 2.4353, + "step": 115 + }, + { + "epoch": 0.029151221963937927, + "grad_norm": 0.7072325944900513, + "learning_rate": 9.993248431993086e-05, + "loss": 1.9157, + "step": 116 + }, + { + "epoch": 0.029402525601558083, + "grad_norm": 0.32835853099823, + "learning_rate": 9.993039142020622e-05, + "loss": 2.8819, + "step": 117 + }, + { + "epoch": 0.029653829239178236, + "grad_norm": 0.2762772738933563, + "learning_rate": 9.992826659770672e-05, + "loss": 2.504, + "step": 118 + }, + { + "epoch": 0.029905132876798392, + "grad_norm": 0.2669197618961334, + "learning_rate": 9.992610985379082e-05, + "loss": 2.3047, + "step": 119 + }, + { + "epoch": 0.030156436514418548, + "grad_norm": 0.47167858481407166, + "learning_rate": 9.992392118983746e-05, + "loss": 2.2432, + "step": 120 + }, + { + "epoch": 0.0304077401520387, + "grad_norm": 0.18089920282363892, + "learning_rate": 9.992170060724593e-05, + "loss": 1.5848, + "step": 121 + }, + { + "epoch": 0.030659043789658856, + "grad_norm": 0.33109933137893677, + "learning_rate": 9.991944810743597e-05, + "loss": 2.3503, + "step": 122 + }, + { + "epoch": 0.03091034742727901, + "grad_norm": 0.24957267940044403, + "learning_rate": 9.991716369184765e-05, + "loss": 2.2894, + "step": 123 + }, + { + "epoch": 0.031161651064899164, + "grad_norm": 0.3461008667945862, + "learning_rate": 9.991484736194157e-05, + "loss": 2.2987, + "step": 124 + }, + { + "epoch": 0.03141295470251932, + "grad_norm": 0.2855774760246277, + "learning_rate": 9.991249911919862e-05, + "loss": 2.5193, + "step": 125 + }, + { + "epoch": 0.03166425834013947, + "grad_norm": 0.30880868434906006, + "learning_rate": 9.991011896512016e-05, + "loss": 2.5596, + "step": 126 + }, + { + "epoch": 0.031915561977759625, + "grad_norm": 0.4237726032733917, + "learning_rate": 9.990770690122793e-05, + "loss": 2.1401, + "step": 127 + }, + { + "epoch": 0.032166865615379785, + "grad_norm": 0.2743227183818817, + "learning_rate": 9.990526292906405e-05, + "loss": 1.858, + "step": 128 + }, + { + "epoch": 0.03241816925299994, + "grad_norm": 0.28934118151664734, + "learning_rate": 9.99027870501911e-05, + "loss": 1.9751, + "step": 129 + }, + { + "epoch": 0.03266947289062009, + "grad_norm": 0.4869619905948639, + "learning_rate": 9.990027926619197e-05, + "loss": 2.7332, + "step": 130 + }, + { + "epoch": 0.03292077652824025, + "grad_norm": 0.2246621996164322, + "learning_rate": 9.989773957867006e-05, + "loss": 2.2951, + "step": 131 + }, + { + "epoch": 0.0331720801658604, + "grad_norm": 0.46311891078948975, + "learning_rate": 9.989516798924908e-05, + "loss": 2.0357, + "step": 132 + }, + { + "epoch": 0.033423383803480554, + "grad_norm": 0.45459482073783875, + "learning_rate": 9.989256449957316e-05, + "loss": 2.2661, + "step": 133 + }, + { + "epoch": 0.03367468744110071, + "grad_norm": 0.4114730656147003, + "learning_rate": 9.988992911130683e-05, + "loss": 2.096, + "step": 134 + }, + { + "epoch": 0.033925991078720866, + "grad_norm": 0.468787282705307, + "learning_rate": 9.988726182613502e-05, + "loss": 2.2083, + "step": 135 + }, + { + "epoch": 0.03417729471634102, + "grad_norm": 0.30824795365333557, + "learning_rate": 9.988456264576305e-05, + "loss": 2.4525, + "step": 136 + }, + { + "epoch": 0.03442859835396117, + "grad_norm": 0.22959741950035095, + "learning_rate": 9.988183157191662e-05, + "loss": 2.4861, + "step": 137 + }, + { + "epoch": 0.03467990199158133, + "grad_norm": 0.37763601541519165, + "learning_rate": 9.987906860634184e-05, + "loss": 2.4736, + "step": 138 + }, + { + "epoch": 0.03493120562920148, + "grad_norm": 0.3474363386631012, + "learning_rate": 9.987627375080519e-05, + "loss": 2.4363, + "step": 139 + }, + { + "epoch": 0.035182509266821635, + "grad_norm": 0.3558574616909027, + "learning_rate": 9.987344700709356e-05, + "loss": 2.0973, + "step": 140 + }, + { + "epoch": 0.035433812904441794, + "grad_norm": 0.19336606562137604, + "learning_rate": 9.98705883770142e-05, + "loss": 1.3915, + "step": 141 + }, + { + "epoch": 0.03568511654206195, + "grad_norm": 0.4760534465312958, + "learning_rate": 9.986769786239477e-05, + "loss": 2.0163, + "step": 142 + }, + { + "epoch": 0.0359364201796821, + "grad_norm": 0.2839144170284271, + "learning_rate": 9.98647754650833e-05, + "loss": 1.7691, + "step": 143 + }, + { + "epoch": 0.03618772381730226, + "grad_norm": 0.3936741352081299, + "learning_rate": 9.986182118694825e-05, + "loss": 2.1423, + "step": 144 + }, + { + "epoch": 0.03643902745492241, + "grad_norm": 0.35313600301742554, + "learning_rate": 9.985883502987838e-05, + "loss": 2.3156, + "step": 145 + }, + { + "epoch": 0.036690331092542564, + "grad_norm": 0.3352813720703125, + "learning_rate": 9.985581699578287e-05, + "loss": 2.2992, + "step": 146 + }, + { + "epoch": 0.036941634730162716, + "grad_norm": 0.43075451254844666, + "learning_rate": 9.985276708659134e-05, + "loss": 2.7181, + "step": 147 + }, + { + "epoch": 0.037192938367782875, + "grad_norm": 0.4140123128890991, + "learning_rate": 9.984968530425369e-05, + "loss": 2.2454, + "step": 148 + }, + { + "epoch": 0.03744424200540303, + "grad_norm": 0.31063738465309143, + "learning_rate": 9.984657165074027e-05, + "loss": 2.4994, + "step": 149 + }, + { + "epoch": 0.03769554564302318, + "grad_norm": 0.22530898451805115, + "learning_rate": 9.984342612804176e-05, + "loss": 2.0645, + "step": 150 + }, + { + "epoch": 0.03794684928064334, + "grad_norm": 0.2811500132083893, + "learning_rate": 9.984024873816924e-05, + "loss": 2.3546, + "step": 151 + }, + { + "epoch": 0.03819815291826349, + "grad_norm": 0.4520877003669739, + "learning_rate": 9.983703948315417e-05, + "loss": 2.1208, + "step": 152 + }, + { + "epoch": 0.038449456555883645, + "grad_norm": 0.37516894936561584, + "learning_rate": 9.983379836504838e-05, + "loss": 2.3911, + "step": 153 + }, + { + "epoch": 0.038700760193503804, + "grad_norm": 0.4740954637527466, + "learning_rate": 9.983052538592404e-05, + "loss": 2.4538, + "step": 154 + }, + { + "epoch": 0.038952063831123956, + "grad_norm": 0.34146809577941895, + "learning_rate": 9.982722054787372e-05, + "loss": 2.5654, + "step": 155 + }, + { + "epoch": 0.03920336746874411, + "grad_norm": 0.3912610709667206, + "learning_rate": 9.982388385301038e-05, + "loss": 2.4403, + "step": 156 + }, + { + "epoch": 0.03945467110636426, + "grad_norm": 0.5822238922119141, + "learning_rate": 9.98205153034673e-05, + "loss": 2.3008, + "step": 157 + }, + { + "epoch": 0.03970597474398442, + "grad_norm": 0.1491788774728775, + "learning_rate": 9.981711490139814e-05, + "loss": 1.1215, + "step": 158 + }, + { + "epoch": 0.03995727838160457, + "grad_norm": 0.3438681960105896, + "learning_rate": 9.981368264897694e-05, + "loss": 2.7011, + "step": 159 + }, + { + "epoch": 0.040208582019224726, + "grad_norm": 0.3378736972808838, + "learning_rate": 9.98102185483981e-05, + "loss": 2.4961, + "step": 160 + }, + { + "epoch": 0.040459885656844885, + "grad_norm": 0.2592676281929016, + "learning_rate": 9.980672260187638e-05, + "loss": 1.8838, + "step": 161 + }, + { + "epoch": 0.04071118929446504, + "grad_norm": 0.31087106466293335, + "learning_rate": 9.980319481164688e-05, + "loss": 2.2719, + "step": 162 + }, + { + "epoch": 0.04096249293208519, + "grad_norm": 0.33119067549705505, + "learning_rate": 9.979963517996509e-05, + "loss": 2.0298, + "step": 163 + }, + { + "epoch": 0.04121379656970535, + "grad_norm": 0.4013742506504059, + "learning_rate": 9.979604370910685e-05, + "loss": 2.5852, + "step": 164 + }, + { + "epoch": 0.0414651002073255, + "grad_norm": 0.21013818681240082, + "learning_rate": 9.979242040136835e-05, + "loss": 1.1489, + "step": 165 + }, + { + "epoch": 0.041716403844945654, + "grad_norm": 0.5467535257339478, + "learning_rate": 9.978876525906613e-05, + "loss": 2.2094, + "step": 166 + }, + { + "epoch": 0.04196770748256581, + "grad_norm": 0.3550753593444824, + "learning_rate": 9.978507828453708e-05, + "loss": 2.2849, + "step": 167 + }, + { + "epoch": 0.042219011120185966, + "grad_norm": 0.2620997726917267, + "learning_rate": 9.978135948013847e-05, + "loss": 2.3229, + "step": 168 + }, + { + "epoch": 0.04247031475780612, + "grad_norm": 0.2706509530544281, + "learning_rate": 9.977760884824788e-05, + "loss": 2.2731, + "step": 169 + }, + { + "epoch": 0.04272161839542627, + "grad_norm": 0.41886723041534424, + "learning_rate": 9.977382639126328e-05, + "loss": 2.42, + "step": 170 + }, + { + "epoch": 0.04297292203304643, + "grad_norm": 0.33128440380096436, + "learning_rate": 9.977001211160296e-05, + "loss": 2.3203, + "step": 171 + }, + { + "epoch": 0.04322422567066658, + "grad_norm": 0.24908378720283508, + "learning_rate": 9.976616601170557e-05, + "loss": 1.8275, + "step": 172 + }, + { + "epoch": 0.043475529308286735, + "grad_norm": 0.3278106451034546, + "learning_rate": 9.976228809403008e-05, + "loss": 2.6484, + "step": 173 + }, + { + "epoch": 0.043726832945906895, + "grad_norm": 0.5737075209617615, + "learning_rate": 9.975837836105581e-05, + "loss": 2.2196, + "step": 174 + }, + { + "epoch": 0.04397813658352705, + "grad_norm": 0.3396255373954773, + "learning_rate": 9.975443681528247e-05, + "loss": 2.1756, + "step": 175 + }, + { + "epoch": 0.0442294402211472, + "grad_norm": 0.4173082411289215, + "learning_rate": 9.975046345923004e-05, + "loss": 1.9394, + "step": 176 + }, + { + "epoch": 0.04448074385876736, + "grad_norm": 0.2630142867565155, + "learning_rate": 9.974645829543889e-05, + "loss": 1.9798, + "step": 177 + }, + { + "epoch": 0.04473204749638751, + "grad_norm": 0.23479297757148743, + "learning_rate": 9.974242132646967e-05, + "loss": 1.8764, + "step": 178 + }, + { + "epoch": 0.044983351134007664, + "grad_norm": 0.2806346118450165, + "learning_rate": 9.973835255490343e-05, + "loss": 2.2942, + "step": 179 + }, + { + "epoch": 0.045234654771627816, + "grad_norm": 0.2745998203754425, + "learning_rate": 9.97342519833415e-05, + "loss": 2.1606, + "step": 180 + }, + { + "epoch": 0.045485958409247976, + "grad_norm": 0.46454527974128723, + "learning_rate": 9.973011961440559e-05, + "loss": 2.2312, + "step": 181 + }, + { + "epoch": 0.04573726204686813, + "grad_norm": 0.34017616510391235, + "learning_rate": 9.972595545073769e-05, + "loss": 2.3171, + "step": 182 + }, + { + "epoch": 0.04598856568448828, + "grad_norm": 0.34439125657081604, + "learning_rate": 9.972175949500012e-05, + "loss": 2.3098, + "step": 183 + }, + { + "epoch": 0.04623986932210844, + "grad_norm": 0.47229647636413574, + "learning_rate": 9.97175317498756e-05, + "loss": 2.1515, + "step": 184 + }, + { + "epoch": 0.04649117295972859, + "grad_norm": 0.3148500621318817, + "learning_rate": 9.971327221806706e-05, + "loss": 2.5827, + "step": 185 + }, + { + "epoch": 0.046742476597348745, + "grad_norm": 0.20196139812469482, + "learning_rate": 9.970898090229785e-05, + "loss": 1.5597, + "step": 186 + }, + { + "epoch": 0.046993780234968904, + "grad_norm": 0.26498332619667053, + "learning_rate": 9.97046578053116e-05, + "loss": 2.4454, + "step": 187 + }, + { + "epoch": 0.04724508387258906, + "grad_norm": 0.4051288068294525, + "learning_rate": 9.970030292987225e-05, + "loss": 2.3758, + "step": 188 + }, + { + "epoch": 0.04749638751020921, + "grad_norm": 0.3003399074077606, + "learning_rate": 9.969591627876409e-05, + "loss": 2.691, + "step": 189 + }, + { + "epoch": 0.04774769114782936, + "grad_norm": 0.22101780772209167, + "learning_rate": 9.96914978547917e-05, + "loss": 2.361, + "step": 190 + }, + { + "epoch": 0.04799899478544952, + "grad_norm": 0.342926025390625, + "learning_rate": 9.968704766077997e-05, + "loss": 2.4845, + "step": 191 + }, + { + "epoch": 0.048250298423069674, + "grad_norm": 0.3619941174983978, + "learning_rate": 9.968256569957411e-05, + "loss": 2.6731, + "step": 192 + }, + { + "epoch": 0.048501602060689826, + "grad_norm": 0.4262664020061493, + "learning_rate": 9.967805197403965e-05, + "loss": 2.6423, + "step": 193 + }, + { + "epoch": 0.048752905698309985, + "grad_norm": 0.21453841030597687, + "learning_rate": 9.96735064870624e-05, + "loss": 1.0993, + "step": 194 + }, + { + "epoch": 0.04900420933593014, + "grad_norm": 0.5756468176841736, + "learning_rate": 9.966892924154853e-05, + "loss": 2.1122, + "step": 195 + }, + { + "epoch": 0.04925551297355029, + "grad_norm": 0.5691533088684082, + "learning_rate": 9.96643202404245e-05, + "loss": 2.3865, + "step": 196 + }, + { + "epoch": 0.04950681661117045, + "grad_norm": 0.4420826733112335, + "learning_rate": 9.965967948663698e-05, + "loss": 2.1786, + "step": 197 + }, + { + "epoch": 0.0497581202487906, + "grad_norm": 0.4547330141067505, + "learning_rate": 9.965500698315306e-05, + "loss": 2.5167, + "step": 198 + }, + { + "epoch": 0.050009423886410755, + "grad_norm": 0.18379782140254974, + "learning_rate": 9.96503027329601e-05, + "loss": 1.0926, + "step": 199 + }, + { + "epoch": 0.05026072752403091, + "grad_norm": 0.20280440151691437, + "learning_rate": 9.964556673906572e-05, + "loss": 2.0984, + "step": 200 + }, + { + "epoch": 0.050512031161651066, + "grad_norm": 0.5554583072662354, + "learning_rate": 9.964079900449785e-05, + "loss": 2.8414, + "step": 201 + }, + { + "epoch": 0.05076333479927122, + "grad_norm": 0.3430071473121643, + "learning_rate": 9.963599953230473e-05, + "loss": 2.4415, + "step": 202 + }, + { + "epoch": 0.05101463843689137, + "grad_norm": 0.34130024909973145, + "learning_rate": 9.96311683255549e-05, + "loss": 2.1872, + "step": 203 + }, + { + "epoch": 0.05126594207451153, + "grad_norm": 0.3914225995540619, + "learning_rate": 9.962630538733715e-05, + "loss": 2.3896, + "step": 204 + }, + { + "epoch": 0.05151724571213168, + "grad_norm": 0.6636744141578674, + "learning_rate": 9.962141072076057e-05, + "loss": 2.5256, + "step": 205 + }, + { + "epoch": 0.051768549349751836, + "grad_norm": 0.419360876083374, + "learning_rate": 9.961648432895454e-05, + "loss": 2.1897, + "step": 206 + }, + { + "epoch": 0.052019852987371995, + "grad_norm": 0.3426320552825928, + "learning_rate": 9.961152621506876e-05, + "loss": 2.433, + "step": 207 + }, + { + "epoch": 0.05227115662499215, + "grad_norm": 0.24097320437431335, + "learning_rate": 9.960653638227315e-05, + "loss": 1.899, + "step": 208 + }, + { + "epoch": 0.0525224602626123, + "grad_norm": 0.40818917751312256, + "learning_rate": 9.960151483375795e-05, + "loss": 2.4313, + "step": 209 + }, + { + "epoch": 0.05277376390023246, + "grad_norm": 0.34848183393478394, + "learning_rate": 9.959646157273366e-05, + "loss": 1.9527, + "step": 210 + }, + { + "epoch": 0.05302506753785261, + "grad_norm": 0.18201400339603424, + "learning_rate": 9.959137660243105e-05, + "loss": 1.8784, + "step": 211 + }, + { + "epoch": 0.053276371175472764, + "grad_norm": 0.3388751745223999, + "learning_rate": 9.95862599261012e-05, + "loss": 2.4083, + "step": 212 + }, + { + "epoch": 0.05352767481309292, + "grad_norm": 0.2567930519580841, + "learning_rate": 9.958111154701542e-05, + "loss": 1.9071, + "step": 213 + }, + { + "epoch": 0.053778978450713076, + "grad_norm": 0.39157772064208984, + "learning_rate": 9.957593146846529e-05, + "loss": 2.072, + "step": 214 + }, + { + "epoch": 0.05403028208833323, + "grad_norm": 0.40348386764526367, + "learning_rate": 9.95707196937627e-05, + "loss": 2.427, + "step": 215 + }, + { + "epoch": 0.05428158572595338, + "grad_norm": 0.47843194007873535, + "learning_rate": 9.956547622623973e-05, + "loss": 2.0144, + "step": 216 + }, + { + "epoch": 0.05453288936357354, + "grad_norm": 0.337272971868515, + "learning_rate": 9.956020106924882e-05, + "loss": 1.7783, + "step": 217 + }, + { + "epoch": 0.05478419300119369, + "grad_norm": 0.40009111166000366, + "learning_rate": 9.955489422616258e-05, + "loss": 2.2961, + "step": 218 + }, + { + "epoch": 0.055035496638813845, + "grad_norm": 0.28611692786216736, + "learning_rate": 9.954955570037395e-05, + "loss": 2.44, + "step": 219 + }, + { + "epoch": 0.055286800276434005, + "grad_norm": 0.3352760374546051, + "learning_rate": 9.954418549529605e-05, + "loss": 2.2449, + "step": 220 + }, + { + "epoch": 0.05553810391405416, + "grad_norm": 0.295691579580307, + "learning_rate": 9.953878361436232e-05, + "loss": 1.9398, + "step": 221 + }, + { + "epoch": 0.05578940755167431, + "grad_norm": 0.41174155473709106, + "learning_rate": 9.953335006102643e-05, + "loss": 2.241, + "step": 222 + }, + { + "epoch": 0.05604071118929446, + "grad_norm": 0.32655176520347595, + "learning_rate": 9.95278848387623e-05, + "loss": 2.078, + "step": 223 + }, + { + "epoch": 0.05629201482691462, + "grad_norm": 0.26872923970222473, + "learning_rate": 9.95223879510641e-05, + "loss": 1.9153, + "step": 224 + }, + { + "epoch": 0.056543318464534774, + "grad_norm": 0.37920647859573364, + "learning_rate": 9.951685940144622e-05, + "loss": 2.5823, + "step": 225 + }, + { + "epoch": 0.056794622102154926, + "grad_norm": 0.18780489265918732, + "learning_rate": 9.951129919344334e-05, + "loss": 0.9544, + "step": 226 + }, + { + "epoch": 0.057045925739775086, + "grad_norm": 0.3492078185081482, + "learning_rate": 9.950570733061033e-05, + "loss": 2.3073, + "step": 227 + }, + { + "epoch": 0.05729722937739524, + "grad_norm": 0.29748043417930603, + "learning_rate": 9.950008381652235e-05, + "loss": 1.6764, + "step": 228 + }, + { + "epoch": 0.05754853301501539, + "grad_norm": 0.30301064252853394, + "learning_rate": 9.949442865477474e-05, + "loss": 2.4839, + "step": 229 + }, + { + "epoch": 0.05779983665263555, + "grad_norm": 1.056869387626648, + "learning_rate": 9.948874184898313e-05, + "loss": 1.9098, + "step": 230 + }, + { + "epoch": 0.0580511402902557, + "grad_norm": 0.33054810762405396, + "learning_rate": 9.948302340278333e-05, + "loss": 2.6708, + "step": 231 + }, + { + "epoch": 0.058302443927875855, + "grad_norm": 0.37433305382728577, + "learning_rate": 9.94772733198314e-05, + "loss": 2.4231, + "step": 232 + }, + { + "epoch": 0.05855374756549601, + "grad_norm": 0.35789650678634644, + "learning_rate": 9.947149160380366e-05, + "loss": 2.1926, + "step": 233 + }, + { + "epoch": 0.05880505120311617, + "grad_norm": 0.3637252151966095, + "learning_rate": 9.94656782583966e-05, + "loss": 2.0955, + "step": 234 + }, + { + "epoch": 0.05905635484073632, + "grad_norm": 0.37392285466194153, + "learning_rate": 9.945983328732698e-05, + "loss": 2.2662, + "step": 235 + }, + { + "epoch": 0.05930765847835647, + "grad_norm": 0.36526796221733093, + "learning_rate": 9.945395669433172e-05, + "loss": 2.4013, + "step": 236 + }, + { + "epoch": 0.05955896211597663, + "grad_norm": 0.3178480267524719, + "learning_rate": 9.944804848316802e-05, + "loss": 2.4751, + "step": 237 + }, + { + "epoch": 0.059810265753596784, + "grad_norm": 0.5924585461616516, + "learning_rate": 9.944210865761328e-05, + "loss": 2.4306, + "step": 238 + }, + { + "epoch": 0.060061569391216936, + "grad_norm": 0.19464784860610962, + "learning_rate": 9.943613722146505e-05, + "loss": 1.7291, + "step": 239 + }, + { + "epoch": 0.060312873028837095, + "grad_norm": 0.43970218300819397, + "learning_rate": 9.943013417854122e-05, + "loss": 2.6384, + "step": 240 + }, + { + "epoch": 0.06056417666645725, + "grad_norm": 0.24550621211528778, + "learning_rate": 9.942409953267972e-05, + "loss": 1.5974, + "step": 241 + }, + { + "epoch": 0.0608154803040774, + "grad_norm": 0.33134451508522034, + "learning_rate": 9.941803328773885e-05, + "loss": 2.4358, + "step": 242 + }, + { + "epoch": 0.06106678394169755, + "grad_norm": 0.5241190791130066, + "learning_rate": 9.941193544759699e-05, + "loss": 2.3483, + "step": 243 + }, + { + "epoch": 0.06131808757931771, + "grad_norm": 0.5123705863952637, + "learning_rate": 9.940580601615279e-05, + "loss": 1.9421, + "step": 244 + }, + { + "epoch": 0.061569391216937865, + "grad_norm": 0.3747979998588562, + "learning_rate": 9.939964499732507e-05, + "loss": 2.207, + "step": 245 + }, + { + "epoch": 0.06182069485455802, + "grad_norm": 0.3442586362361908, + "learning_rate": 9.939345239505284e-05, + "loss": 2.2754, + "step": 246 + }, + { + "epoch": 0.062071998492178176, + "grad_norm": 0.3288307785987854, + "learning_rate": 9.938722821329532e-05, + "loss": 2.3217, + "step": 247 + }, + { + "epoch": 0.06232330212979833, + "grad_norm": 0.33456501364707947, + "learning_rate": 9.938097245603193e-05, + "loss": 2.1507, + "step": 248 + }, + { + "epoch": 0.06257460576741848, + "grad_norm": 0.4627265930175781, + "learning_rate": 9.937468512726223e-05, + "loss": 2.4072, + "step": 249 + }, + { + "epoch": 0.06282590940503864, + "grad_norm": 0.2955438196659088, + "learning_rate": 9.9368366231006e-05, + "loss": 2.452, + "step": 250 + }, + { + "epoch": 0.06307721304265879, + "grad_norm": 0.7906093001365662, + "learning_rate": 9.936201577130324e-05, + "loss": 2.1179, + "step": 251 + }, + { + "epoch": 0.06332851668027895, + "grad_norm": 0.41081616282463074, + "learning_rate": 9.935563375221404e-05, + "loss": 2.0524, + "step": 252 + }, + { + "epoch": 0.0635798203178991, + "grad_norm": 0.18861036002635956, + "learning_rate": 9.934922017781873e-05, + "loss": 1.4761, + "step": 253 + }, + { + "epoch": 0.06383112395551925, + "grad_norm": 0.27071478962898254, + "learning_rate": 9.93427750522178e-05, + "loss": 2.2497, + "step": 254 + }, + { + "epoch": 0.06408242759313941, + "grad_norm": 0.335267573595047, + "learning_rate": 9.933629837953191e-05, + "loss": 2.3577, + "step": 255 + }, + { + "epoch": 0.06433373123075957, + "grad_norm": 0.3902337849140167, + "learning_rate": 9.932979016390192e-05, + "loss": 2.2602, + "step": 256 + }, + { + "epoch": 0.06458503486837971, + "grad_norm": 0.550452470779419, + "learning_rate": 9.932325040948878e-05, + "loss": 2.1322, + "step": 257 + }, + { + "epoch": 0.06483633850599987, + "grad_norm": 0.42229825258255005, + "learning_rate": 9.93166791204737e-05, + "loss": 2.449, + "step": 258 + }, + { + "epoch": 0.06508764214362003, + "grad_norm": 0.2767508924007416, + "learning_rate": 9.931007630105798e-05, + "loss": 2.552, + "step": 259 + }, + { + "epoch": 0.06533894578124018, + "grad_norm": 0.3925112783908844, + "learning_rate": 9.93034419554631e-05, + "loss": 2.5078, + "step": 260 + }, + { + "epoch": 0.06559024941886034, + "grad_norm": 0.38622385263442993, + "learning_rate": 9.929677608793072e-05, + "loss": 2.5889, + "step": 261 + }, + { + "epoch": 0.0658415530564805, + "grad_norm": 0.3186779022216797, + "learning_rate": 9.929007870272262e-05, + "loss": 2.3722, + "step": 262 + }, + { + "epoch": 0.06609285669410064, + "grad_norm": 0.308403342962265, + "learning_rate": 9.928334980412073e-05, + "loss": 2.8672, + "step": 263 + }, + { + "epoch": 0.0663441603317208, + "grad_norm": 0.37506890296936035, + "learning_rate": 9.927658939642716e-05, + "loss": 2.3802, + "step": 264 + }, + { + "epoch": 0.06659546396934096, + "grad_norm": 0.43554747104644775, + "learning_rate": 9.926979748396415e-05, + "loss": 2.1515, + "step": 265 + }, + { + "epoch": 0.06684676760696111, + "grad_norm": 0.4737273156642914, + "learning_rate": 9.926297407107406e-05, + "loss": 2.4804, + "step": 266 + }, + { + "epoch": 0.06709807124458127, + "grad_norm": 0.30481746792793274, + "learning_rate": 9.925611916211943e-05, + "loss": 2.5455, + "step": 267 + }, + { + "epoch": 0.06734937488220143, + "grad_norm": 0.29122671484947205, + "learning_rate": 9.92492327614829e-05, + "loss": 1.9563, + "step": 268 + }, + { + "epoch": 0.06760067851982157, + "grad_norm": 0.2516386806964874, + "learning_rate": 9.924231487356725e-05, + "loss": 2.1242, + "step": 269 + }, + { + "epoch": 0.06785198215744173, + "grad_norm": 0.1990005224943161, + "learning_rate": 9.923536550279544e-05, + "loss": 1.6236, + "step": 270 + }, + { + "epoch": 0.06810328579506188, + "grad_norm": 0.3531213402748108, + "learning_rate": 9.92283846536105e-05, + "loss": 2.3706, + "step": 271 + }, + { + "epoch": 0.06835458943268204, + "grad_norm": 0.358190655708313, + "learning_rate": 9.922137233047558e-05, + "loss": 2.3467, + "step": 272 + }, + { + "epoch": 0.0686058930703022, + "grad_norm": 0.3931328058242798, + "learning_rate": 9.9214328537874e-05, + "loss": 2.359, + "step": 273 + }, + { + "epoch": 0.06885719670792234, + "grad_norm": 0.30483195185661316, + "learning_rate": 9.92072532803092e-05, + "loss": 2.8789, + "step": 274 + }, + { + "epoch": 0.0691085003455425, + "grad_norm": 0.155193030834198, + "learning_rate": 9.920014656230468e-05, + "loss": 1.4218, + "step": 275 + }, + { + "epoch": 0.06935980398316266, + "grad_norm": 0.2718715965747833, + "learning_rate": 9.919300838840409e-05, + "loss": 2.6687, + "step": 276 + }, + { + "epoch": 0.0696111076207828, + "grad_norm": 0.45890912413597107, + "learning_rate": 9.91858387631712e-05, + "loss": 2.4566, + "step": 277 + }, + { + "epoch": 0.06986241125840296, + "grad_norm": 0.39152559638023376, + "learning_rate": 9.917863769118988e-05, + "loss": 2.7171, + "step": 278 + }, + { + "epoch": 0.07011371489602312, + "grad_norm": 0.423405259847641, + "learning_rate": 9.91714051770641e-05, + "loss": 2.4389, + "step": 279 + }, + { + "epoch": 0.07036501853364327, + "grad_norm": 0.2914890944957733, + "learning_rate": 9.916414122541794e-05, + "loss": 2.8702, + "step": 280 + }, + { + "epoch": 0.07061632217126343, + "grad_norm": 0.45607659220695496, + "learning_rate": 9.915684584089557e-05, + "loss": 2.2563, + "step": 281 + }, + { + "epoch": 0.07086762580888359, + "grad_norm": 0.8603537678718567, + "learning_rate": 9.914951902816128e-05, + "loss": 2.0017, + "step": 282 + }, + { + "epoch": 0.07111892944650373, + "grad_norm": 0.34362930059432983, + "learning_rate": 9.91421607918994e-05, + "loss": 2.5167, + "step": 283 + }, + { + "epoch": 0.0713702330841239, + "grad_norm": 0.3992220163345337, + "learning_rate": 9.913477113681441e-05, + "loss": 2.1986, + "step": 284 + }, + { + "epoch": 0.07162153672174405, + "grad_norm": 0.409201443195343, + "learning_rate": 9.912735006763085e-05, + "loss": 1.8954, + "step": 285 + }, + { + "epoch": 0.0718728403593642, + "grad_norm": 0.293445348739624, + "learning_rate": 9.911989758909335e-05, + "loss": 2.3246, + "step": 286 + }, + { + "epoch": 0.07212414399698436, + "grad_norm": 0.3958424925804138, + "learning_rate": 9.911241370596663e-05, + "loss": 2.35, + "step": 287 + }, + { + "epoch": 0.07237544763460452, + "grad_norm": 0.5355744957923889, + "learning_rate": 9.910489842303544e-05, + "loss": 2.3103, + "step": 288 + }, + { + "epoch": 0.07262675127222466, + "grad_norm": 0.4253835678100586, + "learning_rate": 9.909735174510467e-05, + "loss": 2.5327, + "step": 289 + }, + { + "epoch": 0.07287805490984482, + "grad_norm": 0.2699333727359772, + "learning_rate": 9.908977367699926e-05, + "loss": 1.4629, + "step": 290 + }, + { + "epoch": 0.07312935854746497, + "grad_norm": 0.44158613681793213, + "learning_rate": 9.90821642235642e-05, + "loss": 2.1242, + "step": 291 + }, + { + "epoch": 0.07338066218508513, + "grad_norm": 0.33713415265083313, + "learning_rate": 9.907452338966457e-05, + "loss": 2.5233, + "step": 292 + }, + { + "epoch": 0.07363196582270529, + "grad_norm": 0.21191152930259705, + "learning_rate": 9.906685118018549e-05, + "loss": 1.9315, + "step": 293 + }, + { + "epoch": 0.07388326946032543, + "grad_norm": 0.3546162247657776, + "learning_rate": 9.905914760003216e-05, + "loss": 2.3661, + "step": 294 + }, + { + "epoch": 0.07413457309794559, + "grad_norm": 0.33006587624549866, + "learning_rate": 9.905141265412984e-05, + "loss": 1.7989, + "step": 295 + }, + { + "epoch": 0.07438587673556575, + "grad_norm": 0.4529229700565338, + "learning_rate": 9.904364634742385e-05, + "loss": 2.4695, + "step": 296 + }, + { + "epoch": 0.0746371803731859, + "grad_norm": 0.29525983333587646, + "learning_rate": 9.90358486848795e-05, + "loss": 2.1218, + "step": 297 + }, + { + "epoch": 0.07488848401080606, + "grad_norm": 0.273483544588089, + "learning_rate": 9.902801967148219e-05, + "loss": 2.4696, + "step": 298 + }, + { + "epoch": 0.07513978764842622, + "grad_norm": 0.3772584795951843, + "learning_rate": 9.902015931223742e-05, + "loss": 2.3749, + "step": 299 + }, + { + "epoch": 0.07539109128604636, + "grad_norm": 0.15759634971618652, + "learning_rate": 9.901226761217062e-05, + "loss": 0.7498, + "step": 300 + }, + { + "epoch": 0.07564239492366652, + "grad_norm": 0.2802472710609436, + "learning_rate": 9.900434457632734e-05, + "loss": 2.3353, + "step": 301 + }, + { + "epoch": 0.07589369856128668, + "grad_norm": 0.7529959678649902, + "learning_rate": 9.899639020977314e-05, + "loss": 1.7885, + "step": 302 + }, + { + "epoch": 0.07614500219890682, + "grad_norm": 0.3906523585319519, + "learning_rate": 9.89884045175936e-05, + "loss": 2.4677, + "step": 303 + }, + { + "epoch": 0.07639630583652698, + "grad_norm": 0.31307291984558105, + "learning_rate": 9.898038750489433e-05, + "loss": 2.408, + "step": 304 + }, + { + "epoch": 0.07664760947414714, + "grad_norm": 0.3155834674835205, + "learning_rate": 9.897233917680098e-05, + "loss": 2.0308, + "step": 305 + }, + { + "epoch": 0.07689891311176729, + "grad_norm": 0.2838704586029053, + "learning_rate": 9.896425953845923e-05, + "loss": 2.1338, + "step": 306 + }, + { + "epoch": 0.07715021674938745, + "grad_norm": 0.3904295563697815, + "learning_rate": 9.895614859503472e-05, + "loss": 2.4403, + "step": 307 + }, + { + "epoch": 0.07740152038700761, + "grad_norm": 0.27329105138778687, + "learning_rate": 9.89480063517132e-05, + "loss": 1.7174, + "step": 308 + }, + { + "epoch": 0.07765282402462775, + "grad_norm": 0.20863182842731476, + "learning_rate": 9.893983281370034e-05, + "loss": 2.1524, + "step": 309 + }, + { + "epoch": 0.07790412766224791, + "grad_norm": 0.3970401883125305, + "learning_rate": 9.893162798622185e-05, + "loss": 3.0095, + "step": 310 + }, + { + "epoch": 0.07815543129986807, + "grad_norm": 0.3557833731174469, + "learning_rate": 9.892339187452347e-05, + "loss": 2.2407, + "step": 311 + }, + { + "epoch": 0.07840673493748822, + "grad_norm": 0.3668101131916046, + "learning_rate": 9.891512448387092e-05, + "loss": 1.8899, + "step": 312 + }, + { + "epoch": 0.07865803857510838, + "grad_norm": 0.2897646129131317, + "learning_rate": 9.890682581954991e-05, + "loss": 2.4487, + "step": 313 + }, + { + "epoch": 0.07890934221272852, + "grad_norm": 0.31541863083839417, + "learning_rate": 9.889849588686617e-05, + "loss": 1.9136, + "step": 314 + }, + { + "epoch": 0.07916064585034868, + "grad_norm": 0.4142214357852936, + "learning_rate": 9.889013469114539e-05, + "loss": 2.4315, + "step": 315 + }, + { + "epoch": 0.07941194948796884, + "grad_norm": 0.39966338872909546, + "learning_rate": 9.888174223773325e-05, + "loss": 2.0768, + "step": 316 + }, + { + "epoch": 0.07966325312558899, + "grad_norm": 0.3038378357887268, + "learning_rate": 9.887331853199546e-05, + "loss": 2.4825, + "step": 317 + }, + { + "epoch": 0.07991455676320915, + "grad_norm": 0.36689433455467224, + "learning_rate": 9.886486357931767e-05, + "loss": 2.3153, + "step": 318 + }, + { + "epoch": 0.0801658604008293, + "grad_norm": 0.3301517963409424, + "learning_rate": 9.885637738510551e-05, + "loss": 2.4927, + "step": 319 + }, + { + "epoch": 0.08041716403844945, + "grad_norm": 0.26406022906303406, + "learning_rate": 9.884785995478458e-05, + "loss": 2.3793, + "step": 320 + }, + { + "epoch": 0.08066846767606961, + "grad_norm": 0.3765343427658081, + "learning_rate": 9.883931129380049e-05, + "loss": 2.3813, + "step": 321 + }, + { + "epoch": 0.08091977131368977, + "grad_norm": 0.42643749713897705, + "learning_rate": 9.883073140761876e-05, + "loss": 2.444, + "step": 322 + }, + { + "epoch": 0.08117107495130992, + "grad_norm": 0.42273804545402527, + "learning_rate": 9.882212030172493e-05, + "loss": 2.3634, + "step": 323 + }, + { + "epoch": 0.08142237858893007, + "grad_norm": 0.3113279938697815, + "learning_rate": 9.881347798162443e-05, + "loss": 2.236, + "step": 324 + }, + { + "epoch": 0.08167368222655023, + "grad_norm": 0.4435007870197296, + "learning_rate": 9.880480445284274e-05, + "loss": 2.3091, + "step": 325 + }, + { + "epoch": 0.08192498586417038, + "grad_norm": 0.5760218501091003, + "learning_rate": 9.879609972092522e-05, + "loss": 1.7405, + "step": 326 + }, + { + "epoch": 0.08217628950179054, + "grad_norm": 0.3486250638961792, + "learning_rate": 9.878736379143719e-05, + "loss": 1.9632, + "step": 327 + }, + { + "epoch": 0.0824275931394107, + "grad_norm": 0.33690646290779114, + "learning_rate": 9.877859666996395e-05, + "loss": 2.454, + "step": 328 + }, + { + "epoch": 0.08267889677703084, + "grad_norm": 0.4438174068927765, + "learning_rate": 9.876979836211069e-05, + "loss": 2.0577, + "step": 329 + }, + { + "epoch": 0.082930200414651, + "grad_norm": 0.13432446122169495, + "learning_rate": 9.87609688735026e-05, + "loss": 0.8846, + "step": 330 + }, + { + "epoch": 0.08318150405227116, + "grad_norm": 0.4359930753707886, + "learning_rate": 9.875210820978475e-05, + "loss": 2.1561, + "step": 331 + }, + { + "epoch": 0.08343280768989131, + "grad_norm": 0.38827580213546753, + "learning_rate": 9.87432163766222e-05, + "loss": 2.2525, + "step": 332 + }, + { + "epoch": 0.08368411132751147, + "grad_norm": 0.3862955868244171, + "learning_rate": 9.873429337969985e-05, + "loss": 2.3337, + "step": 333 + }, + { + "epoch": 0.08393541496513161, + "grad_norm": 0.3881922960281372, + "learning_rate": 9.872533922472264e-05, + "loss": 2.3624, + "step": 334 + }, + { + "epoch": 0.08418671860275177, + "grad_norm": 0.21210597455501556, + "learning_rate": 9.871635391741533e-05, + "loss": 2.0269, + "step": 335 + }, + { + "epoch": 0.08443802224037193, + "grad_norm": 0.38510724902153015, + "learning_rate": 9.870733746352265e-05, + "loss": 2.1879, + "step": 336 + }, + { + "epoch": 0.08468932587799208, + "grad_norm": 0.31321218609809875, + "learning_rate": 9.869828986880924e-05, + "loss": 1.8977, + "step": 337 + }, + { + "epoch": 0.08494062951561224, + "grad_norm": 0.3835349678993225, + "learning_rate": 9.868921113905961e-05, + "loss": 2.5596, + "step": 338 + }, + { + "epoch": 0.0851919331532324, + "grad_norm": 0.2992005944252014, + "learning_rate": 9.868010128007823e-05, + "loss": 2.3065, + "step": 339 + }, + { + "epoch": 0.08544323679085254, + "grad_norm": 0.31576502323150635, + "learning_rate": 9.867096029768943e-05, + "loss": 2.4326, + "step": 340 + }, + { + "epoch": 0.0856945404284727, + "grad_norm": 0.265114426612854, + "learning_rate": 9.866178819773747e-05, + "loss": 2.4567, + "step": 341 + }, + { + "epoch": 0.08594584406609286, + "grad_norm": 0.40755563974380493, + "learning_rate": 9.86525849860865e-05, + "loss": 2.4134, + "step": 342 + }, + { + "epoch": 0.086197147703713, + "grad_norm": 0.35845133662223816, + "learning_rate": 9.864335066862054e-05, + "loss": 2.1912, + "step": 343 + }, + { + "epoch": 0.08644845134133317, + "grad_norm": 0.2640887498855591, + "learning_rate": 9.863408525124349e-05, + "loss": 2.3818, + "step": 344 + }, + { + "epoch": 0.08669975497895333, + "grad_norm": 0.19019848108291626, + "learning_rate": 9.862478873987919e-05, + "loss": 1.2064, + "step": 345 + }, + { + "epoch": 0.08695105861657347, + "grad_norm": 0.3398876488208771, + "learning_rate": 9.861546114047131e-05, + "loss": 2.513, + "step": 346 + }, + { + "epoch": 0.08720236225419363, + "grad_norm": 0.3897905945777893, + "learning_rate": 9.86061024589834e-05, + "loss": 2.3802, + "step": 347 + }, + { + "epoch": 0.08745366589181379, + "grad_norm": 0.5403574109077454, + "learning_rate": 9.859671270139892e-05, + "loss": 2.492, + "step": 348 + }, + { + "epoch": 0.08770496952943393, + "grad_norm": 0.38538798689842224, + "learning_rate": 9.858729187372114e-05, + "loss": 2.6697, + "step": 349 + }, + { + "epoch": 0.0879562731670541, + "grad_norm": 0.47243911027908325, + "learning_rate": 9.857783998197321e-05, + "loss": 2.5493, + "step": 350 + }, + { + "epoch": 0.08820757680467425, + "grad_norm": 0.24692249298095703, + "learning_rate": 9.85683570321982e-05, + "loss": 1.9117, + "step": 351 + }, + { + "epoch": 0.0884588804422944, + "grad_norm": 0.35984736680984497, + "learning_rate": 9.855884303045897e-05, + "loss": 2.0952, + "step": 352 + }, + { + "epoch": 0.08871018407991456, + "grad_norm": 0.4461250901222229, + "learning_rate": 9.854929798283826e-05, + "loss": 2.1836, + "step": 353 + }, + { + "epoch": 0.08896148771753472, + "grad_norm": 0.4806773066520691, + "learning_rate": 9.853972189543864e-05, + "loss": 2.0937, + "step": 354 + }, + { + "epoch": 0.08921279135515486, + "grad_norm": 0.5709269642829895, + "learning_rate": 9.853011477438254e-05, + "loss": 1.9677, + "step": 355 + }, + { + "epoch": 0.08946409499277502, + "grad_norm": 0.3929988443851471, + "learning_rate": 9.852047662581225e-05, + "loss": 2.5909, + "step": 356 + }, + { + "epoch": 0.08971539863039517, + "grad_norm": 0.33998608589172363, + "learning_rate": 9.851080745588987e-05, + "loss": 2.0388, + "step": 357 + }, + { + "epoch": 0.08996670226801533, + "grad_norm": 0.4756825268268585, + "learning_rate": 9.850110727079735e-05, + "loss": 2.1712, + "step": 358 + }, + { + "epoch": 0.09021800590563549, + "grad_norm": 0.3752896785736084, + "learning_rate": 9.849137607673643e-05, + "loss": 2.4498, + "step": 359 + }, + { + "epoch": 0.09046930954325563, + "grad_norm": 0.4232666790485382, + "learning_rate": 9.848161387992874e-05, + "loss": 2.0123, + "step": 360 + }, + { + "epoch": 0.09072061318087579, + "grad_norm": 0.4741215705871582, + "learning_rate": 9.847182068661567e-05, + "loss": 2.0118, + "step": 361 + }, + { + "epoch": 0.09097191681849595, + "grad_norm": 0.41064271330833435, + "learning_rate": 9.846199650305846e-05, + "loss": 2.2685, + "step": 362 + }, + { + "epoch": 0.0912232204561161, + "grad_norm": 0.31049537658691406, + "learning_rate": 9.845214133553817e-05, + "loss": 1.9623, + "step": 363 + }, + { + "epoch": 0.09147452409373626, + "grad_norm": 0.2547619342803955, + "learning_rate": 9.844225519035565e-05, + "loss": 1.8739, + "step": 364 + }, + { + "epoch": 0.09172582773135642, + "grad_norm": 0.26991865038871765, + "learning_rate": 9.843233807383159e-05, + "loss": 2.2907, + "step": 365 + }, + { + "epoch": 0.09197713136897656, + "grad_norm": 0.3443757891654968, + "learning_rate": 9.84223899923064e-05, + "loss": 2.1534, + "step": 366 + }, + { + "epoch": 0.09222843500659672, + "grad_norm": 0.2624412477016449, + "learning_rate": 9.841241095214038e-05, + "loss": 2.4799, + "step": 367 + }, + { + "epoch": 0.09247973864421688, + "grad_norm": 0.34518447518348694, + "learning_rate": 9.840240095971358e-05, + "loss": 2.2512, + "step": 368 + }, + { + "epoch": 0.09273104228183703, + "grad_norm": 0.4448896646499634, + "learning_rate": 9.839236002142584e-05, + "loss": 2.3634, + "step": 369 + }, + { + "epoch": 0.09298234591945718, + "grad_norm": 0.1713920682668686, + "learning_rate": 9.83822881436968e-05, + "loss": 1.9883, + "step": 370 + }, + { + "epoch": 0.09323364955707734, + "grad_norm": 0.26538336277008057, + "learning_rate": 9.837218533296587e-05, + "loss": 2.3259, + "step": 371 + }, + { + "epoch": 0.09348495319469749, + "grad_norm": 0.3171433210372925, + "learning_rate": 9.83620515956922e-05, + "loss": 2.3368, + "step": 372 + }, + { + "epoch": 0.09373625683231765, + "grad_norm": 0.47525712847709656, + "learning_rate": 9.83518869383548e-05, + "loss": 2.0206, + "step": 373 + }, + { + "epoch": 0.09398756046993781, + "grad_norm": 0.6884750723838806, + "learning_rate": 9.834169136745237e-05, + "loss": 2.2423, + "step": 374 + }, + { + "epoch": 0.09423886410755795, + "grad_norm": 0.3786754012107849, + "learning_rate": 9.833146488950342e-05, + "loss": 2.5151, + "step": 375 + }, + { + "epoch": 0.09449016774517811, + "grad_norm": 0.15857549011707306, + "learning_rate": 9.832120751104617e-05, + "loss": 1.4356, + "step": 376 + }, + { + "epoch": 0.09474147138279826, + "grad_norm": 0.6414403915405273, + "learning_rate": 9.831091923863868e-05, + "loss": 2.2734, + "step": 377 + }, + { + "epoch": 0.09499277502041842, + "grad_norm": 0.25082263350486755, + "learning_rate": 9.830060007885868e-05, + "loss": 2.8101, + "step": 378 + }, + { + "epoch": 0.09524407865803858, + "grad_norm": 0.29801589250564575, + "learning_rate": 9.829025003830368e-05, + "loss": 1.8527, + "step": 379 + }, + { + "epoch": 0.09549538229565872, + "grad_norm": 0.293905109167099, + "learning_rate": 9.827986912359094e-05, + "loss": 2.1692, + "step": 380 + }, + { + "epoch": 0.09574668593327888, + "grad_norm": 0.5134365558624268, + "learning_rate": 9.826945734135744e-05, + "loss": 2.1052, + "step": 381 + }, + { + "epoch": 0.09599798957089904, + "grad_norm": 0.24011516571044922, + "learning_rate": 9.825901469825994e-05, + "loss": 1.8188, + "step": 382 + }, + { + "epoch": 0.09624929320851919, + "grad_norm": 0.39140889048576355, + "learning_rate": 9.824854120097485e-05, + "loss": 2.1372, + "step": 383 + }, + { + "epoch": 0.09650059684613935, + "grad_norm": 0.35121777653694153, + "learning_rate": 9.82380368561984e-05, + "loss": 2.2186, + "step": 384 + }, + { + "epoch": 0.0967519004837595, + "grad_norm": 0.3034665286540985, + "learning_rate": 9.822750167064645e-05, + "loss": 2.2167, + "step": 385 + }, + { + "epoch": 0.09700320412137965, + "grad_norm": 0.18376107513904572, + "learning_rate": 9.821693565105465e-05, + "loss": 1.573, + "step": 386 + }, + { + "epoch": 0.09725450775899981, + "grad_norm": 0.18543782830238342, + "learning_rate": 9.820633880417836e-05, + "loss": 2.0766, + "step": 387 + }, + { + "epoch": 0.09750581139661997, + "grad_norm": 0.3020473122596741, + "learning_rate": 9.819571113679258e-05, + "loss": 2.4098, + "step": 388 + }, + { + "epoch": 0.09775711503424012, + "grad_norm": 0.43857908248901367, + "learning_rate": 9.818505265569209e-05, + "loss": 2.3636, + "step": 389 + }, + { + "epoch": 0.09800841867186028, + "grad_norm": 0.31539830565452576, + "learning_rate": 9.817436336769135e-05, + "loss": 2.3027, + "step": 390 + }, + { + "epoch": 0.09825972230948044, + "grad_norm": 0.38662660121917725, + "learning_rate": 9.816364327962449e-05, + "loss": 2.7305, + "step": 391 + }, + { + "epoch": 0.09851102594710058, + "grad_norm": 0.6164030432701111, + "learning_rate": 9.815289239834536e-05, + "loss": 1.8858, + "step": 392 + }, + { + "epoch": 0.09876232958472074, + "grad_norm": 0.2521904408931732, + "learning_rate": 9.814211073072748e-05, + "loss": 1.9274, + "step": 393 + }, + { + "epoch": 0.0990136332223409, + "grad_norm": 0.35303381085395813, + "learning_rate": 9.813129828366407e-05, + "loss": 2.1945, + "step": 394 + }, + { + "epoch": 0.09926493685996104, + "grad_norm": 0.276737242937088, + "learning_rate": 9.812045506406803e-05, + "loss": 2.2649, + "step": 395 + }, + { + "epoch": 0.0995162404975812, + "grad_norm": 0.45490264892578125, + "learning_rate": 9.81095810788719e-05, + "loss": 2.37, + "step": 396 + }, + { + "epoch": 0.09976754413520136, + "grad_norm": 0.5738433599472046, + "learning_rate": 9.809867633502794e-05, + "loss": 2.1393, + "step": 397 + }, + { + "epoch": 0.10001884777282151, + "grad_norm": 0.2685263454914093, + "learning_rate": 9.808774083950802e-05, + "loss": 2.6325, + "step": 398 + }, + { + "epoch": 0.10027015141044167, + "grad_norm": 0.41066989302635193, + "learning_rate": 9.807677459930374e-05, + "loss": 2.1897, + "step": 399 + }, + { + "epoch": 0.10052145504806181, + "grad_norm": 0.41453301906585693, + "learning_rate": 9.806577762142628e-05, + "loss": 2.243, + "step": 400 + }, + { + "epoch": 0.10077275868568197, + "grad_norm": 0.39212143421173096, + "learning_rate": 9.805474991290652e-05, + "loss": 2.502, + "step": 401 + }, + { + "epoch": 0.10102406232330213, + "grad_norm": 0.23721270263195038, + "learning_rate": 9.804369148079498e-05, + "loss": 1.7259, + "step": 402 + }, + { + "epoch": 0.10127536596092228, + "grad_norm": 0.22118382155895233, + "learning_rate": 9.803260233216184e-05, + "loss": 1.1238, + "step": 403 + }, + { + "epoch": 0.10152666959854244, + "grad_norm": 0.386078804731369, + "learning_rate": 9.802148247409686e-05, + "loss": 2.1155, + "step": 404 + }, + { + "epoch": 0.1017779732361626, + "grad_norm": 0.3332570195198059, + "learning_rate": 9.80103319137095e-05, + "loss": 2.5907, + "step": 405 + }, + { + "epoch": 0.10202927687378274, + "grad_norm": 0.20631489157676697, + "learning_rate": 9.799915065812882e-05, + "loss": 2.3401, + "step": 406 + }, + { + "epoch": 0.1022805805114029, + "grad_norm": 0.529591977596283, + "learning_rate": 9.798793871450346e-05, + "loss": 2.3916, + "step": 407 + }, + { + "epoch": 0.10253188414902306, + "grad_norm": 0.32578417658805847, + "learning_rate": 9.79766960900018e-05, + "loss": 2.3661, + "step": 408 + }, + { + "epoch": 0.10278318778664321, + "grad_norm": 0.4134072959423065, + "learning_rate": 9.796542279181172e-05, + "loss": 2.0656, + "step": 409 + }, + { + "epoch": 0.10303449142426337, + "grad_norm": 0.3845951557159424, + "learning_rate": 9.795411882714076e-05, + "loss": 2.3176, + "step": 410 + }, + { + "epoch": 0.10328579506188353, + "grad_norm": 0.6262491345405579, + "learning_rate": 9.794278420321605e-05, + "loss": 2.42, + "step": 411 + }, + { + "epoch": 0.10353709869950367, + "grad_norm": 0.4291568696498871, + "learning_rate": 9.793141892728436e-05, + "loss": 1.9455, + "step": 412 + }, + { + "epoch": 0.10378840233712383, + "grad_norm": 0.24446240067481995, + "learning_rate": 9.792002300661201e-05, + "loss": 2.6102, + "step": 413 + }, + { + "epoch": 0.10403970597474399, + "grad_norm": 0.4551761746406555, + "learning_rate": 9.79085964484849e-05, + "loss": 2.0785, + "step": 414 + }, + { + "epoch": 0.10429100961236414, + "grad_norm": 0.384036123752594, + "learning_rate": 9.789713926020863e-05, + "loss": 2.3401, + "step": 415 + }, + { + "epoch": 0.1045423132499843, + "grad_norm": 0.2475469559431076, + "learning_rate": 9.788565144910822e-05, + "loss": 2.2387, + "step": 416 + }, + { + "epoch": 0.10479361688760445, + "grad_norm": 0.269940048456192, + "learning_rate": 9.78741330225284e-05, + "loss": 2.2894, + "step": 417 + }, + { + "epoch": 0.1050449205252246, + "grad_norm": 0.27409201860427856, + "learning_rate": 9.786258398783341e-05, + "loss": 2.2478, + "step": 418 + }, + { + "epoch": 0.10529622416284476, + "grad_norm": 0.28689828515052795, + "learning_rate": 9.785100435240706e-05, + "loss": 1.9406, + "step": 419 + }, + { + "epoch": 0.10554752780046492, + "grad_norm": 0.5282906293869019, + "learning_rate": 9.783939412365278e-05, + "loss": 2.1423, + "step": 420 + }, + { + "epoch": 0.10579883143808506, + "grad_norm": 0.2722564935684204, + "learning_rate": 9.782775330899347e-05, + "loss": 2.3398, + "step": 421 + }, + { + "epoch": 0.10605013507570522, + "grad_norm": 0.22443710267543793, + "learning_rate": 9.781608191587166e-05, + "loss": 2.2233, + "step": 422 + }, + { + "epoch": 0.10630143871332537, + "grad_norm": 1.7654945850372314, + "learning_rate": 9.78043799517494e-05, + "loss": 2.2441, + "step": 423 + }, + { + "epoch": 0.10655274235094553, + "grad_norm": 0.272491455078125, + "learning_rate": 9.779264742410829e-05, + "loss": 2.4267, + "step": 424 + }, + { + "epoch": 0.10680404598856569, + "grad_norm": 0.4860396683216095, + "learning_rate": 9.778088434044945e-05, + "loss": 2.3139, + "step": 425 + }, + { + "epoch": 0.10705534962618583, + "grad_norm": 0.5187298059463501, + "learning_rate": 9.77690907082936e-05, + "loss": 2.0393, + "step": 426 + }, + { + "epoch": 0.10730665326380599, + "grad_norm": 0.25615808367729187, + "learning_rate": 9.775726653518091e-05, + "loss": 2.4811, + "step": 427 + }, + { + "epoch": 0.10755795690142615, + "grad_norm": 0.26843705773353577, + "learning_rate": 9.774541182867112e-05, + "loss": 2.0025, + "step": 428 + }, + { + "epoch": 0.1078092605390463, + "grad_norm": 0.42573508620262146, + "learning_rate": 9.773352659634348e-05, + "loss": 2.3841, + "step": 429 + }, + { + "epoch": 0.10806056417666646, + "grad_norm": 0.34817081689834595, + "learning_rate": 9.772161084579679e-05, + "loss": 2.4695, + "step": 430 + }, + { + "epoch": 0.10831186781428662, + "grad_norm": 0.3741927742958069, + "learning_rate": 9.770966458464927e-05, + "loss": 2.099, + "step": 431 + }, + { + "epoch": 0.10856317145190676, + "grad_norm": 0.4013387858867645, + "learning_rate": 9.769768782053879e-05, + "loss": 2.2978, + "step": 432 + }, + { + "epoch": 0.10881447508952692, + "grad_norm": 0.4488285779953003, + "learning_rate": 9.768568056112258e-05, + "loss": 2.064, + "step": 433 + }, + { + "epoch": 0.10906577872714708, + "grad_norm": 0.43745100498199463, + "learning_rate": 9.767364281407745e-05, + "loss": 2.0517, + "step": 434 + }, + { + "epoch": 0.10931708236476723, + "grad_norm": 0.2905375361442566, + "learning_rate": 9.766157458709967e-05, + "loss": 2.43, + "step": 435 + }, + { + "epoch": 0.10956838600238739, + "grad_norm": 0.3817865252494812, + "learning_rate": 9.764947588790502e-05, + "loss": 2.439, + "step": 436 + }, + { + "epoch": 0.10981968964000755, + "grad_norm": 0.44527363777160645, + "learning_rate": 9.763734672422876e-05, + "loss": 2.4307, + "step": 437 + }, + { + "epoch": 0.11007099327762769, + "grad_norm": 0.5595135688781738, + "learning_rate": 9.76251871038256e-05, + "loss": 2.1648, + "step": 438 + }, + { + "epoch": 0.11032229691524785, + "grad_norm": 0.4044279456138611, + "learning_rate": 9.761299703446973e-05, + "loss": 2.3435, + "step": 439 + }, + { + "epoch": 0.11057360055286801, + "grad_norm": 0.1558169573545456, + "learning_rate": 9.760077652395483e-05, + "loss": 0.9334, + "step": 440 + }, + { + "epoch": 0.11082490419048815, + "grad_norm": 0.42228519916534424, + "learning_rate": 9.758852558009404e-05, + "loss": 2.4764, + "step": 441 + }, + { + "epoch": 0.11107620782810831, + "grad_norm": 0.18882694840431213, + "learning_rate": 9.757624421071993e-05, + "loss": 1.3726, + "step": 442 + }, + { + "epoch": 0.11132751146572846, + "grad_norm": 0.37905827164649963, + "learning_rate": 9.756393242368453e-05, + "loss": 1.8814, + "step": 443 + }, + { + "epoch": 0.11157881510334862, + "grad_norm": 0.347260445356369, + "learning_rate": 9.755159022685936e-05, + "loss": 2.6184, + "step": 444 + }, + { + "epoch": 0.11183011874096878, + "grad_norm": 0.4600488543510437, + "learning_rate": 9.753921762813534e-05, + "loss": 2.2642, + "step": 445 + }, + { + "epoch": 0.11208142237858892, + "grad_norm": 0.3841269612312317, + "learning_rate": 9.75268146354228e-05, + "loss": 2.2473, + "step": 446 + }, + { + "epoch": 0.11233272601620908, + "grad_norm": 0.4765447676181793, + "learning_rate": 9.751438125665158e-05, + "loss": 2.3474, + "step": 447 + }, + { + "epoch": 0.11258402965382924, + "grad_norm": 0.3766055405139923, + "learning_rate": 9.750191749977089e-05, + "loss": 1.809, + "step": 448 + }, + { + "epoch": 0.11283533329144939, + "grad_norm": 0.3452647626399994, + "learning_rate": 9.748942337274938e-05, + "loss": 2.5878, + "step": 449 + }, + { + "epoch": 0.11308663692906955, + "grad_norm": 0.21866516768932343, + "learning_rate": 9.747689888357509e-05, + "loss": 0.992, + "step": 450 + }, + { + "epoch": 0.11333794056668971, + "grad_norm": 0.36812421679496765, + "learning_rate": 9.746434404025555e-05, + "loss": 2.0005, + "step": 451 + }, + { + "epoch": 0.11358924420430985, + "grad_norm": 0.44335392117500305, + "learning_rate": 9.74517588508176e-05, + "loss": 2.4726, + "step": 452 + }, + { + "epoch": 0.11384054784193001, + "grad_norm": 0.448779433965683, + "learning_rate": 9.743914332330754e-05, + "loss": 2.2657, + "step": 453 + }, + { + "epoch": 0.11409185147955017, + "grad_norm": 0.5500572323799133, + "learning_rate": 9.742649746579105e-05, + "loss": 2.4726, + "step": 454 + }, + { + "epoch": 0.11434315511717032, + "grad_norm": 0.47495442628860474, + "learning_rate": 9.741382128635321e-05, + "loss": 2.4044, + "step": 455 + }, + { + "epoch": 0.11459445875479048, + "grad_norm": 0.4240530729293823, + "learning_rate": 9.740111479309847e-05, + "loss": 2.6976, + "step": 456 + }, + { + "epoch": 0.11484576239241064, + "grad_norm": 0.48315781354904175, + "learning_rate": 9.738837799415067e-05, + "loss": 2.4035, + "step": 457 + }, + { + "epoch": 0.11509706603003078, + "grad_norm": 0.28931689262390137, + "learning_rate": 9.737561089765303e-05, + "loss": 1.9762, + "step": 458 + }, + { + "epoch": 0.11534836966765094, + "grad_norm": 0.2892288267612457, + "learning_rate": 9.736281351176813e-05, + "loss": 2.1718, + "step": 459 + }, + { + "epoch": 0.1155996733052711, + "grad_norm": 0.4304305911064148, + "learning_rate": 9.734998584467794e-05, + "loss": 2.2799, + "step": 460 + }, + { + "epoch": 0.11585097694289125, + "grad_norm": 0.2887556552886963, + "learning_rate": 9.733712790458375e-05, + "loss": 2.4617, + "step": 461 + }, + { + "epoch": 0.1161022805805114, + "grad_norm": 0.37903836369514465, + "learning_rate": 9.732423969970626e-05, + "loss": 2.4154, + "step": 462 + }, + { + "epoch": 0.11635358421813156, + "grad_norm": 0.34230297803878784, + "learning_rate": 9.731132123828543e-05, + "loss": 1.9664, + "step": 463 + }, + { + "epoch": 0.11660488785575171, + "grad_norm": 0.4099084138870239, + "learning_rate": 9.729837252858067e-05, + "loss": 2.473, + "step": 464 + }, + { + "epoch": 0.11685619149337187, + "grad_norm": 0.4079136252403259, + "learning_rate": 9.728539357887068e-05, + "loss": 1.8914, + "step": 465 + }, + { + "epoch": 0.11710749513099201, + "grad_norm": 0.2663559317588806, + "learning_rate": 9.727238439745346e-05, + "loss": 2.1168, + "step": 466 + }, + { + "epoch": 0.11735879876861217, + "grad_norm": 0.35732075572013855, + "learning_rate": 9.72593449926464e-05, + "loss": 2.3938, + "step": 467 + }, + { + "epoch": 0.11761010240623233, + "grad_norm": 0.26939693093299866, + "learning_rate": 9.724627537278616e-05, + "loss": 1.9446, + "step": 468 + }, + { + "epoch": 0.11786140604385248, + "grad_norm": 0.3639311194419861, + "learning_rate": 9.72331755462288e-05, + "loss": 2.8929, + "step": 469 + }, + { + "epoch": 0.11811270968147264, + "grad_norm": 0.26295900344848633, + "learning_rate": 9.722004552134956e-05, + "loss": 2.3815, + "step": 470 + }, + { + "epoch": 0.1183640133190928, + "grad_norm": 0.4366108179092407, + "learning_rate": 9.720688530654311e-05, + "loss": 2.1099, + "step": 471 + }, + { + "epoch": 0.11861531695671294, + "grad_norm": 0.5005189776420593, + "learning_rate": 9.719369491022339e-05, + "loss": 2.7562, + "step": 472 + }, + { + "epoch": 0.1188666205943331, + "grad_norm": 0.3057880401611328, + "learning_rate": 9.718047434082357e-05, + "loss": 2.5181, + "step": 473 + }, + { + "epoch": 0.11911792423195326, + "grad_norm": 0.2341679483652115, + "learning_rate": 9.716722360679619e-05, + "loss": 1.2066, + "step": 474 + }, + { + "epoch": 0.11936922786957341, + "grad_norm": 0.27327990531921387, + "learning_rate": 9.715394271661306e-05, + "loss": 2.3469, + "step": 475 + }, + { + "epoch": 0.11962053150719357, + "grad_norm": 0.39172980189323425, + "learning_rate": 9.714063167876527e-05, + "loss": 2.2347, + "step": 476 + }, + { + "epoch": 0.11987183514481373, + "grad_norm": 0.49665653705596924, + "learning_rate": 9.71272905017631e-05, + "loss": 1.954, + "step": 477 + }, + { + "epoch": 0.12012313878243387, + "grad_norm": 0.2704184949398041, + "learning_rate": 9.711391919413626e-05, + "loss": 2.3236, + "step": 478 + }, + { + "epoch": 0.12037444242005403, + "grad_norm": 0.17571642994880676, + "learning_rate": 9.710051776443358e-05, + "loss": 1.277, + "step": 479 + }, + { + "epoch": 0.12062574605767419, + "grad_norm": 0.2642328143119812, + "learning_rate": 9.708708622122322e-05, + "loss": 2.4537, + "step": 480 + }, + { + "epoch": 0.12087704969529434, + "grad_norm": 0.21277543902397156, + "learning_rate": 9.707362457309261e-05, + "loss": 1.3385, + "step": 481 + }, + { + "epoch": 0.1211283533329145, + "grad_norm": 0.3256551921367645, + "learning_rate": 9.706013282864834e-05, + "loss": 2.1567, + "step": 482 + }, + { + "epoch": 0.12137965697053466, + "grad_norm": 0.4194876551628113, + "learning_rate": 9.704661099651633e-05, + "loss": 2.2617, + "step": 483 + }, + { + "epoch": 0.1216309606081548, + "grad_norm": 0.13989388942718506, + "learning_rate": 9.70330590853417e-05, + "loss": 0.8074, + "step": 484 + }, + { + "epoch": 0.12188226424577496, + "grad_norm": 0.2916125953197479, + "learning_rate": 9.701947710378881e-05, + "loss": 2.0217, + "step": 485 + }, + { + "epoch": 0.1221335678833951, + "grad_norm": 0.17038998007774353, + "learning_rate": 9.700586506054121e-05, + "loss": 0.8153, + "step": 486 + }, + { + "epoch": 0.12238487152101526, + "grad_norm": 0.39059990644454956, + "learning_rate": 9.699222296430172e-05, + "loss": 2.161, + "step": 487 + }, + { + "epoch": 0.12263617515863542, + "grad_norm": 0.5335647463798523, + "learning_rate": 9.697855082379239e-05, + "loss": 2.2604, + "step": 488 + }, + { + "epoch": 0.12288747879625557, + "grad_norm": 0.4468785524368286, + "learning_rate": 9.696484864775437e-05, + "loss": 2.3022, + "step": 489 + }, + { + "epoch": 0.12313878243387573, + "grad_norm": 0.38395336270332336, + "learning_rate": 9.695111644494814e-05, + "loss": 2.2467, + "step": 490 + }, + { + "epoch": 0.12339008607149589, + "grad_norm": 0.35352465510368347, + "learning_rate": 9.693735422415332e-05, + "loss": 2.7816, + "step": 491 + }, + { + "epoch": 0.12364138970911603, + "grad_norm": 0.5462369322776794, + "learning_rate": 9.692356199416868e-05, + "loss": 2.2723, + "step": 492 + }, + { + "epoch": 0.1238926933467362, + "grad_norm": 0.24535368382930756, + "learning_rate": 9.690973976381228e-05, + "loss": 2.0469, + "step": 493 + }, + { + "epoch": 0.12414399698435635, + "grad_norm": 0.3857629597187042, + "learning_rate": 9.689588754192126e-05, + "loss": 2.1396, + "step": 494 + }, + { + "epoch": 0.1243953006219765, + "grad_norm": 0.4369061291217804, + "learning_rate": 9.688200533735199e-05, + "loss": 2.3099, + "step": 495 + }, + { + "epoch": 0.12464660425959666, + "grad_norm": 0.22359801828861237, + "learning_rate": 9.686809315898e-05, + "loss": 2.113, + "step": 496 + }, + { + "epoch": 0.12489790789721682, + "grad_norm": 0.3624935746192932, + "learning_rate": 9.685415101569999e-05, + "loss": 2.2168, + "step": 497 + }, + { + "epoch": 0.12514921153483696, + "grad_norm": 0.4480370581150055, + "learning_rate": 9.684017891642578e-05, + "loss": 2.5805, + "step": 498 + }, + { + "epoch": 0.12540051517245712, + "grad_norm": 0.47670865058898926, + "learning_rate": 9.682617687009039e-05, + "loss": 1.5993, + "step": 499 + }, + { + "epoch": 0.12565181881007728, + "grad_norm": 0.37690502405166626, + "learning_rate": 9.681214488564596e-05, + "loss": 2.125, + "step": 500 + }, + { + "epoch": 0.12590312244769744, + "grad_norm": 0.29597729444503784, + "learning_rate": 9.679808297206377e-05, + "loss": 2.6068, + "step": 501 + }, + { + "epoch": 0.12615442608531757, + "grad_norm": 0.33130696415901184, + "learning_rate": 9.678399113833425e-05, + "loss": 2.2649, + "step": 502 + }, + { + "epoch": 0.12640572972293773, + "grad_norm": 0.4030790627002716, + "learning_rate": 9.676986939346696e-05, + "loss": 1.8589, + "step": 503 + }, + { + "epoch": 0.1266570333605579, + "grad_norm": 0.37136292457580566, + "learning_rate": 9.675571774649057e-05, + "loss": 2.1863, + "step": 504 + }, + { + "epoch": 0.12690833699817805, + "grad_norm": 0.439748615026474, + "learning_rate": 9.674153620645287e-05, + "loss": 2.6519, + "step": 505 + }, + { + "epoch": 0.1271596406357982, + "grad_norm": 0.3839961588382721, + "learning_rate": 9.672732478242075e-05, + "loss": 2.4758, + "step": 506 + }, + { + "epoch": 0.12741094427341837, + "grad_norm": 0.2992021441459656, + "learning_rate": 9.671308348348025e-05, + "loss": 2.4348, + "step": 507 + }, + { + "epoch": 0.1276622479110385, + "grad_norm": 0.3640328645706177, + "learning_rate": 9.669881231873646e-05, + "loss": 2.6048, + "step": 508 + }, + { + "epoch": 0.12791355154865866, + "grad_norm": 0.27911099791526794, + "learning_rate": 9.66845112973136e-05, + "loss": 2.4449, + "step": 509 + }, + { + "epoch": 0.12816485518627882, + "grad_norm": 0.4001411199569702, + "learning_rate": 9.667018042835496e-05, + "loss": 3.0271, + "step": 510 + }, + { + "epoch": 0.12841615882389898, + "grad_norm": 0.4469778537750244, + "learning_rate": 9.665581972102291e-05, + "loss": 2.192, + "step": 511 + }, + { + "epoch": 0.12866746246151914, + "grad_norm": 0.4696493446826935, + "learning_rate": 9.66414291844989e-05, + "loss": 2.0927, + "step": 512 + }, + { + "epoch": 0.1289187660991393, + "grad_norm": 0.485324889421463, + "learning_rate": 9.662700882798348e-05, + "loss": 1.8437, + "step": 513 + }, + { + "epoch": 0.12917006973675943, + "grad_norm": 0.2740216851234436, + "learning_rate": 9.661255866069622e-05, + "loss": 2.0895, + "step": 514 + }, + { + "epoch": 0.1294213733743796, + "grad_norm": 0.48760735988616943, + "learning_rate": 9.659807869187578e-05, + "loss": 2.3416, + "step": 515 + }, + { + "epoch": 0.12967267701199975, + "grad_norm": 0.14928022027015686, + "learning_rate": 9.658356893077987e-05, + "loss": 1.1877, + "step": 516 + }, + { + "epoch": 0.1299239806496199, + "grad_norm": 0.46163851022720337, + "learning_rate": 9.656902938668524e-05, + "loss": 2.6305, + "step": 517 + }, + { + "epoch": 0.13017528428724007, + "grad_norm": 0.30622944235801697, + "learning_rate": 9.655446006888766e-05, + "loss": 1.5633, + "step": 518 + }, + { + "epoch": 0.1304265879248602, + "grad_norm": 0.30690157413482666, + "learning_rate": 9.653986098670198e-05, + "loss": 2.5689, + "step": 519 + }, + { + "epoch": 0.13067789156248036, + "grad_norm": 0.3105219602584839, + "learning_rate": 9.652523214946205e-05, + "loss": 2.564, + "step": 520 + }, + { + "epoch": 0.13092919520010052, + "grad_norm": 0.2955935299396515, + "learning_rate": 9.651057356652077e-05, + "loss": 2.5073, + "step": 521 + }, + { + "epoch": 0.13118049883772068, + "grad_norm": 0.19297459721565247, + "learning_rate": 9.649588524725002e-05, + "loss": 1.8412, + "step": 522 + }, + { + "epoch": 0.13143180247534084, + "grad_norm": 0.3480035066604614, + "learning_rate": 9.64811672010407e-05, + "loss": 2.5528, + "step": 523 + }, + { + "epoch": 0.131683106112961, + "grad_norm": 0.15509484708309174, + "learning_rate": 9.646641943730277e-05, + "loss": 0.7748, + "step": 524 + }, + { + "epoch": 0.13193440975058113, + "grad_norm": 0.33523187041282654, + "learning_rate": 9.645164196546512e-05, + "loss": 1.939, + "step": 525 + }, + { + "epoch": 0.1321857133882013, + "grad_norm": 0.3940199613571167, + "learning_rate": 9.643683479497567e-05, + "loss": 2.3232, + "step": 526 + }, + { + "epoch": 0.13243701702582145, + "grad_norm": 0.3470746576786041, + "learning_rate": 9.64219979353013e-05, + "loss": 1.933, + "step": 527 + }, + { + "epoch": 0.1326883206634416, + "grad_norm": 0.3262689709663391, + "learning_rate": 9.640713139592792e-05, + "loss": 2.3454, + "step": 528 + }, + { + "epoch": 0.13293962430106177, + "grad_norm": 0.5404649972915649, + "learning_rate": 9.639223518636036e-05, + "loss": 2.4712, + "step": 529 + }, + { + "epoch": 0.13319092793868192, + "grad_norm": 0.30311527848243713, + "learning_rate": 9.637730931612245e-05, + "loss": 2.0744, + "step": 530 + }, + { + "epoch": 0.13344223157630206, + "grad_norm": 0.9992802739143372, + "learning_rate": 9.6362353794757e-05, + "loss": 2.1378, + "step": 531 + }, + { + "epoch": 0.13369353521392222, + "grad_norm": 0.2992432415485382, + "learning_rate": 9.634736863182574e-05, + "loss": 2.4675, + "step": 532 + }, + { + "epoch": 0.13394483885154237, + "grad_norm": 0.3760550618171692, + "learning_rate": 9.633235383690937e-05, + "loss": 2.3762, + "step": 533 + }, + { + "epoch": 0.13419614248916253, + "grad_norm": 0.3664836287498474, + "learning_rate": 9.631730941960752e-05, + "loss": 2.2417, + "step": 534 + }, + { + "epoch": 0.1344474461267827, + "grad_norm": 0.4195888042449951, + "learning_rate": 9.630223538953881e-05, + "loss": 1.9261, + "step": 535 + }, + { + "epoch": 0.13469874976440285, + "grad_norm": 0.39428257942199707, + "learning_rate": 9.628713175634072e-05, + "loss": 2.2189, + "step": 536 + }, + { + "epoch": 0.13495005340202298, + "grad_norm": 0.3145690858364105, + "learning_rate": 9.627199852966969e-05, + "loss": 2.1252, + "step": 537 + }, + { + "epoch": 0.13520135703964314, + "grad_norm": 0.41197821497917175, + "learning_rate": 9.625683571920108e-05, + "loss": 2.0061, + "step": 538 + }, + { + "epoch": 0.1354526606772633, + "grad_norm": 0.4640690088272095, + "learning_rate": 9.62416433346292e-05, + "loss": 1.9037, + "step": 539 + }, + { + "epoch": 0.13570396431488346, + "grad_norm": 0.39173510670661926, + "learning_rate": 9.62264213856672e-05, + "loss": 2.1323, + "step": 540 + }, + { + "epoch": 0.13595526795250362, + "grad_norm": 0.33818063139915466, + "learning_rate": 9.62111698820472e-05, + "loss": 2.4662, + "step": 541 + }, + { + "epoch": 0.13620657159012375, + "grad_norm": 0.3888367712497711, + "learning_rate": 9.619588883352011e-05, + "loss": 1.987, + "step": 542 + }, + { + "epoch": 0.1364578752277439, + "grad_norm": 0.49469324946403503, + "learning_rate": 9.61805782498559e-05, + "loss": 2.075, + "step": 543 + }, + { + "epoch": 0.13670917886536407, + "grad_norm": 0.3214045464992523, + "learning_rate": 9.616523814084324e-05, + "loss": 2.6687, + "step": 544 + }, + { + "epoch": 0.13696048250298423, + "grad_norm": 0.3943631649017334, + "learning_rate": 9.61498685162898e-05, + "loss": 1.9817, + "step": 545 + }, + { + "epoch": 0.1372117861406044, + "grad_norm": 0.46393269300460815, + "learning_rate": 9.613446938602209e-05, + "loss": 2.2147, + "step": 546 + }, + { + "epoch": 0.13746308977822455, + "grad_norm": 0.3647920489311218, + "learning_rate": 9.611904075988544e-05, + "loss": 1.9163, + "step": 547 + }, + { + "epoch": 0.13771439341584468, + "grad_norm": 0.3861480951309204, + "learning_rate": 9.610358264774411e-05, + "loss": 2.7924, + "step": 548 + }, + { + "epoch": 0.13796569705346484, + "grad_norm": 0.4985499083995819, + "learning_rate": 9.608809505948114e-05, + "loss": 2.0298, + "step": 549 + }, + { + "epoch": 0.138217000691085, + "grad_norm": 0.4473382532596588, + "learning_rate": 9.607257800499849e-05, + "loss": 2.0965, + "step": 550 + }, + { + "epoch": 0.13846830432870516, + "grad_norm": 0.4468131959438324, + "learning_rate": 9.60570314942169e-05, + "loss": 2.311, + "step": 551 + }, + { + "epoch": 0.13871960796632532, + "grad_norm": 0.1555897295475006, + "learning_rate": 9.604145553707595e-05, + "loss": 0.8849, + "step": 552 + }, + { + "epoch": 0.13897091160394548, + "grad_norm": 0.17197562754154205, + "learning_rate": 9.602585014353409e-05, + "loss": 1.0516, + "step": 553 + }, + { + "epoch": 0.1392222152415656, + "grad_norm": 0.22020426392555237, + "learning_rate": 9.601021532356854e-05, + "loss": 1.9116, + "step": 554 + }, + { + "epoch": 0.13947351887918577, + "grad_norm": 0.33761653304100037, + "learning_rate": 9.599455108717535e-05, + "loss": 2.3363, + "step": 555 + }, + { + "epoch": 0.13972482251680593, + "grad_norm": 0.5171981453895569, + "learning_rate": 9.59788574443694e-05, + "loss": 2.0701, + "step": 556 + }, + { + "epoch": 0.1399761261544261, + "grad_norm": 0.22373630106449127, + "learning_rate": 9.596313440518432e-05, + "loss": 1.0946, + "step": 557 + }, + { + "epoch": 0.14022742979204625, + "grad_norm": 0.5129598379135132, + "learning_rate": 9.594738197967259e-05, + "loss": 2.3418, + "step": 558 + }, + { + "epoch": 0.1404787334296664, + "grad_norm": 0.4349968433380127, + "learning_rate": 9.593160017790546e-05, + "loss": 2.0397, + "step": 559 + }, + { + "epoch": 0.14073003706728654, + "grad_norm": 0.39511239528656006, + "learning_rate": 9.591578900997292e-05, + "loss": 2.2823, + "step": 560 + }, + { + "epoch": 0.1409813407049067, + "grad_norm": 0.25913137197494507, + "learning_rate": 9.58999484859838e-05, + "loss": 2.3511, + "step": 561 + }, + { + "epoch": 0.14123264434252686, + "grad_norm": 0.19200782477855682, + "learning_rate": 9.588407861606566e-05, + "loss": 1.6859, + "step": 562 + }, + { + "epoch": 0.14148394798014702, + "grad_norm": 0.30504798889160156, + "learning_rate": 9.586817941036483e-05, + "loss": 2.1132, + "step": 563 + }, + { + "epoch": 0.14173525161776718, + "grad_norm": 0.46058428287506104, + "learning_rate": 9.585225087904641e-05, + "loss": 2.4114, + "step": 564 + }, + { + "epoch": 0.1419865552553873, + "grad_norm": 0.2597549557685852, + "learning_rate": 9.583629303229423e-05, + "loss": 2.4956, + "step": 565 + }, + { + "epoch": 0.14223785889300747, + "grad_norm": 0.35064586997032166, + "learning_rate": 9.582030588031084e-05, + "loss": 2.4188, + "step": 566 + }, + { + "epoch": 0.14248916253062763, + "grad_norm": 0.21857944130897522, + "learning_rate": 9.580428943331758e-05, + "loss": 1.4052, + "step": 567 + }, + { + "epoch": 0.1427404661682478, + "grad_norm": 0.4945662021636963, + "learning_rate": 9.578824370155451e-05, + "loss": 2.6965, + "step": 568 + }, + { + "epoch": 0.14299176980586795, + "grad_norm": 0.37999972701072693, + "learning_rate": 9.577216869528038e-05, + "loss": 2.5524, + "step": 569 + }, + { + "epoch": 0.1432430734434881, + "grad_norm": 0.47479113936424255, + "learning_rate": 9.575606442477267e-05, + "loss": 2.166, + "step": 570 + }, + { + "epoch": 0.14349437708110824, + "grad_norm": 0.42284709215164185, + "learning_rate": 9.573993090032758e-05, + "loss": 1.7279, + "step": 571 + }, + { + "epoch": 0.1437456807187284, + "grad_norm": 0.412218302488327, + "learning_rate": 9.572376813225999e-05, + "loss": 2.1049, + "step": 572 + }, + { + "epoch": 0.14399698435634856, + "grad_norm": 0.6507567167282104, + "learning_rate": 9.570757613090353e-05, + "loss": 2.5453, + "step": 573 + }, + { + "epoch": 0.14424828799396872, + "grad_norm": 0.31104111671447754, + "learning_rate": 9.569135490661046e-05, + "loss": 2.6578, + "step": 574 + }, + { + "epoch": 0.14449959163158888, + "grad_norm": 0.3139590620994568, + "learning_rate": 9.567510446975176e-05, + "loss": 2.5413, + "step": 575 + }, + { + "epoch": 0.14475089526920903, + "grad_norm": 0.5592718720436096, + "learning_rate": 9.565882483071706e-05, + "loss": 2.3341, + "step": 576 + }, + { + "epoch": 0.14500219890682917, + "grad_norm": 0.4113386869430542, + "learning_rate": 9.564251599991467e-05, + "loss": 2.7414, + "step": 577 + }, + { + "epoch": 0.14525350254444933, + "grad_norm": 0.2924419045448303, + "learning_rate": 9.56261779877716e-05, + "loss": 1.882, + "step": 578 + }, + { + "epoch": 0.14550480618206948, + "grad_norm": 0.3123188316822052, + "learning_rate": 9.560981080473346e-05, + "loss": 2.2884, + "step": 579 + }, + { + "epoch": 0.14575610981968964, + "grad_norm": 0.3279459476470947, + "learning_rate": 9.559341446126455e-05, + "loss": 1.8245, + "step": 580 + }, + { + "epoch": 0.1460074134573098, + "grad_norm": 0.2936881184577942, + "learning_rate": 9.55769889678478e-05, + "loss": 2.2238, + "step": 581 + }, + { + "epoch": 0.14625871709492994, + "grad_norm": 0.3411659598350525, + "learning_rate": 9.556053433498475e-05, + "loss": 2.1781, + "step": 582 + }, + { + "epoch": 0.1465100207325501, + "grad_norm": 0.42901405692100525, + "learning_rate": 9.554405057319565e-05, + "loss": 2.5198, + "step": 583 + }, + { + "epoch": 0.14676132437017025, + "grad_norm": 0.37800028920173645, + "learning_rate": 9.552753769301925e-05, + "loss": 2.009, + "step": 584 + }, + { + "epoch": 0.1470126280077904, + "grad_norm": 0.35843050479888916, + "learning_rate": 9.551099570501305e-05, + "loss": 2.0567, + "step": 585 + }, + { + "epoch": 0.14726393164541057, + "grad_norm": 0.4403095245361328, + "learning_rate": 9.549442461975306e-05, + "loss": 2.242, + "step": 586 + }, + { + "epoch": 0.14751523528303073, + "grad_norm": 0.3680509626865387, + "learning_rate": 9.547782444783393e-05, + "loss": 1.9327, + "step": 587 + }, + { + "epoch": 0.14776653892065086, + "grad_norm": 0.26541033387184143, + "learning_rate": 9.546119519986894e-05, + "loss": 2.5879, + "step": 588 + }, + { + "epoch": 0.14801784255827102, + "grad_norm": 0.3599735200405121, + "learning_rate": 9.544453688648989e-05, + "loss": 2.448, + "step": 589 + }, + { + "epoch": 0.14826914619589118, + "grad_norm": 0.46123915910720825, + "learning_rate": 9.542784951834721e-05, + "loss": 2.1933, + "step": 590 + }, + { + "epoch": 0.14852044983351134, + "grad_norm": 0.3718903064727783, + "learning_rate": 9.54111331061099e-05, + "loss": 2.0662, + "step": 591 + }, + { + "epoch": 0.1487717534711315, + "grad_norm": 0.5671700239181519, + "learning_rate": 9.539438766046554e-05, + "loss": 2.2881, + "step": 592 + }, + { + "epoch": 0.14902305710875166, + "grad_norm": 0.415791779756546, + "learning_rate": 9.537761319212021e-05, + "loss": 2.1724, + "step": 593 + }, + { + "epoch": 0.1492743607463718, + "grad_norm": 0.33377009630203247, + "learning_rate": 9.536080971179864e-05, + "loss": 2.2665, + "step": 594 + }, + { + "epoch": 0.14952566438399195, + "grad_norm": 0.48479607701301575, + "learning_rate": 9.534397723024402e-05, + "loss": 1.9262, + "step": 595 + }, + { + "epoch": 0.1497769680216121, + "grad_norm": 0.4698795974254608, + "learning_rate": 9.532711575821816e-05, + "loss": 2.3923, + "step": 596 + }, + { + "epoch": 0.15002827165923227, + "grad_norm": 0.45373061299324036, + "learning_rate": 9.531022530650135e-05, + "loss": 2.2584, + "step": 597 + }, + { + "epoch": 0.15027957529685243, + "grad_norm": 0.25719451904296875, + "learning_rate": 9.529330588589243e-05, + "loss": 2.1529, + "step": 598 + }, + { + "epoch": 0.1505308789344726, + "grad_norm": 0.27690425515174866, + "learning_rate": 9.527635750720875e-05, + "loss": 2.1076, + "step": 599 + }, + { + "epoch": 0.15078218257209272, + "grad_norm": 0.11603706330060959, + "learning_rate": 9.525938018128617e-05, + "loss": 0.5727, + "step": 600 + }, + { + "epoch": 0.15103348620971288, + "grad_norm": 0.39744696021080017, + "learning_rate": 9.524237391897909e-05, + "loss": 2.1513, + "step": 601 + }, + { + "epoch": 0.15128478984733304, + "grad_norm": 0.3867836594581604, + "learning_rate": 9.522533873116041e-05, + "loss": 2.3665, + "step": 602 + }, + { + "epoch": 0.1515360934849532, + "grad_norm": 0.16841183602809906, + "learning_rate": 9.520827462872144e-05, + "loss": 1.1614, + "step": 603 + }, + { + "epoch": 0.15178739712257336, + "grad_norm": 0.34245559573173523, + "learning_rate": 9.519118162257209e-05, + "loss": 2.2823, + "step": 604 + }, + { + "epoch": 0.1520387007601935, + "grad_norm": 0.47992077469825745, + "learning_rate": 9.517405972364067e-05, + "loss": 1.9997, + "step": 605 + }, + { + "epoch": 0.15229000439781365, + "grad_norm": 0.2888699471950531, + "learning_rate": 9.5156908942874e-05, + "loss": 2.2298, + "step": 606 + }, + { + "epoch": 0.1525413080354338, + "grad_norm": 0.1741029918193817, + "learning_rate": 9.513972929123737e-05, + "loss": 0.7344, + "step": 607 + }, + { + "epoch": 0.15279261167305397, + "grad_norm": 0.3610229790210724, + "learning_rate": 9.512252077971448e-05, + "loss": 2.0037, + "step": 608 + }, + { + "epoch": 0.15304391531067413, + "grad_norm": 0.30153217911720276, + "learning_rate": 9.510528341930756e-05, + "loss": 2.1396, + "step": 609 + }, + { + "epoch": 0.1532952189482943, + "grad_norm": 0.5049226880073547, + "learning_rate": 9.50880172210372e-05, + "loss": 2.1197, + "step": 610 + }, + { + "epoch": 0.15354652258591442, + "grad_norm": 0.27209773659706116, + "learning_rate": 9.507072219594249e-05, + "loss": 2.37, + "step": 611 + }, + { + "epoch": 0.15379782622353458, + "grad_norm": 0.33535251021385193, + "learning_rate": 9.505339835508091e-05, + "loss": 2.4393, + "step": 612 + }, + { + "epoch": 0.15404912986115474, + "grad_norm": 0.4262019991874695, + "learning_rate": 9.50360457095284e-05, + "loss": 2.0283, + "step": 613 + }, + { + "epoch": 0.1543004334987749, + "grad_norm": 0.5614545941352844, + "learning_rate": 9.50186642703793e-05, + "loss": 2.0785, + "step": 614 + }, + { + "epoch": 0.15455173713639506, + "grad_norm": 0.3979303240776062, + "learning_rate": 9.500125404874631e-05, + "loss": 2.6146, + "step": 615 + }, + { + "epoch": 0.15480304077401522, + "grad_norm": 0.39508217573165894, + "learning_rate": 9.498381505576064e-05, + "loss": 2.2959, + "step": 616 + }, + { + "epoch": 0.15505434441163535, + "grad_norm": 0.36526814103126526, + "learning_rate": 9.49663473025718e-05, + "loss": 2.3149, + "step": 617 + }, + { + "epoch": 0.1553056480492555, + "grad_norm": 0.6422840356826782, + "learning_rate": 9.494885080034774e-05, + "loss": 2.0929, + "step": 618 + }, + { + "epoch": 0.15555695168687567, + "grad_norm": 0.32328036427497864, + "learning_rate": 9.493132556027475e-05, + "loss": 2.2233, + "step": 619 + }, + { + "epoch": 0.15580825532449583, + "grad_norm": 0.25220245122909546, + "learning_rate": 9.491377159355752e-05, + "loss": 2.5095, + "step": 620 + }, + { + "epoch": 0.15605955896211599, + "grad_norm": 0.32473161816596985, + "learning_rate": 9.489618891141911e-05, + "loss": 2.6889, + "step": 621 + }, + { + "epoch": 0.15631086259973614, + "grad_norm": 0.4301775395870209, + "learning_rate": 9.487857752510093e-05, + "loss": 2.227, + "step": 622 + }, + { + "epoch": 0.15656216623735628, + "grad_norm": 0.4302082061767578, + "learning_rate": 9.486093744586271e-05, + "loss": 2.2468, + "step": 623 + }, + { + "epoch": 0.15681346987497644, + "grad_norm": 0.39020925760269165, + "learning_rate": 9.484326868498261e-05, + "loss": 2.3726, + "step": 624 + }, + { + "epoch": 0.1570647735125966, + "grad_norm": 0.3023484945297241, + "learning_rate": 9.482557125375704e-05, + "loss": 2.4235, + "step": 625 + }, + { + "epoch": 0.15731607715021675, + "grad_norm": 0.3555540442466736, + "learning_rate": 9.480784516350079e-05, + "loss": 2.4532, + "step": 626 + }, + { + "epoch": 0.1575673807878369, + "grad_norm": 0.3152056634426117, + "learning_rate": 9.479009042554694e-05, + "loss": 2.7172, + "step": 627 + }, + { + "epoch": 0.15781868442545705, + "grad_norm": 0.5882668495178223, + "learning_rate": 9.477230705124692e-05, + "loss": 2.5912, + "step": 628 + }, + { + "epoch": 0.1580699880630772, + "grad_norm": 0.36808767914772034, + "learning_rate": 9.475449505197043e-05, + "loss": 2.2798, + "step": 629 + }, + { + "epoch": 0.15832129170069736, + "grad_norm": 0.44441547989845276, + "learning_rate": 9.473665443910551e-05, + "loss": 1.9648, + "step": 630 + }, + { + "epoch": 0.15857259533831752, + "grad_norm": 0.40813305974006653, + "learning_rate": 9.471878522405849e-05, + "loss": 1.7885, + "step": 631 + }, + { + "epoch": 0.15882389897593768, + "grad_norm": 0.32029062509536743, + "learning_rate": 9.470088741825394e-05, + "loss": 2.6975, + "step": 632 + }, + { + "epoch": 0.15907520261355784, + "grad_norm": 0.270475834608078, + "learning_rate": 9.468296103313476e-05, + "loss": 0.8822, + "step": 633 + }, + { + "epoch": 0.15932650625117797, + "grad_norm": 0.3409525454044342, + "learning_rate": 9.46650060801621e-05, + "loss": 1.7725, + "step": 634 + }, + { + "epoch": 0.15957780988879813, + "grad_norm": 0.4669889807701111, + "learning_rate": 9.464702257081539e-05, + "loss": 2.379, + "step": 635 + }, + { + "epoch": 0.1598291135264183, + "grad_norm": 0.5293301939964294, + "learning_rate": 9.462901051659232e-05, + "loss": 2.3118, + "step": 636 + }, + { + "epoch": 0.16008041716403845, + "grad_norm": 0.38586729764938354, + "learning_rate": 9.461096992900879e-05, + "loss": 2.0312, + "step": 637 + }, + { + "epoch": 0.1603317208016586, + "grad_norm": 0.3819951117038727, + "learning_rate": 9.459290081959897e-05, + "loss": 1.973, + "step": 638 + }, + { + "epoch": 0.16058302443927877, + "grad_norm": 0.37507110834121704, + "learning_rate": 9.457480319991529e-05, + "loss": 2.3668, + "step": 639 + }, + { + "epoch": 0.1608343280768989, + "grad_norm": 0.3354203701019287, + "learning_rate": 9.455667708152836e-05, + "loss": 1.7392, + "step": 640 + }, + { + "epoch": 0.16108563171451906, + "grad_norm": 0.31567585468292236, + "learning_rate": 9.453852247602704e-05, + "loss": 2.2258, + "step": 641 + }, + { + "epoch": 0.16133693535213922, + "grad_norm": 0.35445067286491394, + "learning_rate": 9.452033939501839e-05, + "loss": 2.5792, + "step": 642 + }, + { + "epoch": 0.16158823898975938, + "grad_norm": 0.3172896206378937, + "learning_rate": 9.45021278501277e-05, + "loss": 2.4493, + "step": 643 + }, + { + "epoch": 0.16183954262737954, + "grad_norm": 0.4169299602508545, + "learning_rate": 9.448388785299842e-05, + "loss": 2.3125, + "step": 644 + }, + { + "epoch": 0.1620908462649997, + "grad_norm": 0.5329498648643494, + "learning_rate": 9.446561941529224e-05, + "loss": 2.1722, + "step": 645 + }, + { + "epoch": 0.16234214990261983, + "grad_norm": 0.21547356247901917, + "learning_rate": 9.444732254868898e-05, + "loss": 2.1758, + "step": 646 + }, + { + "epoch": 0.16259345354024, + "grad_norm": 0.29865655303001404, + "learning_rate": 9.442899726488665e-05, + "loss": 1.5479, + "step": 647 + }, + { + "epoch": 0.16284475717786015, + "grad_norm": 0.38035672903060913, + "learning_rate": 9.441064357560147e-05, + "loss": 2.8413, + "step": 648 + }, + { + "epoch": 0.1630960608154803, + "grad_norm": 0.6323506832122803, + "learning_rate": 9.439226149256779e-05, + "loss": 2.1557, + "step": 649 + }, + { + "epoch": 0.16334736445310047, + "grad_norm": 0.5052780508995056, + "learning_rate": 9.43738510275381e-05, + "loss": 1.7929, + "step": 650 + }, + { + "epoch": 0.1635986680907206, + "grad_norm": 0.3237319588661194, + "learning_rate": 9.435541219228303e-05, + "loss": 2.336, + "step": 651 + }, + { + "epoch": 0.16384997172834076, + "grad_norm": 0.18192382156848907, + "learning_rate": 9.433694499859141e-05, + "loss": 0.9273, + "step": 652 + }, + { + "epoch": 0.16410127536596092, + "grad_norm": 0.31093844771385193, + "learning_rate": 9.431844945827014e-05, + "loss": 2.633, + "step": 653 + }, + { + "epoch": 0.16435257900358108, + "grad_norm": 0.40673384070396423, + "learning_rate": 9.429992558314423e-05, + "loss": 1.8868, + "step": 654 + }, + { + "epoch": 0.16460388264120124, + "grad_norm": 0.30386146903038025, + "learning_rate": 9.428137338505687e-05, + "loss": 2.1055, + "step": 655 + }, + { + "epoch": 0.1648551862788214, + "grad_norm": 0.3704879581928253, + "learning_rate": 9.426279287586934e-05, + "loss": 1.9851, + "step": 656 + }, + { + "epoch": 0.16510648991644153, + "grad_norm": 0.17675453424453735, + "learning_rate": 9.424418406746098e-05, + "loss": 1.0456, + "step": 657 + }, + { + "epoch": 0.1653577935540617, + "grad_norm": 0.2338314950466156, + "learning_rate": 9.422554697172925e-05, + "loss": 1.8722, + "step": 658 + }, + { + "epoch": 0.16560909719168185, + "grad_norm": 0.2262151688337326, + "learning_rate": 9.420688160058972e-05, + "loss": 1.2118, + "step": 659 + }, + { + "epoch": 0.165860400829302, + "grad_norm": 0.32973191142082214, + "learning_rate": 9.418818796597597e-05, + "loss": 1.974, + "step": 660 + }, + { + "epoch": 0.16611170446692217, + "grad_norm": 0.3016055226325989, + "learning_rate": 9.416946607983975e-05, + "loss": 1.8877, + "step": 661 + }, + { + "epoch": 0.16636300810454233, + "grad_norm": 0.39721089601516724, + "learning_rate": 9.415071595415075e-05, + "loss": 2.1139, + "step": 662 + }, + { + "epoch": 0.16661431174216246, + "grad_norm": 0.3398868143558502, + "learning_rate": 9.413193760089682e-05, + "loss": 1.9081, + "step": 663 + }, + { + "epoch": 0.16686561537978262, + "grad_norm": 0.3406602144241333, + "learning_rate": 9.411313103208382e-05, + "loss": 2.1456, + "step": 664 + }, + { + "epoch": 0.16711691901740278, + "grad_norm": 0.4096097946166992, + "learning_rate": 9.409429625973563e-05, + "loss": 2.294, + "step": 665 + }, + { + "epoch": 0.16736822265502294, + "grad_norm": 0.2669360041618347, + "learning_rate": 9.407543329589418e-05, + "loss": 1.8453, + "step": 666 + }, + { + "epoch": 0.1676195262926431, + "grad_norm": 0.5418170690536499, + "learning_rate": 9.405654215261944e-05, + "loss": 2.3642, + "step": 667 + }, + { + "epoch": 0.16787082993026323, + "grad_norm": 0.32345104217529297, + "learning_rate": 9.403762284198936e-05, + "loss": 2.1211, + "step": 668 + }, + { + "epoch": 0.1681221335678834, + "grad_norm": 0.49764448404312134, + "learning_rate": 9.401867537609991e-05, + "loss": 1.9661, + "step": 669 + }, + { + "epoch": 0.16837343720550355, + "grad_norm": 1.5639890432357788, + "learning_rate": 9.399969976706509e-05, + "loss": 1.9938, + "step": 670 + }, + { + "epoch": 0.1686247408431237, + "grad_norm": 1.2142359018325806, + "learning_rate": 9.398069602701687e-05, + "loss": 2.148, + "step": 671 + }, + { + "epoch": 0.16887604448074386, + "grad_norm": 0.2211383879184723, + "learning_rate": 9.396166416810519e-05, + "loss": 2.0333, + "step": 672 + }, + { + "epoch": 0.16912734811836402, + "grad_norm": 0.3100007176399231, + "learning_rate": 9.394260420249801e-05, + "loss": 2.529, + "step": 673 + }, + { + "epoch": 0.16937865175598416, + "grad_norm": 0.39237165451049805, + "learning_rate": 9.39235161423812e-05, + "loss": 2.2316, + "step": 674 + }, + { + "epoch": 0.16962995539360431, + "grad_norm": 0.553925096988678, + "learning_rate": 9.390439999995865e-05, + "loss": 2.1865, + "step": 675 + }, + { + "epoch": 0.16988125903122447, + "grad_norm": 0.45522618293762207, + "learning_rate": 9.38852557874522e-05, + "loss": 2.346, + "step": 676 + }, + { + "epoch": 0.17013256266884463, + "grad_norm": 0.4678781032562256, + "learning_rate": 9.386608351710157e-05, + "loss": 1.9335, + "step": 677 + }, + { + "epoch": 0.1703838663064648, + "grad_norm": 0.3529011905193329, + "learning_rate": 9.38468832011645e-05, + "loss": 2.6414, + "step": 678 + }, + { + "epoch": 0.17063516994408495, + "grad_norm": 0.26705560088157654, + "learning_rate": 9.382765485191662e-05, + "loss": 2.3328, + "step": 679 + }, + { + "epoch": 0.17088647358170508, + "grad_norm": 0.3495092988014221, + "learning_rate": 9.380839848165149e-05, + "loss": 2.3455, + "step": 680 + }, + { + "epoch": 0.17113777721932524, + "grad_norm": 0.32814642786979675, + "learning_rate": 9.378911410268058e-05, + "loss": 2.4395, + "step": 681 + }, + { + "epoch": 0.1713890808569454, + "grad_norm": 0.33247315883636475, + "learning_rate": 9.376980172733329e-05, + "loss": 1.6115, + "step": 682 + }, + { + "epoch": 0.17164038449456556, + "grad_norm": 0.44238927960395813, + "learning_rate": 9.375046136795686e-05, + "loss": 2.1676, + "step": 683 + }, + { + "epoch": 0.17189168813218572, + "grad_norm": 0.3289899528026581, + "learning_rate": 9.373109303691652e-05, + "loss": 2.3906, + "step": 684 + }, + { + "epoch": 0.17214299176980588, + "grad_norm": 0.3287547826766968, + "learning_rate": 9.371169674659529e-05, + "loss": 2.1608, + "step": 685 + }, + { + "epoch": 0.172394295407426, + "grad_norm": 0.2546299397945404, + "learning_rate": 9.36922725093941e-05, + "loss": 2.4981, + "step": 686 + }, + { + "epoch": 0.17264559904504617, + "grad_norm": 0.2905830144882202, + "learning_rate": 9.367282033773177e-05, + "loss": 2.3735, + "step": 687 + }, + { + "epoch": 0.17289690268266633, + "grad_norm": 0.5941200256347656, + "learning_rate": 9.365334024404495e-05, + "loss": 2.7027, + "step": 688 + }, + { + "epoch": 0.1731482063202865, + "grad_norm": 0.3145928680896759, + "learning_rate": 9.363383224078814e-05, + "loss": 2.1827, + "step": 689 + }, + { + "epoch": 0.17339950995790665, + "grad_norm": 0.3063415586948395, + "learning_rate": 9.361429634043372e-05, + "loss": 2.0369, + "step": 690 + }, + { + "epoch": 0.17365081359552678, + "grad_norm": 0.40637439489364624, + "learning_rate": 9.359473255547186e-05, + "loss": 2.4201, + "step": 691 + }, + { + "epoch": 0.17390211723314694, + "grad_norm": 0.5028929710388184, + "learning_rate": 9.357514089841061e-05, + "loss": 1.7536, + "step": 692 + }, + { + "epoch": 0.1741534208707671, + "grad_norm": 0.19582106173038483, + "learning_rate": 9.355552138177577e-05, + "loss": 1.6803, + "step": 693 + }, + { + "epoch": 0.17440472450838726, + "grad_norm": 0.42780154943466187, + "learning_rate": 9.353587401811101e-05, + "loss": 2.0536, + "step": 694 + }, + { + "epoch": 0.17465602814600742, + "grad_norm": 0.41308510303497314, + "learning_rate": 9.351619881997779e-05, + "loss": 2.1088, + "step": 695 + }, + { + "epoch": 0.17490733178362758, + "grad_norm": 0.37217557430267334, + "learning_rate": 9.349649579995536e-05, + "loss": 2.3313, + "step": 696 + }, + { + "epoch": 0.1751586354212477, + "grad_norm": 0.49139076471328735, + "learning_rate": 9.347676497064074e-05, + "loss": 2.048, + "step": 697 + }, + { + "epoch": 0.17540993905886787, + "grad_norm": 0.49698978662490845, + "learning_rate": 9.345700634464876e-05, + "loss": 2.2736, + "step": 698 + }, + { + "epoch": 0.17566124269648803, + "grad_norm": 0.3499569296836853, + "learning_rate": 9.343721993461203e-05, + "loss": 2.4421, + "step": 699 + }, + { + "epoch": 0.1759125463341082, + "grad_norm": 0.2008151412010193, + "learning_rate": 9.341740575318088e-05, + "loss": 1.8728, + "step": 700 + }, + { + "epoch": 0.17616384997172835, + "grad_norm": 0.43613699078559875, + "learning_rate": 9.339756381302341e-05, + "loss": 2.1268, + "step": 701 + }, + { + "epoch": 0.1764151536093485, + "grad_norm": 0.24621760845184326, + "learning_rate": 9.337769412682551e-05, + "loss": 2.3428, + "step": 702 + }, + { + "epoch": 0.17666645724696864, + "grad_norm": 0.6878277659416199, + "learning_rate": 9.335779670729075e-05, + "loss": 2.1971, + "step": 703 + }, + { + "epoch": 0.1769177608845888, + "grad_norm": 0.3094477653503418, + "learning_rate": 9.333787156714047e-05, + "loss": 1.6276, + "step": 704 + }, + { + "epoch": 0.17716906452220896, + "grad_norm": 0.2712198495864868, + "learning_rate": 9.331791871911371e-05, + "loss": 2.1207, + "step": 705 + }, + { + "epoch": 0.17742036815982912, + "grad_norm": 0.40410006046295166, + "learning_rate": 9.329793817596724e-05, + "loss": 2.5001, + "step": 706 + }, + { + "epoch": 0.17767167179744928, + "grad_norm": 0.37285852432250977, + "learning_rate": 9.327792995047553e-05, + "loss": 2.513, + "step": 707 + }, + { + "epoch": 0.17792297543506944, + "grad_norm": 0.3589307963848114, + "learning_rate": 9.325789405543075e-05, + "loss": 2.3407, + "step": 708 + }, + { + "epoch": 0.17817427907268957, + "grad_norm": 0.16013433039188385, + "learning_rate": 9.323783050364276e-05, + "loss": 1.0699, + "step": 709 + }, + { + "epoch": 0.17842558271030973, + "grad_norm": 0.3747367858886719, + "learning_rate": 9.321773930793914e-05, + "loss": 2.2041, + "step": 710 + }, + { + "epoch": 0.1786768863479299, + "grad_norm": 0.48945263028144836, + "learning_rate": 9.319762048116503e-05, + "loss": 1.8284, + "step": 711 + }, + { + "epoch": 0.17892818998555005, + "grad_norm": 0.5572097897529602, + "learning_rate": 9.317747403618337e-05, + "loss": 1.8432, + "step": 712 + }, + { + "epoch": 0.1791794936231702, + "grad_norm": 0.4351899027824402, + "learning_rate": 9.31572999858747e-05, + "loss": 2.0469, + "step": 713 + }, + { + "epoch": 0.17943079726079034, + "grad_norm": 0.747898519039154, + "learning_rate": 9.31370983431372e-05, + "loss": 2.3444, + "step": 714 + }, + { + "epoch": 0.1796821008984105, + "grad_norm": 0.3677506148815155, + "learning_rate": 9.311686912088669e-05, + "loss": 1.7389, + "step": 715 + }, + { + "epoch": 0.17993340453603066, + "grad_norm": 0.7408022880554199, + "learning_rate": 9.309661233205663e-05, + "loss": 2.3839, + "step": 716 + }, + { + "epoch": 0.18018470817365081, + "grad_norm": 0.46684297919273376, + "learning_rate": 9.307632798959813e-05, + "loss": 2.4899, + "step": 717 + }, + { + "epoch": 0.18043601181127097, + "grad_norm": 0.5166415572166443, + "learning_rate": 9.305601610647989e-05, + "loss": 2.2496, + "step": 718 + }, + { + "epoch": 0.18068731544889113, + "grad_norm": 0.17087407410144806, + "learning_rate": 9.30356766956882e-05, + "loss": 0.624, + "step": 719 + }, + { + "epoch": 0.18093861908651127, + "grad_norm": 0.3293837904930115, + "learning_rate": 9.301530977022701e-05, + "loss": 2.1589, + "step": 720 + }, + { + "epoch": 0.18118992272413142, + "grad_norm": 0.5174190402030945, + "learning_rate": 9.29949153431178e-05, + "loss": 1.9908, + "step": 721 + }, + { + "epoch": 0.18144122636175158, + "grad_norm": 0.6492531895637512, + "learning_rate": 9.297449342739964e-05, + "loss": 2.3542, + "step": 722 + }, + { + "epoch": 0.18169252999937174, + "grad_norm": 0.3231172263622284, + "learning_rate": 9.295404403612924e-05, + "loss": 2.0346, + "step": 723 + }, + { + "epoch": 0.1819438336369919, + "grad_norm": 0.3993067145347595, + "learning_rate": 9.293356718238077e-05, + "loss": 2.0821, + "step": 724 + }, + { + "epoch": 0.18219513727461206, + "grad_norm": 0.3920503854751587, + "learning_rate": 9.291306287924608e-05, + "loss": 2.1273, + "step": 725 + }, + { + "epoch": 0.1824464409122322, + "grad_norm": 0.3167310655117035, + "learning_rate": 9.289253113983444e-05, + "loss": 2.0075, + "step": 726 + }, + { + "epoch": 0.18269774454985235, + "grad_norm": 0.3710818290710449, + "learning_rate": 9.287197197727277e-05, + "loss": 2.0204, + "step": 727 + }, + { + "epoch": 0.1829490481874725, + "grad_norm": 0.18097934126853943, + "learning_rate": 9.285138540470546e-05, + "loss": 0.749, + "step": 728 + }, + { + "epoch": 0.18320035182509267, + "grad_norm": 0.18432289361953735, + "learning_rate": 9.283077143529446e-05, + "loss": 0.8822, + "step": 729 + }, + { + "epoch": 0.18345165546271283, + "grad_norm": 0.3951958417892456, + "learning_rate": 9.281013008221921e-05, + "loss": 2.1613, + "step": 730 + }, + { + "epoch": 0.183702959100333, + "grad_norm": 0.3688110113143921, + "learning_rate": 9.278946135867665e-05, + "loss": 2.8058, + "step": 731 + }, + { + "epoch": 0.18395426273795312, + "grad_norm": 0.29696959257125854, + "learning_rate": 9.276876527788127e-05, + "loss": 1.7129, + "step": 732 + }, + { + "epoch": 0.18420556637557328, + "grad_norm": 0.405823677778244, + "learning_rate": 9.274804185306503e-05, + "loss": 2.334, + "step": 733 + }, + { + "epoch": 0.18445687001319344, + "grad_norm": 0.4110073447227478, + "learning_rate": 9.27272910974773e-05, + "loss": 2.4929, + "step": 734 + }, + { + "epoch": 0.1847081736508136, + "grad_norm": 0.3715936541557312, + "learning_rate": 9.270651302438502e-05, + "loss": 1.7891, + "step": 735 + }, + { + "epoch": 0.18495947728843376, + "grad_norm": 0.1682804673910141, + "learning_rate": 9.268570764707257e-05, + "loss": 0.726, + "step": 736 + }, + { + "epoch": 0.1852107809260539, + "grad_norm": 0.2395019680261612, + "learning_rate": 9.266487497884176e-05, + "loss": 2.4391, + "step": 737 + }, + { + "epoch": 0.18546208456367405, + "grad_norm": 0.3601885437965393, + "learning_rate": 9.264401503301185e-05, + "loss": 2.5297, + "step": 738 + }, + { + "epoch": 0.1857133882012942, + "grad_norm": 0.26369959115982056, + "learning_rate": 9.262312782291959e-05, + "loss": 1.7459, + "step": 739 + }, + { + "epoch": 0.18596469183891437, + "grad_norm": 0.15436404943466187, + "learning_rate": 9.26022133619191e-05, + "loss": 1.0022, + "step": 740 + }, + { + "epoch": 0.18621599547653453, + "grad_norm": 0.3303896486759186, + "learning_rate": 9.258127166338196e-05, + "loss": 2.3745, + "step": 741 + }, + { + "epoch": 0.1864672991141547, + "grad_norm": 0.3118177056312561, + "learning_rate": 9.256030274069713e-05, + "loss": 2.8046, + "step": 742 + }, + { + "epoch": 0.18671860275177482, + "grad_norm": 0.17974944412708282, + "learning_rate": 9.253930660727104e-05, + "loss": 0.8354, + "step": 743 + }, + { + "epoch": 0.18696990638939498, + "grad_norm": 0.2209557741880417, + "learning_rate": 9.251828327652742e-05, + "loss": 2.1792, + "step": 744 + }, + { + "epoch": 0.18722121002701514, + "grad_norm": 0.4132773280143738, + "learning_rate": 9.24972327619075e-05, + "loss": 2.3944, + "step": 745 + }, + { + "epoch": 0.1874725136646353, + "grad_norm": 0.47139856219291687, + "learning_rate": 9.24761550768698e-05, + "loss": 2.3182, + "step": 746 + }, + { + "epoch": 0.18772381730225546, + "grad_norm": 0.40354403853416443, + "learning_rate": 9.245505023489024e-05, + "loss": 2.1719, + "step": 747 + }, + { + "epoch": 0.18797512093987562, + "grad_norm": 0.23668596148490906, + "learning_rate": 9.243391824946213e-05, + "loss": 1.9976, + "step": 748 + }, + { + "epoch": 0.18822642457749575, + "grad_norm": 0.48701080679893494, + "learning_rate": 9.24127591340961e-05, + "loss": 2.3892, + "step": 749 + }, + { + "epoch": 0.1884777282151159, + "grad_norm": 0.48195892572402954, + "learning_rate": 9.239157290232014e-05, + "loss": 2.2488, + "step": 750 + }, + { + "epoch": 0.18872903185273607, + "grad_norm": 0.3957456946372986, + "learning_rate": 9.237035956767956e-05, + "loss": 2.2675, + "step": 751 + }, + { + "epoch": 0.18898033549035623, + "grad_norm": 0.419040709733963, + "learning_rate": 9.234911914373702e-05, + "loss": 1.9331, + "step": 752 + }, + { + "epoch": 0.1892316391279764, + "grad_norm": 0.3198854327201843, + "learning_rate": 9.23278516440725e-05, + "loss": 1.9211, + "step": 753 + }, + { + "epoch": 0.18948294276559652, + "grad_norm": 0.4320249855518341, + "learning_rate": 9.230655708228328e-05, + "loss": 1.9932, + "step": 754 + }, + { + "epoch": 0.18973424640321668, + "grad_norm": 0.34588703513145447, + "learning_rate": 9.228523547198393e-05, + "loss": 1.6818, + "step": 755 + }, + { + "epoch": 0.18998555004083684, + "grad_norm": 0.17924979329109192, + "learning_rate": 9.226388682680633e-05, + "loss": 1.2715, + "step": 756 + }, + { + "epoch": 0.190236853678457, + "grad_norm": 0.3479664921760559, + "learning_rate": 9.224251116039965e-05, + "loss": 2.6595, + "step": 757 + }, + { + "epoch": 0.19048815731607716, + "grad_norm": 0.9396395087242126, + "learning_rate": 9.222110848643035e-05, + "loss": 2.1373, + "step": 758 + }, + { + "epoch": 0.19073946095369732, + "grad_norm": 0.423880934715271, + "learning_rate": 9.219967881858209e-05, + "loss": 2.0013, + "step": 759 + }, + { + "epoch": 0.19099076459131745, + "grad_norm": 0.18442866206169128, + "learning_rate": 9.217822217055586e-05, + "loss": 1.1016, + "step": 760 + }, + { + "epoch": 0.1912420682289376, + "grad_norm": 0.33031755685806274, + "learning_rate": 9.215673855606986e-05, + "loss": 2.208, + "step": 761 + }, + { + "epoch": 0.19149337186655777, + "grad_norm": 0.5207613706588745, + "learning_rate": 9.213522798885956e-05, + "loss": 2.0212, + "step": 762 + }, + { + "epoch": 0.19174467550417792, + "grad_norm": 0.29409703612327576, + "learning_rate": 9.211369048267764e-05, + "loss": 2.5577, + "step": 763 + }, + { + "epoch": 0.19199597914179808, + "grad_norm": 0.44755882024765015, + "learning_rate": 9.2092126051294e-05, + "loss": 2.1216, + "step": 764 + }, + { + "epoch": 0.19224728277941824, + "grad_norm": 0.33680227398872375, + "learning_rate": 9.207053470849576e-05, + "loss": 2.5058, + "step": 765 + }, + { + "epoch": 0.19249858641703838, + "grad_norm": 0.41669735312461853, + "learning_rate": 9.204891646808726e-05, + "loss": 2.5137, + "step": 766 + }, + { + "epoch": 0.19274989005465853, + "grad_norm": 0.4869091808795929, + "learning_rate": 9.202727134389004e-05, + "loss": 2.2094, + "step": 767 + }, + { + "epoch": 0.1930011936922787, + "grad_norm": 0.3771580159664154, + "learning_rate": 9.20055993497428e-05, + "loss": 2.5748, + "step": 768 + }, + { + "epoch": 0.19325249732989885, + "grad_norm": 0.4663945734500885, + "learning_rate": 9.198390049950143e-05, + "loss": 2.6845, + "step": 769 + }, + { + "epoch": 0.193503800967519, + "grad_norm": 0.6000380516052246, + "learning_rate": 9.196217480703899e-05, + "loss": 2.4598, + "step": 770 + }, + { + "epoch": 0.19375510460513917, + "grad_norm": 0.4322783946990967, + "learning_rate": 9.194042228624572e-05, + "loss": 2.5049, + "step": 771 + }, + { + "epoch": 0.1940064082427593, + "grad_norm": 0.194077730178833, + "learning_rate": 9.191864295102899e-05, + "loss": 1.5018, + "step": 772 + }, + { + "epoch": 0.19425771188037946, + "grad_norm": 0.28692805767059326, + "learning_rate": 9.189683681531333e-05, + "loss": 1.8483, + "step": 773 + }, + { + "epoch": 0.19450901551799962, + "grad_norm": 0.8345639109611511, + "learning_rate": 9.187500389304037e-05, + "loss": 1.8403, + "step": 774 + }, + { + "epoch": 0.19476031915561978, + "grad_norm": 0.3533509373664856, + "learning_rate": 9.185314419816892e-05, + "loss": 2.4375, + "step": 775 + }, + { + "epoch": 0.19501162279323994, + "grad_norm": 0.40252941846847534, + "learning_rate": 9.18312577446749e-05, + "loss": 2.1452, + "step": 776 + }, + { + "epoch": 0.19526292643086007, + "grad_norm": 0.4904803931713104, + "learning_rate": 9.180934454655126e-05, + "loss": 2.2475, + "step": 777 + }, + { + "epoch": 0.19551423006848023, + "grad_norm": 0.4086427688598633, + "learning_rate": 9.178740461780812e-05, + "loss": 1.9234, + "step": 778 + }, + { + "epoch": 0.1957655337061004, + "grad_norm": 0.32106295228004456, + "learning_rate": 9.176543797247271e-05, + "loss": 2.3433, + "step": 779 + }, + { + "epoch": 0.19601683734372055, + "grad_norm": 0.45663875341415405, + "learning_rate": 9.17434446245893e-05, + "loss": 2.4102, + "step": 780 + }, + { + "epoch": 0.1962681409813407, + "grad_norm": 0.16669417917728424, + "learning_rate": 9.17214245882192e-05, + "loss": 0.9148, + "step": 781 + }, + { + "epoch": 0.19651944461896087, + "grad_norm": 0.21185293793678284, + "learning_rate": 9.169937787744088e-05, + "loss": 2.032, + "step": 782 + }, + { + "epoch": 0.196770748256581, + "grad_norm": 0.28057172894477844, + "learning_rate": 9.167730450634975e-05, + "loss": 2.3357, + "step": 783 + }, + { + "epoch": 0.19702205189420116, + "grad_norm": 0.30073508620262146, + "learning_rate": 9.165520448905835e-05, + "loss": 1.9842, + "step": 784 + }, + { + "epoch": 0.19727335553182132, + "grad_norm": 0.5807662606239319, + "learning_rate": 9.163307783969624e-05, + "loss": 2.1852, + "step": 785 + }, + { + "epoch": 0.19752465916944148, + "grad_norm": 0.43151628971099854, + "learning_rate": 9.161092457240999e-05, + "loss": 2.3249, + "step": 786 + }, + { + "epoch": 0.19777596280706164, + "grad_norm": 0.31615135073661804, + "learning_rate": 9.158874470136319e-05, + "loss": 2.3183, + "step": 787 + }, + { + "epoch": 0.1980272664446818, + "grad_norm": 0.4318180978298187, + "learning_rate": 9.156653824073642e-05, + "loss": 2.0189, + "step": 788 + }, + { + "epoch": 0.19827857008230193, + "grad_norm": 0.6335446834564209, + "learning_rate": 9.154430520472731e-05, + "loss": 1.8264, + "step": 789 + }, + { + "epoch": 0.1985298737199221, + "grad_norm": 0.1641846001148224, + "learning_rate": 9.152204560755045e-05, + "loss": 1.3867, + "step": 790 + }, + { + "epoch": 0.19878117735754225, + "grad_norm": 0.44742926955223083, + "learning_rate": 9.149975946343741e-05, + "loss": 1.9269, + "step": 791 + }, + { + "epoch": 0.1990324809951624, + "grad_norm": 0.438804566860199, + "learning_rate": 9.147744678663672e-05, + "loss": 1.8561, + "step": 792 + }, + { + "epoch": 0.19928378463278257, + "grad_norm": 0.6063904166221619, + "learning_rate": 9.145510759141393e-05, + "loss": 2.1038, + "step": 793 + }, + { + "epoch": 0.19953508827040273, + "grad_norm": 0.3686808943748474, + "learning_rate": 9.143274189205147e-05, + "loss": 2.811, + "step": 794 + }, + { + "epoch": 0.19978639190802286, + "grad_norm": 0.45831429958343506, + "learning_rate": 9.141034970284877e-05, + "loss": 2.1029, + "step": 795 + }, + { + "epoch": 0.20003769554564302, + "grad_norm": 0.4418196678161621, + "learning_rate": 9.138793103812218e-05, + "loss": 1.9126, + "step": 796 + }, + { + "epoch": 0.20028899918326318, + "grad_norm": 0.23358654975891113, + "learning_rate": 9.136548591220495e-05, + "loss": 2.0087, + "step": 797 + }, + { + "epoch": 0.20054030282088334, + "grad_norm": 0.5014088749885559, + "learning_rate": 9.134301433944731e-05, + "loss": 2.1698, + "step": 798 + }, + { + "epoch": 0.2007916064585035, + "grad_norm": 0.48934677243232727, + "learning_rate": 9.132051633421632e-05, + "loss": 1.9628, + "step": 799 + }, + { + "epoch": 0.20104291009612363, + "grad_norm": 0.46975913643836975, + "learning_rate": 9.129799191089601e-05, + "loss": 2.1432, + "step": 800 + }, + { + "epoch": 0.2012942137337438, + "grad_norm": 0.33380022644996643, + "learning_rate": 9.127544108388725e-05, + "loss": 1.7332, + "step": 801 + }, + { + "epoch": 0.20154551737136395, + "grad_norm": 0.6146292090415955, + "learning_rate": 9.125286386760785e-05, + "loss": 2.2721, + "step": 802 + }, + { + "epoch": 0.2017968210089841, + "grad_norm": 0.14950276911258698, + "learning_rate": 9.12302602764924e-05, + "loss": 1.2602, + "step": 803 + }, + { + "epoch": 0.20204812464660427, + "grad_norm": 0.4614298641681671, + "learning_rate": 9.120763032499242e-05, + "loss": 2.2327, + "step": 804 + }, + { + "epoch": 0.20229942828422443, + "grad_norm": 0.249393031001091, + "learning_rate": 9.118497402757631e-05, + "loss": 1.8148, + "step": 805 + }, + { + "epoch": 0.20255073192184456, + "grad_norm": 0.41946738958358765, + "learning_rate": 9.116229139872922e-05, + "loss": 2.5221, + "step": 806 + }, + { + "epoch": 0.20280203555946472, + "grad_norm": 0.4390411078929901, + "learning_rate": 9.113958245295321e-05, + "loss": 2.1989, + "step": 807 + }, + { + "epoch": 0.20305333919708488, + "grad_norm": 0.5600268244743347, + "learning_rate": 9.111684720476717e-05, + "loss": 2.6773, + "step": 808 + }, + { + "epoch": 0.20330464283470503, + "grad_norm": 0.2843821346759796, + "learning_rate": 9.109408566870673e-05, + "loss": 1.9472, + "step": 809 + }, + { + "epoch": 0.2035559464723252, + "grad_norm": 0.3715212643146515, + "learning_rate": 9.107129785932443e-05, + "loss": 2.1466, + "step": 810 + }, + { + "epoch": 0.20380725010994535, + "grad_norm": 0.8551393151283264, + "learning_rate": 9.10484837911895e-05, + "loss": 2.199, + "step": 811 + }, + { + "epoch": 0.20405855374756549, + "grad_norm": 0.5052684545516968, + "learning_rate": 9.102564347888806e-05, + "loss": 1.9972, + "step": 812 + }, + { + "epoch": 0.20430985738518564, + "grad_norm": 0.24479907751083374, + "learning_rate": 9.100277693702294e-05, + "loss": 2.3708, + "step": 813 + }, + { + "epoch": 0.2045611610228058, + "grad_norm": 0.42139294743537903, + "learning_rate": 9.097988418021377e-05, + "loss": 1.9225, + "step": 814 + }, + { + "epoch": 0.20481246466042596, + "grad_norm": 0.342489629983902, + "learning_rate": 9.095696522309693e-05, + "loss": 2.7236, + "step": 815 + }, + { + "epoch": 0.20506376829804612, + "grad_norm": 0.47755831480026245, + "learning_rate": 9.093402008032554e-05, + "loss": 2.2168, + "step": 816 + }, + { + "epoch": 0.20531507193566628, + "grad_norm": 0.32807457447052, + "learning_rate": 9.09110487665695e-05, + "loss": 2.4625, + "step": 817 + }, + { + "epoch": 0.20556637557328641, + "grad_norm": 0.3581337034702301, + "learning_rate": 9.088805129651542e-05, + "loss": 2.6607, + "step": 818 + }, + { + "epoch": 0.20581767921090657, + "grad_norm": 0.24006719887256622, + "learning_rate": 9.08650276848666e-05, + "loss": 2.0933, + "step": 819 + }, + { + "epoch": 0.20606898284852673, + "grad_norm": 0.22163568437099457, + "learning_rate": 9.084197794634312e-05, + "loss": 2.0709, + "step": 820 + }, + { + "epoch": 0.2063202864861469, + "grad_norm": 0.20495516061782837, + "learning_rate": 9.081890209568169e-05, + "loss": 1.8137, + "step": 821 + }, + { + "epoch": 0.20657159012376705, + "grad_norm": 0.33006590604782104, + "learning_rate": 9.079580014763579e-05, + "loss": 2.242, + "step": 822 + }, + { + "epoch": 0.20682289376138718, + "grad_norm": 0.31708237528800964, + "learning_rate": 9.077267211697554e-05, + "loss": 2.5707, + "step": 823 + }, + { + "epoch": 0.20707419739900734, + "grad_norm": 0.3039303719997406, + "learning_rate": 9.07495180184877e-05, + "loss": 2.4013, + "step": 824 + }, + { + "epoch": 0.2073255010366275, + "grad_norm": 0.3243713974952698, + "learning_rate": 9.072633786697581e-05, + "loss": 1.9324, + "step": 825 + }, + { + "epoch": 0.20757680467424766, + "grad_norm": 0.376941055059433, + "learning_rate": 9.070313167725995e-05, + "loss": 1.9856, + "step": 826 + }, + { + "epoch": 0.20782810831186782, + "grad_norm": 0.4256725013256073, + "learning_rate": 9.06798994641769e-05, + "loss": 1.9451, + "step": 827 + }, + { + "epoch": 0.20807941194948798, + "grad_norm": 0.21601825952529907, + "learning_rate": 9.06566412425801e-05, + "loss": 0.9348, + "step": 828 + }, + { + "epoch": 0.2083307155871081, + "grad_norm": 0.5165765881538391, + "learning_rate": 9.063335702733958e-05, + "loss": 2.3604, + "step": 829 + }, + { + "epoch": 0.20858201922472827, + "grad_norm": 0.4365144670009613, + "learning_rate": 9.061004683334196e-05, + "loss": 2.0167, + "step": 830 + }, + { + "epoch": 0.20883332286234843, + "grad_norm": 0.2237500250339508, + "learning_rate": 9.058671067549056e-05, + "loss": 1.7844, + "step": 831 + }, + { + "epoch": 0.2090846264999686, + "grad_norm": 0.26887792348861694, + "learning_rate": 9.056334856870522e-05, + "loss": 2.3547, + "step": 832 + }, + { + "epoch": 0.20933593013758875, + "grad_norm": 0.21619755029678345, + "learning_rate": 9.053996052792244e-05, + "loss": 2.0557, + "step": 833 + }, + { + "epoch": 0.2095872337752089, + "grad_norm": 0.5002549886703491, + "learning_rate": 9.051654656809521e-05, + "loss": 1.962, + "step": 834 + }, + { + "epoch": 0.20983853741282904, + "grad_norm": 0.3360225260257721, + "learning_rate": 9.049310670419316e-05, + "loss": 1.9531, + "step": 835 + }, + { + "epoch": 0.2100898410504492, + "grad_norm": 0.24657025933265686, + "learning_rate": 9.046964095120248e-05, + "loss": 1.2244, + "step": 836 + }, + { + "epoch": 0.21034114468806936, + "grad_norm": 0.29951533675193787, + "learning_rate": 9.044614932412587e-05, + "loss": 1.9471, + "step": 837 + }, + { + "epoch": 0.21059244832568952, + "grad_norm": 0.3678789734840393, + "learning_rate": 9.04226318379826e-05, + "loss": 2.3963, + "step": 838 + }, + { + "epoch": 0.21084375196330968, + "grad_norm": 0.45650580525398254, + "learning_rate": 9.03990885078085e-05, + "loss": 2.342, + "step": 839 + }, + { + "epoch": 0.21109505560092984, + "grad_norm": 0.4444562792778015, + "learning_rate": 9.037551934865587e-05, + "loss": 1.9851, + "step": 840 + }, + { + "epoch": 0.21134635923854997, + "grad_norm": 0.2063484787940979, + "learning_rate": 9.035192437559354e-05, + "loss": 2.2532, + "step": 841 + }, + { + "epoch": 0.21159766287617013, + "grad_norm": 0.3520076274871826, + "learning_rate": 9.032830360370688e-05, + "loss": 1.6042, + "step": 842 + }, + { + "epoch": 0.2118489665137903, + "grad_norm": 0.4106435477733612, + "learning_rate": 9.03046570480977e-05, + "loss": 2.0197, + "step": 843 + }, + { + "epoch": 0.21210027015141045, + "grad_norm": 0.37212634086608887, + "learning_rate": 9.028098472388433e-05, + "loss": 2.1224, + "step": 844 + }, + { + "epoch": 0.2123515737890306, + "grad_norm": 0.36991527676582336, + "learning_rate": 9.025728664620157e-05, + "loss": 2.5759, + "step": 845 + }, + { + "epoch": 0.21260287742665074, + "grad_norm": 0.3646388649940491, + "learning_rate": 9.023356283020067e-05, + "loss": 2.2962, + "step": 846 + }, + { + "epoch": 0.2128541810642709, + "grad_norm": 0.21754935383796692, + "learning_rate": 9.020981329104936e-05, + "loss": 1.7198, + "step": 847 + }, + { + "epoch": 0.21310548470189106, + "grad_norm": 0.24025185406208038, + "learning_rate": 9.01860380439318e-05, + "loss": 1.271, + "step": 848 + }, + { + "epoch": 0.21335678833951122, + "grad_norm": 0.42848479747772217, + "learning_rate": 9.016223710404856e-05, + "loss": 2.3559, + "step": 849 + }, + { + "epoch": 0.21360809197713138, + "grad_norm": 0.4237200915813446, + "learning_rate": 9.013841048661673e-05, + "loss": 2.2002, + "step": 850 + }, + { + "epoch": 0.21385939561475154, + "grad_norm": 0.31660404801368713, + "learning_rate": 9.01145582068697e-05, + "loss": 2.441, + "step": 851 + }, + { + "epoch": 0.21411069925237167, + "grad_norm": 0.4812658131122589, + "learning_rate": 9.009068028005732e-05, + "loss": 2.2861, + "step": 852 + }, + { + "epoch": 0.21436200288999183, + "grad_norm": 0.37174031138420105, + "learning_rate": 9.006677672144586e-05, + "loss": 2.4933, + "step": 853 + }, + { + "epoch": 0.21461330652761199, + "grad_norm": 0.5248540639877319, + "learning_rate": 9.004284754631793e-05, + "loss": 1.9397, + "step": 854 + }, + { + "epoch": 0.21486461016523214, + "grad_norm": 0.2802974283695221, + "learning_rate": 9.001889276997258e-05, + "loss": 2.2688, + "step": 855 + }, + { + "epoch": 0.2151159138028523, + "grad_norm": 0.4122345745563507, + "learning_rate": 8.999491240772516e-05, + "loss": 2.1688, + "step": 856 + }, + { + "epoch": 0.21536721744047246, + "grad_norm": 0.49358898401260376, + "learning_rate": 8.99709064749074e-05, + "loss": 2.0717, + "step": 857 + }, + { + "epoch": 0.2156185210780926, + "grad_norm": 0.415002703666687, + "learning_rate": 8.994687498686742e-05, + "loss": 2.4572, + "step": 858 + }, + { + "epoch": 0.21586982471571275, + "grad_norm": 0.3565453290939331, + "learning_rate": 8.992281795896962e-05, + "loss": 2.2275, + "step": 859 + }, + { + "epoch": 0.21612112835333291, + "grad_norm": 0.25147542357444763, + "learning_rate": 8.989873540659476e-05, + "loss": 1.6368, + "step": 860 + }, + { + "epoch": 0.21637243199095307, + "grad_norm": 0.330954372882843, + "learning_rate": 8.987462734513993e-05, + "loss": 1.6743, + "step": 861 + }, + { + "epoch": 0.21662373562857323, + "grad_norm": 0.3177106976509094, + "learning_rate": 8.985049379001849e-05, + "loss": 2.373, + "step": 862 + }, + { + "epoch": 0.21687503926619336, + "grad_norm": 0.4051658511161804, + "learning_rate": 8.982633475666014e-05, + "loss": 2.504, + "step": 863 + }, + { + "epoch": 0.21712634290381352, + "grad_norm": 0.1954205185174942, + "learning_rate": 8.980215026051083e-05, + "loss": 1.8698, + "step": 864 + }, + { + "epoch": 0.21737764654143368, + "grad_norm": 0.31402403116226196, + "learning_rate": 8.977794031703282e-05, + "loss": 2.2363, + "step": 865 + }, + { + "epoch": 0.21762895017905384, + "grad_norm": 0.28115931153297424, + "learning_rate": 8.975370494170463e-05, + "loss": 2.3547, + "step": 866 + }, + { + "epoch": 0.217880253816674, + "grad_norm": 0.4603864252567291, + "learning_rate": 8.972944415002105e-05, + "loss": 2.3678, + "step": 867 + }, + { + "epoch": 0.21813155745429416, + "grad_norm": 0.21994365751743317, + "learning_rate": 8.97051579574931e-05, + "loss": 2.2028, + "step": 868 + }, + { + "epoch": 0.2183828610919143, + "grad_norm": 0.3067527711391449, + "learning_rate": 8.968084637964804e-05, + "loss": 2.6017, + "step": 869 + }, + { + "epoch": 0.21863416472953445, + "grad_norm": 0.33018767833709717, + "learning_rate": 8.96565094320294e-05, + "loss": 2.3301, + "step": 870 + }, + { + "epoch": 0.2188854683671546, + "grad_norm": 0.6793331503868103, + "learning_rate": 8.963214713019687e-05, + "loss": 2.7803, + "step": 871 + }, + { + "epoch": 0.21913677200477477, + "grad_norm": 0.3478843569755554, + "learning_rate": 8.96077594897264e-05, + "loss": 2.2915, + "step": 872 + }, + { + "epoch": 0.21938807564239493, + "grad_norm": 0.400673508644104, + "learning_rate": 8.95833465262101e-05, + "loss": 1.2611, + "step": 873 + }, + { + "epoch": 0.2196393792800151, + "grad_norm": 0.27117395401000977, + "learning_rate": 8.955890825525631e-05, + "loss": 2.6418, + "step": 874 + }, + { + "epoch": 0.21989068291763522, + "grad_norm": 0.41636621952056885, + "learning_rate": 8.953444469248952e-05, + "loss": 2.0555, + "step": 875 + }, + { + "epoch": 0.22014198655525538, + "grad_norm": 0.5339227318763733, + "learning_rate": 8.95099558535504e-05, + "loss": 2.4363, + "step": 876 + }, + { + "epoch": 0.22039329019287554, + "grad_norm": 0.4329914450645447, + "learning_rate": 8.948544175409579e-05, + "loss": 2.198, + "step": 877 + }, + { + "epoch": 0.2206445938304957, + "grad_norm": 0.377668559551239, + "learning_rate": 8.946090240979865e-05, + "loss": 2.2962, + "step": 878 + }, + { + "epoch": 0.22089589746811586, + "grad_norm": 0.3951661288738251, + "learning_rate": 8.943633783634813e-05, + "loss": 2.1264, + "step": 879 + }, + { + "epoch": 0.22114720110573602, + "grad_norm": 0.36469566822052, + "learning_rate": 8.941174804944948e-05, + "loss": 2.6947, + "step": 880 + }, + { + "epoch": 0.22139850474335615, + "grad_norm": 0.21884319186210632, + "learning_rate": 8.938713306482403e-05, + "loss": 1.9526, + "step": 881 + }, + { + "epoch": 0.2216498083809763, + "grad_norm": 0.38323378562927246, + "learning_rate": 8.936249289820931e-05, + "loss": 2.1726, + "step": 882 + }, + { + "epoch": 0.22190111201859647, + "grad_norm": 0.653200089931488, + "learning_rate": 8.933782756535887e-05, + "loss": 2.505, + "step": 883 + }, + { + "epoch": 0.22215241565621663, + "grad_norm": 0.41666847467422485, + "learning_rate": 8.931313708204239e-05, + "loss": 2.218, + "step": 884 + }, + { + "epoch": 0.2224037192938368, + "grad_norm": 0.3992173373699188, + "learning_rate": 8.928842146404562e-05, + "loss": 2.4002, + "step": 885 + }, + { + "epoch": 0.22265502293145692, + "grad_norm": 0.5349919199943542, + "learning_rate": 8.92636807271704e-05, + "loss": 1.5443, + "step": 886 + }, + { + "epoch": 0.22290632656907708, + "grad_norm": 0.3590314984321594, + "learning_rate": 8.923891488723459e-05, + "loss": 2.3424, + "step": 887 + }, + { + "epoch": 0.22315763020669724, + "grad_norm": 0.4399639666080475, + "learning_rate": 8.921412396007212e-05, + "loss": 2.3039, + "step": 888 + }, + { + "epoch": 0.2234089338443174, + "grad_norm": 0.476307213306427, + "learning_rate": 8.918930796153297e-05, + "loss": 2.0807, + "step": 889 + }, + { + "epoch": 0.22366023748193756, + "grad_norm": 0.40012243390083313, + "learning_rate": 8.916446690748315e-05, + "loss": 2.2535, + "step": 890 + }, + { + "epoch": 0.22391154111955772, + "grad_norm": 0.4795278012752533, + "learning_rate": 8.913960081380465e-05, + "loss": 1.9693, + "step": 891 + }, + { + "epoch": 0.22416284475717785, + "grad_norm": 0.1798836588859558, + "learning_rate": 8.911470969639551e-05, + "loss": 0.6201, + "step": 892 + }, + { + "epoch": 0.224414148394798, + "grad_norm": 0.8967116475105286, + "learning_rate": 8.908979357116976e-05, + "loss": 2.3321, + "step": 893 + }, + { + "epoch": 0.22466545203241817, + "grad_norm": 0.363643616437912, + "learning_rate": 8.90648524540574e-05, + "loss": 2.2787, + "step": 894 + }, + { + "epoch": 0.22491675567003833, + "grad_norm": 0.46721211075782776, + "learning_rate": 8.903988636100445e-05, + "loss": 2.05, + "step": 895 + }, + { + "epoch": 0.22516805930765849, + "grad_norm": 0.41919320821762085, + "learning_rate": 8.901489530797282e-05, + "loss": 2.2821, + "step": 896 + }, + { + "epoch": 0.22541936294527865, + "grad_norm": 0.29892247915267944, + "learning_rate": 8.898987931094049e-05, + "loss": 2.3336, + "step": 897 + }, + { + "epoch": 0.22567066658289878, + "grad_norm": 0.41926395893096924, + "learning_rate": 8.896483838590131e-05, + "loss": 2.1726, + "step": 898 + }, + { + "epoch": 0.22592197022051894, + "grad_norm": 0.3506767451763153, + "learning_rate": 8.893977254886505e-05, + "loss": 1.8011, + "step": 899 + }, + { + "epoch": 0.2261732738581391, + "grad_norm": 0.39030522108078003, + "learning_rate": 8.891468181585747e-05, + "loss": 1.962, + "step": 900 + }, + { + "epoch": 0.22642457749575925, + "grad_norm": 0.40221068263053894, + "learning_rate": 8.888956620292022e-05, + "loss": 2.2385, + "step": 901 + }, + { + "epoch": 0.22667588113337941, + "grad_norm": 0.312210351228714, + "learning_rate": 8.886442572611087e-05, + "loss": 2.3336, + "step": 902 + }, + { + "epoch": 0.22692718477099957, + "grad_norm": 0.1972939372062683, + "learning_rate": 8.883926040150283e-05, + "loss": 1.8672, + "step": 903 + }, + { + "epoch": 0.2271784884086197, + "grad_norm": 0.30837222933769226, + "learning_rate": 8.881407024518548e-05, + "loss": 2.1923, + "step": 904 + }, + { + "epoch": 0.22742979204623986, + "grad_norm": 0.32954657077789307, + "learning_rate": 8.8788855273264e-05, + "loss": 2.53, + "step": 905 + }, + { + "epoch": 0.22768109568386002, + "grad_norm": 0.4983203113079071, + "learning_rate": 8.87636155018595e-05, + "loss": 1.7686, + "step": 906 + }, + { + "epoch": 0.22793239932148018, + "grad_norm": 0.29157817363739014, + "learning_rate": 8.873835094710891e-05, + "loss": 2.0444, + "step": 907 + }, + { + "epoch": 0.22818370295910034, + "grad_norm": 0.7063765525817871, + "learning_rate": 8.8713061625165e-05, + "loss": 2.3444, + "step": 908 + }, + { + "epoch": 0.22843500659672047, + "grad_norm": 0.2752906382083893, + "learning_rate": 8.868774755219641e-05, + "loss": 2.2363, + "step": 909 + }, + { + "epoch": 0.22868631023434063, + "grad_norm": 0.4020974338054657, + "learning_rate": 8.866240874438755e-05, + "loss": 2.2792, + "step": 910 + }, + { + "epoch": 0.2289376138719608, + "grad_norm": 0.3203171193599701, + "learning_rate": 8.863704521793869e-05, + "loss": 2.1895, + "step": 911 + }, + { + "epoch": 0.22918891750958095, + "grad_norm": 0.7605288028717041, + "learning_rate": 8.861165698906589e-05, + "loss": 2.2526, + "step": 912 + }, + { + "epoch": 0.2294402211472011, + "grad_norm": 0.4583891034126282, + "learning_rate": 8.8586244074001e-05, + "loss": 2.5682, + "step": 913 + }, + { + "epoch": 0.22969152478482127, + "grad_norm": 0.29976171255111694, + "learning_rate": 8.856080648899163e-05, + "loss": 1.8839, + "step": 914 + }, + { + "epoch": 0.2299428284224414, + "grad_norm": 0.4007084369659424, + "learning_rate": 8.853534425030123e-05, + "loss": 2.2063, + "step": 915 + }, + { + "epoch": 0.23019413206006156, + "grad_norm": 0.37610113620758057, + "learning_rate": 8.850985737420896e-05, + "loss": 2.5636, + "step": 916 + }, + { + "epoch": 0.23044543569768172, + "grad_norm": 0.3937523663043976, + "learning_rate": 8.84843458770097e-05, + "loss": 2.2865, + "step": 917 + }, + { + "epoch": 0.23069673933530188, + "grad_norm": 0.44338393211364746, + "learning_rate": 8.845880977501419e-05, + "loss": 2.2287, + "step": 918 + }, + { + "epoch": 0.23094804297292204, + "grad_norm": 0.4235187768936157, + "learning_rate": 8.843324908454875e-05, + "loss": 2.3839, + "step": 919 + }, + { + "epoch": 0.2311993466105422, + "grad_norm": 0.5745797157287598, + "learning_rate": 8.840766382195553e-05, + "loss": 1.9735, + "step": 920 + }, + { + "epoch": 0.23145065024816233, + "grad_norm": 0.35707423090934753, + "learning_rate": 8.838205400359234e-05, + "loss": 1.8084, + "step": 921 + }, + { + "epoch": 0.2317019538857825, + "grad_norm": 0.323047935962677, + "learning_rate": 8.835641964583272e-05, + "loss": 2.422, + "step": 922 + }, + { + "epoch": 0.23195325752340265, + "grad_norm": 0.4362463355064392, + "learning_rate": 8.833076076506588e-05, + "loss": 2.5153, + "step": 923 + }, + { + "epoch": 0.2322045611610228, + "grad_norm": 0.3632015883922577, + "learning_rate": 8.830507737769669e-05, + "loss": 2.7776, + "step": 924 + }, + { + "epoch": 0.23245586479864297, + "grad_norm": 0.48653021454811096, + "learning_rate": 8.827936950014573e-05, + "loss": 2.3679, + "step": 925 + }, + { + "epoch": 0.23270716843626313, + "grad_norm": 0.37184858322143555, + "learning_rate": 8.825363714884922e-05, + "loss": 1.9735, + "step": 926 + }, + { + "epoch": 0.23295847207388326, + "grad_norm": 0.38167500495910645, + "learning_rate": 8.822788034025903e-05, + "loss": 2.8957, + "step": 927 + }, + { + "epoch": 0.23320977571150342, + "grad_norm": 0.4858744144439697, + "learning_rate": 8.820209909084265e-05, + "loss": 2.4265, + "step": 928 + }, + { + "epoch": 0.23346107934912358, + "grad_norm": 0.4078523516654968, + "learning_rate": 8.81762934170832e-05, + "loss": 2.0574, + "step": 929 + }, + { + "epoch": 0.23371238298674374, + "grad_norm": 0.3559398353099823, + "learning_rate": 8.815046333547943e-05, + "loss": 2.2669, + "step": 930 + }, + { + "epoch": 0.2339636866243639, + "grad_norm": 0.4299301207065582, + "learning_rate": 8.81246088625457e-05, + "loss": 2.1053, + "step": 931 + }, + { + "epoch": 0.23421499026198403, + "grad_norm": 0.40756756067276, + "learning_rate": 8.809873001481193e-05, + "loss": 2.3635, + "step": 932 + }, + { + "epoch": 0.2344662938996042, + "grad_norm": 0.37873560190200806, + "learning_rate": 8.807282680882367e-05, + "loss": 1.0149, + "step": 933 + }, + { + "epoch": 0.23471759753722435, + "grad_norm": 0.40375521779060364, + "learning_rate": 8.8046899261142e-05, + "loss": 2.5615, + "step": 934 + }, + { + "epoch": 0.2349689011748445, + "grad_norm": 0.4414771497249603, + "learning_rate": 8.802094738834361e-05, + "loss": 2.513, + "step": 935 + }, + { + "epoch": 0.23522020481246467, + "grad_norm": 0.40930548310279846, + "learning_rate": 8.799497120702069e-05, + "loss": 1.5781, + "step": 936 + }, + { + "epoch": 0.23547150845008483, + "grad_norm": 0.4570627808570862, + "learning_rate": 8.7968970733781e-05, + "loss": 2.2293, + "step": 937 + }, + { + "epoch": 0.23572281208770496, + "grad_norm": 0.38974353671073914, + "learning_rate": 8.794294598524784e-05, + "loss": 1.959, + "step": 938 + }, + { + "epoch": 0.23597411572532512, + "grad_norm": 0.5479612946510315, + "learning_rate": 8.791689697806e-05, + "loss": 2.0345, + "step": 939 + }, + { + "epoch": 0.23622541936294528, + "grad_norm": 0.35705479979515076, + "learning_rate": 8.789082372887183e-05, + "loss": 2.2542, + "step": 940 + }, + { + "epoch": 0.23647672300056544, + "grad_norm": 0.4570086896419525, + "learning_rate": 8.786472625435311e-05, + "loss": 2.4306, + "step": 941 + }, + { + "epoch": 0.2367280266381856, + "grad_norm": 1.2224934101104736, + "learning_rate": 8.783860457118918e-05, + "loss": 2.031, + "step": 942 + }, + { + "epoch": 0.23697933027580576, + "grad_norm": 0.47019338607788086, + "learning_rate": 8.781245869608077e-05, + "loss": 2.5845, + "step": 943 + }, + { + "epoch": 0.2372306339134259, + "grad_norm": 0.4843159019947052, + "learning_rate": 8.778628864574419e-05, + "loss": 2.2976, + "step": 944 + }, + { + "epoch": 0.23748193755104605, + "grad_norm": 0.44156232476234436, + "learning_rate": 8.776009443691109e-05, + "loss": 1.7753, + "step": 945 + }, + { + "epoch": 0.2377332411886662, + "grad_norm": 0.4849649667739868, + "learning_rate": 8.773387608632867e-05, + "loss": 2.4643, + "step": 946 + }, + { + "epoch": 0.23798454482628636, + "grad_norm": 0.47490194439888, + "learning_rate": 8.770763361075949e-05, + "loss": 1.9425, + "step": 947 + }, + { + "epoch": 0.23823584846390652, + "grad_norm": 0.42135703563690186, + "learning_rate": 8.768136702698158e-05, + "loss": 2.2653, + "step": 948 + }, + { + "epoch": 0.23848715210152666, + "grad_norm": 0.3938981294631958, + "learning_rate": 8.765507635178832e-05, + "loss": 1.8336, + "step": 949 + }, + { + "epoch": 0.23873845573914682, + "grad_norm": 0.3590041399002075, + "learning_rate": 8.762876160198858e-05, + "loss": 2.083, + "step": 950 + }, + { + "epoch": 0.23898975937676697, + "grad_norm": 10.042701721191406, + "learning_rate": 8.760242279440657e-05, + "loss": 1.9433, + "step": 951 + }, + { + "epoch": 0.23924106301438713, + "grad_norm": 0.4117979407310486, + "learning_rate": 8.75760599458819e-05, + "loss": 1.9981, + "step": 952 + }, + { + "epoch": 0.2394923666520073, + "grad_norm": 0.3976224660873413, + "learning_rate": 8.754967307326951e-05, + "loss": 1.9221, + "step": 953 + }, + { + "epoch": 0.23974367028962745, + "grad_norm": 0.4007420241832733, + "learning_rate": 8.752326219343977e-05, + "loss": 2.1583, + "step": 954 + }, + { + "epoch": 0.23999497392724758, + "grad_norm": 0.43828290700912476, + "learning_rate": 8.74968273232783e-05, + "loss": 2.0958, + "step": 955 + }, + { + "epoch": 0.24024627756486774, + "grad_norm": 0.3099430501461029, + "learning_rate": 8.747036847968618e-05, + "loss": 2.0985, + "step": 956 + }, + { + "epoch": 0.2404975812024879, + "grad_norm": 0.2864762842655182, + "learning_rate": 8.744388567957971e-05, + "loss": 1.9034, + "step": 957 + }, + { + "epoch": 0.24074888484010806, + "grad_norm": 0.38019657135009766, + "learning_rate": 8.741737893989058e-05, + "loss": 2.2289, + "step": 958 + }, + { + "epoch": 0.24100018847772822, + "grad_norm": 0.506572425365448, + "learning_rate": 8.739084827756575e-05, + "loss": 2.3025, + "step": 959 + }, + { + "epoch": 0.24125149211534838, + "grad_norm": 0.4378896653652191, + "learning_rate": 8.736429370956746e-05, + "loss": 1.9396, + "step": 960 + }, + { + "epoch": 0.2415027957529685, + "grad_norm": 0.36668267846107483, + "learning_rate": 8.733771525287331e-05, + "loss": 2.7244, + "step": 961 + }, + { + "epoch": 0.24175409939058867, + "grad_norm": 0.3157023787498474, + "learning_rate": 8.731111292447605e-05, + "loss": 2.2407, + "step": 962 + }, + { + "epoch": 0.24200540302820883, + "grad_norm": 0.2563331127166748, + "learning_rate": 8.728448674138381e-05, + "loss": 2.4275, + "step": 963 + }, + { + "epoch": 0.242256706665829, + "grad_norm": 0.49870792031288147, + "learning_rate": 8.72578367206199e-05, + "loss": 2.0975, + "step": 964 + }, + { + "epoch": 0.24250801030344915, + "grad_norm": 0.328143835067749, + "learning_rate": 8.723116287922288e-05, + "loss": 2.0371, + "step": 965 + }, + { + "epoch": 0.2427593139410693, + "grad_norm": 0.40886473655700684, + "learning_rate": 8.72044652342466e-05, + "loss": 2.294, + "step": 966 + }, + { + "epoch": 0.24301061757868944, + "grad_norm": 0.39602747559547424, + "learning_rate": 8.717774380276002e-05, + "loss": 2.067, + "step": 967 + }, + { + "epoch": 0.2432619212163096, + "grad_norm": 0.33269646763801575, + "learning_rate": 8.715099860184743e-05, + "loss": 2.2003, + "step": 968 + }, + { + "epoch": 0.24351322485392976, + "grad_norm": 0.4260612726211548, + "learning_rate": 8.712422964860822e-05, + "loss": 2.4808, + "step": 969 + }, + { + "epoch": 0.24376452849154992, + "grad_norm": 0.3412139415740967, + "learning_rate": 8.7097436960157e-05, + "loss": 2.3457, + "step": 970 + }, + { + "epoch": 0.24401583212917008, + "grad_norm": 0.402170866727829, + "learning_rate": 8.707062055362359e-05, + "loss": 2.4638, + "step": 971 + }, + { + "epoch": 0.2442671357667902, + "grad_norm": 0.2597676217556, + "learning_rate": 8.70437804461529e-05, + "loss": 1.7648, + "step": 972 + }, + { + "epoch": 0.24451843940441037, + "grad_norm": 0.424844354391098, + "learning_rate": 8.701691665490504e-05, + "loss": 2.276, + "step": 973 + }, + { + "epoch": 0.24476974304203053, + "grad_norm": 0.1249924823641777, + "learning_rate": 8.699002919705527e-05, + "loss": 0.5754, + "step": 974 + }, + { + "epoch": 0.2450210466796507, + "grad_norm": 0.14425934851169586, + "learning_rate": 8.696311808979397e-05, + "loss": 0.9444, + "step": 975 + }, + { + "epoch": 0.24527235031727085, + "grad_norm": 0.5581234693527222, + "learning_rate": 8.693618335032663e-05, + "loss": 2.2831, + "step": 976 + }, + { + "epoch": 0.245523653954891, + "grad_norm": 0.17069493234157562, + "learning_rate": 8.690922499587387e-05, + "loss": 1.0646, + "step": 977 + }, + { + "epoch": 0.24577495759251114, + "grad_norm": 0.3384755551815033, + "learning_rate": 8.688224304367137e-05, + "loss": 1.8558, + "step": 978 + }, + { + "epoch": 0.2460262612301313, + "grad_norm": 0.3919682204723358, + "learning_rate": 8.685523751096994e-05, + "loss": 2.5674, + "step": 979 + }, + { + "epoch": 0.24627756486775146, + "grad_norm": 0.25237560272216797, + "learning_rate": 8.682820841503542e-05, + "loss": 1.7601, + "step": 980 + }, + { + "epoch": 0.24652886850537162, + "grad_norm": 0.3378541171550751, + "learning_rate": 8.680115577314877e-05, + "loss": 2.5141, + "step": 981 + }, + { + "epoch": 0.24678017214299178, + "grad_norm": 0.4153686761856079, + "learning_rate": 8.6774079602606e-05, + "loss": 1.1121, + "step": 982 + }, + { + "epoch": 0.24703147578061194, + "grad_norm": 0.3883324861526489, + "learning_rate": 8.67469799207181e-05, + "loss": 1.8358, + "step": 983 + }, + { + "epoch": 0.24728277941823207, + "grad_norm": 0.35025763511657715, + "learning_rate": 8.671985674481113e-05, + "loss": 1.7407, + "step": 984 + }, + { + "epoch": 0.24753408305585223, + "grad_norm": 0.1769835203886032, + "learning_rate": 8.66927100922262e-05, + "loss": 1.1858, + "step": 985 + }, + { + "epoch": 0.2477853866934724, + "grad_norm": 0.4593118131160736, + "learning_rate": 8.66655399803194e-05, + "loss": 2.4193, + "step": 986 + }, + { + "epoch": 0.24803669033109255, + "grad_norm": 0.24635930359363556, + "learning_rate": 8.663834642646178e-05, + "loss": 2.3587, + "step": 987 + }, + { + "epoch": 0.2482879939687127, + "grad_norm": 0.3709847927093506, + "learning_rate": 8.661112944803946e-05, + "loss": 2.5285, + "step": 988 + }, + { + "epoch": 0.24853929760633287, + "grad_norm": 0.3426840603351593, + "learning_rate": 8.65838890624535e-05, + "loss": 1.9006, + "step": 989 + }, + { + "epoch": 0.248790601243953, + "grad_norm": 0.29686790704727173, + "learning_rate": 8.655662528711987e-05, + "loss": 1.9789, + "step": 990 + }, + { + "epoch": 0.24904190488157316, + "grad_norm": 0.15335550904273987, + "learning_rate": 8.65293381394696e-05, + "loss": 1.379, + "step": 991 + }, + { + "epoch": 0.24929320851919332, + "grad_norm": 0.31200841069221497, + "learning_rate": 8.650202763694856e-05, + "loss": 2.6586, + "step": 992 + }, + { + "epoch": 0.24954451215681347, + "grad_norm": 0.28087717294692993, + "learning_rate": 8.647469379701765e-05, + "loss": 2.1206, + "step": 993 + }, + { + "epoch": 0.24979581579443363, + "grad_norm": 0.4623563289642334, + "learning_rate": 8.64473366371526e-05, + "loss": 2.3116, + "step": 994 + }, + { + "epoch": 0.25004711943205377, + "grad_norm": 0.3620995879173279, + "learning_rate": 8.641995617484411e-05, + "loss": 2.1746, + "step": 995 + }, + { + "epoch": 0.2502984230696739, + "grad_norm": 0.42614445090293884, + "learning_rate": 8.639255242759773e-05, + "loss": 2.1065, + "step": 996 + }, + { + "epoch": 0.2505497267072941, + "grad_norm": 0.5489148497581482, + "learning_rate": 8.636512541293396e-05, + "loss": 1.6889, + "step": 997 + }, + { + "epoch": 0.25080103034491424, + "grad_norm": 0.5294693112373352, + "learning_rate": 8.633767514838811e-05, + "loss": 1.9634, + "step": 998 + }, + { + "epoch": 0.2510523339825344, + "grad_norm": 0.2602441906929016, + "learning_rate": 8.631020165151041e-05, + "loss": 2.0264, + "step": 999 + }, + { + "epoch": 0.25130363762015456, + "grad_norm": 0.5113153457641602, + "learning_rate": 8.62827049398659e-05, + "loss": 2.47, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 3979, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.536126594304095e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}